Spaces:
Runtime error
Runtime error
| import os | |
| from langchain_community.document_loaders.csv_loader import CSVLoader | |
| from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader | |
| from langchain_community.document_loaders.excel import UnstructuredExcelLoader | |
| from langchain_community.document_loaders import PDFMinerLoader | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.document_loaders import Docx2txtLoader | |
| from langchain.docstore.document import Document | |
| import logging | |
| INGEST_THREADS = os.cpu_count() or 8 | |
| DOCUMENT_MAP = { | |
| ".txt": TextLoader, | |
| ".md": TextLoader, | |
| ".pdf": PDFMinerLoader, | |
| ".csv": CSVLoader, | |
| ".csv": UnstructuredCSVLoader, | |
| ".xls": UnstructuredExcelLoader, | |
| ".xlsx": UnstructuredExcelLoader, | |
| ".docx": Docx2txtLoader | |
| # Add additional file types here if necessary | |
| } | |
| logger = logging.getLogger(__name__) | |
| class DocumentLoader(): | |
| def __init__(self, source_dir: str): | |
| """ | |
| Initializes the loader with the directory path from which to load documents. | |
| """ | |
| self.source_dir = source_dir | |
| logger.info(f"DocumentLoader initialized with source directory: {self.source_dir}") | |
| def load_single_document(self, file_path: str): | |
| """ | |
| Loads a single document based on its file extension using the appropriate loader. | |
| Args: | |
| file_path (str): Path to the document file. | |
| Returns: | |
| List[Document]: Loaded document(s) as LangChain Document instances. | |
| """ | |
| file_extension = os.path.splitext(file_path)[1] | |
| loader_class = DOCUMENT_MAP.get(file_extension) | |
| if loader_class: | |
| loader = loader_class(file_path) | |
| logger.info(f"Loading document: {file_path}") | |
| try: | |
| documents = loader.load() | |
| logger.info(f"Successfully loaded document: {file_path}") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading document {file_path}: {e}", exc_info=True) | |
| raise | |
| else: | |
| logger.warning(f"Unsupported document type for file: {file_path}") | |
| raise ValueError(f"Unsupported document type: {file_extension}") | |
| def load_all_documents(self) -> list[Document]: | |
| """ | |
| Loads all documents from the source directory, including documents in subdirectories. | |
| Returns: | |
| List[Document]: List of all loaded documents from the source directory. | |
| """ | |
| paths = self._gather_file_paths() # Gather file paths of documents to load | |
| all_docs = [] | |
| logger.info(f"Loading all documents from directory: {self.source_dir}") | |
| # # Load each document sequentially | |
| # for file_path in paths: | |
| # documents = self.load_single_document(file_path) | |
| # all_docs.extend(documents) # Append loaded documents to the result list | |
| # # return all_docs | |
| for file_path in paths: | |
| try: | |
| documents = self.load_single_document(file_path) | |
| all_docs.extend(documents) # Append loaded documents to the result list | |
| except ValueError as e: | |
| logger.error(f"Skipping file {file_path}: {e}") | |
| except Exception as e: | |
| logger.error(f"An unexpected error occurred while loading {file_path}: {e}", exc_info=True) | |
| logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}") | |
| return all_docs | |
| def _gather_file_paths(self): | |
| """ | |
| Walks through the source directory and gathers file paths of documents | |
| that match the supported file types in DOCUMENT_MAP. | |
| Returns: | |
| List[str]: List of file paths for documents to load. | |
| """ | |
| file_paths = [] | |
| logger.debug(f"Scanning for files in directory: {self.source_dir}") | |
| for root, _, files in os.walk(self.source_dir): | |
| for file_name in files: | |
| file_extension = os.path.splitext(file_name)[1] | |
| if file_extension in DOCUMENT_MAP: | |
| full_path = os.path.join(root, file_name) | |
| file_paths.append(full_path) | |
| logger.debug(f"Found document: {full_path}") | |
| logger.info(f"Total files found for loading: {len(file_paths)}") | |
| return file_paths | |
| # if __name__ == "__main__": | |
| # source_directory = os.path.join(os.path.dirname(__file__),'..','Data') | |
| # document_loader = DocumentLoader(source_directory) | |
| # documents = document_loader.load_all_documents() | |
| # from langchain_community.embeddings import OpenAIEmbeddings | |
| # from langchain_community.vectorstores import FAISS | |
| # directory_path = os.path.join(os.path.dirname(__file__),'..','Data') | |
| # documents = load_documents(directory_path) | |
| # print(documents) | |
| # print(os.path.join(os.path.dirname(__file__),'..','Data')) |