Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_community.document_loaders import ( | |
| PyMuPDFLoader, | |
| TextLoader, | |
| Docx2txtLoader, | |
| DirectoryLoader, | |
| ) | |
| class DocumentProcessor: | |
| def __init__(self, path: str): | |
| self.path = path | |
| def files_to_texts(self) -> list: | |
| loaders_config = { | |
| "*.pdf": PyMuPDFLoader, | |
| "*.txt": (TextLoader, {"encoding": "utf-8"}), | |
| "*.docx": Docx2txtLoader, | |
| "*.doc": Docx2txtLoader, | |
| } | |
| loaders = [ | |
| DirectoryLoader( | |
| path=self.path, | |
| glob=glob, | |
| loader_cls=loader if isinstance(loader, type) else loader[0], | |
| loader_kwargs=loader[1] if isinstance(loader, tuple) else None, | |
| ) | |
| for glob, loader in loaders_config.items() | |
| if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path)) | |
| ] | |
| documents = [] | |
| for loader in loaders: | |
| documents.extend(loader.load()) | |
| return documents | |