Spaces:
Running
Running
company_chatbot
/
community_contributions
/Multi-Model-Resume–JD-Match-Analyzer
/multi_file_ingestion.py
| import os | |
| from langchain.document_loaders import ( | |
| TextLoader, | |
| PyPDFLoader, | |
| UnstructuredWordDocumentLoader, | |
| UnstructuredFileLoader | |
| ) | |
| def load_and_split_resume(file_path: str): | |
| """ | |
| Loads a resume file and splits it into text chunks using LangChain. | |
| Args: | |
| file_path (str): Path to the resume file (.txt, .pdf, .docx, etc.) | |
| chunk_size (int): Maximum characters per chunk. | |
| chunk_overlap (int): Overlap between chunks to preserve context. | |
| Returns: | |
| List[str]: List of split text chunks. | |
| """ | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| ext = os.path.splitext(file_path)[1].lower() | |
| # Select the appropriate loader | |
| if ext == ".txt": | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| elif ext == ".pdf": | |
| loader = PyPDFLoader(file_path) | |
| elif ext in [".docx", ".doc"]: | |
| loader = UnstructuredWordDocumentLoader(file_path) | |
| else: | |
| # Fallback for other common formats | |
| loader = UnstructuredFileLoader(file_path) | |
| # Load the file as LangChain documents | |
| documents = loader.load() | |
| return documents | |
| # return [doc.page_content for doc in split_docs] | |