Spaces:
Running
Running
| # content.py | |
| # Simple content registry for two modes: "career" and "personal". | |
| # Auto-loads PDF/TXT/MD files from folders and concatenates text for prompts. | |
| import os | |
| import glob | |
| from dataclasses import dataclass | |
| from typing import List, Dict, Iterable, Optional | |
| from pypdf import PdfReader | |
| class Doc: | |
| domain: str # "career" or "personal" | |
| title: str | |
| text: str | |
| source_path: str | |
| class ContentStore: | |
| def __init__(self): | |
| self.docs: List[Doc] = [] | |
| self.by_domain: Dict[str, List[Doc]] = {} | |
| # ---------- Loading ---------- | |
| def add_doc(self, doc: Doc): | |
| self.docs.append(doc) | |
| self.by_domain.setdefault(doc.domain, []).append(doc) | |
| def load_folder(self, folder: str, domain: str): | |
| """ | |
| Load all files in a folder into a domain. | |
| Supported: .pdf, .txt, .md, .markdown | |
| """ | |
| os.makedirs(folder, exist_ok=True) | |
| for path in glob.glob(os.path.join(folder, "*")): | |
| if os.path.isdir(path): | |
| continue | |
| text = self._extract_text(path) | |
| if not text: | |
| continue | |
| title = os.path.basename(path) | |
| self.add_doc(Doc(domain=domain, title=title, text=text, source_path=path)) | |
| def _extract_text(self, path: str) -> str: | |
| lower = path.lower() | |
| if lower.endswith(".pdf"): | |
| out = [] | |
| try: | |
| reader = PdfReader(path) | |
| for p in reader.pages: | |
| t = p.extract_text() | |
| if t: | |
| out.append(t) | |
| except Exception: | |
| return "" | |
| return "\n".join(out) | |
| if lower.endswith((".txt", ".md", ".markdown")): | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except Exception: | |
| return "" | |
| return "" | |
| # ---------- Retrieval ---------- | |
| def join_domain_text(self, domains: Optional[Iterable[str]]) -> str: | |
| """ | |
| Concatenate documents for the selected domains. | |
| If domains is None/empty, defaults to ["career"]. | |
| """ | |
| if not domains: | |
| domains = ["career"] | |
| chunks: List[str] = [] | |
| for d in domains: | |
| for doc in self.by_domain.get(d, []): | |
| chunks.append(f"### {doc.title}\n{doc.text}\n") | |
| return "\n".join(chunks) | |