Spaces:
Sleeping
Sleeping
Update document_chunker.py
Browse files- document_chunker.py +17 -2
document_chunker.py
CHANGED
|
@@ -70,9 +70,23 @@ class DocumentChunker:
|
|
| 70 |
sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
|
| 71 |
return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
|
| 72 |
|
| 73 |
-
def extract_text_from_docx(self, file_path: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
doc = Document(file_path)
|
| 75 |
return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def detect_document_type(self, text: str) -> str:
|
| 78 |
keywords = ['grant', 'funding', 'mission']
|
|
@@ -144,7 +158,8 @@ class DocumentChunker:
|
|
| 144 |
|
| 145 |
def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
|
| 146 |
file_path = Path(file_path)
|
| 147 |
-
text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
|
|
|
|
| 148 |
doc_type = self.detect_document_type(text)
|
| 149 |
headers = self.extract_headers(text, doc_type)
|
| 150 |
raw_chunks = self.chunk_by_headers(text, headers)
|
|
|
|
| 70 |
sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
|
| 71 |
return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
|
| 72 |
|
| 73 |
+
# def extract_text_from_docx(self, file_path: str) -> str:
|
| 74 |
+
# doc = Document(file_path)
|
| 75 |
+
# return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
| 76 |
+
def extract_text(self, file_path: str) -> str:
|
| 77 |
+
if file_path.endswith(".docx"):
|
| 78 |
doc = Document(file_path)
|
| 79 |
return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
| 80 |
+
elif file_path.endswith(".pdf"):
|
| 81 |
+
import fitz # PyMuPDF
|
| 82 |
+
text = ""
|
| 83 |
+
with fitz.open(file_path) as doc:
|
| 84 |
+
for page in doc:
|
| 85 |
+
text += page.get_text()
|
| 86 |
+
return text
|
| 87 |
+
else:
|
| 88 |
+
return Path(file_path).read_text()
|
| 89 |
+
|
| 90 |
|
| 91 |
def detect_document_type(self, text: str) -> str:
|
| 92 |
keywords = ['grant', 'funding', 'mission']
|
|
|
|
| 158 |
|
| 159 |
def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
|
| 160 |
file_path = Path(file_path)
|
| 161 |
+
# text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
|
| 162 |
+
text = self.extract_text(str(file_path))
|
| 163 |
doc_type = self.detect_document_type(text)
|
| 164 |
headers = self.extract_headers(text, doc_type)
|
| 165 |
raw_chunks = self.chunk_by_headers(text, headers)
|