Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Jul 29

Commit

88a026a

verified ·

1 Parent(s): f1bab1c

Rename word_updater.py to word_extractor.py

Browse files

Files changed (2) hide show

word_extractor.py +61 -0
word_updater.py +0 -22

word_extractor.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# word_extractor.py
+from docx import Document
+from docx.shared import RGBColor
+from collections import defaultdict
+from typing import List, Dict
+def is_red_font(run) -> bool:
+    if run.font.color and run.font.color.rgb:
+        rgb = run.font.color.rgb
+        r, g, b = rgb[0], rgb[1], rgb[2]
+        return r > 150 and g < 100 and b < 100
+    return False
+def get_full_text_if_red(para):
+    buffer = ""
+    collecting = False
+    red_texts = []
+    for run in para.runs:
+        if is_red_font(run):
+            buffer += run.text
+            collecting = True
+        elif collecting:
+            red_texts.append(buffer.strip())
+            buffer = ""
+            collecting = False
+    if buffer:
+        red_texts.append(buffer.strip())
+    return red_texts
+def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
+    document = Document(doc_path)
+    results = defaultdict(list)
+    for para in document.paragraphs:
+        red_texts = get_full_text_if_red(para)
+        for text in red_texts:
+            if text.strip():
+                results["Unlabeled"].append(text)
+    for table_idx, table in enumerate(document.tables):
+        for row_idx, row in enumerate(table.rows):
+            cells = row.cells
+            if len(cells) >= 2:
+                label = cells[0].text.strip().replace(":", "").replace("\n", " ")
+                values = []
+                for para in cells[1].paragraphs:
+                    values += get_full_text_if_red(para)
+                if values:
+                    clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
+                    for v in values:
+                        results[clean_label].append(v)
+            elif len(cells) == 1:
+                for para in cells[0].paragraphs:
+                    red_texts = get_full_text_if_red(para)
+                    for text in red_texts:
+                        results[f"Single_Column_Table_{table_idx+1}"].append(text)
+    return results

word_updater.py DELETED Viewed

@@ -1,22 +0,0 @@
-# word_updater.py
-from docx import Document
-from docx.shared import RGBColor
-def is_red(run):
-    color = run.font.color
-    return color and color.rgb == RGBColor(255, 0, 0)
-def fill_template_with_data(template_path, output_path, data):
-    doc = Document(template_path)
-    for para in doc.paragraphs:
-        for run in para.runs:
-            if is_red(run):
-                for key in data:
-                    if key.lower() in run.text.lower():
-                        run.text = run.text.replace(run.text, data[key])
-                        break
-    doc.save(output_path)
-    return output_path