Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

d9eda51

verified ·

1 Parent(s): 560885b

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +58 -39

updated_word.py CHANGED Viewed

@@ -1434,60 +1434,80 @@ def process_headings(document, flat_json):
     return replacements_made
-def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     replacements_made = 0
     red_text_segments = []
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             red_text_segments.append(run.text.strip())
     if not red_text_segments:
         return 0
     combined_red_text = " ".join(red_text_segments).strip()
-    print(f"      🔍 Red text found: '{combined_red_text}'")
-    kv = find_matching_json_key_and_value(combined_red_text, flat_json)
-    json_value = kv[1] if kv else None
-    if json_value is None:
-        if "NHVAS APPROVED AUDITOR" in context_text.upper():
-            auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
-            for field in auditor_fields:
-                kv = find_matching_json_key_and_value(field, flat_json)
-                if kv:
-                    print(f"      ✅ Found auditor match with field: '{kv[0]}'")
-                    json_value = kv[1]
-                    break
-        elif "OPERATOR DECLARATION" in context_text.upper():
-            operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
-            for field in operator_fields:
-                kv = find_matching_json_key_and_value(field, flat_json)
                 if kv:
-                    print(f"      ✅ Found operator match with field: '{kv[0]}'")
-                    json_value = kv[1]
                     break
-    if json_value is None:
-        context_queries = [f"{context_text} {combined_red_text}", combined_red_text, context_text]
-        for query in context_queries:
-            kv = find_matching_json_key_and_value(query, flat_json)
-            if kv:
-                print(f"      ✅ Found match with combined query -> {kv[0]}")
-                json_value = kv[1]
-                break
-    if json_value is not None:
-        replacement_text = get_value_as_string(json_value, combined_red_text)
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
-            red_runs[0].text = replacement_text
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
-            print(f"      ✅ Replaced with: '{replacement_text}'")
     else:
-        print(f"      ❌ No match found for red text: '{combined_red_text}'")
     return replacements_made
 def process_red_text_in_context_paragraph(paragraph, heading_text, flat_json, operator_name):
@@ -1593,8 +1613,7 @@ def process_hf(json_file, docx_file, output_file):
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
-        red_text_para = process_red_text_in_paragraph(paragraph, context_text, flat_json)
-        total_replacements = table_replacements + paragraph_replacements + heading_replacements + red_text_para
         # Save unmatched headers for iterative improvement
         if _unmatched_headers:

     return replacements_made
+def process_red_text_in_heading_paragraph(paragraph, paragraph_text, flat_json, operator_name):
+    """Process red text found in heading paragraphs"""
     replacements_made = 0
     red_text_segments = []
     for run in paragraph.runs:
         if is_red(run) and run.text.strip():
             red_text_segments.append(run.text.strip())
     if not red_text_segments:
         return 0
     combined_red_text = " ".join(red_text_segments).strip()
+    print(f"      🔍 Red text found in heading: '{combined_red_text}'")
+    replacement_value = None
+    # Determine what to replace based on heading context
+    if any(mgmt_type in paragraph_text.upper() for mgmt_type in ["MAINTENANCE MANAGEMENT", "MASS MANAGEMENT", "FATIGUE MANAGEMENT"]):
+        # For management section headings, replace with operator name
+        if operator_name:
+            replacement_value = operator_name
+            print(f"      ✅ Using operator name for management section: '{operator_name}'")
+    elif "NHVAS APPROVED AUDITOR DECLARATION" in paragraph_text.upper():
+        # For auditor declarations, look for auditor name
+        auditor_name = None
+        for key, value in flat_json.items():
+            if "auditor" in key.lower() and "name" in key.lower():
+                if isinstance(value, list) and value:
+                    auditor_name = str(value[0]).strip()
+                elif value:
+                    auditor_name = str(value).strip()
+                break
+        if auditor_name:
+            replacement_value = auditor_name
+            print(f"      ✅ Using auditor name: '{auditor_name}'")
+    elif "OPERATOR DECLARATION" in paragraph_text.upper():
+        # For operator declarations, use operator name
+        if operator_name:
+            replacement_value = operator_name
+            print(f"      ✅ Using operator name for operator declaration: '{operator_name}'")
+    else:
+        # For other headings, try to find a relevant match
+        # First try direct match
+        kv = find_matching_json_key_and_value(combined_red_text, flat_json)
+        if kv:
+            replacement_value = get_value_as_string(kv[1], combined_red_text)
+        else:
+            # Try contextual search with heading
+            context_queries = [f"{paragraph_text} {combined_red_text}", combined_red_text, paragraph_text]
+            for query in context_queries:
+                kv = find_matching_json_key_and_value(query, flat_json)
                 if kv:
+                    replacement_value = get_value_as_string(kv[1], combined_red_text)
+                    print(f"      ✅ Found match with combined query: {kv[0]}")
                     break
+    # Apply the replacement if we found a suitable value
+    if replacement_value:
         red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
         if red_runs:
+            red_runs[0].text = replacement_value
             red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
             for run in red_runs[1:]:
                 run.text = ''
             replacements_made = 1
+            print(f"      ✅ Replaced with: '{replacement_value}'")
     else:
+        print(f"      ❌ No suitable replacement found for: '{combined_red_text}'")
     return replacements_made
 def process_red_text_in_context_paragraph(paragraph, heading_text, flat_json, operator_name):
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         heading_replacements = process_headings(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save unmatched headers for iterative improvement
         if _unmatched_headers: