Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 12

Commit

4edca00

verified ·

1 Parent(s): 9d9d8a8

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +183 -1

updated_word.py CHANGED Viewed

@@ -3,6 +3,187 @@ from docx import Document
 from docx.shared import RGBColor
 import re
 def load_json(filepath):
     with open(filepath, 'r') as file:
         return json.load(file)
@@ -815,6 +996,7 @@ def process_hf(json_file, docx_file, output_file):
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements
         # --- Save DOCX output (file or file-like) ---
@@ -823,7 +1005,7 @@ def process_hf(json_file, docx_file, output_file):
         else:
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
-        print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
     except FileNotFoundError as e:
         print(f"❌ File not found: {e}")

 from docx.shared import RGBColor
 import re
+# Add these heading patterns at the top of your file with other constants
+HEADING_PATTERNS = {
+    "main": [
+        r"NHVAS\s+Audit\s+Summary\s+Report",
+        r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
+        r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
+    ],
+    "sub": [
+        r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
+        r"MAINTENANCE\s+MANAGEMENT",
+        r"MASS\s+MANAGEMENT",
+        r"FATIGUE\s+MANAGEMENT",
+        r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
+        r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
+        r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
+        r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
+        r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
+        r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
+        r"Operator\s+Declaration",
+        r"Operator\s+Information"
+    ]
+}
+def process_headings(document, flat_json):
+    """Process document headings and their associated content for red text replacement"""
+    replacements_made = 0
+    print(f"\n🔍 Processing headings:")
+    paragraphs = document.paragraphs
+    for para_idx, paragraph in enumerate(paragraphs):
+        paragraph_text = paragraph.text.strip()
+        if not paragraph_text:
+            continue
+        # Check if this paragraph matches any heading pattern
+        matched_heading = None
+        for category, patterns in HEADING_PATTERNS.items():
+            for pattern in patterns:
+                if re.search(pattern, paragraph_text, re.IGNORECASE):
+                    matched_heading = pattern
+                    break
+            if matched_heading:
+                break
+        if matched_heading:
+            print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
+            # Look for red text in the current heading paragraph first
+            if has_red_text_in_paragraph(paragraph):
+                print(f"    🔴 Found red text in heading itself")
+                heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
+                replacements_made += heading_replacements
+            # Look for red text in the next few paragraphs after the heading
+            for next_para_offset in range(1, 4):  # Check next 3 paragraphs
+                next_para_idx = para_idx + next_para_offset
+                if next_para_idx >= len(paragraphs):
+                    break
+                next_paragraph = paragraphs[next_para_idx]
+                next_text = next_paragraph.text.strip()
+                # Skip empty paragraphs
+                if not next_text:
+                    continue
+                # If we hit another heading, stop looking
+                is_another_heading = False
+                for category, patterns in HEADING_PATTERNS.items():
+                    for pattern in patterns:
+                        if re.search(pattern, next_text, re.IGNORECASE):
+                            is_another_heading = True
+                            break
+                    if is_another_heading:
+                        break
+                if is_another_heading:
+                    break
+                # Check for red text in this paragraph
+                if has_red_text_in_paragraph(next_paragraph):
+                    print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
+                    # Use heading context to improve matching
+                    context_replacements = process_red_text_in_paragraph(
+                        next_paragraph,
+                        paragraph_text,  # Use heading text as context
+                        flat_json
+                    )
+                    replacements_made += context_replacements
+    return replacements_made
+def has_red_text_in_paragraph(paragraph):
+    """Check if a paragraph contains any red text"""
+    for run in paragraph.runs:
+        if is_red(run) and run.text.strip():
+            return True
+    return False
+def process_red_text_in_paragraph(paragraph, context_text, flat_json):
+    """Process red text within a single paragraph using context"""
+    replacements_made = 0
+    # Extract all red text from the paragraph
+    red_text_segments = []
+    for run in paragraph.runs:
+        if is_red(run) and run.text.strip():
+            red_text_segments.append(run.text.strip())
+    if not red_text_segments:
+        return 0
+    # Combine red text segments
+    combined_red_text = " ".join(red_text_segments).strip()
+    print(f"      🔍 Red text found: '{combined_red_text}'")
+    # Try different matching strategies based on context
+    json_value = None
+    # Strategy 1: Direct red text matching
+    json_value = find_matching_json_value(combined_red_text, flat_json)
+    # Strategy 2: Context-based matching for specific headings
+    if json_value is None:
+        if "NHVAS APPROVED AUDITOR" in context_text.upper():
+            # Try auditor-specific fields
+            auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor"]
+            for field in auditor_fields:
+                json_value = find_matching_json_value(field, flat_json)
+                if json_value is not None:
+                    print(f"      ✅ Found auditor match with field: '{field}'")
+                    break
+        elif "OPERATOR DECLARATION" in context_text.upper():
+            # Try operator-specific fields
+            operator_fields = ["operator name", "operator", "company name", "organisation name"]
+            for field in operator_fields:
+                json_value = find_matching_json_value(field, flat_json)
+                if json_value is not None:
+                    print(f"      ✅ Found operator match with field: '{field}'")
+                    break
+    # Strategy 3: Try combining context with red text
+    if json_value is None:
+        context_queries = [
+            f"{context_text} {combined_red_text}",
+            combined_red_text,
+            context_text
+        ]
+        for query in context_queries:
+            json_value = find_matching_json_value(query, flat_json)
+            if json_value is not None:
+                print(f"      ✅ Found match with combined query: '{query[:50]}...'")
+                break
+    # Replace the red text if we found a match
+    if json_value is not None:
+        replacement_text = get_value_as_string(json_value, combined_red_text)
+        # Find and replace all red runs in the paragraph
+        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
+        if red_runs:
+            # Replace first red run with the replacement text
+            red_runs[0].text = replacement_text
+            red_runs[0].font.color.rgb = RGBColor(0, 0, 0)  # Change to black
+            # Clear remaining red runs
+            for run in red_runs[1:]:
+                run.text = ''
+            replacements_made = 1
+            print(f"      ✅ Replaced with: '{replacement_text}'")
+    else:
+        print(f"      ❌ No match found for red text: '{combined_red_text}'")
+    return replacements_made
 def load_json(filepath):
     with open(filepath, 'r') as file:
         return json.load(file)
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
+        heading_replacements = process_headings(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements
         # --- Save DOCX output (file or file-like) ---
         else:
             doc.save(output_file)
         print(f"\n✅ Document saved as: {output_file}")
+        print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs, {heading_replacements} in headings)")
     except FileNotFoundError as e:
         print(f"❌ File not found: {e}")