Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 6

Commit

5b2b3a8

verified ·

1 Parent(s): 25603c9

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +345 -114

updated_word.py CHANGED Viewed

@@ -37,104 +37,71 @@ def get_value_as_string(value, field_name=""):
         return str(value)
 def find_matching_json_value(field_name, flat_json):
-    """Find matching JSON value based on field name (key)"""
     field_name = field_name.strip()
-    # Manual mapping for specific sections that need special handling
-    manual_mappings = {
-        "attendance list name and position title": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
-        "attendance list (names and position titles)": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
-        "nature of the operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
-        "nature of the operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
-        "nature of operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
-        "nature of operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
-        # Paragraph-level mappings
-        "mass management": "paragraphs.MASS MANAGEMENT",
-        "liam herbig": "paragraphs.MASS MANAGEMENT",  # Name should be replaced with company name
-        "date": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
-        # Date-related mappings
-        "13.11.2024": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
-        "auditor signature": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
-        "operator signature": "paragraphs.I hereby consent to information relating to my Accreditation to be shared with other law enforcement agencies, including a service provider authorised under the Heavy Vehicle National Law.",
-        # Specific data mappings
-        "jodie jones": "Audit Information.Auditor name",
-        "13th november 2024": "Audit Information.Date of Audit",
-        "adelaide barossa transport & warehousing pty ltd": "Operator Information.Operator name (Legal entity)",
-        "manager": "Operator Information.Operator name (Legal entity)",  # Replace manager title with company name
-        "liam herbig –manager": "Operator Information.Operator name (Legal entity)",
-        "liam herbig – manager": "Operator Information.Operator name (Legal entity)",
-        "deborah herbig – manager": "Operator Information.Operator name (Legal entity)",
-        # Contact information mappings (old data in red text -> new data from JSON)
-        "141 sitz road callington sa 5254": "Operator Information.Operator business address",  # Replace old address with new
-        "po box 743 mt barker sa": "Operator Information.Operator Postal address",  # Replace old postal with new
-        "debherbig@bigpond.com": "Operator Information.Email address",  # Replace old email with new
-        "0447 710 602": "Operator Information.Operator Telephone Number",  # Replace old phone with new
-        # Manual/Version mappings (old version -> new version)
-        "mahlo 092021v1": "Operator Information.NHVAS Manual (Policies and Procedures) developed by",  # Replace old manual with new
-        # These should stay as they are (no replacement needed, just different format)
-        "511840": "Operator Information.NHVAS Accreditation No. (If applicable)",  # Keep accreditation number
-        "26th october 2023": "Audit Information.Date of Audit",  # Use audit date instead
-        # Std 5 and Std 6 mappings
-        "the latest verification was dated 23rdnovember 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
-        "the latest verification was dated 23rd november 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
-        "internal review was dated 23rd august 2023 with 0 ncr": "Mass Management Summary of Audit findings.Std 6. Internal Review",
-        "23rd august2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
-        "23rd august 2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
-    }
-    # Check manual mappings first
-    normalized_field = field_name.lower().strip()
-    if normalized_field in manual_mappings:
-        mapped_key = manual_mappings[normalized_field]
-        if mapped_key in flat_json:
-            print(f"    ✅ Manual mapping found for '{field_name}' -> '{mapped_key}'")
-            return flat_json[mapped_key]
     # Try exact match first
     if field_name in flat_json:
-        print(f"    Direct match found for key '{field_name}'")
         return flat_json[field_name]
     # Try case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
-            print(f"    Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Try to find a key that ends with this field name
     for key, value in flat_json.items():
-        if key.endswith('.' + field_name):
-            print(f"    Suffix match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Try partial matching for fields with parentheses or additional text
-    clean_field = re.sub(r'\s*\([^)]*\)', '', field_name).strip()  # Remove parentheses content
     for key, value in flat_json.items():
-        clean_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
-        if clean_field.lower() == clean_key.lower():
-            print(f"    Clean match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Try word-based matching - more flexible approach
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     best_match = None
     best_score = 0
     for key, value in flat_json.items():
         key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
-        # Calculate how many words match
         common_words = field_words.intersection(key_words)
         if common_words:
-            score = len(common_words) / max(len(field_words), len(key_words))  # Normalized score
-            if score > best_score:
-                best_score = score
-                best_match = (key, value)
-    if best_match and best_score >= 0.5:  # At least 50% word overlap
-        print(f"    Word-based match found for key '{field_name}' with JSON key '{best_match[0]}' (score: {best_score:.2f})")
-        return best_match[1]
-    # No match found
     print(f"    ❌ No match found for '{field_name}'")
     return None
@@ -152,38 +119,291 @@ def has_red_text(cell):
                 return True
     return False
-def replace_red_text_in_cell(cell, replacement_text):
-    replacements_made = 0
-    # First, collect all red text to show what we're replacing
-    all_red_text = ""
-    for paragraph in cell.paragraphs:
-        for run in paragraph.runs:
             if is_red(run):
-                all_red_text += run.text
-    if all_red_text.strip():
-        print(f"      ✅ Replacing red text: '{all_red_text[:50]}...' → '{replacement_text[:50]}...'")
-    # Now replace all red text in the cell with the replacement text
-    first_replacement_done = False
-    for paragraph in cell.paragraphs:
-        red_runs = [run for run in paragraph.runs if is_red(run)]
-        if red_runs:
-            if not first_replacement_done:
-                # Replace the first red run with our text
-                red_runs[0].text = replacement_text
-                red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
-                first_replacement_done = True
-                replacements_made = 1
-            else:
-                # Clear the first red run since we already replaced content
-                red_runs[0].text = ''
-            # Clear all other red runs in this paragraph
-            for run in red_runs[1:]:
                 run.text = ''
     return replacements_made
 def handle_australian_company_number(row, company_numbers):
@@ -568,27 +788,41 @@ def process_paragraphs(document, flat_json):
                 replacements_made += 1
     return replacements_made
-def main(json_path, docx_path, output_path):
     try:
-        json_data = load_json(json_path)
         flat_json = flatten_json(json_data)
         print("📄 Available JSON keys (sample):")
-        count = 0
-        for key, value in sorted(flat_json.items()):
-            if count < 10:
                 print(f"  - {key}: {value}")
-                count += 1
-        print(f"  ... and {len(flat_json) - count} more keys\n")
-        doc = Document(docx_path)
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements
-        doc.save(output_path)
-        print(f"\n✅ Document saved as: {output_path}")
         print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
     except FileNotFoundError as e:
@@ -600,10 +834,7 @@ def main(json_path, docx_path, output_path):
 if __name__ == "__main__":
     import sys
-    if len(sys.argv) != 4:
-        print("Usage: python updated_word.py <input_docx> <updated_json> <output_docx>")
-        exit(1)
-    docx_path = sys.argv[1]
-    json_path = sys.argv[2]
-    output_path = sys.argv[3]
-    main(json_path, docx_path, output_path)

         return str(value)
 def find_matching_json_value(field_name, flat_json):
+    """Completely dynamic matching without manual mappings"""
     field_name = field_name.strip()
     # Try exact match first
     if field_name in flat_json:
+        print(f"    ✅ Direct match found for key '{field_name}'")
         return flat_json[field_name]
     # Try case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
+            print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Try suffix matching (for nested keys like "section.field")
     for key, value in flat_json.items():
+        if '.' in key and key.split('.')[-1].lower() == field_name.lower():
+            print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Try partial matching - remove parentheses and special chars
+    clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
+    clean_field = re.sub(r'\s+', ' ', clean_field)  # Multiple spaces to single
     for key, value in flat_json.items():
+        clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
+        clean_key = re.sub(r'\s+', ' ', clean_key)
+        if clean_field == clean_key:
+            print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Word-based fuzzy matching
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
+    if not field_words:
+        return None
     best_match = None
     best_score = 0
+    best_key = None
     for key, value in flat_json.items():
         key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
+        if not key_words:
+            continue
+        # Calculate similarity score
         common_words = field_words.intersection(key_words)
         if common_words:
+            # Use Jaccard similarity: intersection / union
+            similarity = len(common_words) / len(field_words.union(key_words))
+            # Bonus for high word coverage in field_name
+            coverage = len(common_words) / len(field_words)
+            final_score = (similarity * 0.6) + (coverage * 0.4)
+            if final_score > best_score:
+                best_score = final_score
+                best_match = value
+                best_key = key
+    if best_match and best_score >= 0.3:  # Lowered threshold for more matches
+        print(f"    ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
+        return best_match
     print(f"    ❌ No match found for '{field_name}'")
     return None
                 return True
     return False
+def extract_red_text_segments(cell):
+    """Extract all red text segments from a cell with better multi-line handling"""
+    red_segments = []
+    for para_idx, paragraph in enumerate(cell.paragraphs):
+        current_segment = ""
+        segment_runs = []
+        for run_idx, run in enumerate(paragraph.runs):
             if is_red(run):
+                if run.text:  # Include even empty red runs for proper replacement
+                    current_segment += run.text
+                segment_runs.append((para_idx, run_idx, run))
+            else:
+                # End of current red segment
+                if segment_runs:  # Changed from current_segment.strip() to segment_runs
+                    red_segments.append({
+                        'text': current_segment,
+                        'runs': segment_runs.copy(),
+                        'paragraph_idx': para_idx
+                    })
+                    current_segment = ""
+                    segment_runs = []
+        # Handle segment at end of paragraph
+        if segment_runs:  # Changed from current_segment.strip() to segment_runs
+            red_segments.append({
+                'text': current_segment,
+                'runs': segment_runs.copy(),
+                'paragraph_idx': para_idx
+            })
+    return red_segments
+def replace_red_text_in_cell(cell, replacement_text):
+    """Enhanced cell replacement with better multi-line and multi-segment handling"""
+    red_segments = extract_red_text_segments(cell)
+    if not red_segments:
+        return 0
+    # If we have multiple segments, try to match each individually first
+    if len(red_segments) > 1:
+        replacements_made = 0
+        for segment in red_segments:
+            segment_text = segment['text'].strip()
+            if segment_text:
+                # Try to find specific match for this segment
+                # This would require access to flat_json, so we'll handle it in the calling function
+                pass
+        # If no individual matches, replace all with the single replacement
+        if replacements_made == 0:
+            return replace_all_red_segments(red_segments, replacement_text)
+    # Single segment or fallback - replace all red text with the replacement
+    return replace_all_red_segments(red_segments, replacement_text)
+def replace_all_red_segments(red_segments, replacement_text):
+    """Replace all red segments with the replacement text"""
+    if not red_segments:
+        return 0
+    # Handle multi-line replacement text
+    if '\n' in replacement_text:
+        replacement_lines = replacement_text.split('\n')
+    else:
+        replacement_lines = [replacement_text]
+    replacements_made = 0
+    # Replace first segment with first line
+    if red_segments and replacement_lines:
+        first_segment = red_segments[0]
+        if first_segment['runs']:
+            first_run = first_segment['runs'][0][2]  # (para_idx, run_idx, run)
+            first_run.text = replacement_lines[0]
+            first_run.font.color.rgb = RGBColor(0, 0, 0)
+            replacements_made = 1
+            # Clear other runs in first segment
+            for _, _, run in first_segment['runs'][1:]:
                 run.text = ''
+    # Clear all other red segments
+    for segment in red_segments[1:]:
+        for _, _, run in segment['runs']:
+            run.text = ''
+    # If we have multiple lines, add them to the same paragraph or create new runs
+    if len(replacement_lines) > 1 and red_segments:
+        try:
+            # Get the paragraph that contains the first run
+            first_run = red_segments[0]['runs'][0][2]
+            paragraph = first_run.element.getparent()  # Get the paragraph element
+            # Add remaining lines as new runs in the same paragraph with line breaks
+            for line in replacement_lines[1:]:
+                if line.strip():  # Only add non-empty lines
+                    # Add a line break run
+                    from docx.oxml import OxmlElement, ns
+                    br = OxmlElement('w:br')
+                    first_run.element.append(br)
+                    # Add the text as a new run
+                    new_run = paragraph.add_run(line.strip())
+                    new_run.font.color.rgb = RGBColor(0, 0, 0)
+        except:
+            # If we can't add line breaks, just put everything in the first run
+            if red_segments and red_segments[0]['runs']:
+                first_run = red_segments[0]['runs'][0][2]
+                # Join all lines with spaces instead of line breaks
+                first_run.text = ' '.join(replacement_lines)
+                first_run.font.color.rgb = RGBColor(0, 0, 0)
+    return replacements_made
+def handle_multiple_red_segments_in_cell(cell, flat_json):
+    """Handle cells with multiple red text segments dynamically"""
+    red_segments = extract_red_text_segments(cell)
+    if not red_segments:
+        return 0
+    print(f"      🔍 Found {len(red_segments)} red text segments in cell")
+    replacements_made = 0
+    unmatched_segments = []
+    # Try to match each segment individually
+    for i, segment in enumerate(red_segments):
+        segment_text = segment['text'].strip()
+        if not segment_text:
+            continue
+        print(f"        Segment {i+1}: '{segment_text[:50]}...'")
+        # Find JSON match for this segment
+        json_value = find_matching_json_value(segment_text, flat_json)
+        if json_value is not None:
+            replacement_text = get_value_as_string(json_value, segment_text)
+            # Handle list values
+            if isinstance(json_value, list) and len(json_value) > 1:
+                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
+            success = replace_single_segment(segment, replacement_text)
+            if success:
+                replacements_made += 1
+                print(f"        ✅ Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
+        else:
+            unmatched_segments.append(segment)
+            print(f"        ⏳ No individual match for segment '{segment_text[:30]}...'")
+    # If we have unmatched segments, try to match the combined text
+    if unmatched_segments and replacements_made == 0:
+        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
+        print(f"      🔄 Trying combined text match: '{combined_text[:50]}...'")
+        json_value = find_matching_json_value(combined_text, flat_json)
+        if json_value is not None:
+            replacement_text = get_value_as_string(json_value, combined_text)
+            if isinstance(json_value, list) and len(json_value) > 1:
+                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
+            # Replace all segments with the combined replacement
+            replacements_made = replace_all_red_segments(red_segments, replacement_text)
+            print(f"      ✅ Replaced combined text with '{replacement_text[:50]}...'")
+    return replacements_made
+def replace_single_segment(segment, replacement_text):
+    """Replace a single red text segment"""
+    if not segment['runs']:
+        return False
+    # Replace first run with new text
+    first_run = segment['runs'][0][2]  # (para_idx, run_idx, run)
+    first_run.text = replacement_text
+    first_run.font.color.rgb = RGBColor(0, 0, 0)
+    # Clear remaining runs in the segment
+    for _, _, run in segment['runs'][1:]:
+        run.text = ''
+    return True
+def process_tables(document, flat_json):
+    """Enhanced table processing with better dynamic detection"""
+    replacements_made = 0
+    for table_idx, table in enumerate(document.tables):
+        print(f"\n🔍 Processing table {table_idx + 1}:")
+        # Dynamically detect table type by analyzing content
+        table_type = detect_table_type(table)
+        print(f"    📋 Detected table type: {table_type}")
+        if table_type == "vehicle_registration":
+            vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
+            replacements_made += vehicle_replacements
+            continue
+        elif table_type == "print_accreditation":
+            print_replacements = handle_print_accreditation_section(table, flat_json)
+            replacements_made += print_replacements
+            continue
+        # Process as regular key-value table
+        for row_idx, row in enumerate(table.rows):
+            if len(row.cells) < 1:
+                continue
+            # Process each cell for red text
+            for cell_idx, cell in enumerate(row.cells):
+                if has_red_text(cell):
+                    cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
+                    replacements_made += cell_replacements
+                    # If no individual segment matches found, try context-based matching
+                    if cell_replacements == 0:
+                        context_replacements = try_context_based_replacement(cell, row, table, flat_json)
+                        replacements_made += context_replacements
+    return replacements_made
+def detect_table_type(table):
+    """Dynamically detect table type based on content"""
+    # Get text from first few rows
+    sample_text = ""
+    for row in table.rows[:3]:
+        for cell in row.cells:
+            sample_text += get_clean_text(cell).lower() + " "
+    # Vehicle registration indicators
+    vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
+    vehicle_score = sum(1 for indicator in vehicle_indicators if indicator in sample_text)
+    # Print accreditation indicators
+    print_indicators = ["print name", "position title"]
+    print_score = sum(1 for indicator in print_indicators if indicator in sample_text)
+    if vehicle_score >= 3:
+        return "vehicle_registration"
+    elif print_score >= 2:
+        return "print_accreditation"
+    else:
+        return "key_value"
+def try_context_based_replacement(cell, row, table, flat_json):
+    """Try to find replacement using context from surrounding cells"""
+    replacements_made = 0
+    # Get context from row headers/labels
+    row_context = ""
+    if len(row.cells) > 1:
+        # First cell might be a label
+        first_cell_text = get_clean_text(row.cells[0]).strip()
+        if first_cell_text and not has_red_text(row.cells[0]):
+            row_context = first_cell_text
+    # Get red text from the cell
+    red_segments = extract_red_text_segments(cell)
+    for segment in red_segments:
+        red_text = segment['text'].strip()
+        if not red_text:
+            continue
+        # Try combining context with red text
+        if row_context:
+            context_queries = [
+                f"{row_context} {red_text}",
+                f"{row_context}",
+                red_text
+            ]
+            for query in context_queries:
+                json_value = find_matching_json_value(query, flat_json)
+                if json_value is not None:
+                    replacement_text = get_value_as_string(json_value, query)
+                    success = replace_single_segment(segment, replacement_text)
+                    if success:
+                        replacements_made += 1
+                        print(f"      ✅ Context-based replacement: '{query}' -> '{replacement_text[:30]}...'")
+                        break
     return replacements_made
 def handle_australian_company_number(row, company_numbers):
                 replacements_made += 1
     return replacements_made
+def process_hf(json_file, docx_file, output_file):
+    """
+    Accepts file-like objects or file paths.
+    For Hugging Face: json_file, docx_file, output_file will be file-like objects.
+    """
     try:
+        # --- Load JSON (file or file-like) ---
+        if hasattr(json_file, "read"):
+            json_data = json.load(json_file)
+        else:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
         flat_json = flatten_json(json_data)
         print("📄 Available JSON keys (sample):")
+        for i, (key, value) in enumerate(sorted(flat_json.items())):
+            if i < 10:
                 print(f"  - {key}: {value}")
+        print(f"  ... and {len(flat_json) - 10} more keys\n")
+        # --- Load DOCX (file or file-like) ---
+        if hasattr(docx_file, "read"):
+            doc = Document(docx_file)
+        else:
+            doc = Document(docx_file)
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
         total_replacements = table_replacements + paragraph_replacements
+        # --- Save DOCX output (file or file-like) ---
+        if hasattr(output_file, "write"):
+            doc.save(output_file)
+        else:
+            doc.save(output_file)
+        print(f"\n✅ Document saved as: {output_file}")
         print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
     except FileNotFoundError as e:
 if __name__ == "__main__":
     import sys
+    if len(sys.argv) == 4:
+        process_hf(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        print("Usage: python updated_word.py <input_json> <input_docx> <output_docx>")