Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 20

Commit

ddb37e5

verified ·

1 Parent(s): 412e2ed

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +202 -749

updated_word.py CHANGED Viewed

@@ -3,50 +3,60 @@ from docx import Document
 from docx.shared import RGBColor
 import re
-# Your original heading patterns (unchanged)
-HEADING_PATTERNS = {
-    "main": [
-        r"NHVAS\s+Audit\s+Summary\s+Report",
-        r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
-        r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
-    ],
-    "sub": [
-        r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
-        r"MAINTENANCE\s+MANAGEMENT",
-        r"MASS\s+MANAGEMENT",
-        r"FATIGUE\s+MANAGEMENT",
-        r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
-        r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
-        r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
-        r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
-        r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
-        r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
-        r"Operator\s+Declaration",
-        r"Operator\s+Information",
-        r"Driver\s*/\s*Scheduler\s+Records\s+Examined"
-    ]
-}
 def load_json(filepath):
     with open(filepath, 'r') as file:
         return json.load(file)
-def flatten_json(y, prefix=''):
-    out = {}
-    for key, val in y.items():
-        new_key = f"{prefix}.{key}" if prefix else key
-        if isinstance(val, dict):
-            out.update(flatten_json(val, new_key))
-        else:
-            out[new_key] = val
-            out[key] = val
-    return out
 def is_red(run):
     color = run.font.color
     return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
 def get_value_as_string(value, field_name=""):
     if isinstance(value, list):
         if len(value) == 0:
             return ""
@@ -54,46 +64,56 @@ def get_value_as_string(value, field_name=""):
             return str(value[0])
         else:
             if "australian company number" in field_name.lower() or "company number" in field_name.lower():
-                return value
             else:
                 return " ".join(str(v) for v in value)
     else:
         return str(value)
 def find_matching_json_value(field_name, flat_json):
-    """Your original matching function (unchanged)"""
     field_name = field_name.strip()
-    # Try exact match first
     if field_name in flat_json:
         print(f"    ✅ Direct match found for key '{field_name}'")
         return flat_json[field_name]
-    # Try case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
             print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
-    # Try suffix matching (for nested keys like "section.field")
-    for key, value in flat_json.items():
-        if '.' in key and key.split('.')[-1].lower() == field_name.lower():
-            print(f"    ✅ Suffix match found for key '{field_name}' with JSON key '{key}'")
-            return value
-    # Try partial matching - remove parentheses and special chars
-    clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
-    clean_field = re.sub(r'\s+', ' ', clean_field)
-    for key, value in flat_json.items():
-        clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
-        clean_key = re.sub(r'\s+', ' ', clean_key)
-        if clean_field == clean_key:
-            print(f"    ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
-            return value
-    # Enhanced fuzzy matching with better scoring
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
@@ -107,13 +127,9 @@ def find_matching_json_value(field_name, flat_json):
         if not key_words:
             continue
-        # Calculate similarity score
         common_words = field_words.intersection(key_words)
         if common_words:
-            # Use Jaccard similarity: intersection / union
             similarity = len(common_words) / len(field_words.union(key_words))
-            # Bonus for high word coverage in field_name
             coverage = len(common_words) / len(field_words)
             final_score = (similarity * 0.6) + (coverage * 0.4)
@@ -130,6 +146,7 @@ def find_matching_json_value(field_name, flat_json):
     return None
 def get_clean_text(cell):
     text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
@@ -137,522 +154,130 @@ def get_clean_text(cell):
     return text.strip()
 def has_red_text(cell):
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run) and run.text.strip():
                 return True
     return False
-def extract_red_text_segments(cell):
-    """Your original red text extraction (unchanged)"""
-    red_segments = []
-    for para_idx, paragraph in enumerate(cell.paragraphs):
-        current_segment = ""
-        segment_runs = []
-        for run_idx, run in enumerate(paragraph.runs):
-            if is_red(run):
-                if run.text:
-                    current_segment += run.text
-                segment_runs.append((para_idx, run_idx, run))
-            else:
-                # End of current red segment
-                if segment_runs:
-                    red_segments.append({
-                        'text': current_segment,
-                        'runs': segment_runs.copy(),
-                        'paragraph_idx': para_idx
-                    })
-                    current_segment = ""
-                    segment_runs = []
-        # Handle segment at end of paragraph
-        if segment_runs:
-            red_segments.append({
-                'text': current_segment,
-                'runs': segment_runs.copy(),
-                'paragraph_idx': para_idx
-            })
-    return red_segments
 def replace_red_text_in_cell(cell, replacement_text):
-    """Your original replacement function (unchanged)"""
-    red_segments = extract_red_text_segments(cell)
-    if not red_segments:
-        return 0
-    if len(red_segments) > 1:
-        replacements_made = 0
-        for segment in red_segments:
-            segment_text = segment['text'].strip()
-            if segment_text:
-                pass
-        if replacements_made == 0:
-            return replace_all_red_segments(red_segments, replacement_text)
-    return replace_all_red_segments(red_segments, replacement_text)
-def replace_all_red_segments(red_segments, replacement_text):
-    """Your original function (unchanged)"""
-    if not red_segments:
-        return 0
-    if '\n' in replacement_text:
-        replacement_lines = replacement_text.split('\n')
-    else:
-        replacement_lines = [replacement_text]
     replacements_made = 0
-    if red_segments and replacement_lines:
-        first_segment = red_segments[0]
-        if first_segment['runs']:
-            first_run = first_segment['runs'][0][2]
-            first_run.text = replacement_lines[0]
-            first_run.font.color.rgb = RGBColor(0, 0, 0)
-            replacements_made = 1
-            for _, _, run in first_segment['runs'][1:]:
-                run.text = ''
-    for segment in red_segments[1:]:
-        for _, _, run in segment['runs']:
-            run.text = ''
-    if len(replacement_lines) > 1 and red_segments:
-        try:
-            first_run = red_segments[0]['runs'][0][2]
-            paragraph = first_run.element.getparent()
-            for line in replacement_lines[1:]:
-                if line.strip():
-                    from docx.oxml import OxmlElement, ns
-                    br = OxmlElement('w:br')
-                    first_run.element.append(br)
-                    new_run = paragraph.add_run(line.strip())
-                    new_run.font.color.rgb = RGBColor(0, 0, 0)
-        except:
-            if red_segments and red_segments[0]['runs']:
-                first_run = red_segments[0]['runs'][0][2]
-                first_run.text = ' '.join(replacement_lines)
-                first_run.font.color.rgb = RGBColor(0, 0, 0)
     return replacements_made
-def replace_single_segment(segment, replacement_text):
-    """Your original function (unchanged)"""
-    if not segment['runs']:
-        return False
-    first_run = segment['runs'][0][2]
-    first_run.text = replacement_text
-    first_run.font.color.rgb = RGBColor(0, 0, 0)
-    for _, _, run in segment['runs'][1:]:
-        run.text = ''
-    return True
-def handle_multiple_red_segments_in_cell(cell, flat_json):
-    """Your original function (unchanged)"""
-    red_segments = extract_red_text_segments(cell)
-    if not red_segments:
-        return 0
-    print(f"      🔍 Found {len(red_segments)} red text segments in cell")
     replacements_made = 0
-    unmatched_segments = []
-    for i, segment in enumerate(red_segments):
-        segment_text = segment['text'].strip()
-        if not segment_text:
-            continue
-        print(f"        Segment {i+1}: '{segment_text[:50]}...'")
-        json_value = find_matching_json_value(segment_text, flat_json)
-        if json_value is not None:
-            replacement_text = get_value_as_string(json_value, segment_text)
-            if isinstance(json_value, list) and len(json_value) > 1:
-                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
-            success = replace_single_segment(segment, replacement_text)
-            if success:
-                replacements_made += 1
-                print(f"        ✅ Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
-        else:
-            unmatched_segments.append(segment)
-            print(f"        ⏳ No individual match for segment '{segment_text[:30]}...'")
-    if unmatched_segments and replacements_made == 0:
-        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
-        print(f"      🔄 Trying combined text match: '{combined_text[:50]}...'")
-        json_value = find_matching_json_value(combined_text, flat_json)
-        if json_value is not None:
-            replacement_text = get_value_as_string(json_value, combined_text)
-            if isinstance(json_value, list) and len(json_value) > 1:
-                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
-            replacements_made = replace_all_red_segments(red_segments, replacement_text)
-            print(f"      ✅ Replaced combined text with '{replacement_text[:50]}...'")
     return replacements_made
-# 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
-def handle_nature_business_multiline_fix(cell, flat_json):
-    """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
     if not has_red_text(cell):
         return 0
-    # Check if this cell contains "Nature of the Operators Business"
     cell_text = get_clean_text(cell).lower()
     if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
         return 0
-    print(f"    🎯 SURGICAL FIX: Nature of Business multi-line processing")
-    # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
-    red_segments = extract_red_text_segments(cell)
-    replacements_made = 0
-    # Try to replace each segment individually first
-    for segment in red_segments:
-        segment_text = segment['text'].strip()
-        if not segment_text:
-            continue
-        json_value = find_matching_json_value(segment_text, flat_json)
-        if json_value is not None:
-            replacement_text = get_value_as_string(json_value, segment_text)
-            success = replace_single_segment(segment, replacement_text)
-            if success:
-                replacements_made += 1
-                print(f"        ✅ Fixed segment: '{segment_text[:30]}...'")
-    # If no individual matches, try combined approach
-    if replacements_made == 0 and red_segments:
-        combined_text = " ".join(seg['text'] for seg in red_segments).strip()
-        json_value = find_matching_json_value(combined_text, flat_json)
-        if json_value is not None:
-            replacement_text = get_value_as_string(json_value, combined_text)
-            replacements_made = replace_all_red_segments(red_segments, replacement_text)
-            print(f"        ✅ Fixed combined text")
-    return replacements_made
-# 🎯 SURGICAL FIX 2: Handle Operator Declaration table
-def handle_operator_declaration_fix(table, flat_json):
-    """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title"""
     replacements_made = 0
-    # Very specific detection: must have EXACTLY these headers
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip()
             cell2_text = get_clean_text(row.cells[1]).strip()
-            # VERY specific match for operator declaration table
-            if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and
-                len(table.rows) <= 4):  # Small table only
-                print(f"    🎯 SURGICAL FIX: Operator Declaration table detected")
-                # Look for the data row (should be next row)
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
-                        # Fix Print Name (first column)
                         if has_red_text(name_cell):
-                            red_text = ""
-                            for paragraph in name_cell.paragraphs:
-                                for run in paragraph.runs:
-                                    if is_red(run):
-                                        red_text += run.text
-                            if red_text.strip():
-                                json_value = find_matching_json_value(red_text.strip(), flat_json)
-                                if json_value is not None:
-                                    replacement_text = get_value_as_string(json_value)
-                                    cell_replacements = replace_red_text_in_cell(name_cell, replacement_text)
-                                    replacements_made += cell_replacements
-                                    print(f"        ✅ Fixed Print Name: '{replacement_text}'")
-                        # Fix Position Title (second column)
                         if has_red_text(position_cell):
-                            red_text = ""
-                            for paragraph in position_cell.paragraphs:
-                                for run in paragraph.runs:
-                                    if is_red(run):
-                                        red_text += run.text
-                            if red_text.strip():
-                                json_value = find_matching_json_value(red_text.strip(), flat_json)
-                                if json_value is not None:
-                                    replacement_text = get_value_as_string(json_value)
-                                    cell_replacements = replace_red_text_in_cell(position_cell, replacement_text)
-                                    replacements_made += cell_replacements
-                                    print(f"        ✅ Fixed Position Title: '{replacement_text}'")
-                break  # Found the table, stop looking
-    return replacements_made
-def handle_australian_company_number(row, company_numbers):
-    """Your original function (unchanged)"""
-    replacements_made = 0
-    for i, digit in enumerate(company_numbers):
-        cell_idx = i + 1
-        if cell_idx < len(row.cells):
-            cell = row.cells[cell_idx]
-            if has_red_text(cell):
-                cell_replacements = replace_red_text_in_cell(cell, str(digit))
-                replacements_made += cell_replacements
-                print(f"      -> Placed digit '{digit}' in cell {cell_idx + 1}")
-    return replacements_made
-def handle_vehicle_registration_table(table, flat_json):
-    """Your original function (unchanged)"""
-    replacements_made = 0
-    # Try to find vehicle registration data
-    vehicle_section = None
-    for key, value in flat_json.items():
-        if "vehicle registration numbers of records examined" in key.lower():
-            if isinstance(value, dict):
-                vehicle_section = value
-                print(f"    ✅ Found vehicle data in key: '{key}'")
-                break
-    if not vehicle_section:
-        potential_columns = {}
-        for key, value in flat_json.items():
-            if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
-                if "." in key:
-                    column_name = key.split(".")[-1]
-                else:
-                    column_name = key
-                potential_columns[column_name] = value
-        if potential_columns:
-            vehicle_section = potential_columns
-            print(f"    ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
-        else:
-            print(f"    ❌ Vehicle registration data not found in JSON")
-            return 0
-    print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
-    # Find header row
-    header_row_idx = -1
-    header_row = None
-    for row_idx, row in enumerate(table.rows):
-        row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
-        if "registration" in row_text and "number" in row_text:
-            header_row_idx = row_idx
-            header_row = row
-            break
-    if header_row_idx == -1:
-        print(f"    ❌ Could not find header row in vehicle table")
-        return 0
-    print(f"    ✅ Found header row at index {header_row_idx}")
-    # Enhanced column mapping
-    column_mapping = {}
-    for col_idx, cell in enumerate(header_row.cells):
-        header_text = get_clean_text(cell).strip()
-        if not header_text or header_text.lower() == "no.":
-            continue
-        best_match = None
-        best_score = 0
-        normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
-        for json_key in vehicle_section.keys():
-            normalized_json = json_key.lower().strip()
-            if normalized_header == normalized_json:
-                best_match = json_key
-                best_score = 1.0
-                break
-            header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
-            json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
-            if header_words and json_words:
-                common_words = header_words.intersection(json_words)
-                score = len(common_words) / max(len(header_words), len(json_words))
-                if score > best_score and score >= 0.3:
-                    best_score = score
-                    best_match = json_key
-            header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
-            json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
-            if header_clean in json_clean or json_clean in header_clean:
-                if len(header_clean) > 5 and len(json_clean) > 5:
-                    substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
-                    if substring_score > best_score and substring_score >= 0.6:
-                        best_score = substring_score
-                        best_match = json_key
-        if best_match:
-            column_mapping[col_idx] = best_match
-            print(f"      📌 Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
-    if not column_mapping:
-        print(f"    ❌ No column mappings found")
-        return 0
-    # Determine data rows needed
-    max_data_rows = 0
-    for json_key, data in vehicle_section.items():
-        if isinstance(data, list):
-            max_data_rows = max(max_data_rows, len(data))
-    print(f"    📌 Need to populate {max_data_rows} data rows")
-    # Process data rows
-    for data_row_index in range(max_data_rows):
-        table_row_idx = header_row_idx + 1 + data_row_index
-        if table_row_idx >= len(table.rows):
-            print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
-            print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
-            new_row = table.add_row()
-            print(f"    ✅ Successfully added row {len(table.rows)} to the table")
-        row = table.rows[table_row_idx]
-        print(f"    📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
-        for col_idx, json_key in column_mapping.items():
-            if col_idx < len(row.cells):
-                cell = row.cells[col_idx]
-                column_data = vehicle_section.get(json_key, [])
-                if isinstance(column_data, list) and data_row_index < len(column_data):
-                    replacement_value = str(column_data[data_row_index])
-                    cell_text = get_clean_text(cell)
-                    if has_red_text(cell) or not cell_text.strip():
-                        if not cell_text.strip():
-                            cell.text = replacement_value
-                            replacements_made += 1
-                            print(f"      -> Added '{replacement_value}' to empty cell (column '{json_key}')")
-                        else:
-                            cell_replacements = replace_red_text_in_cell(cell, replacement_value)
-                            replacements_made += cell_replacements
-                            if cell_replacements > 0:
-                                print(f"      -> Replaced red text with '{replacement_value}' (column '{json_key}')")
-    return replacements_made
-def handle_print_accreditation_section(table, flat_json):
-    """Your original function (unchanged)"""
-    replacements_made = 0
-    print_data = flat_json.get("print accreditation name.print accreditation name", [])
-    if not isinstance(print_data, list) or len(print_data) < 2:
-        return 0
-    name_value = print_data[0]
-    position_value = print_data[1]
-    print(f"    📋 Print accreditation data: Name='{name_value}', Position='{position_value}'")
-    for row_idx, row in enumerate(table.rows):
-        if len(row.cells) >= 2:
-            cell1_text = get_clean_text(row.cells[0]).lower()
-            cell2_text = get_clean_text(row.cells[1]).lower()
-            if "print name" in cell1_text and "position title" in cell2_text:
-                print(f"    📍 Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
-                if row_idx + 1 < len(table.rows):
-                    data_row = table.rows[row_idx + 1]
-                    if len(data_row.cells) >= 2:
-                        if has_red_text(data_row.cells[0]):
-                            cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
-                            replacements_made += cell_replacements
-                            if cell_replacements > 0:
-                                print(f"      ✅ Replaced Print Name: '{name_value}'")
-                        if has_red_text(data_row.cells[1]):
-                            cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
-                            replacements_made += cell_replacements
-                            if cell_replacements > 0:
-                                print(f"      ✅ Replaced Position Title: '{position_value}'")
                 break
     return replacements_made
-def process_single_column_sections(cell, field_name, flat_json):
-    """Your original function (unchanged)"""
-    json_value = find_matching_json_value(field_name, flat_json)
-    if json_value is not None:
-        replacement_text = get_value_as_string(json_value, field_name)
-        if isinstance(json_value, list) and len(json_value) > 1:
-            replacement_text = "\n".join(str(item) for item in json_value)
-        if has_red_text(cell):
-            print(f"    ✅ Replacing red text in single-column section: '{field_name}'")
-            print(f"    ✅ Replacement text:\n{replacement_text}")
-            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
-            if cell_replacements > 0:
-                print(f"    -> Replaced with: '{replacement_text[:100]}...'")
-                return cell_replacements
-    return 0
 def process_tables(document, flat_json):
-    """Your original function with minimal surgical fixes added"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
-        # Your original logic
-        table_text = ""
-        for row in table.rows[:3]:
-            for cell in row.cells:
-                table_text += get_clean_text(cell).lower() + " "
-        # Enhanced vehicle registration detection
-        vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
-        indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
-        if indicator_count >= 2:
-            print(f"    🚗 Detected Vehicle Registration table")
-            vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
-            replacements_made += vehicle_replacements
-            continue
-        # Enhanced print accreditation detection
-        print_accreditation_indicators = ["print name", "position title"]
-        indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
-        if indicator_count >= 1:
-            print(f"    📋 Detected Print Accreditation table")
-            print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
-            replacements_made += print_accreditation_replacements
-            continue
-        # Your existing row processing
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
@@ -665,261 +290,90 @@ def process_tables(document, flat_json):
             print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
             json_value = find_matching_json_value(key_text, flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
-                # Enhanced ACN handling
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
-                # Enhanced section header handling
-                elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
-                    print(f"    ✅ Section header detected, checking next row for content...")
-                    next_row = table.rows[row_idx + 1]
-                    for cell_idx, cell in enumerate(next_row.cells):
                         if has_red_text(cell):
-                            print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
-                            if isinstance(json_value, list):
-                                replacement_text = "\n".join(str(item) for item in json_value)
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
-                                print(f"    -> Replaced section content with: '{replacement_text[:100]}...'")
-                elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
-                    if has_red_text(key_cell):
-                        cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
-                        replacements_made += cell_replacements
-                else:
-                    for cell_idx in range(1, len(row.cells)):
-                        value_cell = row.cells[cell_idx]
-                        if has_red_text(value_cell):
-                            print(f"    ✅ Found red text in column {cell_idx + 1}")
-                            cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
-                            replacements_made += cell_replacements
             else:
-                # Enhanced fallback processing for unmatched keys
-                if len(row.cells) == 1 and has_red_text(key_cell):
-                    red_text = ""
-                    for paragraph in key_cell.paragraphs:
-                        for run in paragraph.runs:
-                            if is_red(run):
-                                red_text += run.text
-                    if red_text.strip():
-                        section_value = find_matching_json_value(red_text.strip(), flat_json)
-                        if section_value is not None:
-                            section_replacement = get_value_as_string(section_value, red_text.strip())
-                            cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
-                            replacements_made += cell_replacements
-                # Enhanced red text processing for all cells
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
-                        cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
-                        replacements_made += cell_replacements
-                        # 🎯 SURGICAL FIX 1: Only if no replacements were made
-                        if cell_replacements == 0:
-                            surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
-                            replacements_made += surgical_fix
-    # 🎯 SURGICAL FIX 2: Handle Operator Declaration tables (only check last few tables)
-    print(f"\n🎯 SURGICAL FIX: Checking for Operator Declaration tables...")
-    for table in document.tables[-3:]:  # Only check last 3 tables
-        if len(table.rows) <= 4:  # Only small tables
-            declaration_fix = handle_operator_declaration_fix(table, flat_json)
-            replacements_made += declaration_fix
     return replacements_made
 def process_paragraphs(document, flat_json):
-    """Your original function (unchanged)"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
-        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
-        if red_runs:
-            full_text = paragraph.text.strip()
-            red_text_only = "".join(run.text for run in red_runs).strip()
-            print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
-            # Your existing matching logic
-            json_value = find_matching_json_value(red_text_only, flat_json)
-            if json_value is None:
-                # Enhanced pattern matching for signatures and dates
-                if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
-                    json_value = find_matching_json_value("auditor signature", flat_json)
-                elif "OPERATOR SIGNATURE" in red_text_only.upper():
-                    json_value = find_matching_json_value("operator signature", flat_json)
-            if json_value is not None:
-                replacement_text = get_value_as_string(json_value)
-                print(f"    ✅ Replacing red text with: '{replacement_text}'")
-                red_runs[0].text = replacement_text
-                red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
-                for run in red_runs[1:]:
-                    run.text = ''
-                replacements_made += 1
-    return replacements_made
-def process_headings(document, flat_json):
-    """Your original function (unchanged)"""
-    replacements_made = 0
-    print(f"\n🔍 Processing headings:")
-    paragraphs = document.paragraphs
-    for para_idx, paragraph in enumerate(paragraphs):
-        paragraph_text = paragraph.text.strip()
-        if not paragraph_text:
-            continue
-        # Enhanced heading detection
-        matched_heading = None
-        for category, patterns in HEADING_PATTERNS.items():
-            for pattern in patterns:
-                if re.search(pattern, paragraph_text, re.IGNORECASE):
-                    matched_heading = pattern
-                    break
-            if matched_heading:
-                break
-        if matched_heading:
-            print(f"  📌 Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
-            # Check current heading paragraph
-            if has_red_text_in_paragraph(paragraph):
-                print(f"    🔴 Found red text in heading itself")
-                heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
-                replacements_made += heading_replacements
-            # Enhanced: Look further ahead for related content
-            for next_para_offset in range(1, 6):  # Extended range
-                next_para_idx = para_idx + next_para_offset
-                if next_para_idx >= len(paragraphs):
-                    break
-                next_paragraph = paragraphs[next_para_idx]
-                next_text = next_paragraph.text.strip()
-                if not next_text:
-                    continue
-                # Stop if we hit another heading
-                is_another_heading = False
-                for category, patterns in HEADING_PATTERNS.items():
-                    for pattern in patterns:
-                        if re.search(pattern, next_text, re.IGNORECASE):
-                            is_another_heading = True
-                            break
-                    if is_another_heading:
-                        break
-                if is_another_heading:
-                    break
-                # Process red text with enhanced context
-                if has_red_text_in_paragraph(next_paragraph):
-                    print(f"    🔴 Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
-                    context_replacements = process_red_text_in_paragraph(
-                        next_paragraph,
-                        paragraph_text,
-                        flat_json
-                    )
-                    replacements_made += context_replacements
-    return replacements_made
-def has_red_text_in_paragraph(paragraph):
-    """Your original function (unchanged)"""
-    for run in paragraph.runs:
-        if is_red(run) and run.text.strip():
-            return True
-    return False
-def process_red_text_in_paragraph(paragraph, context_text, flat_json):
-    """Your original function (unchanged)"""
-    replacements_made = 0
-    red_text_segments = []
-    for run in paragraph.runs:
-        if is_red(run) and run.text.strip():
-            red_text_segments.append(run.text.strip())
-    if not red_text_segments:
-        return 0
-    combined_red_text = " ".join(red_text_segments).strip()
-    print(f"      🔍 Red text found: '{combined_red_text}'")
-    json_value = None
-    # Strategy 1: Direct matching
-    json_value = find_matching_json_value(combined_red_text, flat_json)
-    # Strategy 2: Enhanced context-based matching
-    if json_value is None:
-        if "NHVAS APPROVED AUDITOR" in context_text.upper():
-            auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
-            for field in auditor_fields:
-                json_value = find_matching_json_value(field, flat_json)
-                if json_value is not None:
-                    print(f"      ✅ Found auditor match with field: '{field}'")
-                    break
-        elif "OPERATOR DECLARATION" in context_text.upper():
-            operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
-            for field in operator_fields:
-                json_value = find_matching_json_value(field, flat_json)
-                if json_value is not None:
-                    print(f"      ✅ Found operator match with field: '{field}'")
-                    break
-    # Strategy 3: Enhanced context combination
-    if json_value is None:
-        context_queries = [
-            f"{context_text} {combined_red_text}",
-            combined_red_text,
-            context_text
-        ]
-        for query in context_queries:
-            json_value = find_matching_json_value(query, flat_json)
             if json_value is not None:
-                print(f"      ✅ Found match with combined query: '{query[:50]}...'")
-                break
-    # Replace if match found
-    if json_value is not None:
-        replacement_text = get_value_as_string(json_value, combined_red_text)
-        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
-        if red_runs:
-            red_runs[0].text = replacement_text
-            red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
-            for run in red_runs[1:]:
-                run.text = ''
-            replacements_made = 1
-            print(f"      ✅ Replaced with: '{replacement_text}'")
-    else:
-        print(f"      ❌ No match found for red text: '{combined_red_text}'")
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
-    """Your original main function (unchanged)"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
@@ -928,7 +382,8 @@ def process_hf(json_file, docx_file, output_file):
             with open(json_file, 'r', encoding='utf-8') as f:
                 json_data = json.load(f)
-        flat_json = flatten_json(json_data)
         print("📄 Available JSON keys (sample):")
         for i, (key, value) in enumerate(sorted(flat_json.items())):
             if i < 10:
@@ -941,14 +396,13 @@ def process_hf(json_file, docx_file, output_file):
         else:
             doc = Document(docx_file)
-        # Your original processing
-        print("🚀 Starting processing with surgical fixes...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
-        heading_replacements = process_headings(doc, flat_json)
-        total_replacements = table_replacements + paragraph_replacements + heading_replacements
         # Save output
         if hasattr(output_file, "write"):
@@ -960,7 +414,6 @@ def process_hf(json_file, docx_file, output_file):
         print(f"✅ Total replacements: {total_replacements}")
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
-        print(f"   📋 Headings: {heading_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e:
@@ -973,7 +426,7 @@ def process_hf(json_file, docx_file, output_file):
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:
-        print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
         exit(1)
     docx_path = sys.argv[1]
     json_path = sys.argv[2]

 from docx.shared import RGBColor
 import re
 def load_json(filepath):
     with open(filepath, 'r') as file:
         return json.load(file)
+def flatten_json_new_system(json_data):
+    """Flatten your new JSON structure to work with replacement logic"""
+    flat_json = {}
+    for schema_name, schema_data in json_data.items():
+        if isinstance(schema_data, dict):
+            for field_name, values in schema_data.items():
+                # Handle list values (your system returns lists)
+                if isinstance(values, list) and values:
+                    value = values[0] if len(values) == 1 else values
+                else:
+                    value = values
+                # Add multiple key variations for better matching
+                flat_json[field_name] = value
+                flat_json[field_name.lower()] = value
+                flat_json[field_name.lower().strip()] = value
+                # Add schema-prefixed keys
+                flat_json[f"{schema_name}.{field_name}"] = value
+                flat_json[f"{schema_name.lower()}.{field_name.lower()}"] = value
+                # Special mappings for common cases
+                if "print name" in field_name.lower():
+                    flat_json["print name"] = value
+                    flat_json["operator name"] = value
+                    flat_json["name"] = value
+                if "position title" in field_name.lower():
+                    flat_json["position title"] = value
+                    flat_json["position"] = value
+                    flat_json["title"] = value
+                if "accreditation number" in field_name.lower():
+                    flat_json["accreditation number"] = value
+                    flat_json["nhvas accreditation no"] = value
+                if "expiry date" in field_name.lower():
+                    flat_json["expiry date"] = value
+                    flat_json["expiry"] = value
+    return flat_json
 def is_red(run):
+    """Detect red colored text"""
     color = run.font.color
     return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
 def get_value_as_string(value, field_name=""):
+    """Convert value to string, handling lists appropriately"""
     if isinstance(value, list):
         if len(value) == 0:
             return ""
             return str(value[0])
         else:
             if "australian company number" in field_name.lower() or "company number" in field_name.lower():
+                return value  # Return as list for ACN processing
             else:
                 return " ".join(str(v) for v in value)
     else:
         return str(value)
 def find_matching_json_value(field_name, flat_json):
+    """Enhanced matching for your new JSON structure"""
     field_name = field_name.strip()
+    # Direct match (exact)
     if field_name in flat_json:
         print(f"    ✅ Direct match found for key '{field_name}'")
         return flat_json[field_name]
+    # Case-insensitive exact match
     for key, value in flat_json.items():
         if key.lower() == field_name.lower():
             print(f"    ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
             return value
+    # Partial matching for common field names
+    field_lower = field_name.lower().strip()
+    # Handle common variations
+    if "print name" in field_lower:
+        for key in ["Print Name", "print name", "operator name", "name"]:
+            if key in flat_json:
+                print(f"    ✅ Print name match: '{field_name}' -> '{key}'")
+                return flat_json[key]
+    if "position title" in field_lower:
+        for key in ["Position Title", "position title", "position", "title"]:
+            if key in flat_json:
+                print(f"    ✅ Position title match: '{field_name}' -> '{key}'")
+                return flat_json[key]
+    if "accreditation number" in field_lower:
+        for key in flat_json.keys():
+            if "accreditation" in key.lower() and "number" in key.lower():
+                print(f"    ✅ Accreditation number match: '{field_name}' -> '{key}'")
+                return flat_json[key]
+    if "expiry date" in field_lower:
+        for key in flat_json.keys():
+            if "expiry" in key.lower():
+                print(f"    ✅ Expiry date match: '{field_name}' -> '{key}'")
+                return flat_json[key]
+    # Fuzzy matching
     field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
     if not field_words:
         return None
         if not key_words:
             continue
         common_words = field_words.intersection(key_words)
         if common_words:
             similarity = len(common_words) / len(field_words.union(key_words))
             coverage = len(common_words) / len(field_words)
             final_score = (similarity * 0.6) + (coverage * 0.4)
     return None
 def get_clean_text(cell):
+    """Extract clean text from cell"""
     text = ""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
     return text.strip()
 def has_red_text(cell):
+    """Check if cell has red text"""
     for paragraph in cell.paragraphs:
         for run in paragraph.runs:
             if is_red(run) and run.text.strip():
                 return True
     return False
 def replace_red_text_in_cell(cell, replacement_text):
+    """Replace red text in cell with new text"""
     replacements_made = 0
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if is_red(run) and run.text.strip():
+                run.text = replacement_text
+                run.font.color.rgb = RGBColor(0, 0, 0)  # Change to black
+                replacements_made += 1
+                break  # Only replace first red text found
     return replacements_made
+def handle_australian_company_number(row, company_numbers):
+    """Handle ACN digit placement"""
     replacements_made = 0
+    for i, digit in enumerate(company_numbers):
+        cell_idx = i + 1
+        if cell_idx < len(row.cells):
+            cell = row.cells[cell_idx]
+            if has_red_text(cell):
+                cell_replacements = replace_red_text_in_cell(cell, str(digit))
+                replacements_made += cell_replacements
+                print(f"      -> Placed digit '{digit}' in cell {cell_idx + 1}")
     return replacements_made
+def handle_nature_business_section(cell, flat_json):
+    """Handle Nature of Business section with sub-fields"""
     if not has_red_text(cell):
         return 0
     cell_text = get_clean_text(cell).lower()
     if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
         return 0
+    print(f"    🎯 Found Nature of Business section")
+    # Check for business description
+    for key in flat_json.keys():
+        if "nature of the operators business" in key.lower():
+            business_value = flat_json[key]
+            replacement_text = get_value_as_string(business_value)
+            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+            if cell_replacements > 0:
+                print(f"      ✅ Updated business description")
+                return cell_replacements
+    return 0
+def handle_operator_declaration_table(table, flat_json):
+    """Handle Operator Declaration table specifically"""
     replacements_made = 0
     for row_idx, row in enumerate(table.rows):
         if len(row.cells) >= 2:
             cell1_text = get_clean_text(row.cells[0]).strip()
             cell2_text = get_clean_text(row.cells[1]).strip()
+            # Check if this is the Print Name / Position Title header row
+            if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower()):
+                print(f"    🎯 Found Operator Declaration table")
+                # Look for data row
                 if row_idx + 1 < len(table.rows):
                     data_row = table.rows[row_idx + 1]
                     if len(data_row.cells) >= 2:
                         name_cell = data_row.cells[0]
                         position_cell = data_row.cells[1]
+                        # Update Print Name
                         if has_red_text(name_cell):
+                            name_value = None
+                            for key in ["Print Name", "print name", "Operator Declaration.Print Name"]:
+                                if key in flat_json:
+                                    name_value = flat_json[key]
+                                    break
+                            if name_value:
+                                name_text = get_value_as_string(name_value)
+                                cell_replacements = replace_red_text_in_cell(name_cell, name_text)
+                                replacements_made += cell_replacements
+                                print(f"        ✅ Updated Print Name: '{name_text}'")
+                        # Update Position Title
                         if has_red_text(position_cell):
+                            position_value = None
+                            for key in ["Position Title", "position title", "Operator Declaration.Position Title"]:
+                                if key in flat_json:
+                                    position_value = flat_json[key]
+                                    break
+                            if position_value:
+                                position_text = get_value_as_string(position_value)
+                                cell_replacements = replace_red_text_in_cell(position_cell, position_text)
+                                replacements_made += cell_replacements
+                                print(f"        ✅ Updated Position Title: '{position_text}'")
                 break
     return replacements_made
 def process_tables(document, flat_json):
+    """Process all tables in document"""
     replacements_made = 0
     for table_idx, table in enumerate(document.tables):
         print(f"\n🔍 Processing table {table_idx + 1}:")
+        # Check for Operator Declaration table first (priority fix)
+        if len(table.rows) <= 4:  # Small tables
+            declaration_replacements = handle_operator_declaration_table(table, flat_json)
+            if declaration_replacements > 0:
+                replacements_made += declaration_replacements
+                continue
+        # Process all rows
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) < 1:
                 continue
             print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
+            # Handle Nature of Business section
+            if "nature of the operators business" in key_text.lower():
+                nature_replacements = handle_nature_business_section(key_cell, flat_json)
+                replacements_made += nature_replacements
+                continue
+            # Regular field matching
             json_value = find_matching_json_value(key_text, flat_json)
             if json_value is not None:
                 replacement_text = get_value_as_string(json_value, key_text)
+                # Handle Australian Company Number specially
                 if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
                     cell_replacements = handle_australian_company_number(row, json_value)
                     replacements_made += cell_replacements
+                else:
+                    # Handle regular fields
+                    for cell_idx in range(len(row.cells)):
+                        cell = row.cells[cell_idx]
                         if has_red_text(cell):
                             cell_replacements = replace_red_text_in_cell(cell, replacement_text)
                             replacements_made += cell_replacements
                             if cell_replacements > 0:
+                                print(f"    ✅ Updated cell {cell_idx + 1}: '{replacement_text}'")
             else:
+                # Process any red text in row cells
                 for cell_idx in range(len(row.cells)):
                     cell = row.cells[cell_idx]
                     if has_red_text(cell):
+                        # Try to extract red text and match it
+                        red_text = ""
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                if is_red(run):
+                                    red_text += run.text
+                        if red_text.strip():
+                            json_value = find_matching_json_value(red_text.strip(), flat_json)
+                            if json_value is not None:
+                                replacement_text = get_value_as_string(json_value)
+                                cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+                                replacements_made += cell_replacements
+                                if cell_replacements > 0:
+                                    print(f"    ✅ Replaced red text: '{red_text.strip()}' -> '{replacement_text}'")
     return replacements_made
 def process_paragraphs(document, flat_json):
+    """Process paragraphs for red text"""
     replacements_made = 0
     print(f"\n🔍 Processing paragraphs:")
     for para_idx, paragraph in enumerate(document.paragraphs):
+        red_text = ""
+        red_runs = []
+        for run in paragraph.runs:
+            if is_red(run) and run.text.strip():
+                red_text += run.text
+                red_runs.append(run)
+        if red_text.strip():
+            print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text.strip()}'")
+            json_value = find_matching_json_value(red_text.strip(), flat_json)
             if json_value is not None:
+                replacement_text = get_value_as_string(json_value)
+                print(f"    ✅ Replacing with: '{replacement_text}'")
+                # Replace in first red run only
+                if red_runs:
+                    red_runs[0].text = replacement_text
+                    red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+                    # Clear other red runs
+                    for run in red_runs[1:]:
+                        run.text = ''
+                    replacements_made += 1
     return replacements_made
 def process_hf(json_file, docx_file, output_file):
+    """Main processing function compatible with your new system"""
     try:
         # Load JSON
         if hasattr(json_file, "read"):
             with open(json_file, 'r', encoding='utf-8') as f:
                 json_data = json.load(f)
+        # Flatten your new JSON structure
+        flat_json = flatten_json_new_system(json_data)
         print("📄 Available JSON keys (sample):")
         for i, (key, value) in enumerate(sorted(flat_json.items())):
             if i < 10:
         else:
             doc = Document(docx_file)
+        # Process document
+        print("🚀 Starting processing compatible with your new system...")
         table_replacements = process_tables(doc, flat_json)
         paragraph_replacements = process_paragraphs(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements
         # Save output
         if hasattr(output_file, "write"):
         print(f"✅ Total replacements: {total_replacements}")
         print(f"   📊 Tables: {table_replacements}")
         print(f"   📝 Paragraphs: {paragraph_replacements}")
         print(f"🎉 Processing complete!")
     except FileNotFoundError as e:
 if __name__ == "__main__":
     import sys
     if len(sys.argv) != 4:
+        print("Usage: python compatible_pipeline.py <input_docx> <updated_json> <output_docx>")
         exit(1)
     docx_path = sys.argv[1]
     json_path = sys.argv[2]