Shami96 commited on
Commit
7755a4a
Β·
verified Β·
1 Parent(s): 4edca00

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +400 -343
updated_word.py CHANGED
@@ -3,7 +3,7 @@ from docx import Document
3
  from docx.shared import RGBColor
4
  import re
5
 
6
- # Add these heading patterns at the top of your file with other constants
7
  HEADING_PATTERNS = {
8
  "main": [
9
  r"NHVAS\s+Audit\s+Summary\s+Report",
@@ -22,168 +22,11 @@ HEADING_PATTERNS = {
22
  r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
23
  r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
24
  r"Operator\s+Declaration",
25
- r"Operator\s+Information"
 
26
  ]
27
  }
28
 
29
- def process_headings(document, flat_json):
30
- """Process document headings and their associated content for red text replacement"""
31
- replacements_made = 0
32
- print(f"\nπŸ” Processing headings:")
33
-
34
- paragraphs = document.paragraphs
35
-
36
- for para_idx, paragraph in enumerate(paragraphs):
37
- paragraph_text = paragraph.text.strip()
38
-
39
- if not paragraph_text:
40
- continue
41
-
42
- # Check if this paragraph matches any heading pattern
43
- matched_heading = None
44
- for category, patterns in HEADING_PATTERNS.items():
45
- for pattern in patterns:
46
- if re.search(pattern, paragraph_text, re.IGNORECASE):
47
- matched_heading = pattern
48
- break
49
- if matched_heading:
50
- break
51
-
52
- if matched_heading:
53
- print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
54
-
55
- # Look for red text in the current heading paragraph first
56
- if has_red_text_in_paragraph(paragraph):
57
- print(f" πŸ”΄ Found red text in heading itself")
58
- heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
59
- replacements_made += heading_replacements
60
-
61
- # Look for red text in the next few paragraphs after the heading
62
- for next_para_offset in range(1, 4): # Check next 3 paragraphs
63
- next_para_idx = para_idx + next_para_offset
64
- if next_para_idx >= len(paragraphs):
65
- break
66
-
67
- next_paragraph = paragraphs[next_para_idx]
68
- next_text = next_paragraph.text.strip()
69
-
70
- # Skip empty paragraphs
71
- if not next_text:
72
- continue
73
-
74
- # If we hit another heading, stop looking
75
- is_another_heading = False
76
- for category, patterns in HEADING_PATTERNS.items():
77
- for pattern in patterns:
78
- if re.search(pattern, next_text, re.IGNORECASE):
79
- is_another_heading = True
80
- break
81
- if is_another_heading:
82
- break
83
-
84
- if is_another_heading:
85
- break
86
-
87
- # Check for red text in this paragraph
88
- if has_red_text_in_paragraph(next_paragraph):
89
- print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
90
-
91
- # Use heading context to improve matching
92
- context_replacements = process_red_text_in_paragraph(
93
- next_paragraph,
94
- paragraph_text, # Use heading text as context
95
- flat_json
96
- )
97
- replacements_made += context_replacements
98
-
99
- return replacements_made
100
-
101
- def has_red_text_in_paragraph(paragraph):
102
- """Check if a paragraph contains any red text"""
103
- for run in paragraph.runs:
104
- if is_red(run) and run.text.strip():
105
- return True
106
- return False
107
-
108
- def process_red_text_in_paragraph(paragraph, context_text, flat_json):
109
- """Process red text within a single paragraph using context"""
110
- replacements_made = 0
111
-
112
- # Extract all red text from the paragraph
113
- red_text_segments = []
114
- for run in paragraph.runs:
115
- if is_red(run) and run.text.strip():
116
- red_text_segments.append(run.text.strip())
117
-
118
- if not red_text_segments:
119
- return 0
120
-
121
- # Combine red text segments
122
- combined_red_text = " ".join(red_text_segments).strip()
123
- print(f" πŸ” Red text found: '{combined_red_text}'")
124
-
125
- # Try different matching strategies based on context
126
- json_value = None
127
-
128
- # Strategy 1: Direct red text matching
129
- json_value = find_matching_json_value(combined_red_text, flat_json)
130
-
131
- # Strategy 2: Context-based matching for specific headings
132
- if json_value is None:
133
- if "NHVAS APPROVED AUDITOR" in context_text.upper():
134
- # Try auditor-specific fields
135
- auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor"]
136
- for field in auditor_fields:
137
- json_value = find_matching_json_value(field, flat_json)
138
- if json_value is not None:
139
- print(f" βœ… Found auditor match with field: '{field}'")
140
- break
141
-
142
- elif "OPERATOR DECLARATION" in context_text.upper():
143
- # Try operator-specific fields
144
- operator_fields = ["operator name", "operator", "company name", "organisation name"]
145
- for field in operator_fields:
146
- json_value = find_matching_json_value(field, flat_json)
147
- if json_value is not None:
148
- print(f" βœ… Found operator match with field: '{field}'")
149
- break
150
-
151
- # Strategy 3: Try combining context with red text
152
- if json_value is None:
153
- context_queries = [
154
- f"{context_text} {combined_red_text}",
155
- combined_red_text,
156
- context_text
157
- ]
158
-
159
- for query in context_queries:
160
- json_value = find_matching_json_value(query, flat_json)
161
- if json_value is not None:
162
- print(f" βœ… Found match with combined query: '{query[:50]}...'")
163
- break
164
-
165
- # Replace the red text if we found a match
166
- if json_value is not None:
167
- replacement_text = get_value_as_string(json_value, combined_red_text)
168
-
169
- # Find and replace all red runs in the paragraph
170
- red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
171
- if red_runs:
172
- # Replace first red run with the replacement text
173
- red_runs[0].text = replacement_text
174
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0) # Change to black
175
-
176
- # Clear remaining red runs
177
- for run in red_runs[1:]:
178
- run.text = ''
179
-
180
- replacements_made = 1
181
- print(f" βœ… Replaced with: '{replacement_text}'")
182
- else:
183
- print(f" ❌ No match found for red text: '{combined_red_text}'")
184
-
185
- return replacements_made
186
-
187
  def load_json(filepath):
188
  with open(filepath, 'r') as file:
189
  return json.load(file)
@@ -218,7 +61,7 @@ def get_value_as_string(value, field_name=""):
218
  return str(value)
219
 
220
  def find_matching_json_value(field_name, flat_json):
221
- """Completely dynamic matching without manual mappings"""
222
  field_name = field_name.strip()
223
 
224
  # Try exact match first
@@ -240,7 +83,7 @@ def find_matching_json_value(field_name, flat_json):
240
 
241
  # Try partial matching - remove parentheses and special chars
242
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
243
- clean_field = re.sub(r'\s+', ' ', clean_field) # Multiple spaces to single
244
 
245
  for key, value in flat_json.items():
246
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
@@ -250,7 +93,7 @@ def find_matching_json_value(field_name, flat_json):
250
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
251
  return value
252
 
253
- # Word-based fuzzy matching
254
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
255
  if not field_words:
256
  return None
@@ -279,7 +122,7 @@ def find_matching_json_value(field_name, flat_json):
279
  best_match = value
280
  best_key = key
281
 
282
- if best_match and best_score >= 0.3: # Lowered threshold for more matches
283
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
284
  return best_match
285
 
@@ -301,7 +144,7 @@ def has_red_text(cell):
301
  return False
302
 
303
  def extract_red_text_segments(cell):
304
- """Extract all red text segments from a cell with better multi-line handling"""
305
  red_segments = []
306
 
307
  for para_idx, paragraph in enumerate(cell.paragraphs):
@@ -310,12 +153,12 @@ def extract_red_text_segments(cell):
310
 
311
  for run_idx, run in enumerate(paragraph.runs):
312
  if is_red(run):
313
- if run.text: # Include even empty red runs for proper replacement
314
  current_segment += run.text
315
  segment_runs.append((para_idx, run_idx, run))
316
  else:
317
  # End of current red segment
318
- if segment_runs: # Changed from current_segment.strip() to segment_runs
319
  red_segments.append({
320
  'text': current_segment,
321
  'runs': segment_runs.copy(),
@@ -325,7 +168,7 @@ def extract_red_text_segments(cell):
325
  segment_runs = []
326
 
327
  # Handle segment at end of paragraph
328
- if segment_runs: # Changed from current_segment.strip() to segment_runs
329
  red_segments.append({
330
  'text': current_segment,
331
  'runs': segment_runs.copy(),
@@ -335,35 +178,29 @@ def extract_red_text_segments(cell):
335
  return red_segments
336
 
337
  def replace_red_text_in_cell(cell, replacement_text):
338
- """Enhanced cell replacement with better multi-line and multi-segment handling"""
339
  red_segments = extract_red_text_segments(cell)
340
 
341
  if not red_segments:
342
  return 0
343
 
344
- # If we have multiple segments, try to match each individually first
345
  if len(red_segments) > 1:
346
  replacements_made = 0
347
  for segment in red_segments:
348
  segment_text = segment['text'].strip()
349
  if segment_text:
350
- # Try to find specific match for this segment
351
- # This would require access to flat_json, so we'll handle it in the calling function
352
  pass
353
 
354
- # If no individual matches, replace all with the single replacement
355
  if replacements_made == 0:
356
  return replace_all_red_segments(red_segments, replacement_text)
357
 
358
- # Single segment or fallback - replace all red text with the replacement
359
  return replace_all_red_segments(red_segments, replacement_text)
360
 
361
  def replace_all_red_segments(red_segments, replacement_text):
362
- """Replace all red segments with the replacement text"""
363
  if not red_segments:
364
  return 0
365
 
366
- # Handle multi-line replacement text
367
  if '\n' in replacement_text:
368
  replacement_lines = replacement_text.split('\n')
369
  else:
@@ -371,54 +208,91 @@ def replace_all_red_segments(red_segments, replacement_text):
371
 
372
  replacements_made = 0
373
 
374
- # Replace first segment with first line
375
  if red_segments and replacement_lines:
376
  first_segment = red_segments[0]
377
  if first_segment['runs']:
378
- first_run = first_segment['runs'][0][2] # (para_idx, run_idx, run)
379
  first_run.text = replacement_lines[0]
380
  first_run.font.color.rgb = RGBColor(0, 0, 0)
381
  replacements_made = 1
382
 
383
- # Clear other runs in first segment
384
  for _, _, run in first_segment['runs'][1:]:
385
  run.text = ''
386
 
387
- # Clear all other red segments
388
  for segment in red_segments[1:]:
389
  for _, _, run in segment['runs']:
390
  run.text = ''
391
 
392
- # If we have multiple lines, add them to the same paragraph or create new runs
393
  if len(replacement_lines) > 1 and red_segments:
394
  try:
395
- # Get the paragraph that contains the first run
396
  first_run = red_segments[0]['runs'][0][2]
397
- paragraph = first_run.element.getparent() # Get the paragraph element
398
 
399
- # Add remaining lines as new runs in the same paragraph with line breaks
400
  for line in replacement_lines[1:]:
401
- if line.strip(): # Only add non-empty lines
402
- # Add a line break run
403
  from docx.oxml import OxmlElement, ns
404
  br = OxmlElement('w:br')
405
  first_run.element.append(br)
406
 
407
- # Add the text as a new run
408
  new_run = paragraph.add_run(line.strip())
409
  new_run.font.color.rgb = RGBColor(0, 0, 0)
410
  except:
411
- # If we can't add line breaks, just put everything in the first run
412
  if red_segments and red_segments[0]['runs']:
413
  first_run = red_segments[0]['runs'][0][2]
414
- # Join all lines with spaces instead of line breaks
415
  first_run.text = ' '.join(replacement_lines)
416
  first_run.font.color.rgb = RGBColor(0, 0, 0)
417
 
418
  return replacements_made
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  def handle_multiple_red_segments_in_cell(cell, flat_json):
421
- """Handle cells with multiple red text segments dynamically"""
422
  red_segments = extract_red_text_segments(cell)
423
 
424
  if not red_segments:
@@ -428,7 +302,6 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
428
  replacements_made = 0
429
  unmatched_segments = []
430
 
431
- # Try to match each segment individually
432
  for i, segment in enumerate(red_segments):
433
  segment_text = segment['text'].strip()
434
  if not segment_text:
@@ -436,13 +309,11 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
436
 
437
  print(f" Segment {i+1}: '{segment_text[:50]}...'")
438
 
439
- # Find JSON match for this segment
440
  json_value = find_matching_json_value(segment_text, flat_json)
441
 
442
  if json_value is not None:
443
  replacement_text = get_value_as_string(json_value, segment_text)
444
 
445
- # Handle list values
446
  if isinstance(json_value, list) and len(json_value) > 1:
447
  replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
448
 
@@ -454,7 +325,6 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
454
  unmatched_segments.append(segment)
455
  print(f" ⏳ No individual match for segment '{segment_text[:30]}...'")
456
 
457
- # If we have unmatched segments, try to match the combined text
458
  if unmatched_segments and replacements_made == 0:
459
  combined_text = " ".join(seg['text'] for seg in red_segments).strip()
460
  print(f" πŸ”„ Trying combined text match: '{combined_text[:50]}...'")
@@ -465,109 +335,46 @@ def handle_multiple_red_segments_in_cell(cell, flat_json):
465
  if isinstance(json_value, list) and len(json_value) > 1:
466
  replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
467
 
468
- # Replace all segments with the combined replacement
469
  replacements_made = replace_all_red_segments(red_segments, replacement_text)
470
  print(f" βœ… Replaced combined text with '{replacement_text[:50]}...'")
471
 
472
  return replacements_made
473
 
474
  def replace_single_segment(segment, replacement_text):
475
- """Replace a single red text segment"""
476
  if not segment['runs']:
477
  return False
478
 
479
- # Replace first run with new text
480
- first_run = segment['runs'][0][2] # (para_idx, run_idx, run)
481
  first_run.text = replacement_text
482
  first_run.font.color.rgb = RGBColor(0, 0, 0)
483
 
484
- # Clear remaining runs in the segment
485
  for _, _, run in segment['runs'][1:]:
486
  run.text = ''
487
 
488
  return True
489
 
490
- def process_tables(document, flat_json):
491
- """Enhanced table processing with better dynamic detection"""
492
- replacements_made = 0
493
-
494
- for table_idx, table in enumerate(document.tables):
495
- print(f"\nπŸ” Processing table {table_idx + 1}:")
496
-
497
- # Dynamically detect table type by analyzing content
498
- table_type = detect_table_type(table)
499
- print(f" πŸ“‹ Detected table type: {table_type}")
500
-
501
- if table_type == "vehicle_registration":
502
- vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
503
- replacements_made += vehicle_replacements
504
- continue
505
- elif table_type == "print_accreditation":
506
- print_replacements = handle_print_accreditation_section(table, flat_json)
507
- replacements_made += print_replacements
508
- continue
509
-
510
- # Process as regular key-value table
511
- for row_idx, row in enumerate(table.rows):
512
- if len(row.cells) < 1:
513
- continue
514
-
515
- # Process each cell for red text
516
- for cell_idx, cell in enumerate(row.cells):
517
- if has_red_text(cell):
518
- cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
519
- replacements_made += cell_replacements
520
-
521
- # If no individual segment matches found, try context-based matching
522
- if cell_replacements == 0:
523
- context_replacements = try_context_based_replacement(cell, row, table, flat_json)
524
- replacements_made += context_replacements
525
-
526
- return replacements_made
527
-
528
  def detect_table_type(table):
529
- """Dynamically detect table type based on content"""
530
- # Get text from first few rows
531
- sample_text = ""
532
- for row in table.rows[:3]:
533
- for cell in row.cells:
534
- sample_text += get_clean_text(cell).lower() + " "
535
-
536
- # Vehicle registration indicators
537
- vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
538
- vehicle_score = sum(1 for indicator in vehicle_indicators if indicator in sample_text)
539
-
540
- # Print accreditation indicators
541
- print_indicators = ["print name", "position title"]
542
- print_score = sum(1 for indicator in print_indicators if indicator in sample_text)
543
-
544
- if vehicle_score >= 3:
545
- return "vehicle_registration"
546
- elif print_score >= 2:
547
- return "print_accreditation"
548
- else:
549
- return "key_value"
550
 
551
  def try_context_based_replacement(cell, row, table, flat_json):
552
- """Try to find replacement using context from surrounding cells"""
553
  replacements_made = 0
554
 
555
- # Get context from row headers/labels
556
  row_context = ""
557
  if len(row.cells) > 1:
558
- # First cell might be a label
559
  first_cell_text = get_clean_text(row.cells[0]).strip()
560
  if first_cell_text and not has_red_text(row.cells[0]):
561
  row_context = first_cell_text
562
 
563
- # Get red text from the cell
564
  red_segments = extract_red_text_segments(cell)
565
  for segment in red_segments:
566
  red_text = segment['text'].strip()
567
  if not red_text:
568
  continue
569
 
570
- # Try combining context with red text
571
  if row_context:
572
  context_queries = [
573
  f"{row_context} {red_text}",
@@ -587,7 +394,56 @@ def try_context_based_replacement(cell, row, table, flat_json):
587
 
588
  return replacements_made
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  def handle_australian_company_number(row, company_numbers):
 
591
  replacements_made = 0
592
  for i, digit in enumerate(company_numbers):
593
  cell_idx = i + 1
@@ -600,26 +456,23 @@ def handle_australian_company_number(row, company_numbers):
600
  return replacements_made
601
 
602
  def handle_vehicle_registration_table(table, flat_json):
603
- """Handle the Vehicle Registration Numbers table with column-based data"""
604
  replacements_made = 0
605
 
606
- # Look for the vehicle registration data in the flattened JSON
607
  vehicle_section = None
608
 
609
- # Try to find the vehicle registration section
610
  for key, value in flat_json.items():
611
  if "vehicle registration numbers of records examined" in key.lower():
612
- if isinstance(value, dict): # This should be the nested structure
613
  vehicle_section = value
614
  print(f" βœ… Found vehicle data in key: '{key}'")
615
  break
616
 
617
  if not vehicle_section:
618
- # Try alternative approach - look for individual column keys
619
  potential_columns = {}
620
  for key, value in flat_json.items():
621
  if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
622
- # Extract the column name from the flattened key
623
  if "." in key:
624
  column_name = key.split(".")[-1]
625
  else:
@@ -635,7 +488,7 @@ def handle_vehicle_registration_table(table, flat_json):
635
 
636
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
637
 
638
- # Find header row (usually row 0 or 1)
639
  header_row_idx = -1
640
  header_row = None
641
 
@@ -652,30 +505,26 @@ def handle_vehicle_registration_table(table, flat_json):
652
 
653
  print(f" βœ… Found header row at index {header_row_idx}")
654
 
655
- # Create mapping between column indices and JSON keys
656
  column_mapping = {}
657
  for col_idx, cell in enumerate(header_row.cells):
658
  header_text = get_clean_text(cell).strip()
659
  if not header_text or header_text.lower() == "no.":
660
  continue
661
 
662
- # Try to match header text with JSON keys
663
  best_match = None
664
  best_score = 0
665
 
666
- # Normalize header text for better matching
667
  normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
668
 
669
  for json_key in vehicle_section.keys():
670
  normalized_json = json_key.lower().strip()
671
 
672
- # Try exact match first (after normalization)
673
  if normalized_header == normalized_json:
674
  best_match = json_key
675
  best_score = 1.0
676
  break
677
 
678
- # Try word-based matching
679
  header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
680
  json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
681
 
@@ -683,16 +532,15 @@ def handle_vehicle_registration_table(table, flat_json):
683
  common_words = header_words.intersection(json_words)
684
  score = len(common_words) / max(len(header_words), len(json_words))
685
 
686
- if score > best_score and score >= 0.3: # At least 30% match
687
  best_score = score
688
  best_match = json_key
689
 
690
- # Try substring matching for cases like "RegistrationNumber" vs "Registration Number"
691
  header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
692
  json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
693
 
694
  if header_clean in json_clean or json_clean in header_clean:
695
- if len(header_clean) > 5 and len(json_clean) > 5: # Only for meaningful matches
696
  substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
697
  if substring_score > best_score and substring_score >= 0.6:
698
  best_score = substring_score
@@ -706,7 +554,7 @@ def handle_vehicle_registration_table(table, flat_json):
706
  print(f" ❌ No column mappings found")
707
  return 0
708
 
709
- # Determine how many data rows we need based on the JSON arrays
710
  max_data_rows = 0
711
  for json_key, data in vehicle_section.items():
712
  if isinstance(data, list):
@@ -714,42 +562,35 @@ def handle_vehicle_registration_table(table, flat_json):
714
 
715
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
716
 
717
- # Process all required data rows
718
  for data_row_index in range(max_data_rows):
719
  table_row_idx = header_row_idx + 1 + data_row_index
720
 
721
- # Check if this table row exists, if not, add it
722
  if table_row_idx >= len(table.rows):
723
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
724
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
725
 
726
- # Add a new row to the table
727
  new_row = table.add_row()
728
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
729
 
730
  row = table.rows[table_row_idx]
731
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
732
 
733
- # Fill in data for each mapped column
734
  for col_idx, json_key in column_mapping.items():
735
  if col_idx < len(row.cells):
736
  cell = row.cells[col_idx]
737
 
738
- # Get the data for this column and row
739
  column_data = vehicle_section.get(json_key, [])
740
  if isinstance(column_data, list) and data_row_index < len(column_data):
741
  replacement_value = str(column_data[data_row_index])
742
 
743
- # Check if cell has red text or is empty (needs data)
744
  cell_text = get_clean_text(cell)
745
  if has_red_text(cell) or not cell_text.strip():
746
- # If cell is empty, add the text directly
747
  if not cell_text.strip():
748
  cell.text = replacement_value
749
  replacements_made += 1
750
  print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
751
  else:
752
- # If cell has red text, replace it
753
  cell_replacements = replace_red_text_in_cell(cell, replacement_value)
754
  replacements_made += cell_replacements
755
  if cell_replacements > 0:
@@ -758,52 +599,47 @@ def handle_vehicle_registration_table(table, flat_json):
758
  return replacements_made
759
 
760
  def handle_print_accreditation_section(table, flat_json):
761
- """Handle the special case of print accreditation name with 2 values"""
762
  replacements_made = 0
763
 
764
- # Look for the print accreditation name data
765
  print_data = flat_json.get("print accreditation name.print accreditation name", [])
766
  if not isinstance(print_data, list) or len(print_data) < 2:
767
  return 0
768
 
769
- name_value = print_data[0] # "Simon Anderson"
770
- position_value = print_data[1] # "Director"
771
 
772
  print(f" πŸ“‹ Print accreditation data: Name='{name_value}', Position='{position_value}'")
773
 
774
- # Find rows with "Print Name" and "Position Title"
775
  for row_idx, row in enumerate(table.rows):
776
  if len(row.cells) >= 2:
777
- # Check if this row has the headers
778
  cell1_text = get_clean_text(row.cells[0]).lower()
779
  cell2_text = get_clean_text(row.cells[1]).lower()
780
 
781
  if "print name" in cell1_text and "position title" in cell2_text:
782
  print(f" πŸ“ Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
783
 
784
- # Check the next row for red text to replace
785
  if row_idx + 1 < len(table.rows):
786
  data_row = table.rows[row_idx + 1]
787
  if len(data_row.cells) >= 2:
788
- # Replace Print Name (first cell)
789
  if has_red_text(data_row.cells[0]):
790
  cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
791
  replacements_made += cell_replacements
792
  if cell_replacements > 0:
793
  print(f" βœ… Replaced Print Name: '{name_value}'")
794
 
795
- # Replace Position Title (second cell)
796
  if has_red_text(data_row.cells[1]):
797
  cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
798
  replacements_made += cell_replacements
799
  if cell_replacements > 0:
800
  print(f" βœ… Replaced Position Title: '{position_value}'")
801
 
802
- break # Found the section, no need to continue
803
 
804
  return replacements_made
805
 
806
  def process_single_column_sections(cell, field_name, flat_json):
 
807
  json_value = find_matching_json_value(field_name, flat_json)
808
  if json_value is not None:
809
  replacement_text = get_value_as_string(json_value, field_name)
@@ -819,41 +655,45 @@ def process_single_column_sections(cell, field_name, flat_json):
819
  return 0
820
 
821
  def process_tables(document, flat_json):
822
- """Process tables to find key-value pairs and replace red values"""
823
  replacements_made = 0
824
 
825
  for table_idx, table in enumerate(document.tables):
826
  print(f"\nπŸ” Processing table {table_idx + 1}:")
827
 
828
- # Check if this is the vehicle registration table
 
 
 
 
829
  table_text = ""
830
- for row in table.rows[:3]: # Check first 3 rows
831
  for cell in row.cells:
832
  table_text += get_clean_text(cell).lower() + " "
833
 
834
- # Look for vehicle registration indicators (need multiple indicators to avoid false positives)
835
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
836
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
837
- if indicator_count >= 3: # Require at least 3 indicators to be sure it's a vehicle table
838
  print(f" πŸš— Detected Vehicle Registration table")
839
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
840
  replacements_made += vehicle_replacements
841
- continue # Skip normal processing for this table
842
 
843
- # Check if this is the print accreditation table
844
  print_accreditation_indicators = ["print name", "position title"]
845
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
846
- if indicator_count >= 2: # Require at least 2 indicators to be sure it's a print accreditation table
847
  print(f" πŸ“‹ Detected Print Accreditation table")
848
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
849
  replacements_made += print_accreditation_replacements
850
- continue # Skip normal processing for this table
851
 
 
852
  for row_idx, row in enumerate(table.rows):
853
- if len(row.cells) < 1: # Skip empty rows
854
  continue
855
 
856
- # Get the key from the first column
857
  key_cell = row.cells[0]
858
  key_text = get_clean_text(key_cell)
859
 
@@ -862,27 +702,24 @@ def process_tables(document, flat_json):
862
 
863
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
864
 
865
- # Check if this key exists in our JSON
866
  json_value = find_matching_json_value(key_text, flat_json)
867
 
868
  if json_value is not None:
869
  replacement_text = get_value_as_string(json_value, key_text)
870
 
871
- # Special handling for Australian Company Number
872
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
873
  cell_replacements = handle_australian_company_number(row, json_value)
874
  replacements_made += cell_replacements
875
 
876
- # Handle section headers (like Attendance List, Nature of Business) where content is in next row
877
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
878
  print(f" βœ… Section header detected, checking next row for content...")
879
  next_row = table.rows[row_idx + 1]
880
 
881
- # Check all cells in the next row for red text
882
  for cell_idx, cell in enumerate(next_row.cells):
883
  if has_red_text(cell):
884
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
885
- # For list values, join with line breaks
886
  if isinstance(json_value, list):
887
  replacement_text = "\n".join(str(item) for item in json_value)
888
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
@@ -902,6 +739,7 @@ def process_tables(document, flat_json):
902
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
903
  replacements_made += cell_replacements
904
  else:
 
905
  if len(row.cells) == 1 and has_red_text(key_cell):
906
  red_text = ""
907
  for paragraph in key_cell.paragraphs:
@@ -915,32 +753,30 @@ def process_tables(document, flat_json):
915
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
916
  replacements_made += cell_replacements
917
 
918
- # Handle tables where red text appears in multiple columns (like contact info tables)
919
  for cell_idx in range(len(row.cells)):
920
  cell = row.cells[cell_idx]
921
  if has_red_text(cell):
922
- # Get the red text from this cell
923
- red_text = ""
924
- for paragraph in cell.paragraphs:
925
- for run in paragraph.runs:
926
- if is_red(run):
927
- red_text += run.text
928
 
929
- if red_text.strip():
930
- # Try to find a direct mapping for this red text
931
- section_value = find_matching_json_value(red_text.strip(), flat_json)
932
- if section_value is not None:
933
- section_replacement = get_value_as_string(section_value, red_text.strip())
934
- cell_replacements = replace_red_text_in_cell(cell, section_replacement)
935
- replacements_made += cell_replacements
936
- if cell_replacements > 0:
937
- print(f" βœ… Replaced red text '{red_text.strip()[:30]}...' with '{section_replacement[:30]}...' in cell {cell_idx + 1}")
938
 
939
  return replacements_made
940
 
941
  def process_paragraphs(document, flat_json):
 
942
  replacements_made = 0
943
  print(f"\nπŸ” Processing paragraphs:")
 
944
  for para_idx, paragraph in enumerate(document.paragraphs):
945
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
946
  if red_runs:
@@ -948,16 +784,18 @@ def process_paragraphs(document, flat_json):
948
  red_text_only = "".join(run.text for run in red_runs).strip()
949
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
950
 
951
- # Try to match the red text specifically first
952
  json_value = find_matching_json_value(red_text_only, flat_json)
953
 
954
- # If no match, try some common patterns
955
  if json_value is None:
956
- # Check for signature patterns
957
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
958
  json_value = find_matching_json_value("auditor signature", flat_json)
959
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
960
  json_value = find_matching_json_value("operator signature", flat_json)
 
 
 
961
 
962
  if json_value is not None:
963
  replacement_text = get_value_as_string(json_value)
@@ -967,20 +805,225 @@ def process_paragraphs(document, flat_json):
967
  for run in red_runs[1:]:
968
  run.text = ''
969
  replacements_made += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  return replacements_made
971
 
972
  def process_hf(json_file, docx_file, output_file):
973
- """
974
- Accepts file-like objects or file paths.
975
- For Hugging Face: json_file, docx_file, output_file will be file-like objects.
976
- """
977
  try:
978
- # --- Load JSON (file or file-like) ---
979
  if hasattr(json_file, "read"):
980
  json_data = json.load(json_file)
981
  else:
982
  with open(json_file, 'r', encoding='utf-8') as f:
983
  json_data = json.load(f)
 
984
  flat_json = flatten_json(json_data)
985
  print("πŸ“„ Available JSON keys (sample):")
986
  for i, (key, value) in enumerate(sorted(flat_json.items())):
@@ -988,24 +1031,38 @@ def process_hf(json_file, docx_file, output_file):
988
  print(f" - {key}: {value}")
989
  print(f" ... and {len(flat_json) - 10} more keys\n")
990
 
991
- # --- Load DOCX (file or file-like) ---
992
  if hasattr(docx_file, "read"):
993
  doc = Document(docx_file)
994
  else:
995
  doc = Document(docx_file)
996
 
 
 
 
 
997
  table_replacements = process_tables(doc, flat_json)
998
  paragraph_replacements = process_paragraphs(doc, flat_json)
999
  heading_replacements = process_headings(doc, flat_json)
1000
- total_replacements = table_replacements + paragraph_replacements
 
 
 
 
1001
 
1002
- # --- Save DOCX output (file or file-like) ---
1003
  if hasattr(output_file, "write"):
1004
  doc.save(output_file)
1005
  else:
1006
  doc.save(output_file)
 
1007
  print(f"\nβœ… Document saved as: {output_file}")
1008
- print(f"βœ… Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs, {heading_replacements} in headings)")
 
 
 
 
 
1009
 
1010
  except FileNotFoundError as e:
1011
  print(f"❌ File not found: {e}")
@@ -1017,9 +1074,9 @@ def process_hf(json_file, docx_file, output_file):
1017
  if __name__ == "__main__":
1018
  import sys
1019
  if len(sys.argv) != 4:
1020
- print("Usage: python updated_word.py <input_docx> <updated_json> <output_docx>")
1021
  exit(1)
1022
  docx_path = sys.argv[1]
1023
  json_path = sys.argv[2]
1024
  output_path = sys.argv[3]
1025
- process_hf(json_path, docx_path, output_path) # <--- if your main function is called process_hf!
 
3
  from docx.shared import RGBColor
4
  import re
5
 
6
+ # Enhanced heading patterns (ADDITIVE - keeps your existing ones)
7
  HEADING_PATTERNS = {
8
  "main": [
9
  r"NHVAS\s+Audit\s+Summary\s+Report",
 
22
  r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
23
  r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
24
  r"Operator\s+Declaration",
25
+ r"Operator\s+Information",
26
+ r"Driver\s*/\s*Scheduler\s+Records\s+Examined"
27
  ]
28
  }
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def load_json(filepath):
31
  with open(filepath, 'r') as file:
32
  return json.load(file)
 
61
  return str(value)
62
 
63
  def find_matching_json_value(field_name, flat_json):
64
+ """Enhanced dynamic matching without manual mappings"""
65
  field_name = field_name.strip()
66
 
67
  # Try exact match first
 
83
 
84
  # Try partial matching - remove parentheses and special chars
85
  clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
86
+ clean_field = re.sub(r'\s+', ' ', clean_field)
87
 
88
  for key, value in flat_json.items():
89
  clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
 
93
  print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
94
  return value
95
 
96
+ # Enhanced fuzzy matching with better scoring
97
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
98
  if not field_words:
99
  return None
 
122
  best_match = value
123
  best_key = key
124
 
125
+ if best_match and best_score >= 0.25: # Lowered threshold for better coverage
126
  print(f" βœ… Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
127
  return best_match
128
 
 
144
  return False
145
 
146
  def extract_red_text_segments(cell):
147
+ """Enhanced red text extraction with better multi-line handling"""
148
  red_segments = []
149
 
150
  for para_idx, paragraph in enumerate(cell.paragraphs):
 
153
 
154
  for run_idx, run in enumerate(paragraph.runs):
155
  if is_red(run):
156
+ if run.text:
157
  current_segment += run.text
158
  segment_runs.append((para_idx, run_idx, run))
159
  else:
160
  # End of current red segment
161
+ if segment_runs:
162
  red_segments.append({
163
  'text': current_segment,
164
  'runs': segment_runs.copy(),
 
168
  segment_runs = []
169
 
170
  # Handle segment at end of paragraph
171
+ if segment_runs:
172
  red_segments.append({
173
  'text': current_segment,
174
  'runs': segment_runs.copy(),
 
178
  return red_segments
179
 
180
  def replace_red_text_in_cell(cell, replacement_text):
181
+ """Enhanced cell replacement with improved multi-line handling"""
182
  red_segments = extract_red_text_segments(cell)
183
 
184
  if not red_segments:
185
  return 0
186
 
 
187
  if len(red_segments) > 1:
188
  replacements_made = 0
189
  for segment in red_segments:
190
  segment_text = segment['text'].strip()
191
  if segment_text:
 
 
192
  pass
193
 
 
194
  if replacements_made == 0:
195
  return replace_all_red_segments(red_segments, replacement_text)
196
 
 
197
  return replace_all_red_segments(red_segments, replacement_text)
198
 
199
  def replace_all_red_segments(red_segments, replacement_text):
200
+ """Enhanced replacement with better line handling"""
201
  if not red_segments:
202
  return 0
203
 
 
204
  if '\n' in replacement_text:
205
  replacement_lines = replacement_text.split('\n')
206
  else:
 
208
 
209
  replacements_made = 0
210
 
 
211
  if red_segments and replacement_lines:
212
  first_segment = red_segments[0]
213
  if first_segment['runs']:
214
+ first_run = first_segment['runs'][0][2]
215
  first_run.text = replacement_lines[0]
216
  first_run.font.color.rgb = RGBColor(0, 0, 0)
217
  replacements_made = 1
218
 
 
219
  for _, _, run in first_segment['runs'][1:]:
220
  run.text = ''
221
 
 
222
  for segment in red_segments[1:]:
223
  for _, _, run in segment['runs']:
224
  run.text = ''
225
 
 
226
  if len(replacement_lines) > 1 and red_segments:
227
  try:
 
228
  first_run = red_segments[0]['runs'][0][2]
229
+ paragraph = first_run.element.getparent()
230
 
 
231
  for line in replacement_lines[1:]:
232
+ if line.strip():
 
233
  from docx.oxml import OxmlElement, ns
234
  br = OxmlElement('w:br')
235
  first_run.element.append(br)
236
 
 
237
  new_run = paragraph.add_run(line.strip())
238
  new_run.font.color.rgb = RGBColor(0, 0, 0)
239
  except:
 
240
  if red_segments and red_segments[0]['runs']:
241
  first_run = red_segments[0]['runs'][0][2]
 
242
  first_run.text = ' '.join(replacement_lines)
243
  first_run.font.color.rgb = RGBColor(0, 0, 0)
244
 
245
  return replacements_made
246
 
247
+ def analyze_table_structure(table):
248
+ """NEW: Dynamic table structure analysis"""
249
+ structure = {
250
+ 'type': 'unknown',
251
+ 'orientation': 'unknown',
252
+ 'has_headers': False,
253
+ 'column_count': 0,
254
+ 'row_count': 0,
255
+ 'red_text_locations': []
256
+ }
257
+
258
+ if not table.rows:
259
+ return structure
260
+
261
+ structure['row_count'] = len(table.rows)
262
+ structure['column_count'] = len(table.rows[0].cells) if table.rows else 0
263
+
264
+ # Analyze first row for headers
265
+ first_row_text = []
266
+ for cell in table.rows[0].cells:
267
+ cell_text = get_clean_text(cell).strip()
268
+ first_row_text.append(cell_text)
269
+
270
+ # Detect table type based on content patterns
271
+ combined_text = " ".join(first_row_text).lower()
272
+
273
+ if any(indicator in combined_text for indicator in ["registration", "vehicle", "maintenance", "mass"]):
274
+ structure['type'] = 'vehicle_registration'
275
+ elif any(indicator in combined_text for indicator in ["print name", "position", "auditor", "operator"]):
276
+ structure['type'] = 'declaration'
277
+ elif any(indicator in combined_text for indicator in ["std", "standard", "compliance"]):
278
+ structure['type'] = 'compliance_matrix'
279
+ elif len(table.rows[0].cells) == 2 and not any(indicator in combined_text for indicator in ["no.", "number"]):
280
+ structure['type'] = 'key_value'
281
+ else:
282
+ structure['type'] = 'data_grid'
283
+
284
+ # Find red text locations
285
+ for row_idx, row in enumerate(table.rows):
286
+ for cell_idx, cell in enumerate(row.cells):
287
+ if has_red_text(cell):
288
+ structure['red_text_locations'].append((row_idx, cell_idx))
289
+
290
+ structure['has_headers'] = len(structure['red_text_locations']) > 0 and (0, 0) not in structure['red_text_locations']
291
+
292
+ return structure
293
+
294
  def handle_multiple_red_segments_in_cell(cell, flat_json):
295
+ """Enhanced multi-segment handling"""
296
  red_segments = extract_red_text_segments(cell)
297
 
298
  if not red_segments:
 
302
  replacements_made = 0
303
  unmatched_segments = []
304
 
 
305
  for i, segment in enumerate(red_segments):
306
  segment_text = segment['text'].strip()
307
  if not segment_text:
 
309
 
310
  print(f" Segment {i+1}: '{segment_text[:50]}...'")
311
 
 
312
  json_value = find_matching_json_value(segment_text, flat_json)
313
 
314
  if json_value is not None:
315
  replacement_text = get_value_as_string(json_value, segment_text)
316
 
 
317
  if isinstance(json_value, list) and len(json_value) > 1:
318
  replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
319
 
 
325
  unmatched_segments.append(segment)
326
  print(f" ⏳ No individual match for segment '{segment_text[:30]}...'")
327
 
 
328
  if unmatched_segments and replacements_made == 0:
329
  combined_text = " ".join(seg['text'] for seg in red_segments).strip()
330
  print(f" πŸ”„ Trying combined text match: '{combined_text[:50]}...'")
 
335
  if isinstance(json_value, list) and len(json_value) > 1:
336
  replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
337
 
 
338
  replacements_made = replace_all_red_segments(red_segments, replacement_text)
339
  print(f" βœ… Replaced combined text with '{replacement_text[:50]}...'")
340
 
341
  return replacements_made
342
 
343
  def replace_single_segment(segment, replacement_text):
344
+ """Enhanced single segment replacement"""
345
  if not segment['runs']:
346
  return False
347
 
348
+ first_run = segment['runs'][0][2]
 
349
  first_run.text = replacement_text
350
  first_run.font.color.rgb = RGBColor(0, 0, 0)
351
 
 
352
  for _, _, run in segment['runs'][1:]:
353
  run.text = ''
354
 
355
  return True
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  def detect_table_type(table):
358
+ """Enhanced table type detection"""
359
+ structure = analyze_table_structure(table)
360
+ return structure['type']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  def try_context_based_replacement(cell, row, table, flat_json):
363
+ """Enhanced context-based replacement"""
364
  replacements_made = 0
365
 
 
366
  row_context = ""
367
  if len(row.cells) > 1:
 
368
  first_cell_text = get_clean_text(row.cells[0]).strip()
369
  if first_cell_text and not has_red_text(row.cells[0]):
370
  row_context = first_cell_text
371
 
 
372
  red_segments = extract_red_text_segments(cell)
373
  for segment in red_segments:
374
  red_text = segment['text'].strip()
375
  if not red_text:
376
  continue
377
 
 
378
  if row_context:
379
  context_queries = [
380
  f"{row_context} {red_text}",
 
394
 
395
  return replacements_made
396
 
397
+ def smart_fallback_processor(element, flat_json):
398
+ """NEW: Smart fallback for missed red text"""
399
+ replacements_made = 0
400
+
401
+ # Check if element has red text that wasn't processed
402
+ if hasattr(element, 'paragraphs'):
403
+ for paragraph in element.paragraphs:
404
+ for run in paragraph.runs:
405
+ if is_red(run) and run.text.strip():
406
+ # Try advanced pattern matching
407
+ red_text = run.text.strip()
408
+
409
+ # Try semantic matching
410
+ json_value = semantic_text_matching(red_text, flat_json)
411
+ if json_value:
412
+ replacement_text = get_value_as_string(json_value, red_text)
413
+ run.text = replacement_text
414
+ run.font.color.rgb = RGBColor(0, 0, 0)
415
+ replacements_made += 1
416
+ print(f" 🎯 Fallback match: '{red_text}' -> '{replacement_text[:30]}...'")
417
+
418
+ return replacements_made
419
+
420
+ def semantic_text_matching(text, flat_json):
421
+ """NEW: Advanced semantic matching for edge cases"""
422
+ text_lower = text.lower().strip()
423
+
424
+ # Common semantic patterns
425
+ semantic_patterns = {
426
+ 'name': ['name', 'manager', 'operator', 'auditor', 'driver'],
427
+ 'date': ['date', 'expiry', 'conducted', 'completed'],
428
+ 'address': ['address', 'location', 'road', 'street'],
429
+ 'number': ['number', 'registration', 'phone', 'telephone'],
430
+ 'email': ['email', 'mail'],
431
+ 'position': ['position', 'title', 'role']
432
+ }
433
+
434
+ # Find semantic category
435
+ for category, keywords in semantic_patterns.items():
436
+ if any(keyword in text_lower for keyword in keywords):
437
+ # Look for JSON keys in this semantic category
438
+ for key, value in flat_json.items():
439
+ key_lower = key.lower()
440
+ if any(keyword in key_lower for keyword in keywords):
441
+ return value
442
+
443
+ return None
444
+
445
  def handle_australian_company_number(row, company_numbers):
446
+ """Enhanced ACN handling"""
447
  replacements_made = 0
448
  for i, digit in enumerate(company_numbers):
449
  cell_idx = i + 1
 
456
  return replacements_made
457
 
458
  def handle_vehicle_registration_table(table, flat_json):
459
+ """Enhanced vehicle registration table handling"""
460
  replacements_made = 0
461
 
462
+ # Try to find vehicle registration data
463
  vehicle_section = None
464
 
 
465
  for key, value in flat_json.items():
466
  if "vehicle registration numbers of records examined" in key.lower():
467
+ if isinstance(value, dict):
468
  vehicle_section = value
469
  print(f" βœ… Found vehicle data in key: '{key}'")
470
  break
471
 
472
  if not vehicle_section:
 
473
  potential_columns = {}
474
  for key, value in flat_json.items():
475
  if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
 
476
  if "." in key:
477
  column_name = key.split(".")[-1]
478
  else:
 
488
 
489
  print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
490
 
491
+ # Find header row
492
  header_row_idx = -1
493
  header_row = None
494
 
 
505
 
506
  print(f" βœ… Found header row at index {header_row_idx}")
507
 
508
+ # Enhanced column mapping
509
  column_mapping = {}
510
  for col_idx, cell in enumerate(header_row.cells):
511
  header_text = get_clean_text(cell).strip()
512
  if not header_text or header_text.lower() == "no.":
513
  continue
514
 
 
515
  best_match = None
516
  best_score = 0
517
 
 
518
  normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
519
 
520
  for json_key in vehicle_section.keys():
521
  normalized_json = json_key.lower().strip()
522
 
 
523
  if normalized_header == normalized_json:
524
  best_match = json_key
525
  best_score = 1.0
526
  break
527
 
 
528
  header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
529
  json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
530
 
 
532
  common_words = header_words.intersection(json_words)
533
  score = len(common_words) / max(len(header_words), len(json_words))
534
 
535
+ if score > best_score and score >= 0.3:
536
  best_score = score
537
  best_match = json_key
538
 
 
539
  header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
540
  json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
541
 
542
  if header_clean in json_clean or json_clean in header_clean:
543
+ if len(header_clean) > 5 and len(json_clean) > 5:
544
  substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
545
  if substring_score > best_score and substring_score >= 0.6:
546
  best_score = substring_score
 
554
  print(f" ❌ No column mappings found")
555
  return 0
556
 
557
+ # Determine data rows needed
558
  max_data_rows = 0
559
  for json_key, data in vehicle_section.items():
560
  if isinstance(data, list):
 
562
 
563
  print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
564
 
565
+ # Process data rows
566
  for data_row_index in range(max_data_rows):
567
  table_row_idx = header_row_idx + 1 + data_row_index
568
 
 
569
  if table_row_idx >= len(table.rows):
570
  print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
571
  print(f" βž• Adding new row for vehicle {data_row_index + 1}")
572
 
 
573
  new_row = table.add_row()
574
  print(f" βœ… Successfully added row {len(table.rows)} to the table")
575
 
576
  row = table.rows[table_row_idx]
577
  print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
578
 
 
579
  for col_idx, json_key in column_mapping.items():
580
  if col_idx < len(row.cells):
581
  cell = row.cells[col_idx]
582
 
 
583
  column_data = vehicle_section.get(json_key, [])
584
  if isinstance(column_data, list) and data_row_index < len(column_data):
585
  replacement_value = str(column_data[data_row_index])
586
 
 
587
  cell_text = get_clean_text(cell)
588
  if has_red_text(cell) or not cell_text.strip():
 
589
  if not cell_text.strip():
590
  cell.text = replacement_value
591
  replacements_made += 1
592
  print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
593
  else:
 
594
  cell_replacements = replace_red_text_in_cell(cell, replacement_value)
595
  replacements_made += cell_replacements
596
  if cell_replacements > 0:
 
599
  return replacements_made
600
 
601
  def handle_print_accreditation_section(table, flat_json):
602
+ """Enhanced print accreditation handling"""
603
  replacements_made = 0
604
 
 
605
  print_data = flat_json.get("print accreditation name.print accreditation name", [])
606
  if not isinstance(print_data, list) or len(print_data) < 2:
607
  return 0
608
 
609
+ name_value = print_data[0]
610
+ position_value = print_data[1]
611
 
612
  print(f" πŸ“‹ Print accreditation data: Name='{name_value}', Position='{position_value}'")
613
 
 
614
  for row_idx, row in enumerate(table.rows):
615
  if len(row.cells) >= 2:
 
616
  cell1_text = get_clean_text(row.cells[0]).lower()
617
  cell2_text = get_clean_text(row.cells[1]).lower()
618
 
619
  if "print name" in cell1_text and "position title" in cell2_text:
620
  print(f" πŸ“ Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
621
 
 
622
  if row_idx + 1 < len(table.rows):
623
  data_row = table.rows[row_idx + 1]
624
  if len(data_row.cells) >= 2:
 
625
  if has_red_text(data_row.cells[0]):
626
  cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
627
  replacements_made += cell_replacements
628
  if cell_replacements > 0:
629
  print(f" βœ… Replaced Print Name: '{name_value}'")
630
 
 
631
  if has_red_text(data_row.cells[1]):
632
  cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
633
  replacements_made += cell_replacements
634
  if cell_replacements > 0:
635
  print(f" βœ… Replaced Position Title: '{position_value}'")
636
 
637
+ break
638
 
639
  return replacements_made
640
 
641
  def process_single_column_sections(cell, field_name, flat_json):
642
+ """Enhanced single column processing"""
643
  json_value = find_matching_json_value(field_name, flat_json)
644
  if json_value is not None:
645
  replacement_text = get_value_as_string(json_value, field_name)
 
655
  return 0
656
 
657
  def process_tables(document, flat_json):
658
+ """ENHANCED: Your existing function + smart enhancements"""
659
  replacements_made = 0
660
 
661
  for table_idx, table in enumerate(document.tables):
662
  print(f"\nπŸ” Processing table {table_idx + 1}:")
663
 
664
+ # ENHANCED: Dynamic table analysis
665
+ table_structure = analyze_table_structure(table)
666
+ print(f" πŸ“Š Table structure: {table_structure['type']} ({table_structure['row_count']}x{table_structure['column_count']})")
667
+
668
+ # Your existing logic with enhancements
669
  table_text = ""
670
+ for row in table.rows[:3]:
671
  for cell in row.cells:
672
  table_text += get_clean_text(cell).lower() + " "
673
 
674
+ # Enhanced vehicle registration detection
675
  vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
676
  indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
677
+ if indicator_count >= 2 or table_structure['type'] == 'vehicle_registration': # Lowered threshold
678
  print(f" πŸš— Detected Vehicle Registration table")
679
  vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
680
  replacements_made += vehicle_replacements
681
+ continue
682
 
683
+ # Enhanced print accreditation detection
684
  print_accreditation_indicators = ["print name", "position title"]
685
  indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
686
+ if indicator_count >= 1 or table_structure['type'] == 'declaration': # Lowered threshold
687
  print(f" πŸ“‹ Detected Print Accreditation table")
688
  print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
689
  replacements_made += print_accreditation_replacements
690
+ continue
691
 
692
+ # Your existing row processing with enhancements
693
  for row_idx, row in enumerate(table.rows):
694
+ if len(row.cells) < 1:
695
  continue
696
 
 
697
  key_cell = row.cells[0]
698
  key_text = get_clean_text(key_cell)
699
 
 
702
 
703
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
704
 
 
705
  json_value = find_matching_json_value(key_text, flat_json)
706
 
707
  if json_value is not None:
708
  replacement_text = get_value_as_string(json_value, key_text)
709
 
710
+ # Enhanced ACN handling
711
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
712
  cell_replacements = handle_australian_company_number(row, json_value)
713
  replacements_made += cell_replacements
714
 
715
+ # Enhanced section header handling
716
  elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
717
  print(f" βœ… Section header detected, checking next row for content...")
718
  next_row = table.rows[row_idx + 1]
719
 
 
720
  for cell_idx, cell in enumerate(next_row.cells):
721
  if has_red_text(cell):
722
  print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
 
723
  if isinstance(json_value, list):
724
  replacement_text = "\n".join(str(item) for item in json_value)
725
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
 
739
  cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
740
  replacements_made += cell_replacements
741
  else:
742
+ # Enhanced fallback processing for unmatched keys
743
  if len(row.cells) == 1 and has_red_text(key_cell):
744
  red_text = ""
745
  for paragraph in key_cell.paragraphs:
 
753
  cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
754
  replacements_made += cell_replacements
755
 
756
+ # Enhanced red text processing for all cells
757
  for cell_idx in range(len(row.cells)):
758
  cell = row.cells[cell_idx]
759
  if has_red_text(cell):
760
+ cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
761
+ replacements_made += cell_replacements
 
 
 
 
762
 
763
+ # ENHANCED: Fallback for still unmatched red text
764
+ if cell_replacements == 0:
765
+ context_replacements = try_context_based_replacement(cell, row, table, flat_json)
766
+ replacements_made += context_replacements
767
+
768
+ # ENHANCED: Smart fallback processor
769
+ if context_replacements == 0:
770
+ fallback_replacements = smart_fallback_processor(cell, flat_json)
771
+ replacements_made += fallback_replacements
772
 
773
  return replacements_made
774
 
775
  def process_paragraphs(document, flat_json):
776
+ """ENHANCED: Your existing function + smart fallbacks"""
777
  replacements_made = 0
778
  print(f"\nπŸ” Processing paragraphs:")
779
+
780
  for para_idx, paragraph in enumerate(document.paragraphs):
781
  red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
782
  if red_runs:
 
784
  red_text_only = "".join(run.text for run in red_runs).strip()
785
  print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
786
 
787
+ # Your existing matching logic
788
  json_value = find_matching_json_value(red_text_only, flat_json)
789
 
 
790
  if json_value is None:
791
+ # Enhanced pattern matching for signatures and dates
792
  if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
793
  json_value = find_matching_json_value("auditor signature", flat_json)
794
  elif "OPERATOR SIGNATURE" in red_text_only.upper():
795
  json_value = find_matching_json_value("operator signature", flat_json)
796
+ # ENHANCED: Try semantic matching
797
+ elif json_value is None:
798
+ json_value = semantic_text_matching(red_text_only, flat_json)
799
 
800
  if json_value is not None:
801
  replacement_text = get_value_as_string(json_value)
 
805
  for run in red_runs[1:]:
806
  run.text = ''
807
  replacements_made += 1
808
+ else:
809
+ # ENHANCED: Try smart fallback
810
+ fallback_replacements = smart_fallback_processor(paragraph, flat_json)
811
+ replacements_made += fallback_replacements
812
+
813
+ return replacements_made
814
+
815
+ def process_headings(document, flat_json):
816
+ """ENHANCED: Your existing function + comprehensive coverage"""
817
+ replacements_made = 0
818
+ print(f"\nπŸ” Processing headings:")
819
+
820
+ paragraphs = document.paragraphs
821
+
822
+ for para_idx, paragraph in enumerate(paragraphs):
823
+ paragraph_text = paragraph.text.strip()
824
+
825
+ if not paragraph_text:
826
+ continue
827
+
828
+ # Enhanced heading detection
829
+ matched_heading = None
830
+ for category, patterns in HEADING_PATTERNS.items():
831
+ for pattern in patterns:
832
+ if re.search(pattern, paragraph_text, re.IGNORECASE):
833
+ matched_heading = pattern
834
+ break
835
+ if matched_heading:
836
+ break
837
+
838
+ if matched_heading:
839
+ print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
840
+
841
+ # Check current heading paragraph
842
+ if has_red_text_in_paragraph(paragraph):
843
+ print(f" πŸ”΄ Found red text in heading itself")
844
+ heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
845
+ replacements_made += heading_replacements
846
+
847
+ # Enhanced: Look further ahead for related content
848
+ for next_para_offset in range(1, 6): # Extended range
849
+ next_para_idx = para_idx + next_para_offset
850
+ if next_para_idx >= len(paragraphs):
851
+ break
852
+
853
+ next_paragraph = paragraphs[next_para_idx]
854
+ next_text = next_paragraph.text.strip()
855
+
856
+ if not next_text:
857
+ continue
858
+
859
+ # Stop if we hit another heading
860
+ is_another_heading = False
861
+ for category, patterns in HEADING_PATTERNS.items():
862
+ for pattern in patterns:
863
+ if re.search(pattern, next_text, re.IGNORECASE):
864
+ is_another_heading = True
865
+ break
866
+ if is_another_heading:
867
+ break
868
+
869
+ if is_another_heading:
870
+ break
871
+
872
+ # Process red text with enhanced context
873
+ if has_red_text_in_paragraph(next_paragraph):
874
+ print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
875
+
876
+ context_replacements = process_red_text_in_paragraph(
877
+ next_paragraph,
878
+ paragraph_text,
879
+ flat_json
880
+ )
881
+ replacements_made += context_replacements
882
+
883
+ # ENHANCED: Smart fallback if still no match
884
+ if context_replacements == 0:
885
+ fallback_replacements = smart_fallback_processor(next_paragraph, flat_json)
886
+ replacements_made += fallback_replacements
887
+
888
+ return replacements_made
889
+
890
+ def has_red_text_in_paragraph(paragraph):
891
+ """Enhanced paragraph red text detection"""
892
+ for run in paragraph.runs:
893
+ if is_red(run) and run.text.strip():
894
+ return True
895
+ return False
896
+
897
+ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
898
+ """ENHANCED: Your existing function + smarter matching"""
899
+ replacements_made = 0
900
+
901
+ red_text_segments = []
902
+ for run in paragraph.runs:
903
+ if is_red(run) and run.text.strip():
904
+ red_text_segments.append(run.text.strip())
905
+
906
+ if not red_text_segments:
907
+ return 0
908
+
909
+ combined_red_text = " ".join(red_text_segments).strip()
910
+ print(f" πŸ” Red text found: '{combined_red_text}'")
911
+
912
+ json_value = None
913
+
914
+ # Strategy 1: Direct matching
915
+ json_value = find_matching_json_value(combined_red_text, flat_json)
916
+
917
+ # Strategy 2: Enhanced context-based matching
918
+ if json_value is None:
919
+ if "NHVAS APPROVED AUDITOR" in context_text.upper():
920
+ auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
921
+ for field in auditor_fields:
922
+ json_value = find_matching_json_value(field, flat_json)
923
+ if json_value is not None:
924
+ print(f" βœ… Found auditor match with field: '{field}'")
925
+ break
926
+
927
+ elif "OPERATOR DECLARATION" in context_text.upper():
928
+ operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
929
+ for field in operator_fields:
930
+ json_value = find_matching_json_value(field, flat_json)
931
+ if json_value is not None:
932
+ print(f" βœ… Found operator match with field: '{field}'")
933
+ break
934
+
935
+ # Strategy 3: Enhanced context combination
936
+ if json_value is None:
937
+ context_queries = [
938
+ f"{context_text} {combined_red_text}",
939
+ combined_red_text,
940
+ context_text
941
+ ]
942
+
943
+ for query in context_queries:
944
+ json_value = find_matching_json_value(query, flat_json)
945
+ if json_value is not None:
946
+ print(f" βœ… Found match with combined query: '{query[:50]}...'")
947
+ break
948
+
949
+ # ENHANCED: Strategy 4: Semantic matching
950
+ if json_value is None:
951
+ json_value = semantic_text_matching(combined_red_text, flat_json)
952
+ if json_value:
953
+ print(f" βœ… Found semantic match for: '{combined_red_text}'")
954
+
955
+ # Replace if match found
956
+ if json_value is not None:
957
+ replacement_text = get_value_as_string(json_value, combined_red_text)
958
+
959
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
960
+ if red_runs:
961
+ red_runs[0].text = replacement_text
962
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
963
+
964
+ for run in red_runs[1:]:
965
+ run.text = ''
966
+
967
+ replacements_made = 1
968
+ print(f" βœ… Replaced with: '{replacement_text}'")
969
+ else:
970
+ print(f" ❌ No match found for red text: '{combined_red_text}'")
971
+
972
+ return replacements_made
973
+
974
+ def comprehensive_document_scan(document, flat_json):
975
+ """NEW: Final comprehensive scan for any missed red text"""
976
+ print(f"\nπŸ” Comprehensive final scan for missed red text:")
977
+ replacements_made = 0
978
+
979
+ # Scan all elements in document
980
+ for element in document.element.body:
981
+ # Check tables
982
+ if element.tag.endswith('tbl'):
983
+ table_obj = None
984
+ for table in document.tables:
985
+ if table._element == element:
986
+ table_obj = table
987
+ break
988
+
989
+ if table_obj:
990
+ for row in table_obj.rows:
991
+ for cell in row.cells:
992
+ if has_red_text(cell):
993
+ # Try one more time with enhanced fallback
994
+ cell_replacements = smart_fallback_processor(cell, flat_json)
995
+ replacements_made += cell_replacements
996
+
997
+ # Check paragraphs
998
+ elif element.tag.endswith('p'):
999
+ paragraph_obj = None
1000
+ for para in document.paragraphs:
1001
+ if para._element == element:
1002
+ paragraph_obj = para
1003
+ break
1004
+
1005
+ if paragraph_obj and has_red_text_in_paragraph(paragraph_obj):
1006
+ # Try enhanced fallback
1007
+ para_replacements = smart_fallback_processor(paragraph_obj, flat_json)
1008
+ replacements_made += para_replacements
1009
+
1010
+ if replacements_made > 0:
1011
+ print(f" βœ… Final scan caught {replacements_made} additional replacements!")
1012
+ else:
1013
+ print(f" βœ… No additional red text found - document fully processed!")
1014
+
1015
  return replacements_made
1016
 
1017
  def process_hf(json_file, docx_file, output_file):
1018
+ """ENHANCED: Your existing main function + comprehensive processing"""
 
 
 
1019
  try:
1020
+ # Load JSON
1021
  if hasattr(json_file, "read"):
1022
  json_data = json.load(json_file)
1023
  else:
1024
  with open(json_file, 'r', encoding='utf-8') as f:
1025
  json_data = json.load(f)
1026
+
1027
  flat_json = flatten_json(json_data)
1028
  print("πŸ“„ Available JSON keys (sample):")
1029
  for i, (key, value) in enumerate(sorted(flat_json.items())):
 
1031
  print(f" - {key}: {value}")
1032
  print(f" ... and {len(flat_json) - 10} more keys\n")
1033
 
1034
+ # Load DOCX
1035
  if hasattr(docx_file, "read"):
1036
  doc = Document(docx_file)
1037
  else:
1038
  doc = Document(docx_file)
1039
 
1040
+ # ENHANCED: Multi-pass processing for 100% coverage
1041
+ print("πŸš€ Starting enhanced multi-pass processing...")
1042
+
1043
+ # Pass 1: Your existing processors (enhanced)
1044
  table_replacements = process_tables(doc, flat_json)
1045
  paragraph_replacements = process_paragraphs(doc, flat_json)
1046
  heading_replacements = process_headings(doc, flat_json)
1047
+
1048
+ # Pass 2: NEW - Comprehensive final scan
1049
+ final_scan_replacements = comprehensive_document_scan(doc, flat_json)
1050
+
1051
+ total_replacements = table_replacements + paragraph_replacements + heading_replacements + final_scan_replacements
1052
 
1053
+ # Save output
1054
  if hasattr(output_file, "write"):
1055
  doc.save(output_file)
1056
  else:
1057
  doc.save(output_file)
1058
+
1059
  print(f"\nβœ… Document saved as: {output_file}")
1060
+ print(f"βœ… Total replacements: {total_replacements}")
1061
+ print(f" πŸ“Š Tables: {table_replacements}")
1062
+ print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1063
+ print(f" πŸ“‹ Headings: {heading_replacements}")
1064
+ print(f" 🎯 Final scan: {final_scan_replacements}")
1065
+ print(f"πŸŽ‰ Processing complete with enhanced coverage!")
1066
 
1067
  except FileNotFoundError as e:
1068
  print(f"❌ File not found: {e}")
 
1074
  if __name__ == "__main__":
1075
  import sys
1076
  if len(sys.argv) != 4:
1077
+ print("Usage: python enhanced_pipeline.py <input_docx> <updated_json> <output_docx>")
1078
  exit(1)
1079
  docx_path = sys.argv[1]
1080
  json_path = sys.argv[2]
1081
  output_path = sys.argv[3]
1082
+ process_hf(json_path, docx_path, output_path)