Shami96 commited on
Commit
ddb37e5
Β·
verified Β·
1 Parent(s): 412e2ed

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +202 -749
updated_word.py CHANGED
@@ -3,50 +3,60 @@ from docx import Document
3
  from docx.shared import RGBColor
4
  import re
5
 
6
- # Your original heading patterns (unchanged)
7
- HEADING_PATTERNS = {
8
- "main": [
9
- r"NHVAS\s+Audit\s+Summary\s+Report",
10
- r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
11
- r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
12
- ],
13
- "sub": [
14
- r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
15
- r"MAINTENANCE\s+MANAGEMENT",
16
- r"MASS\s+MANAGEMENT",
17
- r"FATIGUE\s+MANAGEMENT",
18
- r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
19
- r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
20
- r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
21
- r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
22
- r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
23
- r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
24
- r"Operator\s+Declaration",
25
- r"Operator\s+Information",
26
- r"Driver\s*/\s*Scheduler\s+Records\s+Examined"
27
- ]
28
- }
29
-
30
  def load_json(filepath):
31
  with open(filepath, 'r') as file:
32
  return json.load(file)
33
 
34
- def flatten_json(y, prefix=''):
35
- out = {}
36
- for key, val in y.items():
37
- new_key = f"{prefix}.{key}" if prefix else key
38
- if isinstance(val, dict):
39
- out.update(flatten_json(val, new_key))
40
- else:
41
- out[new_key] = val
42
- out[key] = val
43
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def is_red(run):
 
46
  color = run.font.color
47
  return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
48
 
49
  def get_value_as_string(value, field_name=""):
 
50
  if isinstance(value, list):
51
  if len(value) == 0:
52
  return ""
@@ -54,46 +64,56 @@ def get_value_as_string(value, field_name=""):
54
  return str(value[0])
55
  else:
56
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
57
- return value
58
  else:
59
  return " ".join(str(v) for v in value)
60
  else:
61
  return str(value)
62
 
63
  def find_matching_json_value(field_name, flat_json):
64
- """Your original matching function (unchanged)"""
65
  field_name = field_name.strip()
66
 
67
- # Try exact match first
68
  if field_name in flat_json:
69
  print(f" βœ… Direct match found for key '{field_name}'")
70
  return flat_json[field_name]
71
 
72
- # Try case-insensitive exact match
73
  for key, value in flat_json.items():
74
  if key.lower() == field_name.lower():
75
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
76
  return value
77
 
78
- # Try suffix matching (for nested keys like "section.field")
79
- for key, value in flat_json.items():
80
- if '.' in key and key.split('.')[-1].lower() == field_name.lower():
81
- print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
82
- return value
83
-
84
- # Try partial matching - remove parentheses and special chars
85
- clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
86
- clean_field = re.sub(r'\s+', ' ', clean_field)
87
-
88
- for key, value in flat_json.items():
89
- clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
90
- clean_key = re.sub(r'\s+', ' ', clean_key)
91
-
92
- if clean_field == clean_key:
93
- print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
94
- return value
95
-
96
- # Enhanced fuzzy matching with better scoring
 
 
 
 
 
 
 
 
 
 
97
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
98
  if not field_words:
99
  return None
@@ -107,13 +127,9 @@ def find_matching_json_value(field_name, flat_json):
107
  if not key_words:
108
  continue
109
 
110
- # Calculate similarity score
111
  common_words = field_words.intersection(key_words)
112
  if common_words:
113
- # Use Jaccard similarity: intersection / union
114
  similarity = len(common_words) / len(field_words.union(key_words))
115
-
116
- # Bonus for high word coverage in field_name
117
  coverage = len(common_words) / len(field_words)
118
  final_score = (similarity * 0.6) + (coverage * 0.4)
119
 
@@ -130,6 +146,7 @@ def find_matching_json_value(field_name, flat_json):
130
  return None
131
 
132
  def get_clean_text(cell):
 
133
  text = ""
134
  for paragraph in cell.paragraphs:
135
  for run in paragraph.runs:
@@ -137,522 +154,130 @@ def get_clean_text(cell):
137
  return text.strip()
138
 
139
  def has_red_text(cell):
 
140
  for paragraph in cell.paragraphs:
141
  for run in paragraph.runs:
142
  if is_red(run) and run.text.strip():
143
  return True
144
  return False
145
 
146
- def extract_red_text_segments(cell):
147
- """Your original red text extraction (unchanged)"""
148
- red_segments = []
149
-
150
- for para_idx, paragraph in enumerate(cell.paragraphs):
151
- current_segment = ""
152
- segment_runs = []
153
-
154
- for run_idx, run in enumerate(paragraph.runs):
155
- if is_red(run):
156
- if run.text:
157
- current_segment += run.text
158
- segment_runs.append((para_idx, run_idx, run))
159
- else:
160
- # End of current red segment
161
- if segment_runs:
162
- red_segments.append({
163
- 'text': current_segment,
164
- 'runs': segment_runs.copy(),
165
- 'paragraph_idx': para_idx
166
- })
167
- current_segment = ""
168
- segment_runs = []
169
-
170
- # Handle segment at end of paragraph
171
- if segment_runs:
172
- red_segments.append({
173
- 'text': current_segment,
174
- 'runs': segment_runs.copy(),
175
- 'paragraph_idx': para_idx
176
- })
177
-
178
- return red_segments
179
-
180
  def replace_red_text_in_cell(cell, replacement_text):
181
- """Your original replacement function (unchanged)"""
182
- red_segments = extract_red_text_segments(cell)
183
-
184
- if not red_segments:
185
- return 0
186
-
187
- if len(red_segments) > 1:
188
- replacements_made = 0
189
- for segment in red_segments:
190
- segment_text = segment['text'].strip()
191
- if segment_text:
192
- pass
193
-
194
- if replacements_made == 0:
195
- return replace_all_red_segments(red_segments, replacement_text)
196
-
197
- return replace_all_red_segments(red_segments, replacement_text)
198
-
199
- def replace_all_red_segments(red_segments, replacement_text):
200
- """Your original function (unchanged)"""
201
- if not red_segments:
202
- return 0
203
-
204
- if '\n' in replacement_text:
205
- replacement_lines = replacement_text.split('\n')
206
- else:
207
- replacement_lines = [replacement_text]
208
-
209
  replacements_made = 0
210
 
211
- if red_segments and replacement_lines:
212
- first_segment = red_segments[0]
213
- if first_segment['runs']:
214
- first_run = first_segment['runs'][0][2]
215
- first_run.text = replacement_lines[0]
216
- first_run.font.color.rgb = RGBColor(0, 0, 0)
217
- replacements_made = 1
218
-
219
- for _, _, run in first_segment['runs'][1:]:
220
- run.text = ''
221
-
222
- for segment in red_segments[1:]:
223
- for _, _, run in segment['runs']:
224
- run.text = ''
225
-
226
- if len(replacement_lines) > 1 and red_segments:
227
- try:
228
- first_run = red_segments[0]['runs'][0][2]
229
- paragraph = first_run.element.getparent()
230
-
231
- for line in replacement_lines[1:]:
232
- if line.strip():
233
- from docx.oxml import OxmlElement, ns
234
- br = OxmlElement('w:br')
235
- first_run.element.append(br)
236
-
237
- new_run = paragraph.add_run(line.strip())
238
- new_run.font.color.rgb = RGBColor(0, 0, 0)
239
- except:
240
- if red_segments and red_segments[0]['runs']:
241
- first_run = red_segments[0]['runs'][0][2]
242
- first_run.text = ' '.join(replacement_lines)
243
- first_run.font.color.rgb = RGBColor(0, 0, 0)
244
 
245
  return replacements_made
246
 
247
- def replace_single_segment(segment, replacement_text):
248
- """Your original function (unchanged)"""
249
- if not segment['runs']:
250
- return False
251
-
252
- first_run = segment['runs'][0][2]
253
- first_run.text = replacement_text
254
- first_run.font.color.rgb = RGBColor(0, 0, 0)
255
-
256
- for _, _, run in segment['runs'][1:]:
257
- run.text = ''
258
-
259
- return True
260
-
261
- def handle_multiple_red_segments_in_cell(cell, flat_json):
262
- """Your original function (unchanged)"""
263
- red_segments = extract_red_text_segments(cell)
264
-
265
- if not red_segments:
266
- return 0
267
-
268
- print(f" πŸ” Found {len(red_segments)} red text segments in cell")
269
  replacements_made = 0
270
- unmatched_segments = []
271
-
272
- for i, segment in enumerate(red_segments):
273
- segment_text = segment['text'].strip()
274
- if not segment_text:
275
- continue
276
-
277
- print(f" Segment {i+1}: '{segment_text[:50]}...'")
278
-
279
- json_value = find_matching_json_value(segment_text, flat_json)
280
-
281
- if json_value is not None:
282
- replacement_text = get_value_as_string(json_value, segment_text)
283
-
284
- if isinstance(json_value, list) and len(json_value) > 1:
285
- replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
286
-
287
- success = replace_single_segment(segment, replacement_text)
288
- if success:
289
- replacements_made += 1
290
- print(f" βœ… Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
291
- else:
292
- unmatched_segments.append(segment)
293
- print(f" ⏳ No individual match for segment '{segment_text[:30]}...'")
294
-
295
- if unmatched_segments and replacements_made == 0:
296
- combined_text = " ".join(seg['text'] for seg in red_segments).strip()
297
- print(f" πŸ”„ Trying combined text match: '{combined_text[:50]}...'")
298
-
299
- json_value = find_matching_json_value(combined_text, flat_json)
300
- if json_value is not None:
301
- replacement_text = get_value_as_string(json_value, combined_text)
302
- if isinstance(json_value, list) and len(json_value) > 1:
303
- replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
304
-
305
- replacements_made = replace_all_red_segments(red_segments, replacement_text)
306
- print(f" βœ… Replaced combined text with '{replacement_text[:50]}...'")
307
-
308
  return replacements_made
309
 
310
- # 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
311
- def handle_nature_business_multiline_fix(cell, flat_json):
312
- """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
313
  if not has_red_text(cell):
314
  return 0
315
 
316
- # Check if this cell contains "Nature of the Operators Business"
317
  cell_text = get_clean_text(cell).lower()
318
  if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
319
  return 0
320
 
321
- print(f" 🎯 SURGICAL FIX: Nature of Business multi-line processing")
322
 
323
- # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
324
- red_segments = extract_red_text_segments(cell)
325
- replacements_made = 0
326
-
327
- # Try to replace each segment individually first
328
- for segment in red_segments:
329
- segment_text = segment['text'].strip()
330
- if not segment_text:
331
- continue
332
-
333
- json_value = find_matching_json_value(segment_text, flat_json)
334
- if json_value is not None:
335
- replacement_text = get_value_as_string(json_value, segment_text)
336
- success = replace_single_segment(segment, replacement_text)
337
- if success:
338
- replacements_made += 1
339
- print(f" βœ… Fixed segment: '{segment_text[:30]}...'")
340
-
341
- # If no individual matches, try combined approach
342
- if replacements_made == 0 and red_segments:
343
- combined_text = " ".join(seg['text'] for seg in red_segments).strip()
344
- json_value = find_matching_json_value(combined_text, flat_json)
345
- if json_value is not None:
346
- replacement_text = get_value_as_string(json_value, combined_text)
347
- replacements_made = replace_all_red_segments(red_segments, replacement_text)
348
- print(f" βœ… Fixed combined text")
349
 
350
- return replacements_made
351
 
352
- # 🎯 SURGICAL FIX 2: Handle Operator Declaration table
353
- def handle_operator_declaration_fix(table, flat_json):
354
- """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title"""
355
  replacements_made = 0
356
 
357
- # Very specific detection: must have EXACTLY these headers
358
  for row_idx, row in enumerate(table.rows):
359
  if len(row.cells) >= 2:
360
  cell1_text = get_clean_text(row.cells[0]).strip()
361
  cell2_text = get_clean_text(row.cells[1]).strip()
362
 
363
- # VERY specific match for operator declaration table
364
- if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and
365
- len(table.rows) <= 4): # Small table only
366
 
367
- print(f" 🎯 SURGICAL FIX: Operator Declaration table detected")
368
-
369
- # Look for the data row (should be next row)
370
  if row_idx + 1 < len(table.rows):
371
  data_row = table.rows[row_idx + 1]
372
  if len(data_row.cells) >= 2:
373
  name_cell = data_row.cells[0]
374
  position_cell = data_row.cells[1]
375
 
376
- # Fix Print Name (first column)
377
  if has_red_text(name_cell):
378
- red_text = ""
379
- for paragraph in name_cell.paragraphs:
380
- for run in paragraph.runs:
381
- if is_red(run):
382
- red_text += run.text
383
 
384
- if red_text.strip():
385
- json_value = find_matching_json_value(red_text.strip(), flat_json)
386
- if json_value is not None:
387
- replacement_text = get_value_as_string(json_value)
388
- cell_replacements = replace_red_text_in_cell(name_cell, replacement_text)
389
- replacements_made += cell_replacements
390
- print(f" βœ… Fixed Print Name: '{replacement_text}'")
391
 
392
- # Fix Position Title (second column)
393
  if has_red_text(position_cell):
394
- red_text = ""
395
- for paragraph in position_cell.paragraphs:
396
- for run in paragraph.runs:
397
- if is_red(run):
398
- red_text += run.text
399
 
400
- if red_text.strip():
401
- json_value = find_matching_json_value(red_text.strip(), flat_json)
402
- if json_value is not None:
403
- replacement_text = get_value_as_string(json_value)
404
- cell_replacements = replace_red_text_in_cell(position_cell, replacement_text)
405
- replacements_made += cell_replacements
406
- print(f" βœ… Fixed Position Title: '{replacement_text}'")
407
-
408
- break # Found the table, stop looking
409
-
410
- return replacements_made
411
-
412
- def handle_australian_company_number(row, company_numbers):
413
- """Your original function (unchanged)"""
414
- replacements_made = 0
415
- for i, digit in enumerate(company_numbers):
416
- cell_idx = i + 1
417
- if cell_idx < len(row.cells):
418
- cell = row.cells[cell_idx]
419
- if has_red_text(cell):
420
- cell_replacements = replace_red_text_in_cell(cell, str(digit))
421
- replacements_made += cell_replacements
422
- print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
423
- return replacements_made
424
-
425
- def handle_vehicle_registration_table(table, flat_json):
426
- """Your original function (unchanged)"""
427
- replacements_made = 0
428
-
429
- # Try to find vehicle registration data
430
- vehicle_section = None
431
-
432
- for key, value in flat_json.items():
433
- if "vehicle registration numbers of records examined" in key.lower():
434
- if isinstance(value, dict):
435
- vehicle_section = value
436
- print(f" βœ… Found vehicle data in key: '{key}'")
437
- break
438
-
439
- if not vehicle_section:
440
- potential_columns = {}
441
- for key, value in flat_json.items():
442
- if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
443
- if "." in key:
444
- column_name = key.split(".")[-1]
445
- else:
446
- column_name = key
447
- potential_columns[column_name] = value
448
-
449
- if potential_columns:
450
- vehicle_section = potential_columns
451
- print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
452
- else:
453
- print(f" ❌ Vehicle registration data not found in JSON")
454
- return 0
455
-
456
- print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
457
-
458
- # Find header row
459
- header_row_idx = -1
460
- header_row = None
461
-
462
- for row_idx, row in enumerate(table.rows):
463
- row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
464
- if "registration" in row_text and "number" in row_text:
465
- header_row_idx = row_idx
466
- header_row = row
467
- break
468
-
469
- if header_row_idx == -1:
470
- print(f" ❌ Could not find header row in vehicle table")
471
- return 0
472
-
473
- print(f" βœ… Found header row at index {header_row_idx}")
474
-
475
- # Enhanced column mapping
476
- column_mapping = {}
477
- for col_idx, cell in enumerate(header_row.cells):
478
- header_text = get_clean_text(cell).strip()
479
- if not header_text or header_text.lower() == "no.":
480
- continue
481
-
482
- best_match = None
483
- best_score = 0
484
-
485
- normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
486
-
487
- for json_key in vehicle_section.keys():
488
- normalized_json = json_key.lower().strip()
489
-
490
- if normalized_header == normalized_json:
491
- best_match = json_key
492
- best_score = 1.0
493
- break
494
-
495
- header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
496
- json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
497
-
498
- if header_words and json_words:
499
- common_words = header_words.intersection(json_words)
500
- score = len(common_words) / max(len(header_words), len(json_words))
501
-
502
- if score > best_score and score >= 0.3:
503
- best_score = score
504
- best_match = json_key
505
-
506
- header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
507
- json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
508
-
509
- if header_clean in json_clean or json_clean in header_clean:
510
- if len(header_clean) > 5 and len(json_clean) > 5:
511
- substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
512
- if substring_score > best_score and substring_score >= 0.6:
513
- best_score = substring_score
514
- best_match = json_key
515
-
516
- if best_match:
517
- column_mapping[col_idx] = best_match
518
- print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
519
-
520
- if not column_mapping:
521
- print(f" ❌ No column mappings found")
522
- return 0
523
-
524
- # Determine data rows needed
525
- max_data_rows = 0
526
- for json_key, data in vehicle_section.items():
527
- if isinstance(data, list):
528
- max_data_rows = max(max_data_rows, len(data))
529
-
530
- print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
531
-
532
- # Process data rows
533
- for data_row_index in range(max_data_rows):
534
- table_row_idx = header_row_idx + 1 + data_row_index
535
-
536
- if table_row_idx >= len(table.rows):
537
- print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
538
- print(f" βž• Adding new row for vehicle {data_row_index + 1}")
539
-
540
- new_row = table.add_row()
541
- print(f" βœ… Successfully added row {len(table.rows)} to the table")
542
-
543
- row = table.rows[table_row_idx]
544
- print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
545
-
546
- for col_idx, json_key in column_mapping.items():
547
- if col_idx < len(row.cells):
548
- cell = row.cells[col_idx]
549
-
550
- column_data = vehicle_section.get(json_key, [])
551
- if isinstance(column_data, list) and data_row_index < len(column_data):
552
- replacement_value = str(column_data[data_row_index])
553
-
554
- cell_text = get_clean_text(cell)
555
- if has_red_text(cell) or not cell_text.strip():
556
- if not cell_text.strip():
557
- cell.text = replacement_value
558
- replacements_made += 1
559
- print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
560
- else:
561
- cell_replacements = replace_red_text_in_cell(cell, replacement_value)
562
- replacements_made += cell_replacements
563
- if cell_replacements > 0:
564
- print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
565
-
566
- return replacements_made
567
-
568
- def handle_print_accreditation_section(table, flat_json):
569
- """Your original function (unchanged)"""
570
- replacements_made = 0
571
-
572
- print_data = flat_json.get("print accreditation name.print accreditation name", [])
573
- if not isinstance(print_data, list) or len(print_data) < 2:
574
- return 0
575
-
576
- name_value = print_data[0]
577
- position_value = print_data[1]
578
-
579
- print(f" πŸ“‹ Print accreditation data: Name='{name_value}', Position='{position_value}'")
580
-
581
- for row_idx, row in enumerate(table.rows):
582
- if len(row.cells) >= 2:
583
- cell1_text = get_clean_text(row.cells[0]).lower()
584
- cell2_text = get_clean_text(row.cells[1]).lower()
585
-
586
- if "print name" in cell1_text and "position title" in cell2_text:
587
- print(f" πŸ“ Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
588
-
589
- if row_idx + 1 < len(table.rows):
590
- data_row = table.rows[row_idx + 1]
591
- if len(data_row.cells) >= 2:
592
- if has_red_text(data_row.cells[0]):
593
- cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
594
- replacements_made += cell_replacements
595
- if cell_replacements > 0:
596
- print(f" βœ… Replaced Print Name: '{name_value}'")
597
-
598
- if has_red_text(data_row.cells[1]):
599
- cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
600
- replacements_made += cell_replacements
601
- if cell_replacements > 0:
602
- print(f" βœ… Replaced Position Title: '{position_value}'")
603
 
604
  break
605
 
606
  return replacements_made
607
 
608
- def process_single_column_sections(cell, field_name, flat_json):
609
- """Your original function (unchanged)"""
610
- json_value = find_matching_json_value(field_name, flat_json)
611
- if json_value is not None:
612
- replacement_text = get_value_as_string(json_value, field_name)
613
- if isinstance(json_value, list) and len(json_value) > 1:
614
- replacement_text = "\n".join(str(item) for item in json_value)
615
- if has_red_text(cell):
616
- print(f" βœ… Replacing red text in single-column section: '{field_name}'")
617
- print(f" βœ… Replacement text:\n{replacement_text}")
618
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
619
- if cell_replacements > 0:
620
- print(f" -> Replaced with: '{replacement_text[:100]}...'")
621
- return cell_replacements
622
- return 0
623
-
624
  def process_tables(document, flat_json):
625
- """Your original function with minimal surgical fixes added"""
626
  replacements_made = 0
627
 
628
  for table_idx, table in enumerate(document.tables):
629
  print(f"\nπŸ” Processing table {table_idx + 1}:")
630
 
631
- # Your original logic
632
- table_text = ""
633
- for row in table.rows[:3]:
634
- for cell in row.cells:
635
- table_text += get_clean_text(cell).lower() + " "
636
-
637
- # Enhanced vehicle registration detection
638
- vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
639
- indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
640
- if indicator_count >= 2:
641
- print(f" πŸš— Detected Vehicle Registration table")
642
- vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
643
- replacements_made += vehicle_replacements
644
- continue
645
-
646
- # Enhanced print accreditation detection
647
- print_accreditation_indicators = ["print name", "position title"]
648
- indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
649
- if indicator_count >= 1:
650
- print(f" πŸ“‹ Detected Print Accreditation table")
651
- print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
652
- replacements_made += print_accreditation_replacements
653
- continue
654
 
655
- # Your existing row processing
656
  for row_idx, row in enumerate(table.rows):
657
  if len(row.cells) < 1:
658
  continue
@@ -665,261 +290,90 @@ def process_tables(document, flat_json):
665
 
666
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
667
 
 
 
 
 
 
 
 
668
  json_value = find_matching_json_value(key_text, flat_json)
669
 
670
  if json_value is not None:
671
  replacement_text = get_value_as_string(json_value, key_text)
672
 
673
- # Enhanced ACN handling
674
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
675
  cell_replacements = handle_australian_company_number(row, json_value)
676
  replacements_made += cell_replacements
677
-
678
- # Enhanced section header handling
679
- elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
680
- print(f" βœ… Section header detected, checking next row for content...")
681
- next_row = table.rows[row_idx + 1]
682
-
683
- for cell_idx, cell in enumerate(next_row.cells):
684
  if has_red_text(cell):
685
- print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
686
- if isinstance(json_value, list):
687
- replacement_text = "\n".join(str(item) for item in json_value)
688
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
689
  replacements_made += cell_replacements
690
  if cell_replacements > 0:
691
- print(f" -> Replaced section content with: '{replacement_text[:100]}...'")
692
-
693
- elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
694
- if has_red_text(key_cell):
695
- cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
696
- replacements_made += cell_replacements
697
- else:
698
- for cell_idx in range(1, len(row.cells)):
699
- value_cell = row.cells[cell_idx]
700
- if has_red_text(value_cell):
701
- print(f" βœ… Found red text in column {cell_idx + 1}")
702
- cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
703
- replacements_made += cell_replacements
704
  else:
705
- # Enhanced fallback processing for unmatched keys
706
- if len(row.cells) == 1 and has_red_text(key_cell):
707
- red_text = ""
708
- for paragraph in key_cell.paragraphs:
709
- for run in paragraph.runs:
710
- if is_red(run):
711
- red_text += run.text
712
- if red_text.strip():
713
- section_value = find_matching_json_value(red_text.strip(), flat_json)
714
- if section_value is not None:
715
- section_replacement = get_value_as_string(section_value, red_text.strip())
716
- cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
717
- replacements_made += cell_replacements
718
-
719
- # Enhanced red text processing for all cells
720
  for cell_idx in range(len(row.cells)):
721
  cell = row.cells[cell_idx]
722
  if has_red_text(cell):
723
- cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
724
- replacements_made += cell_replacements
 
 
 
 
725
 
726
- # 🎯 SURGICAL FIX 1: Only if no replacements were made
727
- if cell_replacements == 0:
728
- surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
729
- replacements_made += surgical_fix
730
-
731
- # 🎯 SURGICAL FIX 2: Handle Operator Declaration tables (only check last few tables)
732
- print(f"\n🎯 SURGICAL FIX: Checking for Operator Declaration tables...")
733
- for table in document.tables[-3:]: # Only check last 3 tables
734
- if len(table.rows) <= 4: # Only small tables
735
- declaration_fix = handle_operator_declaration_fix(table, flat_json)
736
- replacements_made += declaration_fix
737
 
738
  return replacements_made
739
 
740
  def process_paragraphs(document, flat_json):
741
- """Your original function (unchanged)"""
742
  replacements_made = 0
743
  print(f"\nπŸ” Processing paragraphs:")
744
 
745
  for para_idx, paragraph in enumerate(document.paragraphs):
746
- red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
747
- if red_runs:
748
- full_text = paragraph.text.strip()
749
- red_text_only = "".join(run.text for run in red_runs).strip()
750
- print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
751
-
752
- # Your existing matching logic
753
- json_value = find_matching_json_value(red_text_only, flat_json)
754
-
755
- if json_value is None:
756
- # Enhanced pattern matching for signatures and dates
757
- if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
758
- json_value = find_matching_json_value("auditor signature", flat_json)
759
- elif "OPERATOR SIGNATURE" in red_text_only.upper():
760
- json_value = find_matching_json_value("operator signature", flat_json)
761
-
762
- if json_value is not None:
763
- replacement_text = get_value_as_string(json_value)
764
- print(f" βœ… Replacing red text with: '{replacement_text}'")
765
- red_runs[0].text = replacement_text
766
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
767
- for run in red_runs[1:]:
768
- run.text = ''
769
- replacements_made += 1
770
-
771
- return replacements_made
772
-
773
- def process_headings(document, flat_json):
774
- """Your original function (unchanged)"""
775
- replacements_made = 0
776
- print(f"\nπŸ” Processing headings:")
777
-
778
- paragraphs = document.paragraphs
779
-
780
- for para_idx, paragraph in enumerate(paragraphs):
781
- paragraph_text = paragraph.text.strip()
782
 
783
- if not paragraph_text:
784
- continue
785
-
786
- # Enhanced heading detection
787
- matched_heading = None
788
- for category, patterns in HEADING_PATTERNS.items():
789
- for pattern in patterns:
790
- if re.search(pattern, paragraph_text, re.IGNORECASE):
791
- matched_heading = pattern
792
- break
793
- if matched_heading:
794
- break
795
 
796
- if matched_heading:
797
- print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
798
 
799
- # Check current heading paragraph
800
- if has_red_text_in_paragraph(paragraph):
801
- print(f" πŸ”΄ Found red text in heading itself")
802
- heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
803
- replacements_made += heading_replacements
804
 
805
- # Enhanced: Look further ahead for related content
806
- for next_para_offset in range(1, 6): # Extended range
807
- next_para_idx = para_idx + next_para_offset
808
- if next_para_idx >= len(paragraphs):
809
- break
810
-
811
- next_paragraph = paragraphs[next_para_idx]
812
- next_text = next_paragraph.text.strip()
813
-
814
- if not next_text:
815
- continue
816
-
817
- # Stop if we hit another heading
818
- is_another_heading = False
819
- for category, patterns in HEADING_PATTERNS.items():
820
- for pattern in patterns:
821
- if re.search(pattern, next_text, re.IGNORECASE):
822
- is_another_heading = True
823
- break
824
- if is_another_heading:
825
- break
826
-
827
- if is_another_heading:
828
- break
829
-
830
- # Process red text with enhanced context
831
- if has_red_text_in_paragraph(next_paragraph):
832
- print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
833
-
834
- context_replacements = process_red_text_in_paragraph(
835
- next_paragraph,
836
- paragraph_text,
837
- flat_json
838
- )
839
- replacements_made += context_replacements
840
-
841
- return replacements_made
842
-
843
- def has_red_text_in_paragraph(paragraph):
844
- """Your original function (unchanged)"""
845
- for run in paragraph.runs:
846
- if is_red(run) and run.text.strip():
847
- return True
848
- return False
849
-
850
- def process_red_text_in_paragraph(paragraph, context_text, flat_json):
851
- """Your original function (unchanged)"""
852
- replacements_made = 0
853
-
854
- red_text_segments = []
855
- for run in paragraph.runs:
856
- if is_red(run) and run.text.strip():
857
- red_text_segments.append(run.text.strip())
858
-
859
- if not red_text_segments:
860
- return 0
861
-
862
- combined_red_text = " ".join(red_text_segments).strip()
863
- print(f" πŸ” Red text found: '{combined_red_text}'")
864
-
865
- json_value = None
866
-
867
- # Strategy 1: Direct matching
868
- json_value = find_matching_json_value(combined_red_text, flat_json)
869
-
870
- # Strategy 2: Enhanced context-based matching
871
- if json_value is None:
872
- if "NHVAS APPROVED AUDITOR" in context_text.upper():
873
- auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
874
- for field in auditor_fields:
875
- json_value = find_matching_json_value(field, flat_json)
876
- if json_value is not None:
877
- print(f" βœ… Found auditor match with field: '{field}'")
878
- break
879
-
880
- elif "OPERATOR DECLARATION" in context_text.upper():
881
- operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
882
- for field in operator_fields:
883
- json_value = find_matching_json_value(field, flat_json)
884
- if json_value is not None:
885
- print(f" βœ… Found operator match with field: '{field}'")
886
- break
887
-
888
- # Strategy 3: Enhanced context combination
889
- if json_value is None:
890
- context_queries = [
891
- f"{context_text} {combined_red_text}",
892
- combined_red_text,
893
- context_text
894
- ]
895
-
896
- for query in context_queries:
897
- json_value = find_matching_json_value(query, flat_json)
898
  if json_value is not None:
899
- print(f" βœ… Found match with combined query: '{query[:50]}...'")
900
- break
901
-
902
- # Replace if match found
903
- if json_value is not None:
904
- replacement_text = get_value_as_string(json_value, combined_red_text)
905
-
906
- red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
907
- if red_runs:
908
- red_runs[0].text = replacement_text
909
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
910
-
911
- for run in red_runs[1:]:
912
- run.text = ''
913
-
914
- replacements_made = 1
915
- print(f" βœ… Replaced with: '{replacement_text}'")
916
- else:
917
- print(f" ❌ No match found for red text: '{combined_red_text}'")
918
 
919
  return replacements_made
920
 
921
  def process_hf(json_file, docx_file, output_file):
922
- """Your original main function (unchanged)"""
923
  try:
924
  # Load JSON
925
  if hasattr(json_file, "read"):
@@ -928,7 +382,8 @@ def process_hf(json_file, docx_file, output_file):
928
  with open(json_file, 'r', encoding='utf-8') as f:
929
  json_data = json.load(f)
930
 
931
- flat_json = flatten_json(json_data)
 
932
  print("πŸ“„ Available JSON keys (sample):")
933
  for i, (key, value) in enumerate(sorted(flat_json.items())):
934
  if i < 10:
@@ -941,14 +396,13 @@ def process_hf(json_file, docx_file, output_file):
941
  else:
942
  doc = Document(docx_file)
943
 
944
- # Your original processing
945
- print("πŸš€ Starting processing with surgical fixes...")
946
 
947
  table_replacements = process_tables(doc, flat_json)
948
  paragraph_replacements = process_paragraphs(doc, flat_json)
949
- heading_replacements = process_headings(doc, flat_json)
950
 
951
- total_replacements = table_replacements + paragraph_replacements + heading_replacements
952
 
953
  # Save output
954
  if hasattr(output_file, "write"):
@@ -960,7 +414,6 @@ def process_hf(json_file, docx_file, output_file):
960
  print(f"βœ… Total replacements: {total_replacements}")
961
  print(f" πŸ“Š Tables: {table_replacements}")
962
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
963
- print(f" πŸ“‹ Headings: {heading_replacements}")
964
  print(f"πŸŽ‰ Processing complete!")
965
 
966
  except FileNotFoundError as e:
@@ -973,7 +426,7 @@ def process_hf(json_file, docx_file, output_file):
973
  if __name__ == "__main__":
974
  import sys
975
  if len(sys.argv) != 4:
976
- print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
977
  exit(1)
978
  docx_path = sys.argv[1]
979
  json_path = sys.argv[2]
 
3
  from docx.shared import RGBColor
4
  import re
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def load_json(filepath):
7
  with open(filepath, 'r') as file:
8
  return json.load(file)
9
 
10
+ def flatten_json_new_system(json_data):
11
+ """Flatten your new JSON structure to work with replacement logic"""
12
+ flat_json = {}
13
+
14
+ for schema_name, schema_data in json_data.items():
15
+ if isinstance(schema_data, dict):
16
+ for field_name, values in schema_data.items():
17
+ # Handle list values (your system returns lists)
18
+ if isinstance(values, list) and values:
19
+ value = values[0] if len(values) == 1 else values
20
+ else:
21
+ value = values
22
+
23
+ # Add multiple key variations for better matching
24
+ flat_json[field_name] = value
25
+ flat_json[field_name.lower()] = value
26
+ flat_json[field_name.lower().strip()] = value
27
+
28
+ # Add schema-prefixed keys
29
+ flat_json[f"{schema_name}.{field_name}"] = value
30
+ flat_json[f"{schema_name.lower()}.{field_name.lower()}"] = value
31
+
32
+ # Special mappings for common cases
33
+ if "print name" in field_name.lower():
34
+ flat_json["print name"] = value
35
+ flat_json["operator name"] = value
36
+ flat_json["name"] = value
37
+
38
+ if "position title" in field_name.lower():
39
+ flat_json["position title"] = value
40
+ flat_json["position"] = value
41
+ flat_json["title"] = value
42
+
43
+ if "accreditation number" in field_name.lower():
44
+ flat_json["accreditation number"] = value
45
+ flat_json["nhvas accreditation no"] = value
46
+
47
+ if "expiry date" in field_name.lower():
48
+ flat_json["expiry date"] = value
49
+ flat_json["expiry"] = value
50
+
51
+ return flat_json
52
 
53
  def is_red(run):
54
+ """Detect red colored text"""
55
  color = run.font.color
56
  return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
57
 
58
  def get_value_as_string(value, field_name=""):
59
+ """Convert value to string, handling lists appropriately"""
60
  if isinstance(value, list):
61
  if len(value) == 0:
62
  return ""
 
64
  return str(value[0])
65
  else:
66
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
67
+ return value # Return as list for ACN processing
68
  else:
69
  return " ".join(str(v) for v in value)
70
  else:
71
  return str(value)
72
 
73
  def find_matching_json_value(field_name, flat_json):
74
+ """Enhanced matching for your new JSON structure"""
75
  field_name = field_name.strip()
76
 
77
+ # Direct match (exact)
78
  if field_name in flat_json:
79
  print(f" βœ… Direct match found for key '{field_name}'")
80
  return flat_json[field_name]
81
 
82
+ # Case-insensitive exact match
83
  for key, value in flat_json.items():
84
  if key.lower() == field_name.lower():
85
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
86
  return value
87
 
88
+ # Partial matching for common field names
89
+ field_lower = field_name.lower().strip()
90
+
91
+ # Handle common variations
92
+ if "print name" in field_lower:
93
+ for key in ["Print Name", "print name", "operator name", "name"]:
94
+ if key in flat_json:
95
+ print(f" βœ… Print name match: '{field_name}' -> '{key}'")
96
+ return flat_json[key]
97
+
98
+ if "position title" in field_lower:
99
+ for key in ["Position Title", "position title", "position", "title"]:
100
+ if key in flat_json:
101
+ print(f" βœ… Position title match: '{field_name}' -> '{key}'")
102
+ return flat_json[key]
103
+
104
+ if "accreditation number" in field_lower:
105
+ for key in flat_json.keys():
106
+ if "accreditation" in key.lower() and "number" in key.lower():
107
+ print(f" βœ… Accreditation number match: '{field_name}' -> '{key}'")
108
+ return flat_json[key]
109
+
110
+ if "expiry date" in field_lower:
111
+ for key in flat_json.keys():
112
+ if "expiry" in key.lower():
113
+ print(f" βœ… Expiry date match: '{field_name}' -> '{key}'")
114
+ return flat_json[key]
115
+
116
+ # Fuzzy matching
117
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
118
  if not field_words:
119
  return None
 
127
  if not key_words:
128
  continue
129
 
 
130
  common_words = field_words.intersection(key_words)
131
  if common_words:
 
132
  similarity = len(common_words) / len(field_words.union(key_words))
 
 
133
  coverage = len(common_words) / len(field_words)
134
  final_score = (similarity * 0.6) + (coverage * 0.4)
135
 
 
146
  return None
147
 
148
  def get_clean_text(cell):
149
+ """Extract clean text from cell"""
150
  text = ""
151
  for paragraph in cell.paragraphs:
152
  for run in paragraph.runs:
 
154
  return text.strip()
155
 
156
  def has_red_text(cell):
157
+ """Check if cell has red text"""
158
  for paragraph in cell.paragraphs:
159
  for run in paragraph.runs:
160
  if is_red(run) and run.text.strip():
161
  return True
162
  return False
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def replace_red_text_in_cell(cell, replacement_text):
165
+ """Replace red text in cell with new text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  replacements_made = 0
167
 
168
+ for paragraph in cell.paragraphs:
169
+ for run in paragraph.runs:
170
+ if is_red(run) and run.text.strip():
171
+ run.text = replacement_text
172
+ run.font.color.rgb = RGBColor(0, 0, 0) # Change to black
173
+ replacements_made += 1
174
+ break # Only replace first red text found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  return replacements_made
177
 
178
+ def handle_australian_company_number(row, company_numbers):
179
+ """Handle ACN digit placement"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  replacements_made = 0
181
+ for i, digit in enumerate(company_numbers):
182
+ cell_idx = i + 1
183
+ if cell_idx < len(row.cells):
184
+ cell = row.cells[cell_idx]
185
+ if has_red_text(cell):
186
+ cell_replacements = replace_red_text_in_cell(cell, str(digit))
187
+ replacements_made += cell_replacements
188
+ print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  return replacements_made
190
 
191
+ def handle_nature_business_section(cell, flat_json):
192
+ """Handle Nature of Business section with sub-fields"""
 
193
  if not has_red_text(cell):
194
  return 0
195
 
 
196
  cell_text = get_clean_text(cell).lower()
197
  if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
198
  return 0
199
 
200
+ print(f" 🎯 Found Nature of Business section")
201
 
202
+ # Check for business description
203
+ for key in flat_json.keys():
204
+ if "nature of the operators business" in key.lower():
205
+ business_value = flat_json[key]
206
+ replacement_text = get_value_as_string(business_value)
207
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
208
+ if cell_replacements > 0:
209
+ print(f" βœ… Updated business description")
210
+ return cell_replacements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
+ return 0
213
 
214
+ def handle_operator_declaration_table(table, flat_json):
215
+ """Handle Operator Declaration table specifically"""
 
216
  replacements_made = 0
217
 
 
218
  for row_idx, row in enumerate(table.rows):
219
  if len(row.cells) >= 2:
220
  cell1_text = get_clean_text(row.cells[0]).strip()
221
  cell2_text = get_clean_text(row.cells[1]).strip()
222
 
223
+ # Check if this is the Print Name / Position Title header row
224
+ if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower()):
225
+ print(f" 🎯 Found Operator Declaration table")
226
 
227
+ # Look for data row
 
 
228
  if row_idx + 1 < len(table.rows):
229
  data_row = table.rows[row_idx + 1]
230
  if len(data_row.cells) >= 2:
231
  name_cell = data_row.cells[0]
232
  position_cell = data_row.cells[1]
233
 
234
+ # Update Print Name
235
  if has_red_text(name_cell):
236
+ name_value = None
237
+ for key in ["Print Name", "print name", "Operator Declaration.Print Name"]:
238
+ if key in flat_json:
239
+ name_value = flat_json[key]
240
+ break
241
 
242
+ if name_value:
243
+ name_text = get_value_as_string(name_value)
244
+ cell_replacements = replace_red_text_in_cell(name_cell, name_text)
245
+ replacements_made += cell_replacements
246
+ print(f" βœ… Updated Print Name: '{name_text}'")
 
 
247
 
248
+ # Update Position Title
249
  if has_red_text(position_cell):
250
+ position_value = None
251
+ for key in ["Position Title", "position title", "Operator Declaration.Position Title"]:
252
+ if key in flat_json:
253
+ position_value = flat_json[key]
254
+ break
255
 
256
+ if position_value:
257
+ position_text = get_value_as_string(position_value)
258
+ cell_replacements = replace_red_text_in_cell(position_cell, position_text)
259
+ replacements_made += cell_replacements
260
+ print(f" βœ… Updated Position Title: '{position_text}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  break
263
 
264
  return replacements_made
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def process_tables(document, flat_json):
267
+ """Process all tables in document"""
268
  replacements_made = 0
269
 
270
  for table_idx, table in enumerate(document.tables):
271
  print(f"\nπŸ” Processing table {table_idx + 1}:")
272
 
273
+ # Check for Operator Declaration table first (priority fix)
274
+ if len(table.rows) <= 4: # Small tables
275
+ declaration_replacements = handle_operator_declaration_table(table, flat_json)
276
+ if declaration_replacements > 0:
277
+ replacements_made += declaration_replacements
278
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ # Process all rows
281
  for row_idx, row in enumerate(table.rows):
282
  if len(row.cells) < 1:
283
  continue
 
290
 
291
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
292
 
293
+ # Handle Nature of Business section
294
+ if "nature of the operators business" in key_text.lower():
295
+ nature_replacements = handle_nature_business_section(key_cell, flat_json)
296
+ replacements_made += nature_replacements
297
+ continue
298
+
299
+ # Regular field matching
300
  json_value = find_matching_json_value(key_text, flat_json)
301
 
302
  if json_value is not None:
303
  replacement_text = get_value_as_string(json_value, key_text)
304
 
305
+ # Handle Australian Company Number specially
306
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
307
  cell_replacements = handle_australian_company_number(row, json_value)
308
  replacements_made += cell_replacements
309
+ else:
310
+ # Handle regular fields
311
+ for cell_idx in range(len(row.cells)):
312
+ cell = row.cells[cell_idx]
 
 
 
313
  if has_red_text(cell):
 
 
 
314
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
315
  replacements_made += cell_replacements
316
  if cell_replacements > 0:
317
+ print(f" βœ… Updated cell {cell_idx + 1}: '{replacement_text}'")
 
 
 
 
 
 
 
 
 
 
 
 
318
  else:
319
+ # Process any red text in row cells
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  for cell_idx in range(len(row.cells)):
321
  cell = row.cells[cell_idx]
322
  if has_red_text(cell):
323
+ # Try to extract red text and match it
324
+ red_text = ""
325
+ for paragraph in cell.paragraphs:
326
+ for run in paragraph.runs:
327
+ if is_red(run):
328
+ red_text += run.text
329
 
330
+ if red_text.strip():
331
+ json_value = find_matching_json_value(red_text.strip(), flat_json)
332
+ if json_value is not None:
333
+ replacement_text = get_value_as_string(json_value)
334
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
335
+ replacements_made += cell_replacements
336
+ if cell_replacements > 0:
337
+ print(f" βœ… Replaced red text: '{red_text.strip()}' -> '{replacement_text}'")
 
 
 
338
 
339
  return replacements_made
340
 
341
  def process_paragraphs(document, flat_json):
342
+ """Process paragraphs for red text"""
343
  replacements_made = 0
344
  print(f"\nπŸ” Processing paragraphs:")
345
 
346
  for para_idx, paragraph in enumerate(document.paragraphs):
347
+ red_text = ""
348
+ red_runs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ for run in paragraph.runs:
351
+ if is_red(run) and run.text.strip():
352
+ red_text += run.text
353
+ red_runs.append(run)
 
 
 
 
 
 
 
 
354
 
355
+ if red_text.strip():
356
+ print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text.strip()}'")
357
 
358
+ json_value = find_matching_json_value(red_text.strip(), flat_json)
 
 
 
 
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  if json_value is not None:
361
+ replacement_text = get_value_as_string(json_value)
362
+ print(f" βœ… Replacing with: '{replacement_text}'")
363
+
364
+ # Replace in first red run only
365
+ if red_runs:
366
+ red_runs[0].text = replacement_text
367
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
368
+ # Clear other red runs
369
+ for run in red_runs[1:]:
370
+ run.text = ''
371
+ replacements_made += 1
 
 
 
 
 
 
 
 
372
 
373
  return replacements_made
374
 
375
  def process_hf(json_file, docx_file, output_file):
376
+ """Main processing function compatible with your new system"""
377
  try:
378
  # Load JSON
379
  if hasattr(json_file, "read"):
 
382
  with open(json_file, 'r', encoding='utf-8') as f:
383
  json_data = json.load(f)
384
 
385
+ # Flatten your new JSON structure
386
+ flat_json = flatten_json_new_system(json_data)
387
  print("πŸ“„ Available JSON keys (sample):")
388
  for i, (key, value) in enumerate(sorted(flat_json.items())):
389
  if i < 10:
 
396
  else:
397
  doc = Document(docx_file)
398
 
399
+ # Process document
400
+ print("πŸš€ Starting processing compatible with your new system...")
401
 
402
  table_replacements = process_tables(doc, flat_json)
403
  paragraph_replacements = process_paragraphs(doc, flat_json)
 
404
 
405
+ total_replacements = table_replacements + paragraph_replacements
406
 
407
  # Save output
408
  if hasattr(output_file, "write"):
 
414
  print(f"βœ… Total replacements: {total_replacements}")
415
  print(f" πŸ“Š Tables: {table_replacements}")
416
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
 
417
  print(f"πŸŽ‰ Processing complete!")
418
 
419
  except FileNotFoundError as e:
 
426
  if __name__ == "__main__":
427
  import sys
428
  if len(sys.argv) != 4:
429
+ print("Usage: python compatible_pipeline.py <input_docx> <updated_json> <output_docx>")
430
  exit(1)
431
  docx_path = sys.argv[1]
432
  json_path = sys.argv[2]