Shami96 commited on
Commit
5b2b3a8
·
verified ·
1 Parent(s): 25603c9

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +345 -114
updated_word.py CHANGED
@@ -37,104 +37,71 @@ def get_value_as_string(value, field_name=""):
37
  return str(value)
38
 
39
  def find_matching_json_value(field_name, flat_json):
40
- """Find matching JSON value based on field name (key)"""
41
  field_name = field_name.strip()
42
 
43
- # Manual mapping for specific sections that need special handling
44
- manual_mappings = {
45
- "attendance list name and position title": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
46
- "attendance list (names and position titles)": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
47
- "nature of the operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
48
- "nature of the operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
49
- "nature of operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
50
- "nature of operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
51
- # Paragraph-level mappings
52
- "mass management": "paragraphs.MASS MANAGEMENT",
53
- "liam herbig": "paragraphs.MASS MANAGEMENT", # Name should be replaced with company name
54
- "date": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
55
- # Date-related mappings
56
- "13.11.2024": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
57
- "auditor signature": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
58
- "operator signature": "paragraphs.I hereby consent to information relating to my Accreditation to be shared with other law enforcement agencies, including a service provider authorised under the Heavy Vehicle National Law.",
59
- # Specific data mappings
60
- "jodie jones": "Audit Information.Auditor name",
61
- "13th november 2024": "Audit Information.Date of Audit",
62
- "adelaide barossa transport & warehousing pty ltd": "Operator Information.Operator name (Legal entity)",
63
- "manager": "Operator Information.Operator name (Legal entity)", # Replace manager title with company name
64
- "liam herbig –manager": "Operator Information.Operator name (Legal entity)",
65
- "liam herbig – manager": "Operator Information.Operator name (Legal entity)",
66
- "deborah herbig – manager": "Operator Information.Operator name (Legal entity)",
67
- # Contact information mappings (old data in red text -> new data from JSON)
68
- "141 sitz road callington sa 5254": "Operator Information.Operator business address", # Replace old address with new
69
- "po box 743 mt barker sa": "Operator Information.Operator Postal address", # Replace old postal with new
70
- "debherbig@bigpond.com": "Operator Information.Email address", # Replace old email with new
71
- "0447 710 602": "Operator Information.Operator Telephone Number", # Replace old phone with new
72
- # Manual/Version mappings (old version -> new version)
73
- "mahlo 092021v1": "Operator Information.NHVAS Manual (Policies and Procedures) developed by", # Replace old manual with new
74
- # These should stay as they are (no replacement needed, just different format)
75
- "511840": "Operator Information.NHVAS Accreditation No. (If applicable)", # Keep accreditation number
76
- "26th october 2023": "Audit Information.Date of Audit", # Use audit date instead
77
- # Std 5 and Std 6 mappings
78
- "the latest verification was dated 23rdnovember 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
79
- "the latest verification was dated 23rd november 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
80
- "internal review was dated 23rd august 2023 with 0 ncr": "Mass Management Summary of Audit findings.Std 6. Internal Review",
81
- "23rd august2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
82
- "23rd august 2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
83
- }
84
-
85
- # Check manual mappings first
86
- normalized_field = field_name.lower().strip()
87
- if normalized_field in manual_mappings:
88
- mapped_key = manual_mappings[normalized_field]
89
- if mapped_key in flat_json:
90
- print(f" ✅ Manual mapping found for '{field_name}' -> '{mapped_key}'")
91
- return flat_json[mapped_key]
92
-
93
  # Try exact match first
94
  if field_name in flat_json:
95
- print(f" Direct match found for key '{field_name}'")
96
  return flat_json[field_name]
97
 
98
  # Try case-insensitive exact match
99
  for key, value in flat_json.items():
100
  if key.lower() == field_name.lower():
101
- print(f" Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
102
  return value
103
 
104
- # Try to find a key that ends with this field name
105
  for key, value in flat_json.items():
106
- if key.endswith('.' + field_name):
107
- print(f" Suffix match found for key '{field_name}' with JSON key '{key}'")
108
  return value
109
 
110
- # Try partial matching for fields with parentheses or additional text
111
- clean_field = re.sub(r'\s*\([^)]*\)', '', field_name).strip() # Remove parentheses content
 
 
112
  for key, value in flat_json.items():
113
- clean_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
114
- if clean_field.lower() == clean_key.lower():
115
- print(f" Clean match found for key '{field_name}' with JSON key '{key}'")
 
 
116
  return value
117
 
118
- # Try word-based matching - more flexible approach
119
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
 
 
 
120
  best_match = None
121
  best_score = 0
 
122
 
123
  for key, value in flat_json.items():
124
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
125
- # Calculate how many words match
 
 
 
126
  common_words = field_words.intersection(key_words)
127
  if common_words:
128
- score = len(common_words) / max(len(field_words), len(key_words)) # Normalized score
129
- if score > best_score:
130
- best_score = score
131
- best_match = (key, value)
 
 
 
 
 
 
 
132
 
133
- if best_match and best_score >= 0.5: # At least 50% word overlap
134
- print(f" Word-based match found for key '{field_name}' with JSON key '{best_match[0]}' (score: {best_score:.2f})")
135
- return best_match[1]
136
 
137
- # No match found
138
  print(f" ❌ No match found for '{field_name}'")
139
  return None
140
 
@@ -152,38 +119,291 @@ def has_red_text(cell):
152
  return True
153
  return False
154
 
155
- def replace_red_text_in_cell(cell, replacement_text):
156
- replacements_made = 0
 
157
 
158
- # First, collect all red text to show what we're replacing
159
- all_red_text = ""
160
- for paragraph in cell.paragraphs:
161
- for run in paragraph.runs:
 
162
  if is_red(run):
163
- all_red_text += run.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- if all_red_text.strip():
166
- print(f" ✅ Replacing red text: '{all_red_text[:50]}...' → '{replacement_text[:50]}...'")
167
 
168
- # Now replace all red text in the cell with the replacement text
169
- first_replacement_done = False
170
- for paragraph in cell.paragraphs:
171
- red_runs = [run for run in paragraph.runs if is_red(run)]
172
- if red_runs:
173
- if not first_replacement_done:
174
- # Replace the first red run with our text
175
- red_runs[0].text = replacement_text
176
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
177
- first_replacement_done = True
178
- replacements_made = 1
179
- else:
180
- # Clear the first red run since we already replaced content
181
- red_runs[0].text = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- # Clear all other red runs in this paragraph
184
- for run in red_runs[1:]:
185
  run.text = ''
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return replacements_made
188
 
189
  def handle_australian_company_number(row, company_numbers):
@@ -568,27 +788,41 @@ def process_paragraphs(document, flat_json):
568
  replacements_made += 1
569
  return replacements_made
570
 
571
- def main(json_path, docx_path, output_path):
572
-
 
 
 
573
  try:
574
- json_data = load_json(json_path)
 
 
 
 
 
575
  flat_json = flatten_json(json_data)
576
  print("📄 Available JSON keys (sample):")
577
- count = 0
578
- for key, value in sorted(flat_json.items()):
579
- if count < 10:
580
  print(f" - {key}: {value}")
581
- count += 1
582
- print(f" ... and {len(flat_json) - count} more keys\n")
583
 
584
- doc = Document(docx_path)
 
 
 
 
585
 
586
  table_replacements = process_tables(doc, flat_json)
587
  paragraph_replacements = process_paragraphs(doc, flat_json)
588
  total_replacements = table_replacements + paragraph_replacements
589
 
590
- doc.save(output_path)
591
- print(f"\n✅ Document saved as: {output_path}")
 
 
 
 
592
  print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
593
 
594
  except FileNotFoundError as e:
@@ -600,10 +834,7 @@ def main(json_path, docx_path, output_path):
600
 
601
  if __name__ == "__main__":
602
  import sys
603
- if len(sys.argv) != 4:
604
- print("Usage: python updated_word.py <input_docx> <updated_json> <output_docx>")
605
- exit(1)
606
- docx_path = sys.argv[1]
607
- json_path = sys.argv[2]
608
- output_path = sys.argv[3]
609
- main(json_path, docx_path, output_path)
 
37
  return str(value)
38
 
39
  def find_matching_json_value(field_name, flat_json):
40
+ """Completely dynamic matching without manual mappings"""
41
  field_name = field_name.strip()
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Try exact match first
44
  if field_name in flat_json:
45
+ print(f" Direct match found for key '{field_name}'")
46
  return flat_json[field_name]
47
 
48
  # Try case-insensitive exact match
49
  for key, value in flat_json.items():
50
  if key.lower() == field_name.lower():
51
+ print(f" Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
52
  return value
53
 
54
+ # Try suffix matching (for nested keys like "section.field")
55
  for key, value in flat_json.items():
56
+ if '.' in key and key.split('.')[-1].lower() == field_name.lower():
57
+ print(f" Suffix match found for key '{field_name}' with JSON key '{key}'")
58
  return value
59
 
60
+ # Try partial matching - remove parentheses and special chars
61
+ clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
62
+ clean_field = re.sub(r'\s+', ' ', clean_field) # Multiple spaces to single
63
+
64
  for key, value in flat_json.items():
65
+ clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
66
+ clean_key = re.sub(r'\s+', ' ', clean_key)
67
+
68
+ if clean_field == clean_key:
69
+ print(f" ✅ Clean match found for key '{field_name}' with JSON key '{key}'")
70
  return value
71
 
72
+ # Word-based fuzzy matching
73
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
74
+ if not field_words:
75
+ return None
76
+
77
  best_match = None
78
  best_score = 0
79
+ best_key = None
80
 
81
  for key, value in flat_json.items():
82
  key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
83
+ if not key_words:
84
+ continue
85
+
86
+ # Calculate similarity score
87
  common_words = field_words.intersection(key_words)
88
  if common_words:
89
+ # Use Jaccard similarity: intersection / union
90
+ similarity = len(common_words) / len(field_words.union(key_words))
91
+
92
+ # Bonus for high word coverage in field_name
93
+ coverage = len(common_words) / len(field_words)
94
+ final_score = (similarity * 0.6) + (coverage * 0.4)
95
+
96
+ if final_score > best_score:
97
+ best_score = final_score
98
+ best_match = value
99
+ best_key = key
100
 
101
+ if best_match and best_score >= 0.3: # Lowered threshold for more matches
102
+ print(f" Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
103
+ return best_match
104
 
 
105
  print(f" ❌ No match found for '{field_name}'")
106
  return None
107
 
 
119
  return True
120
  return False
121
 
122
+ def extract_red_text_segments(cell):
123
+ """Extract all red text segments from a cell with better multi-line handling"""
124
+ red_segments = []
125
 
126
+ for para_idx, paragraph in enumerate(cell.paragraphs):
127
+ current_segment = ""
128
+ segment_runs = []
129
+
130
+ for run_idx, run in enumerate(paragraph.runs):
131
  if is_red(run):
132
+ if run.text: # Include even empty red runs for proper replacement
133
+ current_segment += run.text
134
+ segment_runs.append((para_idx, run_idx, run))
135
+ else:
136
+ # End of current red segment
137
+ if segment_runs: # Changed from current_segment.strip() to segment_runs
138
+ red_segments.append({
139
+ 'text': current_segment,
140
+ 'runs': segment_runs.copy(),
141
+ 'paragraph_idx': para_idx
142
+ })
143
+ current_segment = ""
144
+ segment_runs = []
145
+
146
+ # Handle segment at end of paragraph
147
+ if segment_runs: # Changed from current_segment.strip() to segment_runs
148
+ red_segments.append({
149
+ 'text': current_segment,
150
+ 'runs': segment_runs.copy(),
151
+ 'paragraph_idx': para_idx
152
+ })
153
+
154
+ return red_segments
155
+
156
+ def replace_red_text_in_cell(cell, replacement_text):
157
+ """Enhanced cell replacement with better multi-line and multi-segment handling"""
158
+ red_segments = extract_red_text_segments(cell)
159
 
160
+ if not red_segments:
161
+ return 0
162
 
163
+ # If we have multiple segments, try to match each individually first
164
+ if len(red_segments) > 1:
165
+ replacements_made = 0
166
+ for segment in red_segments:
167
+ segment_text = segment['text'].strip()
168
+ if segment_text:
169
+ # Try to find specific match for this segment
170
+ # This would require access to flat_json, so we'll handle it in the calling function
171
+ pass
172
+
173
+ # If no individual matches, replace all with the single replacement
174
+ if replacements_made == 0:
175
+ return replace_all_red_segments(red_segments, replacement_text)
176
+
177
+ # Single segment or fallback - replace all red text with the replacement
178
+ return replace_all_red_segments(red_segments, replacement_text)
179
+
180
+ def replace_all_red_segments(red_segments, replacement_text):
181
+ """Replace all red segments with the replacement text"""
182
+ if not red_segments:
183
+ return 0
184
+
185
+ # Handle multi-line replacement text
186
+ if '\n' in replacement_text:
187
+ replacement_lines = replacement_text.split('\n')
188
+ else:
189
+ replacement_lines = [replacement_text]
190
+
191
+ replacements_made = 0
192
+
193
+ # Replace first segment with first line
194
+ if red_segments and replacement_lines:
195
+ first_segment = red_segments[0]
196
+ if first_segment['runs']:
197
+ first_run = first_segment['runs'][0][2] # (para_idx, run_idx, run)
198
+ first_run.text = replacement_lines[0]
199
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
200
+ replacements_made = 1
201
 
202
+ # Clear other runs in first segment
203
+ for _, _, run in first_segment['runs'][1:]:
204
  run.text = ''
205
 
206
+ # Clear all other red segments
207
+ for segment in red_segments[1:]:
208
+ for _, _, run in segment['runs']:
209
+ run.text = ''
210
+
211
+ # If we have multiple lines, add them to the same paragraph or create new runs
212
+ if len(replacement_lines) > 1 and red_segments:
213
+ try:
214
+ # Get the paragraph that contains the first run
215
+ first_run = red_segments[0]['runs'][0][2]
216
+ paragraph = first_run.element.getparent() # Get the paragraph element
217
+
218
+ # Add remaining lines as new runs in the same paragraph with line breaks
219
+ for line in replacement_lines[1:]:
220
+ if line.strip(): # Only add non-empty lines
221
+ # Add a line break run
222
+ from docx.oxml import OxmlElement, ns
223
+ br = OxmlElement('w:br')
224
+ first_run.element.append(br)
225
+
226
+ # Add the text as a new run
227
+ new_run = paragraph.add_run(line.strip())
228
+ new_run.font.color.rgb = RGBColor(0, 0, 0)
229
+ except:
230
+ # If we can't add line breaks, just put everything in the first run
231
+ if red_segments and red_segments[0]['runs']:
232
+ first_run = red_segments[0]['runs'][0][2]
233
+ # Join all lines with spaces instead of line breaks
234
+ first_run.text = ' '.join(replacement_lines)
235
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
236
+
237
+ return replacements_made
238
+
239
+ def handle_multiple_red_segments_in_cell(cell, flat_json):
240
+ """Handle cells with multiple red text segments dynamically"""
241
+ red_segments = extract_red_text_segments(cell)
242
+
243
+ if not red_segments:
244
+ return 0
245
+
246
+ print(f" 🔍 Found {len(red_segments)} red text segments in cell")
247
+ replacements_made = 0
248
+ unmatched_segments = []
249
+
250
+ # Try to match each segment individually
251
+ for i, segment in enumerate(red_segments):
252
+ segment_text = segment['text'].strip()
253
+ if not segment_text:
254
+ continue
255
+
256
+ print(f" Segment {i+1}: '{segment_text[:50]}...'")
257
+
258
+ # Find JSON match for this segment
259
+ json_value = find_matching_json_value(segment_text, flat_json)
260
+
261
+ if json_value is not None:
262
+ replacement_text = get_value_as_string(json_value, segment_text)
263
+
264
+ # Handle list values
265
+ if isinstance(json_value, list) and len(json_value) > 1:
266
+ replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
267
+
268
+ success = replace_single_segment(segment, replacement_text)
269
+ if success:
270
+ replacements_made += 1
271
+ print(f" ✅ Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
272
+ else:
273
+ unmatched_segments.append(segment)
274
+ print(f" ⏳ No individual match for segment '{segment_text[:30]}...'")
275
+
276
+ # If we have unmatched segments, try to match the combined text
277
+ if unmatched_segments and replacements_made == 0:
278
+ combined_text = " ".join(seg['text'] for seg in red_segments).strip()
279
+ print(f" 🔄 Trying combined text match: '{combined_text[:50]}...'")
280
+
281
+ json_value = find_matching_json_value(combined_text, flat_json)
282
+ if json_value is not None:
283
+ replacement_text = get_value_as_string(json_value, combined_text)
284
+ if isinstance(json_value, list) and len(json_value) > 1:
285
+ replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
286
+
287
+ # Replace all segments with the combined replacement
288
+ replacements_made = replace_all_red_segments(red_segments, replacement_text)
289
+ print(f" ✅ Replaced combined text with '{replacement_text[:50]}...'")
290
+
291
+ return replacements_made
292
+
293
+ def replace_single_segment(segment, replacement_text):
294
+ """Replace a single red text segment"""
295
+ if not segment['runs']:
296
+ return False
297
+
298
+ # Replace first run with new text
299
+ first_run = segment['runs'][0][2] # (para_idx, run_idx, run)
300
+ first_run.text = replacement_text
301
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
302
+
303
+ # Clear remaining runs in the segment
304
+ for _, _, run in segment['runs'][1:]:
305
+ run.text = ''
306
+
307
+ return True
308
+
309
+ def process_tables(document, flat_json):
310
+ """Enhanced table processing with better dynamic detection"""
311
+ replacements_made = 0
312
+
313
+ for table_idx, table in enumerate(document.tables):
314
+ print(f"\n🔍 Processing table {table_idx + 1}:")
315
+
316
+ # Dynamically detect table type by analyzing content
317
+ table_type = detect_table_type(table)
318
+ print(f" 📋 Detected table type: {table_type}")
319
+
320
+ if table_type == "vehicle_registration":
321
+ vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
322
+ replacements_made += vehicle_replacements
323
+ continue
324
+ elif table_type == "print_accreditation":
325
+ print_replacements = handle_print_accreditation_section(table, flat_json)
326
+ replacements_made += print_replacements
327
+ continue
328
+
329
+ # Process as regular key-value table
330
+ for row_idx, row in enumerate(table.rows):
331
+ if len(row.cells) < 1:
332
+ continue
333
+
334
+ # Process each cell for red text
335
+ for cell_idx, cell in enumerate(row.cells):
336
+ if has_red_text(cell):
337
+ cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
338
+ replacements_made += cell_replacements
339
+
340
+ # If no individual segment matches found, try context-based matching
341
+ if cell_replacements == 0:
342
+ context_replacements = try_context_based_replacement(cell, row, table, flat_json)
343
+ replacements_made += context_replacements
344
+
345
+ return replacements_made
346
+
347
+ def detect_table_type(table):
348
+ """Dynamically detect table type based on content"""
349
+ # Get text from first few rows
350
+ sample_text = ""
351
+ for row in table.rows[:3]:
352
+ for cell in row.cells:
353
+ sample_text += get_clean_text(cell).lower() + " "
354
+
355
+ # Vehicle registration indicators
356
+ vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
357
+ vehicle_score = sum(1 for indicator in vehicle_indicators if indicator in sample_text)
358
+
359
+ # Print accreditation indicators
360
+ print_indicators = ["print name", "position title"]
361
+ print_score = sum(1 for indicator in print_indicators if indicator in sample_text)
362
+
363
+ if vehicle_score >= 3:
364
+ return "vehicle_registration"
365
+ elif print_score >= 2:
366
+ return "print_accreditation"
367
+ else:
368
+ return "key_value"
369
+
370
+ def try_context_based_replacement(cell, row, table, flat_json):
371
+ """Try to find replacement using context from surrounding cells"""
372
+ replacements_made = 0
373
+
374
+ # Get context from row headers/labels
375
+ row_context = ""
376
+ if len(row.cells) > 1:
377
+ # First cell might be a label
378
+ first_cell_text = get_clean_text(row.cells[0]).strip()
379
+ if first_cell_text and not has_red_text(row.cells[0]):
380
+ row_context = first_cell_text
381
+
382
+ # Get red text from the cell
383
+ red_segments = extract_red_text_segments(cell)
384
+ for segment in red_segments:
385
+ red_text = segment['text'].strip()
386
+ if not red_text:
387
+ continue
388
+
389
+ # Try combining context with red text
390
+ if row_context:
391
+ context_queries = [
392
+ f"{row_context} {red_text}",
393
+ f"{row_context}",
394
+ red_text
395
+ ]
396
+
397
+ for query in context_queries:
398
+ json_value = find_matching_json_value(query, flat_json)
399
+ if json_value is not None:
400
+ replacement_text = get_value_as_string(json_value, query)
401
+ success = replace_single_segment(segment, replacement_text)
402
+ if success:
403
+ replacements_made += 1
404
+ print(f" ✅ Context-based replacement: '{query}' -> '{replacement_text[:30]}...'")
405
+ break
406
+
407
  return replacements_made
408
 
409
  def handle_australian_company_number(row, company_numbers):
 
788
  replacements_made += 1
789
  return replacements_made
790
 
791
+ def process_hf(json_file, docx_file, output_file):
792
+ """
793
+ Accepts file-like objects or file paths.
794
+ For Hugging Face: json_file, docx_file, output_file will be file-like objects.
795
+ """
796
  try:
797
+ # --- Load JSON (file or file-like) ---
798
+ if hasattr(json_file, "read"):
799
+ json_data = json.load(json_file)
800
+ else:
801
+ with open(json_file, 'r', encoding='utf-8') as f:
802
+ json_data = json.load(f)
803
  flat_json = flatten_json(json_data)
804
  print("📄 Available JSON keys (sample):")
805
+ for i, (key, value) in enumerate(sorted(flat_json.items())):
806
+ if i < 10:
 
807
  print(f" - {key}: {value}")
808
+ print(f" ... and {len(flat_json) - 10} more keys\n")
 
809
 
810
+ # --- Load DOCX (file or file-like) ---
811
+ if hasattr(docx_file, "read"):
812
+ doc = Document(docx_file)
813
+ else:
814
+ doc = Document(docx_file)
815
 
816
  table_replacements = process_tables(doc, flat_json)
817
  paragraph_replacements = process_paragraphs(doc, flat_json)
818
  total_replacements = table_replacements + paragraph_replacements
819
 
820
+ # --- Save DOCX output (file or file-like) ---
821
+ if hasattr(output_file, "write"):
822
+ doc.save(output_file)
823
+ else:
824
+ doc.save(output_file)
825
+ print(f"\n✅ Document saved as: {output_file}")
826
  print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
827
 
828
  except FileNotFoundError as e:
 
834
 
835
  if __name__ == "__main__":
836
  import sys
837
+ if len(sys.argv) == 4:
838
+ process_hf(sys.argv[1], sys.argv[2], sys.argv[3])
839
+ else:
840
+ print("Usage: python updated_word.py <input_json> <input_docx> <output_docx>")