Shami96 commited on
Commit
6aa8b72
·
verified ·
1 Parent(s): da7e8af

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +0 -162
updated_word.py CHANGED
@@ -1300,169 +1300,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
1300
 
1301
  return replacements_made
1302
 
1303
- def force_red_text_replacement(document, flat_json):
1304
- """Force replacement of any remaining red text by trying ALL JSON values"""
1305
- replacements_made = 0
1306
- print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
1307
-
1308
- # Collect all possible replacement values from JSON
1309
- all_values = {}
1310
- for key, value in flat_json.items():
1311
- if value:
1312
- value_str = get_value_as_string(value, key)
1313
-
1314
- if value_str and isinstance(value_str, str) and value_str.strip():
1315
- all_values[key] = value_str.strip()
1316
-
1317
- # Store individual items from lists for partial matching
1318
- if isinstance(value, list):
1319
- for i, item in enumerate(value):
1320
- item_str = str(item).strip() if item else ""
1321
- if item_str:
1322
- all_values[f"{key}_item_{i}"] = item_str
1323
-
1324
- print(f" Found {len(all_values)} potential replacement values")
1325
-
1326
- # Process all tables
1327
- for table_idx, table in enumerate(document.tables):
1328
- for row_idx, row in enumerate(table.rows):
1329
- for cell_idx, cell in enumerate(row.cells):
1330
- if has_red_text(cell):
1331
- print(f" 🔍 Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
1332
-
1333
- # Extract all red text from this cell
1334
- red_text_parts = []
1335
- for paragraph in cell.paragraphs:
1336
- for run in paragraph.runs:
1337
- if is_red(run) and run.text.strip():
1338
- red_text_parts.append(run.text.strip())
1339
-
1340
- combined_red_text = " ".join(red_text_parts).strip()
1341
- print(f" Red text: '{combined_red_text}'")
1342
-
1343
- # safety: when red text is very short, avoid replacing with very long multi-item values
1344
- red_len_words = len(combined_red_text.split())
1345
-
1346
- # Find best match
1347
- best_match = None
1348
- best_key = None
1349
-
1350
- # Exact matching (prefer exact)
1351
- for key, value in all_values.items():
1352
- if combined_red_text.lower() == value.lower():
1353
- best_match = value
1354
- best_key = key
1355
- break
1356
 
1357
- # Partial matching (skip aggressive short->long mapping)
1358
- if not best_match:
1359
- for key, value in all_values.items():
1360
- # <<< PATCH: skip matching single-word red_text to multi-item candidate values
1361
- if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
1362
- continue
1363
- if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1364
- (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1365
- best_match = value
1366
- best_key = key
1367
- break
1368
-
1369
- # Word-by-word matching for names/dates
1370
- if not best_match:
1371
- red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1372
- best_score = 0
1373
-
1374
- for key, value in all_values.items():
1375
- # skip aggressive substitution for short red tokens vs long values
1376
- if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
1377
- continue
1378
- value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1379
- if red_words and value_words:
1380
- common_words = red_words.intersection(value_words)
1381
- if common_words:
1382
- score = len(common_words) / len(red_words)
1383
- if score > best_score and score >= 0.5: # At least 50% match
1384
- best_score = score
1385
- best_match = value
1386
- best_key = key
1387
-
1388
- # Replace if we found a match
1389
- if best_match:
1390
- print(f" ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
1391
- cell_replacements = replace_red_text_in_cell(cell, best_match)
1392
- replacements_made += cell_replacements
1393
- print(f" Made {cell_replacements} replacements")
1394
- else:
1395
- print(f" ❌ No suitable replacement found")
1396
-
1397
- # Process all paragraphs
1398
- for para_idx, paragraph in enumerate(document.paragraphs):
1399
- if has_red_text_in_paragraph(paragraph):
1400
- red_text_parts = []
1401
- for run in paragraph.runs:
1402
- if is_red(run) and run.text.strip():
1403
- red_text_parts.append(run.text.strip())
1404
-
1405
- combined_red_text = " ".join(red_text_parts).strip()
1406
- if combined_red_text:
1407
- print(f" 🔍 Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
1408
-
1409
- # Same matching logic as above
1410
- best_match = None
1411
- best_key = None
1412
-
1413
- red_len_words = len(combined_red_text.split())
1414
-
1415
- # Exact match
1416
- for key, value in all_values.items():
1417
- if combined_red_text.lower() == value.lower():
1418
- best_match = value
1419
- best_key = key
1420
- break
1421
-
1422
- # Partial match
1423
- if not best_match:
1424
- for key, value in all_values.items():
1425
- if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
1426
- continue
1427
- if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
1428
- (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
1429
- best_match = value
1430
- best_key = key
1431
- break
1432
-
1433
- # Word match
1434
- if not best_match:
1435
- red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
1436
- best_score = 0
1437
-
1438
- for key, value in all_values.items():
1439
- if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
1440
- continue
1441
- value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
1442
- if red_words and value_words:
1443
- common_words = red_words.intersection(value_words)
1444
- if common_words:
1445
- score = len(common_words) / len(red_words)
1446
- if score > best_score and score >= 0.5:
1447
- best_score = score
1448
- best_match = value
1449
- best_key = key
1450
-
1451
- # Replace if found
1452
- if best_match:
1453
- print(f" ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
1454
- red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
1455
- if red_runs:
1456
- red_runs[0].text = best_match
1457
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
1458
- for run in red_runs[1:]:
1459
- run.text = ''
1460
- replacements_made += 1
1461
- print(f" Made 1 paragraph replacement")
1462
- else:
1463
- print(f" ❌ No suitable replacement found")
1464
-
1465
- return replacements_made
1466
 
1467
  def process_hf(json_file, docx_file, output_file):
1468
  """Main processing function with comprehensive error handling"""
 
1300
 
1301
  return replacements_made
1302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304
 
1305
  def process_hf(json_file, docx_file, output_file):
1306
  """Main processing function with comprehensive error handling"""