Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +0 -162
updated_word.py
CHANGED
|
@@ -1300,169 +1300,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
|
|
| 1300 |
|
| 1301 |
return replacements_made
|
| 1302 |
|
| 1303 |
-
def force_red_text_replacement(document, flat_json):
|
| 1304 |
-
"""Force replacement of any remaining red text by trying ALL JSON values"""
|
| 1305 |
-
replacements_made = 0
|
| 1306 |
-
print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
|
| 1307 |
-
|
| 1308 |
-
# Collect all possible replacement values from JSON
|
| 1309 |
-
all_values = {}
|
| 1310 |
-
for key, value in flat_json.items():
|
| 1311 |
-
if value:
|
| 1312 |
-
value_str = get_value_as_string(value, key)
|
| 1313 |
-
|
| 1314 |
-
if value_str and isinstance(value_str, str) and value_str.strip():
|
| 1315 |
-
all_values[key] = value_str.strip()
|
| 1316 |
-
|
| 1317 |
-
# Store individual items from lists for partial matching
|
| 1318 |
-
if isinstance(value, list):
|
| 1319 |
-
for i, item in enumerate(value):
|
| 1320 |
-
item_str = str(item).strip() if item else ""
|
| 1321 |
-
if item_str:
|
| 1322 |
-
all_values[f"{key}_item_{i}"] = item_str
|
| 1323 |
-
|
| 1324 |
-
print(f" Found {len(all_values)} potential replacement values")
|
| 1325 |
-
|
| 1326 |
-
# Process all tables
|
| 1327 |
-
for table_idx, table in enumerate(document.tables):
|
| 1328 |
-
for row_idx, row in enumerate(table.rows):
|
| 1329 |
-
for cell_idx, cell in enumerate(row.cells):
|
| 1330 |
-
if has_red_text(cell):
|
| 1331 |
-
print(f" 🔍 Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
|
| 1332 |
-
|
| 1333 |
-
# Extract all red text from this cell
|
| 1334 |
-
red_text_parts = []
|
| 1335 |
-
for paragraph in cell.paragraphs:
|
| 1336 |
-
for run in paragraph.runs:
|
| 1337 |
-
if is_red(run) and run.text.strip():
|
| 1338 |
-
red_text_parts.append(run.text.strip())
|
| 1339 |
-
|
| 1340 |
-
combined_red_text = " ".join(red_text_parts).strip()
|
| 1341 |
-
print(f" Red text: '{combined_red_text}'")
|
| 1342 |
-
|
| 1343 |
-
# safety: when red text is very short, avoid replacing with very long multi-item values
|
| 1344 |
-
red_len_words = len(combined_red_text.split())
|
| 1345 |
-
|
| 1346 |
-
# Find best match
|
| 1347 |
-
best_match = None
|
| 1348 |
-
best_key = None
|
| 1349 |
-
|
| 1350 |
-
# Exact matching (prefer exact)
|
| 1351 |
-
for key, value in all_values.items():
|
| 1352 |
-
if combined_red_text.lower() == value.lower():
|
| 1353 |
-
best_match = value
|
| 1354 |
-
best_key = key
|
| 1355 |
-
break
|
| 1356 |
|
| 1357 |
-
# Partial matching (skip aggressive short->long mapping)
|
| 1358 |
-
if not best_match:
|
| 1359 |
-
for key, value in all_values.items():
|
| 1360 |
-
# <<< PATCH: skip matching single-word red_text to multi-item candidate values
|
| 1361 |
-
if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
|
| 1362 |
-
continue
|
| 1363 |
-
if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
|
| 1364 |
-
(len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
|
| 1365 |
-
best_match = value
|
| 1366 |
-
best_key = key
|
| 1367 |
-
break
|
| 1368 |
-
|
| 1369 |
-
# Word-by-word matching for names/dates
|
| 1370 |
-
if not best_match:
|
| 1371 |
-
red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
|
| 1372 |
-
best_score = 0
|
| 1373 |
-
|
| 1374 |
-
for key, value in all_values.items():
|
| 1375 |
-
# skip aggressive substitution for short red tokens vs long values
|
| 1376 |
-
if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
|
| 1377 |
-
continue
|
| 1378 |
-
value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
|
| 1379 |
-
if red_words and value_words:
|
| 1380 |
-
common_words = red_words.intersection(value_words)
|
| 1381 |
-
if common_words:
|
| 1382 |
-
score = len(common_words) / len(red_words)
|
| 1383 |
-
if score > best_score and score >= 0.5: # At least 50% match
|
| 1384 |
-
best_score = score
|
| 1385 |
-
best_match = value
|
| 1386 |
-
best_key = key
|
| 1387 |
-
|
| 1388 |
-
# Replace if we found a match
|
| 1389 |
-
if best_match:
|
| 1390 |
-
print(f" ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
|
| 1391 |
-
cell_replacements = replace_red_text_in_cell(cell, best_match)
|
| 1392 |
-
replacements_made += cell_replacements
|
| 1393 |
-
print(f" Made {cell_replacements} replacements")
|
| 1394 |
-
else:
|
| 1395 |
-
print(f" ❌ No suitable replacement found")
|
| 1396 |
-
|
| 1397 |
-
# Process all paragraphs
|
| 1398 |
-
for para_idx, paragraph in enumerate(document.paragraphs):
|
| 1399 |
-
if has_red_text_in_paragraph(paragraph):
|
| 1400 |
-
red_text_parts = []
|
| 1401 |
-
for run in paragraph.runs:
|
| 1402 |
-
if is_red(run) and run.text.strip():
|
| 1403 |
-
red_text_parts.append(run.text.strip())
|
| 1404 |
-
|
| 1405 |
-
combined_red_text = " ".join(red_text_parts).strip()
|
| 1406 |
-
if combined_red_text:
|
| 1407 |
-
print(f" 🔍 Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
|
| 1408 |
-
|
| 1409 |
-
# Same matching logic as above
|
| 1410 |
-
best_match = None
|
| 1411 |
-
best_key = None
|
| 1412 |
-
|
| 1413 |
-
red_len_words = len(combined_red_text.split())
|
| 1414 |
-
|
| 1415 |
-
# Exact match
|
| 1416 |
-
for key, value in all_values.items():
|
| 1417 |
-
if combined_red_text.lower() == value.lower():
|
| 1418 |
-
best_match = value
|
| 1419 |
-
best_key = key
|
| 1420 |
-
break
|
| 1421 |
-
|
| 1422 |
-
# Partial match
|
| 1423 |
-
if not best_match:
|
| 1424 |
-
for key, value in all_values.items():
|
| 1425 |
-
if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
|
| 1426 |
-
continue
|
| 1427 |
-
if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
|
| 1428 |
-
(len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
|
| 1429 |
-
best_match = value
|
| 1430 |
-
best_key = key
|
| 1431 |
-
break
|
| 1432 |
-
|
| 1433 |
-
# Word match
|
| 1434 |
-
if not best_match:
|
| 1435 |
-
red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
|
| 1436 |
-
best_score = 0
|
| 1437 |
-
|
| 1438 |
-
for key, value in all_values.items():
|
| 1439 |
-
if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
|
| 1440 |
-
continue
|
| 1441 |
-
value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
|
| 1442 |
-
if red_words and value_words:
|
| 1443 |
-
common_words = red_words.intersection(value_words)
|
| 1444 |
-
if common_words:
|
| 1445 |
-
score = len(common_words) / len(red_words)
|
| 1446 |
-
if score > best_score and score >= 0.5:
|
| 1447 |
-
best_score = score
|
| 1448 |
-
best_match = value
|
| 1449 |
-
best_key = key
|
| 1450 |
-
|
| 1451 |
-
# Replace if found
|
| 1452 |
-
if best_match:
|
| 1453 |
-
print(f" ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
|
| 1454 |
-
red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
|
| 1455 |
-
if red_runs:
|
| 1456 |
-
red_runs[0].text = best_match
|
| 1457 |
-
red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
|
| 1458 |
-
for run in red_runs[1:]:
|
| 1459 |
-
run.text = ''
|
| 1460 |
-
replacements_made += 1
|
| 1461 |
-
print(f" Made 1 paragraph replacement")
|
| 1462 |
-
else:
|
| 1463 |
-
print(f" ❌ No suitable replacement found")
|
| 1464 |
-
|
| 1465 |
-
return replacements_made
|
| 1466 |
|
| 1467 |
def process_hf(json_file, docx_file, output_file):
|
| 1468 |
"""Main processing function with comprehensive error handling"""
|
|
|
|
| 1300 |
|
| 1301 |
return replacements_made
|
| 1302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1304 |
|
| 1305 |
def process_hf(json_file, docx_file, output_file):
|
| 1306 |
"""Main processing function with comprehensive error handling"""
|