Spaces:
Running
Running
Update updated_word.py
Browse files- updated_word.py +107 -31
updated_word.py
CHANGED
|
@@ -878,6 +878,7 @@ def handle_management_summary_fix(cell, flat_json):
|
|
| 878 |
def handle_operator_declaration_fix(table, flat_json):
|
| 879 |
"""Wrapper for small declaration tables. Delegate to canonical fix first.
|
| 880 |
If canonical did not change anything, fall back to the small-table auditor handling.
|
|
|
|
| 881 |
"""
|
| 882 |
replacements_made = 0
|
| 883 |
|
|
@@ -886,7 +887,7 @@ def handle_operator_declaration_fix(table, flat_json):
|
|
| 886 |
print(f" ⏭️ Skipping - Operator Declaration table already processed")
|
| 887 |
return 0
|
| 888 |
|
| 889 |
-
# only intended for small tables; if large, skip
|
| 890 |
if len(table.rows) > 4:
|
| 891 |
return 0
|
| 892 |
|
|
@@ -897,46 +898,121 @@ def handle_operator_declaration_fix(table, flat_json):
|
|
| 897 |
# canonical handled it and set the processed flag
|
| 898 |
return replacements_made
|
| 899 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
# fallback: original small-table behaviour (auditor declaration etc.)
|
| 901 |
-
# (This mirrors your earlier auditor-specific logic but will not run if canonical updated table)
|
| 902 |
print(f" 🎯 Processing other declaration table (fallback small-table behavior)")
|
| 903 |
|
| 904 |
for row_idx, row in enumerate(table.rows):
|
| 905 |
for cell_idx, cell in enumerate(row.cells):
|
| 906 |
-
if has_red_text(cell):
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
"Auditor name",
|
| 910 |
-
"Signature",
|
| 911 |
-
"Date"
|
| 912 |
-
]
|
| 913 |
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
|
|
|
| 926 |
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
for run in paragraph.runs:
|
| 931 |
-
if is_red(run):
|
| 932 |
-
red_text += run.text
|
| 933 |
|
| 934 |
-
|
| 935 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
replacements_made += cell_replacements
|
| 937 |
-
|
| 938 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
replacements_made += cell_replacements
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 940 |
|
| 941 |
# if any replacements made here, mark processed
|
| 942 |
if replacements_made > 0:
|
|
|
|
| 878 |
def handle_operator_declaration_fix(table, flat_json):
|
| 879 |
"""Wrapper for small declaration tables. Delegate to canonical fix first.
|
| 880 |
If canonical did not change anything, fall back to the small-table auditor handling.
|
| 881 |
+
Safeguards: do not replace with date-like values; prefer person/role candidates.
|
| 882 |
"""
|
| 883 |
replacements_made = 0
|
| 884 |
|
|
|
|
| 887 |
print(f" ⏭️ Skipping - Operator Declaration table already processed")
|
| 888 |
return 0
|
| 889 |
|
| 890 |
+
# only intended for small tables; if large, skip
|
| 891 |
if len(table.rows) > 4:
|
| 892 |
return 0
|
| 893 |
|
|
|
|
| 898 |
# canonical handled it and set the processed flag
|
| 899 |
return replacements_made
|
| 900 |
|
| 901 |
+
# --- Helper validators (local, minimal, safe) ---
|
| 902 |
+
def is_date_like(s: str) -> bool:
|
| 903 |
+
if not s:
|
| 904 |
+
return False
|
| 905 |
+
s = s.strip()
|
| 906 |
+
# common tokens that indicate a date string
|
| 907 |
+
month_names = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)"
|
| 908 |
+
# patterns: "2nd November 2023", "02/11/2023", "2023-11-02", "November 2023", "Date"
|
| 909 |
+
if re.search(r"\bDate\b", s, re.IGNORECASE):
|
| 910 |
+
return True
|
| 911 |
+
if re.search(r"\b\d{1,2}(?:st|nd|rd|th)?\b\s+" + month_names, s, re.IGNORECASE):
|
| 912 |
+
return True
|
| 913 |
+
if re.search(month_names + r".*\b\d{4}\b", s, re.IGNORECASE):
|
| 914 |
+
return True
|
| 915 |
+
if re.search(r"\b\d{1,2}[\/\.\-]\d{1,2}[\/\.\-]\d{2,4}\b", s):
|
| 916 |
+
return True
|
| 917 |
+
if re.search(r"\b\d{4}[\/\.\-]\d{1,2}[\/\.\-]\d{1,2}\b", s):
|
| 918 |
+
return True
|
| 919 |
+
# single 4-digit year alone
|
| 920 |
+
if re.fullmatch(r"\d{4}", s):
|
| 921 |
+
return True
|
| 922 |
+
return False
|
| 923 |
+
|
| 924 |
+
def looks_like_person_name(s: str) -> bool:
|
| 925 |
+
if not s:
|
| 926 |
+
return False
|
| 927 |
+
low = s.lower().strip()
|
| 928 |
+
# reject org/company terms
|
| 929 |
+
bad_terms = ["pty ltd", "p/l", "plc", "company", "farming", "farm", "trust", "ltd"]
|
| 930 |
+
if any(bt in low for bt in bad_terms):
|
| 931 |
+
return False
|
| 932 |
+
# minimal length check and presence of alphabetic characters
|
| 933 |
+
if len(low) < 3:
|
| 934 |
+
return False
|
| 935 |
+
return bool(re.search(r"[a-zA-Z]", low))
|
| 936 |
+
|
| 937 |
+
def looks_like_position(s: str) -> bool:
|
| 938 |
+
if not s:
|
| 939 |
+
return False
|
| 940 |
+
low = s.lower()
|
| 941 |
+
roles = ["manager", "auditor", "owner", "director", "supervisor", "coordinator", "driver", "operator", "representative", "chief"]
|
| 942 |
+
return any(r in low for r in roles)
|
| 943 |
+
|
| 944 |
# fallback: original small-table behaviour (auditor declaration etc.)
|
|
|
|
| 945 |
print(f" 🎯 Processing other declaration table (fallback small-table behavior)")
|
| 946 |
|
| 947 |
for row_idx, row in enumerate(table.rows):
|
| 948 |
for cell_idx, cell in enumerate(row.cells):
|
| 949 |
+
if not has_red_text(cell):
|
| 950 |
+
# do not overwrite non-red content in fallback
|
| 951 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
|
| 953 |
+
# Try auditor-specific fields first
|
| 954 |
+
declaration_fields = [
|
| 955 |
+
"NHVAS Approved Auditor Declaration.Print Name",
|
| 956 |
+
"Auditor name",
|
| 957 |
+
"Signature",
|
| 958 |
+
"Date"
|
| 959 |
+
]
|
| 960 |
+
|
| 961 |
+
replaced_this_cell = False
|
| 962 |
+
for field in declaration_fields:
|
| 963 |
+
field_value = find_matching_json_value(field, flat_json)
|
| 964 |
+
if field_value is None:
|
| 965 |
+
continue
|
| 966 |
|
| 967 |
+
replacement_text = get_value_as_string(field_value, field).strip()
|
| 968 |
+
if not replacement_text:
|
| 969 |
+
continue
|
|
|
|
|
|
|
|
|
|
| 970 |
|
| 971 |
+
# SAFEGUARD: do not replace with date-like text for name/position cells
|
| 972 |
+
if is_date_like(replacement_text):
|
| 973 |
+
# allow genuinely date-targeted cells (if red text explicitly contains 'date')
|
| 974 |
+
# but skip using a date string to fill 'name' or 'position' slots
|
| 975 |
+
# check the red text in the cell to see if it expects a date
|
| 976 |
+
red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip()
|
| 977 |
+
if "date" not in red_text.lower():
|
| 978 |
+
print(f" ⚠️ Skipping date-like replacement for field '{field}' -> '{replacement_text[:30]}...'")
|
| 979 |
+
continue
|
| 980 |
+
|
| 981 |
+
# Further safeguard: if replacement looks like a person or role, only then write into name/position cells
|
| 982 |
+
if (looks_like_person_name(replacement_text) or looks_like_position(replacement_text) or "signature" in field.lower() or "date" in field.lower()):
|
| 983 |
+
# Replace only red runs (safe)
|
| 984 |
+
cell_replacements = replace_red_text_in_cell(cell, replacement_text)
|
| 985 |
+
if cell_replacements > 0:
|
| 986 |
replacements_made += cell_replacements
|
| 987 |
+
replaced_this_cell = True
|
| 988 |
+
print(f" ✅ Fixed declaration field: {field} -> '{replacement_text}'")
|
| 989 |
+
break
|
| 990 |
+
else:
|
| 991 |
+
# Not a person or role-looking text — skip to avoid clobbering name/position with unrelated content
|
| 992 |
+
print(f" ⚠️ Replacement for field '{field}' does not look like name/role, skipping: '{replacement_text[:30]}...'")
|
| 993 |
+
continue
|
| 994 |
+
|
| 995 |
+
# If not replaced by the declared fields, try to infer from the cell's red text (date/signature fallback)
|
| 996 |
+
if not replaced_this_cell:
|
| 997 |
+
red_text = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red(run)).strip().lower()
|
| 998 |
+
if "signature" in red_text:
|
| 999 |
+
cell_replacements = replace_red_text_in_cell(cell, "[Signature]")
|
| 1000 |
+
if cell_replacements > 0:
|
| 1001 |
replacements_made += cell_replacements
|
| 1002 |
+
print(f" ✅ Inserted placeholder [Signature]")
|
| 1003 |
+
elif "date" in red_text:
|
| 1004 |
+
# Try to find a date value in JSON for an explicit date slot else skip
|
| 1005 |
+
date_value = find_matching_json_value("Date", flat_json) or find_matching_json_value("Date of Audit", flat_json) or find_matching_json_value("Audit was conducted on", flat_json)
|
| 1006 |
+
if date_value is not None:
|
| 1007 |
+
date_text = get_value_as_string(date_value)
|
| 1008 |
+
if not is_date_like(date_text):
|
| 1009 |
+
# defensive: if the date value is not date-like, skip
|
| 1010 |
+
print(f" ⚠️ Found date-value but not date-like, skipping: '{date_text}'")
|
| 1011 |
+
else:
|
| 1012 |
+
cell_replacements = replace_red_text_in_cell(cell, date_text)
|
| 1013 |
+
if cell_replacements > 0:
|
| 1014 |
+
replacements_made += cell_replacements
|
| 1015 |
+
print(f" ✅ Inserted date value: '{date_text}'")
|
| 1016 |
|
| 1017 |
# if any replacements made here, mark processed
|
| 1018 |
if replacements_made > 0:
|