Shami96 commited on
Commit
4edca00
Β·
verified Β·
1 Parent(s): 9d9d8a8

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +183 -1
updated_word.py CHANGED
@@ -3,6 +3,187 @@ from docx import Document
3
  from docx.shared import RGBColor
4
  import re
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def load_json(filepath):
7
  with open(filepath, 'r') as file:
8
  return json.load(file)
@@ -815,6 +996,7 @@ def process_hf(json_file, docx_file, output_file):
815
 
816
  table_replacements = process_tables(doc, flat_json)
817
  paragraph_replacements = process_paragraphs(doc, flat_json)
 
818
  total_replacements = table_replacements + paragraph_replacements
819
 
820
  # --- Save DOCX output (file or file-like) ---
@@ -823,7 +1005,7 @@ def process_hf(json_file, docx_file, output_file):
823
  else:
824
  doc.save(output_file)
825
  print(f"\nβœ… Document saved as: {output_file}")
826
- print(f"βœ… Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
827
 
828
  except FileNotFoundError as e:
829
  print(f"❌ File not found: {e}")
 
3
  from docx.shared import RGBColor
4
  import re
5
 
6
+ # Add these heading patterns at the top of your file with other constants
7
+ HEADING_PATTERNS = {
8
+ "main": [
9
+ r"NHVAS\s+Audit\s+Summary\s+Report",
10
+ r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
11
+ r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
12
+ ],
13
+ "sub": [
14
+ r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
15
+ r"MAINTENANCE\s+MANAGEMENT",
16
+ r"MASS\s+MANAGEMENT",
17
+ r"FATIGUE\s+MANAGEMENT",
18
+ r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
19
+ r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
20
+ r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
21
+ r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
22
+ r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
23
+ r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
24
+ r"Operator\s+Declaration",
25
+ r"Operator\s+Information"
26
+ ]
27
+ }
28
+
29
+ def process_headings(document, flat_json):
30
+ """Process document headings and their associated content for red text replacement"""
31
+ replacements_made = 0
32
+ print(f"\nπŸ” Processing headings:")
33
+
34
+ paragraphs = document.paragraphs
35
+
36
+ for para_idx, paragraph in enumerate(paragraphs):
37
+ paragraph_text = paragraph.text.strip()
38
+
39
+ if not paragraph_text:
40
+ continue
41
+
42
+ # Check if this paragraph matches any heading pattern
43
+ matched_heading = None
44
+ for category, patterns in HEADING_PATTERNS.items():
45
+ for pattern in patterns:
46
+ if re.search(pattern, paragraph_text, re.IGNORECASE):
47
+ matched_heading = pattern
48
+ break
49
+ if matched_heading:
50
+ break
51
+
52
+ if matched_heading:
53
+ print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
54
+
55
+ # Look for red text in the current heading paragraph first
56
+ if has_red_text_in_paragraph(paragraph):
57
+ print(f" πŸ”΄ Found red text in heading itself")
58
+ heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
59
+ replacements_made += heading_replacements
60
+
61
+ # Look for red text in the next few paragraphs after the heading
62
+ for next_para_offset in range(1, 4): # Check next 3 paragraphs
63
+ next_para_idx = para_idx + next_para_offset
64
+ if next_para_idx >= len(paragraphs):
65
+ break
66
+
67
+ next_paragraph = paragraphs[next_para_idx]
68
+ next_text = next_paragraph.text.strip()
69
+
70
+ # Skip empty paragraphs
71
+ if not next_text:
72
+ continue
73
+
74
+ # If we hit another heading, stop looking
75
+ is_another_heading = False
76
+ for category, patterns in HEADING_PATTERNS.items():
77
+ for pattern in patterns:
78
+ if re.search(pattern, next_text, re.IGNORECASE):
79
+ is_another_heading = True
80
+ break
81
+ if is_another_heading:
82
+ break
83
+
84
+ if is_another_heading:
85
+ break
86
+
87
+ # Check for red text in this paragraph
88
+ if has_red_text_in_paragraph(next_paragraph):
89
+ print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
90
+
91
+ # Use heading context to improve matching
92
+ context_replacements = process_red_text_in_paragraph(
93
+ next_paragraph,
94
+ paragraph_text, # Use heading text as context
95
+ flat_json
96
+ )
97
+ replacements_made += context_replacements
98
+
99
+ return replacements_made
100
+
101
+ def has_red_text_in_paragraph(paragraph):
102
+ """Check if a paragraph contains any red text"""
103
+ for run in paragraph.runs:
104
+ if is_red(run) and run.text.strip():
105
+ return True
106
+ return False
107
+
108
+ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
109
+ """Process red text within a single paragraph using context"""
110
+ replacements_made = 0
111
+
112
+ # Extract all red text from the paragraph
113
+ red_text_segments = []
114
+ for run in paragraph.runs:
115
+ if is_red(run) and run.text.strip():
116
+ red_text_segments.append(run.text.strip())
117
+
118
+ if not red_text_segments:
119
+ return 0
120
+
121
+ # Combine red text segments
122
+ combined_red_text = " ".join(red_text_segments).strip()
123
+ print(f" πŸ” Red text found: '{combined_red_text}'")
124
+
125
+ # Try different matching strategies based on context
126
+ json_value = None
127
+
128
+ # Strategy 1: Direct red text matching
129
+ json_value = find_matching_json_value(combined_red_text, flat_json)
130
+
131
+ # Strategy 2: Context-based matching for specific headings
132
+ if json_value is None:
133
+ if "NHVAS APPROVED AUDITOR" in context_text.upper():
134
+ # Try auditor-specific fields
135
+ auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor"]
136
+ for field in auditor_fields:
137
+ json_value = find_matching_json_value(field, flat_json)
138
+ if json_value is not None:
139
+ print(f" βœ… Found auditor match with field: '{field}'")
140
+ break
141
+
142
+ elif "OPERATOR DECLARATION" in context_text.upper():
143
+ # Try operator-specific fields
144
+ operator_fields = ["operator name", "operator", "company name", "organisation name"]
145
+ for field in operator_fields:
146
+ json_value = find_matching_json_value(field, flat_json)
147
+ if json_value is not None:
148
+ print(f" βœ… Found operator match with field: '{field}'")
149
+ break
150
+
151
+ # Strategy 3: Try combining context with red text
152
+ if json_value is None:
153
+ context_queries = [
154
+ f"{context_text} {combined_red_text}",
155
+ combined_red_text,
156
+ context_text
157
+ ]
158
+
159
+ for query in context_queries:
160
+ json_value = find_matching_json_value(query, flat_json)
161
+ if json_value is not None:
162
+ print(f" βœ… Found match with combined query: '{query[:50]}...'")
163
+ break
164
+
165
+ # Replace the red text if we found a match
166
+ if json_value is not None:
167
+ replacement_text = get_value_as_string(json_value, combined_red_text)
168
+
169
+ # Find and replace all red runs in the paragraph
170
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
171
+ if red_runs:
172
+ # Replace first red run with the replacement text
173
+ red_runs[0].text = replacement_text
174
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0) # Change to black
175
+
176
+ # Clear remaining red runs
177
+ for run in red_runs[1:]:
178
+ run.text = ''
179
+
180
+ replacements_made = 1
181
+ print(f" βœ… Replaced with: '{replacement_text}'")
182
+ else:
183
+ print(f" ❌ No match found for red text: '{combined_red_text}'")
184
+
185
+ return replacements_made
186
+
187
  def load_json(filepath):
188
  with open(filepath, 'r') as file:
189
  return json.load(file)
 
996
 
997
  table_replacements = process_tables(doc, flat_json)
998
  paragraph_replacements = process_paragraphs(doc, flat_json)
999
+ heading_replacements = process_headings(doc, flat_json)
1000
  total_replacements = table_replacements + paragraph_replacements
1001
 
1002
  # --- Save DOCX output (file or file-like) ---
 
1005
  else:
1006
  doc.save(output_file)
1007
  print(f"\nβœ… Document saved as: {output_file}")
1008
+ print(f"βœ… Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs, {heading_replacements} in headings)")
1009
 
1010
  except FileNotFoundError as e:
1011
  print(f"❌ File not found: {e}")