Shami96 commited on
Commit
24ad2d2
Β·
verified Β·
1 Parent(s): 8001b1f

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +119 -91
extract_red_text.py CHANGED
@@ -1,29 +1,36 @@
1
  #!/usr/bin/env python3
 
 
 
 
 
 
2
  import re
3
  import json
4
  import sys
5
  from docx import Document
6
  from docx.oxml.ns import qn
 
 
7
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
8
 
9
- def is_red_font(run):
10
- """Enhanced red font detection with better color checking"""
11
- col = run.font.color
12
- if col and col.rgb:
13
- r, g, b = col.rgb
14
- if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
15
- return True
16
- rPr = getattr(run._element, "rPr", None)
17
- if rPr is not None:
18
- clr = rPr.find(qn('w:color'))
19
- if clr is not None:
20
- val = clr.get(qn('w:val'))
21
- if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
22
- rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
23
- if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
24
- return True
25
- return False
26
 
 
 
 
27
  def _prev_para_text(tbl):
28
  """Get text from previous paragraph before table"""
29
  prev = tbl._tbl.getprevious()
@@ -33,23 +40,30 @@ def _prev_para_text(tbl):
33
  return ""
34
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
35
 
36
- def normalize_text(text):
37
- """Normalize text for better matching"""
38
- return re.sub(r'\s+', ' ', text.strip())
39
-
40
  def fuzzy_match_heading(heading, patterns):
41
  """Check if heading matches any pattern with fuzzy matching"""
42
- heading_norm = normalize_text(heading.upper())
 
 
43
  for pattern in patterns:
44
- if re.search(pattern, heading_norm, re.IGNORECASE):
45
- return True
 
 
 
 
 
46
  return False
47
 
48
  def get_table_context(tbl):
49
  """Get comprehensive context information for table"""
50
  heading = normalize_text(_prev_para_text(tbl))
51
- headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
52
- col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
 
53
  first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
54
  all_cells = []
55
  for row in tbl.rows:
@@ -67,33 +81,35 @@ def get_table_context(tbl):
67
  'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
68
  }
69
 
 
 
 
70
  def calculate_schema_match_score(schema_name, spec, context):
71
  """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
72
  score = 0
73
  reasons = []
74
-
75
- # 🎯 VEHICLE REGISTRATION BOOST
76
  if "Vehicle Registration" in schema_name:
77
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
78
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
79
-
80
  keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
81
  if keyword_matches >= 2:
82
- score += 150 # Very high boost for vehicle tables
83
  reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
84
  elif keyword_matches >= 1:
85
- score += 75 # Medium boost
86
  reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
87
-
88
- # 🎯 SUMMARY TABLE BOOST (existing logic)
89
  if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
90
  score += 100
91
  reasons.append(f"Summary schema with DETAILS column - perfect match")
92
-
93
  if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
94
  score -= 75
95
  reasons.append(f"Non-summary schema penalized for DETAILS column presence")
96
-
97
  # Context exclusions
98
  if spec.get("context_exclusions"):
99
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
@@ -101,7 +117,7 @@ def calculate_schema_match_score(schema_name, spec, context):
101
  if exclusion.lower() in table_text:
102
  score -= 50
103
  reasons.append(f"Context exclusion penalty: '{exclusion}' found")
104
-
105
  # Context keywords
106
  if spec.get("context_keywords"):
107
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
@@ -109,24 +125,23 @@ def calculate_schema_match_score(schema_name, spec, context):
109
  for keyword in spec["context_keywords"]:
110
  if keyword.lower() in table_text:
111
  keyword_matches += 1
112
-
113
  if keyword_matches > 0:
114
  score += keyword_matches * 15
115
  reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
116
-
117
  # Direct first cell match
118
  if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
119
  score += 100
120
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
121
-
122
  # Heading pattern matching
123
  if spec.get("headings"):
124
  for h in spec["headings"]:
125
- if fuzzy_match_heading(context['heading'], [h["text"]]):
126
  score += 50
127
  reasons.append(f"Heading match: '{context['heading']}'")
128
  break
129
-
130
  # Column header matching
131
  if spec.get("columns"):
132
  cols = [normalize_text(col) for col in spec["columns"]]
@@ -140,7 +155,7 @@ def calculate_schema_match_score(schema_name, spec, context):
140
  elif matches > 0:
141
  score += matches * 20
142
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
143
-
144
  # Label matching for left-oriented tables
145
  if spec.get("orientation") == "left":
146
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
@@ -151,24 +166,21 @@ def calculate_schema_match_score(schema_name, spec, context):
151
  if matches > 0:
152
  score += (matches / len(labels)) * 30
153
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
154
-
155
- # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
156
  elif spec.get("orientation") == "row1":
157
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
158
  matches = 0
159
  for lbl in labels:
160
- # More flexible matching for vehicle tables
161
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
162
  matches += 1
163
- # Also check for partial keyword matches
164
  elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
165
- matches += 0.5 # Partial credit
166
-
167
  if matches > 0:
168
- score += (matches / len(labels)) * 40 # Higher weight for row1 tables
169
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
170
-
171
- # Special handling for Declaration tables (existing logic)
172
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
173
  if "OPERATOR DECLARATION" in context['heading'].upper():
174
  score += 80
@@ -176,12 +188,12 @@ def calculate_schema_match_score(schema_name, spec, context):
176
  elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
177
  score += 60
178
  reasons.append("Manager found in cells (likely Operator Declaration)")
179
-
180
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
181
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
182
  score -= 50
183
  reasons.append("Penalty: Manager found (not auditor)")
184
-
185
  return score, reasons
186
 
187
  def match_table_schema(tbl):
@@ -198,6 +210,9 @@ def match_table_schema(tbl):
198
  return best_match
199
  return None
200
 
 
 
 
201
  def check_multi_schema_table(tbl):
202
  """Check if table contains multiple schemas and split appropriately"""
203
  context = get_table_context(tbl)
@@ -244,117 +259,107 @@ def extract_multi_schema_table(tbl, schemas):
244
  result[schema_name] = schema_data
245
  return result
246
 
 
 
 
247
  def extract_table_data(tbl, schema_name, spec):
248
  """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
249
-
250
- # 🎯 SPECIAL HANDLING for Vehicle Registration tables
251
  if "Vehicle Registration" in schema_name:
252
  print(f" πŸš— EXTRACTION FIX: Processing Vehicle Registration table")
253
-
254
  labels = spec["labels"]
255
  collected = {lbl: [] for lbl in labels}
256
  seen = {lbl: set() for lbl in labels}
257
-
258
- # For Vehicle Registration, orientation is "row1" - headers in first row
259
  if len(tbl.rows) < 2:
260
  print(f" ❌ Vehicle table has less than 2 rows")
261
  return {}
262
-
263
- # Map header cells to labels
264
  header_row = tbl.rows[0]
265
  column_mapping = {}
266
-
267
  print(f" πŸ“‹ Mapping {len(header_row.cells)} header cells to labels")
268
-
269
  for col_idx, cell in enumerate(header_row.cells):
270
  header_text = normalize_text(cell.text).strip()
271
  if not header_text:
272
  continue
273
-
274
  print(f" Column {col_idx}: '{header_text}'")
275
-
276
- # Find best matching label
277
  best_match = None
278
  best_score = 0
279
-
280
  for label in labels:
281
- # Direct match
282
  if header_text.upper() == label.upper():
283
  best_match = label
284
  best_score = 1.0
285
  break
286
-
287
- # Partial keyword matching
288
  header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
289
  label_words = set(word.upper() for word in label.split() if len(word) > 2)
290
-
291
  if header_words and label_words:
292
  common_words = header_words.intersection(label_words)
293
  if common_words:
294
  score = len(common_words) / max(len(header_words), len(label_words))
295
- if score > best_score and score >= 0.4: # Lower threshold for vehicle tables
296
  best_score = score
297
  best_match = label
298
-
299
  if best_match:
300
  column_mapping[col_idx] = best_match
301
  print(f" βœ… Mapped to: '{best_match}' (score: {best_score:.2f})")
302
  else:
303
  print(f" ⚠️ No mapping found for '{header_text}'")
304
-
305
  print(f" πŸ“Š Total column mappings: {len(column_mapping)}")
306
-
307
  # Extract red text from data rows (skip header)
308
  for row_idx in range(1, len(tbl.rows)):
309
  row = tbl.rows[row_idx]
310
  print(f" πŸ“Œ Processing data row {row_idx}")
311
-
312
  for col_idx, cell in enumerate(row.cells):
313
  if col_idx in column_mapping:
314
  label = column_mapping[col_idx]
315
-
316
- # Extract red text
317
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
318
-
319
  if red_txt:
320
  print(f" πŸ”΄ Found red text in '{label}': '{red_txt}'")
321
-
322
  if red_txt not in seen[label]:
323
  seen[label].add(red_txt)
324
  collected[label].append(red_txt)
325
-
326
- # Return only non-empty collections
327
  result = {k: v for k, v in collected.items() if v}
328
  print(f" βœ… Vehicle Registration extracted: {len(result)} columns with data")
329
  return result
330
-
331
- # 🎯 ORIGINAL CODE for all other tables (unchanged)
332
- labels = spec["labels"] + [schema_name]
333
  collected = {lbl: [] for lbl in labels}
334
  seen = {lbl: set() for lbl in labels}
335
- by_col = (spec["orientation"] == "row1")
336
  start_row = 1 if by_col else 0
337
  rows = tbl.rows[start_row:]
338
-
339
  for ri, row in enumerate(rows):
340
  for ci, cell in enumerate(row.cells):
341
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
342
  if not red_txt:
343
  continue
344
  if by_col:
345
- if ci < len(spec["labels"]):
346
  lbl = spec["labels"][ci]
347
  else:
348
  lbl = schema_name
349
  else:
350
  raw_label = normalize_text(row.cells[0].text)
351
  lbl = None
352
- for spec_label in spec["labels"]:
353
  if normalize_text(spec_label).upper() == raw_label.upper():
354
  lbl = spec_label
355
  break
356
  if not lbl:
357
- for spec_label in spec["labels"]:
358
  spec_norm = normalize_text(spec_label).upper()
359
  raw_norm = raw_label.upper()
360
  if spec_norm in raw_norm or raw_norm in spec_norm:
@@ -367,16 +372,24 @@ def extract_table_data(tbl, schema_name, spec):
367
  collected[lbl].append(red_txt)
368
  return {k: v for k, v in collected.items() if v}
369
 
 
 
 
370
  def extract_red_text(input_doc):
371
- # input_doc: docx.Document object or file path
 
 
 
372
  if isinstance(input_doc, str):
373
  doc = Document(input_doc)
374
  else:
375
  doc = input_doc
376
  out = {}
377
  table_count = 0
 
378
  for tbl in doc.tables:
379
  table_count += 1
 
380
  multi_schemas = check_multi_schema_table(tbl)
381
  if multi_schemas:
382
  multi_data = extract_multi_schema_table(tbl, multi_schemas)
@@ -391,8 +404,10 @@ def extract_red_text(input_doc):
391
  else:
392
  out[schema_name] = schema_data
393
  continue
 
394
  schema = match_table_schema(tbl)
395
  if not schema:
 
396
  continue
397
  spec = TABLE_SCHEMAS[schema]
398
  data = extract_table_data(tbl, schema, spec)
@@ -405,11 +420,15 @@ def extract_red_text(input_doc):
405
  out[schema][k] = v
406
  else:
407
  out[schema] = data
 
 
408
  paras = {}
409
  for idx, para in enumerate(doc.paragraphs):
410
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
411
  if not red_txt:
412
  continue
 
 
413
  context = None
414
  for j in range(idx-1, -1, -1):
415
  txt = normalize_text(doc.paragraphs[j].text)
@@ -418,15 +437,22 @@ def extract_red_text(input_doc):
418
  if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
419
  context = txt
420
  break
 
 
421
  if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
422
  context = "Date"
 
423
  if not context:
424
  context = "(para)"
425
  paras.setdefault(context, []).append(red_txt)
 
426
  if paras:
427
  out["paragraphs"] = paras
428
  return out
429
 
 
 
 
430
  def extract_red_text_filelike(input_file, output_file):
431
  """
432
  Accepts:
@@ -445,8 +471,10 @@ def extract_red_text_filelike(input_file, output_file):
445
  json.dump(result, f, indent=2, ensure_ascii=False)
446
  return result
447
 
 
 
 
448
  if __name__ == "__main__":
449
- # Support both script and app/file-like usage
450
  if len(sys.argv) == 3:
451
  input_docx = sys.argv[1]
452
  output_json = sys.argv[2]
 
1
  #!/usr/bin/env python3
2
+ """
3
+ extract_red_text.py
4
+ Improved version that reuses hf_utils for shared heuristics while preserving
5
+ the original schema logic, logging and behavior.
6
+ """
7
+
8
  import re
9
  import json
10
  import sys
11
  from docx import Document
12
  from docx.oxml.ns import qn
13
+
14
+ # master schema & patterns (unchanged)
15
  from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
16
 
17
+ # canonical helpers (from your new hf_utils.py)
18
+ from hf_utils import (
19
+ is_red_font,
20
+ normalize_text,
21
+ normalize_header_text,
22
+ flatten_json,
23
+ find_matching_json_key_and_value,
24
+ get_clean_text,
25
+ has_red_text,
26
+ extract_red_text_segments,
27
+ replace_red_text_in_cell,
28
+ key_is_forbidden_for_position,
29
+ )
 
 
 
 
30
 
31
+ # -------------------------------------------------------------------
32
+ # Small XML helper (kept exactly as before β€” low-level)
33
+ # -------------------------------------------------------------------
34
  def _prev_para_text(tbl):
35
  """Get text from previous paragraph before table"""
36
  prev = tbl._tbl.getprevious()
 
40
  return ""
41
  return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
42
 
43
+ # -------------------------------------------------------------------
44
+ # Table context helpers (use normalize_text from hf_utils)
45
+ # -------------------------------------------------------------------
 
46
  def fuzzy_match_heading(heading, patterns):
47
  """Check if heading matches any pattern with fuzzy matching"""
48
+ if not heading:
49
+ return False
50
+ heading_norm = normalize_text(heading).upper()
51
  for pattern in patterns:
52
+ try:
53
+ if re.search(pattern, heading_norm, re.IGNORECASE):
54
+ return True
55
+ except re.error:
56
+ # fallback simple substring if pattern isn't a valid re
57
+ if pattern.upper() in heading_norm:
58
+ return True
59
  return False
60
 
61
  def get_table_context(tbl):
62
  """Get comprehensive context information for table"""
63
  heading = normalize_text(_prev_para_text(tbl))
64
+ # first row headers
65
+ headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()] if tbl.rows else []
66
+ col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
67
  first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
68
  all_cells = []
69
  for row in tbl.rows:
 
81
  'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
82
  }
83
 
84
+ # -------------------------------------------------------------------
85
+ # Scoring / matching logic (kept your behavior but using normalize_text)
86
+ # -------------------------------------------------------------------
87
  def calculate_schema_match_score(schema_name, spec, context):
88
  """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
89
  score = 0
90
  reasons = []
91
+
92
+ # VEHICLE REGISTRATION BOOST
93
  if "Vehicle Registration" in schema_name:
94
  vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
95
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
 
96
  keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
97
  if keyword_matches >= 2:
98
+ score += 150
99
  reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
100
  elif keyword_matches >= 1:
101
+ score += 75
102
  reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
103
+
104
+ # SUMMARY TABLE BOOST (existing logic)
105
  if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
106
  score += 100
107
  reasons.append(f"Summary schema with DETAILS column - perfect match")
108
+
109
  if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
110
  score -= 75
111
  reasons.append(f"Non-summary schema penalized for DETAILS column presence")
112
+
113
  # Context exclusions
114
  if spec.get("context_exclusions"):
115
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
 
117
  if exclusion.lower() in table_text:
118
  score -= 50
119
  reasons.append(f"Context exclusion penalty: '{exclusion}' found")
120
+
121
  # Context keywords
122
  if spec.get("context_keywords"):
123
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
 
125
  for keyword in spec["context_keywords"]:
126
  if keyword.lower() in table_text:
127
  keyword_matches += 1
 
128
  if keyword_matches > 0:
129
  score += keyword_matches * 15
130
  reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
131
+
132
  # Direct first cell match
133
  if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
134
  score += 100
135
  reasons.append(f"Direct first cell match: '{context['first_cell']}'")
136
+
137
  # Heading pattern matching
138
  if spec.get("headings"):
139
  for h in spec["headings"]:
140
+ if fuzzy_match_heading(context['heading'], [h.get("text", "")]):
141
  score += 50
142
  reasons.append(f"Heading match: '{context['heading']}'")
143
  break
144
+
145
  # Column header matching
146
  if spec.get("columns"):
147
  cols = [normalize_text(col) for col in spec["columns"]]
 
155
  elif matches > 0:
156
  score += matches * 20
157
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
158
+
159
  # Label matching for left-oriented tables
160
  if spec.get("orientation") == "left":
161
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
 
166
  if matches > 0:
167
  score += (matches / len(labels)) * 30
168
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
169
+
170
+ # Enhanced Label matching for row1-oriented tables (Vehicle Registration)
171
  elif spec.get("orientation") == "row1":
172
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
173
  matches = 0
174
  for lbl in labels:
 
175
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
176
  matches += 1
 
177
  elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
178
+ matches += 0.5
 
179
  if matches > 0:
180
+ score += (matches / len(labels)) * 40
181
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
182
+
183
+ # Special handling for Declaration tables
184
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
185
  if "OPERATOR DECLARATION" in context['heading'].upper():
186
  score += 80
 
188
  elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
189
  score += 60
190
  reasons.append("Manager found in cells (likely Operator Declaration)")
191
+
192
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
193
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
194
  score -= 50
195
  reasons.append("Penalty: Manager found (not auditor)")
196
+
197
  return score, reasons
198
 
199
  def match_table_schema(tbl):
 
210
  return best_match
211
  return None
212
 
213
+ # -------------------------------------------------------------------
214
+ # Multi-schema detection & extraction (kept behavior)
215
+ # -------------------------------------------------------------------
216
  def check_multi_schema_table(tbl):
217
  """Check if table contains multiple schemas and split appropriately"""
218
  context = get_table_context(tbl)
 
259
  result[schema_name] = schema_data
260
  return result
261
 
262
+ # -------------------------------------------------------------------
263
+ # Table extraction for schemas (kept your specialized vehicle handling)
264
+ # -------------------------------------------------------------------
265
  def extract_table_data(tbl, schema_name, spec):
266
  """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
267
+
268
+ # Special handling for vehicle registration tables
269
  if "Vehicle Registration" in schema_name:
270
  print(f" πŸš— EXTRACTION FIX: Processing Vehicle Registration table")
 
271
  labels = spec["labels"]
272
  collected = {lbl: [] for lbl in labels}
273
  seen = {lbl: set() for lbl in labels}
274
+
 
275
  if len(tbl.rows) < 2:
276
  print(f" ❌ Vehicle table has less than 2 rows")
277
  return {}
278
+
 
279
  header_row = tbl.rows[0]
280
  column_mapping = {}
281
+
282
  print(f" πŸ“‹ Mapping {len(header_row.cells)} header cells to labels")
283
+
284
  for col_idx, cell in enumerate(header_row.cells):
285
  header_text = normalize_text(cell.text).strip()
286
  if not header_text:
287
  continue
288
+
289
  print(f" Column {col_idx}: '{header_text}'")
290
+
 
291
  best_match = None
292
  best_score = 0
293
+
294
  for label in labels:
 
295
  if header_text.upper() == label.upper():
296
  best_match = label
297
  best_score = 1.0
298
  break
299
+
 
300
  header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
301
  label_words = set(word.upper() for word in label.split() if len(word) > 2)
302
+
303
  if header_words and label_words:
304
  common_words = header_words.intersection(label_words)
305
  if common_words:
306
  score = len(common_words) / max(len(header_words), len(label_words))
307
+ if score > best_score and score >= 0.4:
308
  best_score = score
309
  best_match = label
310
+
311
  if best_match:
312
  column_mapping[col_idx] = best_match
313
  print(f" βœ… Mapped to: '{best_match}' (score: {best_score:.2f})")
314
  else:
315
  print(f" ⚠️ No mapping found for '{header_text}'")
316
+
317
  print(f" πŸ“Š Total column mappings: {len(column_mapping)}")
318
+
319
  # Extract red text from data rows (skip header)
320
  for row_idx in range(1, len(tbl.rows)):
321
  row = tbl.rows[row_idx]
322
  print(f" πŸ“Œ Processing data row {row_idx}")
 
323
  for col_idx, cell in enumerate(row.cells):
324
  if col_idx in column_mapping:
325
  label = column_mapping[col_idx]
 
 
326
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
 
327
  if red_txt:
328
  print(f" πŸ”΄ Found red text in '{label}': '{red_txt}'")
 
329
  if red_txt not in seen[label]:
330
  seen[label].add(red_txt)
331
  collected[label].append(red_txt)
 
 
332
  result = {k: v for k, v in collected.items() if v}
333
  print(f" βœ… Vehicle Registration extracted: {len(result)} columns with data")
334
  return result
335
+
336
+ # FALLBACK: original extraction logic for other tables
337
+ labels = spec.get("labels", []) + [schema_name]
338
  collected = {lbl: [] for lbl in labels}
339
  seen = {lbl: set() for lbl in labels}
340
+ by_col = (spec.get("orientation") == "row1")
341
  start_row = 1 if by_col else 0
342
  rows = tbl.rows[start_row:]
343
+
344
  for ri, row in enumerate(rows):
345
  for ci, cell in enumerate(row.cells):
346
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
347
  if not red_txt:
348
  continue
349
  if by_col:
350
+ if ci < len(spec.get("labels", [])):
351
  lbl = spec["labels"][ci]
352
  else:
353
  lbl = schema_name
354
  else:
355
  raw_label = normalize_text(row.cells[0].text)
356
  lbl = None
357
+ for spec_label in spec.get("labels", []):
358
  if normalize_text(spec_label).upper() == raw_label.upper():
359
  lbl = spec_label
360
  break
361
  if not lbl:
362
+ for spec_label in spec.get("labels", []):
363
  spec_norm = normalize_text(spec_label).upper()
364
  raw_norm = raw_label.upper()
365
  if spec_norm in raw_norm or raw_norm in spec_norm:
 
372
  collected[lbl].append(red_txt)
373
  return {k: v for k, v in collected.items() if v}
374
 
375
+ # -------------------------------------------------------------------
376
+ # Main extraction: iterate tables & paragraphs
377
+ # -------------------------------------------------------------------
378
  def extract_red_text(input_doc):
379
+ """
380
+ input_doc: docx.Document object or file path
381
+ returns: dict
382
+ """
383
  if isinstance(input_doc, str):
384
  doc = Document(input_doc)
385
  else:
386
  doc = input_doc
387
  out = {}
388
  table_count = 0
389
+
390
  for tbl in doc.tables:
391
  table_count += 1
392
+ # Check multi-schema table first
393
  multi_schemas = check_multi_schema_table(tbl)
394
  if multi_schemas:
395
  multi_data = extract_multi_schema_table(tbl, multi_schemas)
 
404
  else:
405
  out[schema_name] = schema_data
406
  continue
407
+
408
  schema = match_table_schema(tbl)
409
  if not schema:
410
+ # keep scanning for tables even if no schema matched
411
  continue
412
  spec = TABLE_SCHEMAS[schema]
413
  data = extract_table_data(tbl, schema, spec)
 
420
  out[schema][k] = v
421
  else:
422
  out[schema] = data
423
+
424
+ # paragraphs
425
  paras = {}
426
  for idx, para in enumerate(doc.paragraphs):
427
  red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
428
  if not red_txt:
429
  continue
430
+
431
+ # find context heading by scanning backward
432
  context = None
433
  for j in range(idx-1, -1, -1):
434
  txt = normalize_text(doc.paragraphs[j].text)
 
437
  if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
438
  context = txt
439
  break
440
+
441
+ # if it's date-like and matches date pattern, set context to Date
442
  if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
443
  context = "Date"
444
+
445
  if not context:
446
  context = "(para)"
447
  paras.setdefault(context, []).append(red_txt)
448
+
449
  if paras:
450
  out["paragraphs"] = paras
451
  return out
452
 
453
+ # -------------------------------------------------------------------
454
+ # File-like wrapper (keeps API used elsewhere)
455
+ # -------------------------------------------------------------------
456
  def extract_red_text_filelike(input_file, output_file):
457
  """
458
  Accepts:
 
471
  json.dump(result, f, indent=2, ensure_ascii=False)
472
  return result
473
 
474
+ # -------------------------------------------------------------------
475
+ # CLI entrypoint (preserve original UX)
476
+ # -------------------------------------------------------------------
477
  if __name__ == "__main__":
 
478
  if len(sys.argv) == 3:
479
  input_docx = sys.argv[1]
480
  output_json = sys.argv[2]