Shami96 commited on
Commit
1487325
Β·
verified Β·
1 Parent(s): d4200b4

Update extract_red_text.py

Browse files
Files changed (1) hide show
  1. extract_red_text.py +117 -16
extract_red_text.py CHANGED
@@ -68,21 +68,33 @@ def get_table_context(tbl):
68
  }
69
 
70
  def calculate_schema_match_score(schema_name, spec, context):
71
- """Enhanced calculate match score for a schema against table context with Summary table detection"""
72
  score = 0
73
  reasons = []
74
 
75
- # 🎯 CRITICAL: Boost Summary schemas when DETAILS column is detected
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
77
- score += 100 # Very high boost for summary tables with DETAILS column
78
  reasons.append(f"Summary schema with DETAILS column - perfect match")
79
 
80
- # 🎯 CRITICAL: Heavily penalize non-Summary schemas when DETAILS column is present
81
  if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
82
- score -= 75 # Heavy penalty to prevent basic schemas from matching summary tables
83
  reasons.append(f"Non-summary schema penalized for DETAILS column presence")
84
 
85
- # Check for context exclusions (prevents basic Management from matching Summary tables)
86
  if spec.get("context_exclusions"):
87
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
88
  for exclusion in spec["context_exclusions"]:
@@ -90,7 +102,7 @@ def calculate_schema_match_score(schema_name, spec, context):
90
  score -= 50
91
  reasons.append(f"Context exclusion penalty: '{exclusion}' found")
92
 
93
- # Check for context keywords (boosts matching for relevant tables)
94
  if spec.get("context_keywords"):
95
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
96
  keyword_matches = 0
@@ -99,7 +111,7 @@ def calculate_schema_match_score(schema_name, spec, context):
99
  keyword_matches += 1
100
 
101
  if keyword_matches > 0:
102
- score += keyword_matches * 15 # Boost for each matching keyword
103
  reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
104
 
105
  # Direct first cell match
@@ -115,7 +127,7 @@ def calculate_schema_match_score(schema_name, spec, context):
115
  reasons.append(f"Heading match: '{context['heading']}'")
116
  break
117
 
118
- # Column header matching (important for Summary tables)
119
  if spec.get("columns"):
120
  cols = [normalize_text(col) for col in spec["columns"]]
121
  matches = 0
@@ -123,10 +135,10 @@ def calculate_schema_match_score(schema_name, spec, context):
123
  if any(col.upper() in h.upper() for h in context['headers']):
124
  matches += 1
125
  if matches == len(cols):
126
- score += 60 # High boost for exact column matches
127
  reasons.append(f"All column headers match: {cols}")
128
  elif matches > 0:
129
- score += matches * 20 # Partial column matches
130
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
131
 
132
  # Label matching for left-oriented tables
@@ -140,18 +152,23 @@ def calculate_schema_match_score(schema_name, spec, context):
140
  score += (matches / len(labels)) * 30
141
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
142
 
143
- # Label matching for row1-oriented tables
144
  elif spec.get("orientation") == "row1":
145
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
146
  matches = 0
147
  for lbl in labels:
 
148
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
149
  matches += 1
 
 
 
 
150
  if matches > 0:
151
- score += (matches / len(labels)) * 30
152
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
153
 
154
- # Special handling for Declaration tables
155
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
156
  if "OPERATOR DECLARATION" in context['heading'].upper():
157
  score += 80
@@ -162,7 +179,7 @@ def calculate_schema_match_score(schema_name, spec, context):
162
 
163
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
164
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
165
- score -= 50 # Penalty because auditors shouldn't be managers
166
  reasons.append("Penalty: Manager found (not auditor)")
167
 
168
  return score, reasons
@@ -228,13 +245,97 @@ def extract_multi_schema_table(tbl, schemas):
228
  return result
229
 
230
  def extract_table_data(tbl, schema_name, spec):
231
- """Extract red text data from table based on schema"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  labels = spec["labels"] + [schema_name]
233
  collected = {lbl: [] for lbl in labels}
234
  seen = {lbl: set() for lbl in labels}
235
  by_col = (spec["orientation"] == "row1")
236
  start_row = 1 if by_col else 0
237
  rows = tbl.rows[start_row:]
 
238
  for ri, row in enumerate(rows):
239
  for ci, cell in enumerate(row.cells):
240
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
 
68
  }
69
 
70
  def calculate_schema_match_score(schema_name, spec, context):
71
+ """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
72
  score = 0
73
  reasons = []
74
 
75
+ # 🎯 VEHICLE REGISTRATION BOOST
76
+ if "Vehicle Registration" in schema_name:
77
+ vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
78
+ table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
79
+
80
+ keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
81
+ if keyword_matches >= 2:
82
+ score += 150 # Very high boost for vehicle tables
83
+ reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
84
+ elif keyword_matches >= 1:
85
+ score += 75 # Medium boost
86
+ reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
87
+
88
+ # 🎯 SUMMARY TABLE BOOST (existing logic)
89
  if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
90
+ score += 100
91
  reasons.append(f"Summary schema with DETAILS column - perfect match")
92
 
 
93
  if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
94
+ score -= 75
95
  reasons.append(f"Non-summary schema penalized for DETAILS column presence")
96
 
97
+ # Context exclusions
98
  if spec.get("context_exclusions"):
99
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
100
  for exclusion in spec["context_exclusions"]:
 
102
  score -= 50
103
  reasons.append(f"Context exclusion penalty: '{exclusion}' found")
104
 
105
+ # Context keywords
106
  if spec.get("context_keywords"):
107
  table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
108
  keyword_matches = 0
 
111
  keyword_matches += 1
112
 
113
  if keyword_matches > 0:
114
+ score += keyword_matches * 15
115
  reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
116
 
117
  # Direct first cell match
 
127
  reasons.append(f"Heading match: '{context['heading']}'")
128
  break
129
 
130
+ # Column header matching
131
  if spec.get("columns"):
132
  cols = [normalize_text(col) for col in spec["columns"]]
133
  matches = 0
 
135
  if any(col.upper() in h.upper() for h in context['headers']):
136
  matches += 1
137
  if matches == len(cols):
138
+ score += 60
139
  reasons.append(f"All column headers match: {cols}")
140
  elif matches > 0:
141
+ score += matches * 20
142
  reasons.append(f"Partial column matches: {matches}/{len(cols)}")
143
 
144
  # Label matching for left-oriented tables
 
152
  score += (matches / len(labels)) * 30
153
  reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
154
 
155
+ # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
156
  elif spec.get("orientation") == "row1":
157
  labels = [normalize_text(lbl) for lbl in spec["labels"]]
158
  matches = 0
159
  for lbl in labels:
160
+ # More flexible matching for vehicle tables
161
  if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
162
  matches += 1
163
+ # Also check for partial keyword matches
164
+ elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
165
+ matches += 0.5 # Partial credit
166
+
167
  if matches > 0:
168
+ score += (matches / len(labels)) * 40 # Higher weight for row1 tables
169
  reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
170
 
171
+ # Special handling for Declaration tables (existing logic)
172
  if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
173
  if "OPERATOR DECLARATION" in context['heading'].upper():
174
  score += 80
 
179
 
180
  if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
181
  if any("MANAGER" in cell.upper() for cell in context['all_cells']):
182
+ score -= 50
183
  reasons.append("Penalty: Manager found (not auditor)")
184
 
185
  return score, reasons
 
245
  return result
246
 
247
  def extract_table_data(tbl, schema_name, spec):
248
+ """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
249
+
250
+ # 🎯 SPECIAL HANDLING for Vehicle Registration tables
251
+ if "Vehicle Registration" in schema_name:
252
+ print(f" πŸš— EXTRACTION FIX: Processing Vehicle Registration table")
253
+
254
+ labels = spec["labels"]
255
+ collected = {lbl: [] for lbl in labels}
256
+ seen = {lbl: set() for lbl in labels}
257
+
258
+ # For Vehicle Registration, orientation is "row1" - headers in first row
259
+ if len(tbl.rows) < 2:
260
+ print(f" ❌ Vehicle table has less than 2 rows")
261
+ return {}
262
+
263
+ # Map header cells to labels
264
+ header_row = tbl.rows[0]
265
+ column_mapping = {}
266
+
267
+ print(f" πŸ“‹ Mapping {len(header_row.cells)} header cells to labels")
268
+
269
+ for col_idx, cell in enumerate(header_row.cells):
270
+ header_text = normalize_text(cell.text).strip()
271
+ if not header_text:
272
+ continue
273
+
274
+ print(f" Column {col_idx}: '{header_text}'")
275
+
276
+ # Find best matching label
277
+ best_match = None
278
+ best_score = 0
279
+
280
+ for label in labels:
281
+ # Direct match
282
+ if header_text.upper() == label.upper():
283
+ best_match = label
284
+ best_score = 1.0
285
+ break
286
+
287
+ # Partial keyword matching
288
+ header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
289
+ label_words = set(word.upper() for word in label.split() if len(word) > 2)
290
+
291
+ if header_words and label_words:
292
+ common_words = header_words.intersection(label_words)
293
+ if common_words:
294
+ score = len(common_words) / max(len(header_words), len(label_words))
295
+ if score > best_score and score >= 0.4: # Lower threshold for vehicle tables
296
+ best_score = score
297
+ best_match = label
298
+
299
+ if best_match:
300
+ column_mapping[col_idx] = best_match
301
+ print(f" βœ… Mapped to: '{best_match}' (score: {best_score:.2f})")
302
+ else:
303
+ print(f" ⚠️ No mapping found for '{header_text}'")
304
+
305
+ print(f" πŸ“Š Total column mappings: {len(column_mapping)}")
306
+
307
+ # Extract red text from data rows (skip header)
308
+ for row_idx in range(1, len(tbl.rows)):
309
+ row = tbl.rows[row_idx]
310
+ print(f" πŸ“Œ Processing data row {row_idx}")
311
+
312
+ for col_idx, cell in enumerate(row.cells):
313
+ if col_idx in column_mapping:
314
+ label = column_mapping[col_idx]
315
+
316
+ # Extract red text
317
+ red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
318
+
319
+ if red_txt:
320
+ print(f" πŸ”΄ Found red text in '{label}': '{red_txt}'")
321
+
322
+ if red_txt not in seen[label]:
323
+ seen[label].add(red_txt)
324
+ collected[label].append(red_txt)
325
+
326
+ # Return only non-empty collections
327
+ result = {k: v for k, v in collected.items() if v}
328
+ print(f" βœ… Vehicle Registration extracted: {len(result)} columns with data")
329
+ return result
330
+
331
+ # 🎯 ORIGINAL CODE for all other tables (unchanged)
332
  labels = spec["labels"] + [schema_name]
333
  collected = {lbl: [] for lbl in labels}
334
  seen = {lbl: set() for lbl in labels}
335
  by_col = (spec["orientation"] == "row1")
336
  start_row = 1 if by_col else 0
337
  rows = tbl.rows[start_row:]
338
+
339
  for ri, row in enumerate(rows):
340
  for ci, cell in enumerate(row.cells):
341
  red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()