Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

1487325

verified ·

1 Parent(s): d4200b4

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +117 -16

extract_red_text.py CHANGED Viewed

@@ -68,21 +68,33 @@ def get_table_context(tbl):
     }
 def calculate_schema_match_score(schema_name, spec, context):
-    """Enhanced calculate match score for a schema against table context with Summary table detection"""
     score = 0
     reasons = []
-    # 🎯 CRITICAL: Boost Summary schemas when DETAILS column is detected
     if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
-        score += 100  # Very high boost for summary tables with DETAILS column
         reasons.append(f"Summary schema with DETAILS column - perfect match")
-    # 🎯 CRITICAL: Heavily penalize non-Summary schemas when DETAILS column is present
     if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
-        score -= 75  # Heavy penalty to prevent basic schemas from matching summary tables
         reasons.append(f"Non-summary schema penalized for DETAILS column presence")
-    # Check for context exclusions (prevents basic Management from matching Summary tables)
     if spec.get("context_exclusions"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         for exclusion in spec["context_exclusions"]:
@@ -90,7 +102,7 @@ def calculate_schema_match_score(schema_name, spec, context):
                 score -= 50
                 reasons.append(f"Context exclusion penalty: '{exclusion}' found")
-    # Check for context keywords (boosts matching for relevant tables)
     if spec.get("context_keywords"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = 0
@@ -99,7 +111,7 @@ def calculate_schema_match_score(schema_name, spec, context):
                 keyword_matches += 1
         if keyword_matches > 0:
-            score += keyword_matches * 15  # Boost for each matching keyword
             reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
     # Direct first cell match
@@ -115,7 +127,7 @@ def calculate_schema_match_score(schema_name, spec, context):
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
-    # Column header matching (important for Summary tables)
     if spec.get("columns"):
         cols = [normalize_text(col) for col in spec["columns"]]
         matches = 0
@@ -123,10 +135,10 @@ def calculate_schema_match_score(schema_name, spec, context):
             if any(col.upper() in h.upper() for h in context['headers']):
                 matches += 1
         if matches == len(cols):
-            score += 60  # High boost for exact column matches
             reasons.append(f"All column headers match: {cols}")
         elif matches > 0:
-            score += matches * 20  # Partial column matches
             reasons.append(f"Partial column matches: {matches}/{len(cols)}")
     # Label matching for left-oriented tables
@@ -140,18 +152,23 @@ def calculate_schema_match_score(schema_name, spec, context):
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
-    # Label matching for row1-oriented tables
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
         if matches > 0:
-            score += (matches / len(labels)) * 30
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
-    # Special handling for Declaration tables
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
@@ -162,7 +179,7 @@ def calculate_schema_match_score(schema_name, spec, context):
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
-            score -= 50  # Penalty because auditors shouldn't be managers
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
@@ -228,13 +245,97 @@ def extract_multi_schema_table(tbl, schemas):
     return result
 def extract_table_data(tbl, schema_name, spec):
-    """Extract red text data from table based on schema"""
     labels = spec["labels"] + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
     by_col = (spec["orientation"] == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
             red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()

     }
 def calculate_schema_match_score(schema_name, spec, context):
+    """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
     score = 0
     reasons = []
+    # 🎯 VEHICLE REGISTRATION BOOST
+    if "Vehicle Registration" in schema_name:
+        vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
+        table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
+        keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
+        if keyword_matches >= 2:
+            score += 150  # Very high boost for vehicle tables
+            reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
+        elif keyword_matches >= 1:
+            score += 75   # Medium boost
+            reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
+    # 🎯 SUMMARY TABLE BOOST (existing logic)
     if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
+        score += 100
         reasons.append(f"Summary schema with DETAILS column - perfect match")
     if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
+        score -= 75
         reasons.append(f"Non-summary schema penalized for DETAILS column presence")
+    # Context exclusions
     if spec.get("context_exclusions"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         for exclusion in spec["context_exclusions"]:
                 score -= 50
                 reasons.append(f"Context exclusion penalty: '{exclusion}' found")
+    # Context keywords
     if spec.get("context_keywords"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = 0
                 keyword_matches += 1
         if keyword_matches > 0:
+            score += keyword_matches * 15
             reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
     # Direct first cell match
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
+    # Column header matching
     if spec.get("columns"):
         cols = [normalize_text(col) for col in spec["columns"]]
         matches = 0
             if any(col.upper() in h.upper() for h in context['headers']):
                 matches += 1
         if matches == len(cols):
+            score += 60
             reasons.append(f"All column headers match: {cols}")
         elif matches > 0:
+            score += matches * 20
             reasons.append(f"Partial column matches: {matches}/{len(cols)}")
     # Label matching for left-oriented tables
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
+    # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
+            # More flexible matching for vehicle tables
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
+            # Also check for partial keyword matches
+            elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
+                matches += 0.5  # Partial credit
         if matches > 0:
+            score += (matches / len(labels)) * 40  # Higher weight for row1 tables
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
+    # Special handling for Declaration tables (existing logic)
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
+            score -= 50
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
     return result
 def extract_table_data(tbl, schema_name, spec):
+    """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
+    # 🎯 SPECIAL HANDLING for Vehicle Registration tables
+    if "Vehicle Registration" in schema_name:
+        print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
+        labels = spec["labels"]
+        collected = {lbl: [] for lbl in labels}
+        seen = {lbl: set() for lbl in labels}
+        # For Vehicle Registration, orientation is "row1" - headers in first row
+        if len(tbl.rows) < 2:
+            print(f"    ❌ Vehicle table has less than 2 rows")
+            return {}
+        # Map header cells to labels
+        header_row = tbl.rows[0]
+        column_mapping = {}
+        print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
+        for col_idx, cell in enumerate(header_row.cells):
+            header_text = normalize_text(cell.text).strip()
+            if not header_text:
+                continue
+            print(f"      Column {col_idx}: '{header_text}'")
+            # Find best matching label
+            best_match = None
+            best_score = 0
+            for label in labels:
+                # Direct match
+                if header_text.upper() == label.upper():
+                    best_match = label
+                    best_score = 1.0
+                    break
+                # Partial keyword matching
+                header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
+                label_words = set(word.upper() for word in label.split() if len(word) > 2)
+                if header_words and label_words:
+                    common_words = header_words.intersection(label_words)
+                    if common_words:
+                        score = len(common_words) / max(len(header_words), len(label_words))
+                        if score > best_score and score >= 0.4:  # Lower threshold for vehicle tables
+                            best_score = score
+                            best_match = label
+            if best_match:
+                column_mapping[col_idx] = best_match
+                print(f"        ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
+            else:
+                print(f"        ⚠️ No mapping found for '{header_text}'")
+        print(f"    📊 Total column mappings: {len(column_mapping)}")
+        # Extract red text from data rows (skip header)
+        for row_idx in range(1, len(tbl.rows)):
+            row = tbl.rows[row_idx]
+            print(f"      📌 Processing data row {row_idx}")
+            for col_idx, cell in enumerate(row.cells):
+                if col_idx in column_mapping:
+                    label = column_mapping[col_idx]
+                    # Extract red text
+                    red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
+                    if red_txt:
+                        print(f"        🔴 Found red text in '{label}': '{red_txt}'")
+                        if red_txt not in seen[label]:
+                            seen[label].add(red_txt)
+                            collected[label].append(red_txt)
+        # Return only non-empty collections
+        result = {k: v for k, v in collected.items() if v}
+        print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
+        return result
+    # 🎯 ORIGINAL CODE for all other tables (unchanged)
     labels = spec["labels"] + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
     by_col = (spec["orientation"] == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
             red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()