Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

24ad2d2

verified ·

1 Parent(s): 8001b1f

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +119 -91

extract_red_text.py CHANGED Viewed

@@ -1,29 +1,36 @@
 #!/usr/bin/env python3
 import re
 import json
 import sys
 from docx import Document
 from docx.oxml.ns import qn
 from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
-def is_red_font(run):
-    """Enhanced red font detection with better color checking"""
-    col = run.font.color
-    if col and col.rgb:
-        r, g, b = col.rgb
-        if r > 150 and g < 100 and b < 100 and (r-g) > 30 and (r-b) > 30:
-            return True
-    rPr = getattr(run._element, "rPr", None)
-    if rPr is not None:
-        clr = rPr.find(qn('w:color'))
-        if clr is not None:
-            val = clr.get(qn('w:val'))
-            if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
-                rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
-                if rr > 150 and gg < 100 and bb < 100 and (rr-gg) > 30 and (rr-bb) > 30:
-                    return True
-    return False
 def _prev_para_text(tbl):
     """Get text from previous paragraph before table"""
     prev = tbl._tbl.getprevious()
@@ -33,23 +40,30 @@ def _prev_para_text(tbl):
         return ""
     return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
-def normalize_text(text):
-    """Normalize text for better matching"""
-    return re.sub(r'\s+', ' ', text.strip())
 def fuzzy_match_heading(heading, patterns):
     """Check if heading matches any pattern with fuzzy matching"""
-    heading_norm = normalize_text(heading.upper())
     for pattern in patterns:
-        if re.search(pattern, heading_norm, re.IGNORECASE):
-            return True
     return False
 def get_table_context(tbl):
     """Get comprehensive context information for table"""
     heading = normalize_text(_prev_para_text(tbl))
-    headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()]
-    col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells[0].text.strip()]
     first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
     all_cells = []
     for row in tbl.rows:
@@ -67,33 +81,35 @@ def get_table_context(tbl):
         'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
     }
 def calculate_schema_match_score(schema_name, spec, context):
     """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
     score = 0
     reasons = []
-    # 🎯 VEHICLE REGISTRATION BOOST
     if "Vehicle Registration" in schema_name:
         vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
         if keyword_matches >= 2:
-            score += 150  # Very high boost for vehicle tables
             reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
         elif keyword_matches >= 1:
-            score += 75   # Medium boost
             reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
-    # 🎯 SUMMARY TABLE BOOST (existing logic)
     if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
         score += 100
         reasons.append(f"Summary schema with DETAILS column - perfect match")
     if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
         score -= 75
         reasons.append(f"Non-summary schema penalized for DETAILS column presence")
     # Context exclusions
     if spec.get("context_exclusions"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
@@ -101,7 +117,7 @@ def calculate_schema_match_score(schema_name, spec, context):
             if exclusion.lower() in table_text:
                 score -= 50
                 reasons.append(f"Context exclusion penalty: '{exclusion}' found")
     # Context keywords
     if spec.get("context_keywords"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
@@ -109,24 +125,23 @@ def calculate_schema_match_score(schema_name, spec, context):
         for keyword in spec["context_keywords"]:
             if keyword.lower() in table_text:
                 keyword_matches += 1
         if keyword_matches > 0:
             score += keyword_matches * 15
             reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
     # Direct first cell match
     if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
         score += 100
         reasons.append(f"Direct first cell match: '{context['first_cell']}'")
     # Heading pattern matching
     if spec.get("headings"):
         for h in spec["headings"]:
-            if fuzzy_match_heading(context['heading'], [h["text"]]):
                 score += 50
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
     # Column header matching
     if spec.get("columns"):
         cols = [normalize_text(col) for col in spec["columns"]]
@@ -140,7 +155,7 @@ def calculate_schema_match_score(schema_name, spec, context):
         elif matches > 0:
             score += matches * 20
             reasons.append(f"Partial column matches: {matches}/{len(cols)}")
     # Label matching for left-oriented tables
     if spec.get("orientation") == "left":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
@@ -151,24 +166,21 @@ def calculate_schema_match_score(schema_name, spec, context):
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
-    # 🎯 ENHANCED Label matching for row1-oriented tables (Vehicle Registration)
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
-            # More flexible matching for vehicle tables
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
-            # Also check for partial keyword matches
             elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
-                matches += 0.5  # Partial credit
         if matches > 0:
-            score += (matches / len(labels)) * 40  # Higher weight for row1 tables
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
-    # Special handling for Declaration tables (existing logic)
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
@@ -176,12 +188,12 @@ def calculate_schema_match_score(schema_name, spec, context):
         elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score += 60
             reasons.append("Manager found in cells (likely Operator Declaration)")
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score -= 50
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
 def match_table_schema(tbl):
@@ -198,6 +210,9 @@ def match_table_schema(tbl):
         return best_match
     return None
 def check_multi_schema_table(tbl):
     """Check if table contains multiple schemas and split appropriately"""
     context = get_table_context(tbl)
@@ -244,117 +259,107 @@ def extract_multi_schema_table(tbl, schemas):
             result[schema_name] = schema_data
     return result
 def extract_table_data(tbl, schema_name, spec):
     """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
-    # 🎯 SPECIAL HANDLING for Vehicle Registration tables
     if "Vehicle Registration" in schema_name:
         print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
         labels = spec["labels"]
         collected = {lbl: [] for lbl in labels}
         seen = {lbl: set() for lbl in labels}
-        # For Vehicle Registration, orientation is "row1" - headers in first row
         if len(tbl.rows) < 2:
             print(f"    ❌ Vehicle table has less than 2 rows")
             return {}
-        # Map header cells to labels
         header_row = tbl.rows[0]
         column_mapping = {}
         print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
         for col_idx, cell in enumerate(header_row.cells):
             header_text = normalize_text(cell.text).strip()
             if not header_text:
                 continue
             print(f"      Column {col_idx}: '{header_text}'")
-            # Find best matching label
             best_match = None
             best_score = 0
             for label in labels:
-                # Direct match
                 if header_text.upper() == label.upper():
                     best_match = label
                     best_score = 1.0
                     break
-                # Partial keyword matching
                 header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
                 label_words = set(word.upper() for word in label.split() if len(word) > 2)
                 if header_words and label_words:
                     common_words = header_words.intersection(label_words)
                     if common_words:
                         score = len(common_words) / max(len(header_words), len(label_words))
-                        if score > best_score and score >= 0.4:  # Lower threshold for vehicle tables
                             best_score = score
                             best_match = label
             if best_match:
                 column_mapping[col_idx] = best_match
                 print(f"        ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
             else:
                 print(f"        ⚠️ No mapping found for '{header_text}'")
         print(f"    📊 Total column mappings: {len(column_mapping)}")
         # Extract red text from data rows (skip header)
         for row_idx in range(1, len(tbl.rows)):
             row = tbl.rows[row_idx]
             print(f"      📌 Processing data row {row_idx}")
             for col_idx, cell in enumerate(row.cells):
                 if col_idx in column_mapping:
                     label = column_mapping[col_idx]
-                    # Extract red text
                     red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
                     if red_txt:
                         print(f"        🔴 Found red text in '{label}': '{red_txt}'")
                         if red_txt not in seen[label]:
                             seen[label].add(red_txt)
                             collected[label].append(red_txt)
-        # Return only non-empty collections
         result = {k: v for k, v in collected.items() if v}
         print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
         return result
-    # 🎯 ORIGINAL CODE for all other tables (unchanged)
-    labels = spec["labels"] + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
-    by_col = (spec["orientation"] == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
             red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
             if not red_txt:
                 continue
             if by_col:
-                if ci < len(spec["labels"]):
                     lbl = spec["labels"][ci]
                 else:
                     lbl = schema_name
             else:
                 raw_label = normalize_text(row.cells[0].text)
                 lbl = None
-                for spec_label in spec["labels"]:
                     if normalize_text(spec_label).upper() == raw_label.upper():
                         lbl = spec_label
                         break
                 if not lbl:
-                    for spec_label in spec["labels"]:
                         spec_norm = normalize_text(spec_label).upper()
                         raw_norm = raw_label.upper()
                         if spec_norm in raw_norm or raw_norm in spec_norm:
@@ -367,16 +372,24 @@ def extract_table_data(tbl, schema_name, spec):
                 collected[lbl].append(red_txt)
     return {k: v for k, v in collected.items() if v}
 def extract_red_text(input_doc):
-    # input_doc: docx.Document object or file path
     if isinstance(input_doc, str):
         doc = Document(input_doc)
     else:
         doc = input_doc
     out = {}
     table_count = 0
     for tbl in doc.tables:
         table_count += 1
         multi_schemas = check_multi_schema_table(tbl)
         if multi_schemas:
             multi_data = extract_multi_schema_table(tbl, multi_schemas)
@@ -391,8 +404,10 @@ def extract_red_text(input_doc):
                     else:
                         out[schema_name] = schema_data
             continue
         schema = match_table_schema(tbl)
         if not schema:
             continue
         spec = TABLE_SCHEMAS[schema]
         data = extract_table_data(tbl, schema, spec)
@@ -405,11 +420,15 @@ def extract_red_text(input_doc):
                         out[schema][k] = v
             else:
                 out[schema] = data
     paras = {}
     for idx, para in enumerate(doc.paragraphs):
         red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
         if not red_txt:
             continue
         context = None
         for j in range(idx-1, -1, -1):
             txt = normalize_text(doc.paragraphs[j].text)
@@ -418,15 +437,22 @@ def extract_red_text(input_doc):
                 if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
                     context = txt
                     break
         if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
             context = "Date"
         if not context:
             context = "(para)"
         paras.setdefault(context, []).append(red_txt)
     if paras:
         out["paragraphs"] = paras
     return out
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts:
@@ -445,8 +471,10 @@ def extract_red_text_filelike(input_file, output_file):
             json.dump(result, f, indent=2, ensure_ascii=False)
     return result
 if __name__ == "__main__":
-    # Support both script and app/file-like usage
     if len(sys.argv) == 3:
         input_docx = sys.argv[1]
         output_json = sys.argv[2]

 #!/usr/bin/env python3
+"""
+extract_red_text.py
+Improved version that reuses hf_utils for shared heuristics while preserving
+the original schema logic, logging and behavior.
+"""
 import re
 import json
 import sys
 from docx import Document
 from docx.oxml.ns import qn
+# master schema & patterns (unchanged)
 from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
+# canonical helpers (from your new hf_utils.py)
+from hf_utils import (
+    is_red_font,
+    normalize_text,
+    normalize_header_text,
+    flatten_json,
+    find_matching_json_key_and_value,
+    get_clean_text,
+    has_red_text,
+    extract_red_text_segments,
+    replace_red_text_in_cell,
+    key_is_forbidden_for_position,
+)
+# -------------------------------------------------------------------
+# Small XML helper (kept exactly as before — low-level)
+# -------------------------------------------------------------------
 def _prev_para_text(tbl):
     """Get text from previous paragraph before table"""
     prev = tbl._tbl.getprevious()
         return ""
     return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
+# -------------------------------------------------------------------
+# Table context helpers (use normalize_text from hf_utils)
+# -------------------------------------------------------------------
 def fuzzy_match_heading(heading, patterns):
     """Check if heading matches any pattern with fuzzy matching"""
+    if not heading:
+        return False
+    heading_norm = normalize_text(heading).upper()
     for pattern in patterns:
+        try:
+            if re.search(pattern, heading_norm, re.IGNORECASE):
+                return True
+        except re.error:
+            # fallback simple substring if pattern isn't a valid re
+            if pattern.upper() in heading_norm:
+                return True
     return False
 def get_table_context(tbl):
     """Get comprehensive context information for table"""
     heading = normalize_text(_prev_para_text(tbl))
+    # first row headers
+    headers = [normalize_text(c.text) for c in tbl.rows[0].cells if c.text.strip()] if tbl.rows else []
+    col0 = [normalize_text(r.cells[0].text) for r in tbl.rows if r.cells and r.cells[0].text.strip()]
     first_cell = normalize_text(tbl.rows[0].cells[0].text) if tbl.rows else ""
     all_cells = []
     for row in tbl.rows:
         'num_cols': len(tbl.rows[0].cells) if tbl.rows else 0
     }
+# -------------------------------------------------------------------
+# Scoring / matching logic (kept your behavior but using normalize_text)
+# -------------------------------------------------------------------
 def calculate_schema_match_score(schema_name, spec, context):
     """Enhanced calculate match score - IMPROVED for Vehicle Registration tables"""
     score = 0
     reasons = []
+    # VEHICLE REGISTRATION BOOST
     if "Vehicle Registration" in schema_name:
         vehicle_keywords = ["registration", "vehicle", "sub-contractor", "weight verification", "rfs suspension"]
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         keyword_matches = sum(1 for keyword in vehicle_keywords if keyword in table_text)
         if keyword_matches >= 2:
+            score += 150
             reasons.append(f"Vehicle Registration keywords: {keyword_matches}/5")
         elif keyword_matches >= 1:
+            score += 75
             reasons.append(f"Some Vehicle Registration keywords: {keyword_matches}/5")
+    # SUMMARY TABLE BOOST (existing logic)
     if "Summary" in schema_name and "details" in " ".join(context['headers']).lower():
         score += 100
         reasons.append(f"Summary schema with DETAILS column - perfect match")
     if "Summary" not in schema_name and "details" in " ".join(context['headers']).lower():
         score -= 75
         reasons.append(f"Non-summary schema penalized for DETAILS column presence")
     # Context exclusions
     if spec.get("context_exclusions"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
             if exclusion.lower() in table_text:
                 score -= 50
                 reasons.append(f"Context exclusion penalty: '{exclusion}' found")
     # Context keywords
     if spec.get("context_keywords"):
         table_text = " ".join(context['headers']).lower() + " " + context['heading'].lower()
         for keyword in spec["context_keywords"]:
             if keyword.lower() in table_text:
                 keyword_matches += 1
         if keyword_matches > 0:
             score += keyword_matches * 15
             reasons.append(f"Context keyword matches: {keyword_matches}/{len(spec['context_keywords'])}")
     # Direct first cell match
     if context['first_cell'] and context['first_cell'].upper() == schema_name.upper():
         score += 100
         reasons.append(f"Direct first cell match: '{context['first_cell']}'")
     # Heading pattern matching
     if spec.get("headings"):
         for h in spec["headings"]:
+            if fuzzy_match_heading(context['heading'], [h.get("text", "")]):
                 score += 50
                 reasons.append(f"Heading match: '{context['heading']}'")
                 break
     # Column header matching
     if spec.get("columns"):
         cols = [normalize_text(col) for col in spec["columns"]]
         elif matches > 0:
             score += matches * 20
             reasons.append(f"Partial column matches: {matches}/{len(cols)}")
     # Label matching for left-oriented tables
     if spec.get("orientation") == "left":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         if matches > 0:
             score += (matches / len(labels)) * 30
             reasons.append(f"Left orientation label matches: {matches}/{len(labels)}")
+    # Enhanced Label matching for row1-oriented tables (Vehicle Registration)
     elif spec.get("orientation") == "row1":
         labels = [normalize_text(lbl) for lbl in spec["labels"]]
         matches = 0
         for lbl in labels:
             if any(lbl.upper() in h.upper() or h.upper() in lbl.upper() for h in context['headers']):
                 matches += 1
             elif any(word.upper() in " ".join(context['headers']).upper() for word in lbl.split() if len(word) > 3):
+                matches += 0.5
         if matches > 0:
+            score += (matches / len(labels)) * 40
             reasons.append(f"Row1 orientation header matches: {matches}/{len(labels)}")
+    # Special handling for Declaration tables
     if schema_name == "Operator Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if "OPERATOR DECLARATION" in context['heading'].upper():
             score += 80
         elif any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score += 60
             reasons.append("Manager found in cells (likely Operator Declaration)")
     if schema_name == "NHVAS Approved Auditor Declaration" and context['first_cell'].upper() == "PRINT NAME":
         if any("MANAGER" in cell.upper() for cell in context['all_cells']):
             score -= 50
             reasons.append("Penalty: Manager found (not auditor)")
     return score, reasons
 def match_table_schema(tbl):
         return best_match
     return None
+# -------------------------------------------------------------------
+# Multi-schema detection & extraction (kept behavior)
+# -------------------------------------------------------------------
 def check_multi_schema_table(tbl):
     """Check if table contains multiple schemas and split appropriately"""
     context = get_table_context(tbl)
             result[schema_name] = schema_data
     return result
+# -------------------------------------------------------------------
+# Table extraction for schemas (kept your specialized vehicle handling)
+# -------------------------------------------------------------------
 def extract_table_data(tbl, schema_name, spec):
     """Extract red text data from table based on schema - ENHANCED for Vehicle Registration"""
+    # Special handling for vehicle registration tables
     if "Vehicle Registration" in schema_name:
         print(f"    🚗 EXTRACTION FIX: Processing Vehicle Registration table")
         labels = spec["labels"]
         collected = {lbl: [] for lbl in labels}
         seen = {lbl: set() for lbl in labels}
         if len(tbl.rows) < 2:
             print(f"    ❌ Vehicle table has less than 2 rows")
             return {}
         header_row = tbl.rows[0]
         column_mapping = {}
         print(f"    📋 Mapping {len(header_row.cells)} header cells to labels")
         for col_idx, cell in enumerate(header_row.cells):
             header_text = normalize_text(cell.text).strip()
             if not header_text:
                 continue
             print(f"      Column {col_idx}: '{header_text}'")
             best_match = None
             best_score = 0
             for label in labels:
                 if header_text.upper() == label.upper():
                     best_match = label
                     best_score = 1.0
                     break
                 header_words = set(word.upper() for word in header_text.split() if len(word) > 2)
                 label_words = set(word.upper() for word in label.split() if len(word) > 2)
                 if header_words and label_words:
                     common_words = header_words.intersection(label_words)
                     if common_words:
                         score = len(common_words) / max(len(header_words), len(label_words))
+                        if score > best_score and score >= 0.4:
                             best_score = score
                             best_match = label
             if best_match:
                 column_mapping[col_idx] = best_match
                 print(f"        ✅ Mapped to: '{best_match}' (score: {best_score:.2f})")
             else:
                 print(f"        ⚠️ No mapping found for '{header_text}'")
         print(f"    📊 Total column mappings: {len(column_mapping)}")
         # Extract red text from data rows (skip header)
         for row_idx in range(1, len(tbl.rows)):
             row = tbl.rows[row_idx]
             print(f"      📌 Processing data row {row_idx}")
             for col_idx, cell in enumerate(row.cells):
                 if col_idx in column_mapping:
                     label = column_mapping[col_idx]
                     red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
                     if red_txt:
                         print(f"        🔴 Found red text in '{label}': '{red_txt}'")
                         if red_txt not in seen[label]:
                             seen[label].add(red_txt)
                             collected[label].append(red_txt)
         result = {k: v for k, v in collected.items() if v}
         print(f"    ✅ Vehicle Registration extracted: {len(result)} columns with data")
         return result
+    # FALLBACK: original extraction logic for other tables
+    labels = spec.get("labels", []) + [schema_name]
     collected = {lbl: [] for lbl in labels}
     seen = {lbl: set() for lbl in labels}
+    by_col = (spec.get("orientation") == "row1")
     start_row = 1 if by_col else 0
     rows = tbl.rows[start_row:]
     for ri, row in enumerate(rows):
         for ci, cell in enumerate(row.cells):
             red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
             if not red_txt:
                 continue
             if by_col:
+                if ci < len(spec.get("labels", [])):
                     lbl = spec["labels"][ci]
                 else:
                     lbl = schema_name
             else:
                 raw_label = normalize_text(row.cells[0].text)
                 lbl = None
+                for spec_label in spec.get("labels", []):
                     if normalize_text(spec_label).upper() == raw_label.upper():
                         lbl = spec_label
                         break
                 if not lbl:
+                    for spec_label in spec.get("labels", []):
                         spec_norm = normalize_text(spec_label).upper()
                         raw_norm = raw_label.upper()
                         if spec_norm in raw_norm or raw_norm in spec_norm:
                 collected[lbl].append(red_txt)
     return {k: v for k, v in collected.items() if v}
+# -------------------------------------------------------------------
+# Main extraction: iterate tables & paragraphs
+# -------------------------------------------------------------------
 def extract_red_text(input_doc):
+    """
+    input_doc: docx.Document object or file path
+    returns: dict
+    """
     if isinstance(input_doc, str):
         doc = Document(input_doc)
     else:
         doc = input_doc
     out = {}
     table_count = 0
     for tbl in doc.tables:
         table_count += 1
+        # Check multi-schema table first
         multi_schemas = check_multi_schema_table(tbl)
         if multi_schemas:
             multi_data = extract_multi_schema_table(tbl, multi_schemas)
                     else:
                         out[schema_name] = schema_data
             continue
         schema = match_table_schema(tbl)
         if not schema:
+            # keep scanning for tables even if no schema matched
             continue
         spec = TABLE_SCHEMAS[schema]
         data = extract_table_data(tbl, schema, spec)
                         out[schema][k] = v
             else:
                 out[schema] = data
+    # paragraphs
     paras = {}
     for idx, para in enumerate(doc.paragraphs):
         red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
         if not red_txt:
             continue
+        # find context heading by scanning backward
         context = None
         for j in range(idx-1, -1, -1):
             txt = normalize_text(doc.paragraphs[j].text)
                 if any(re.search(p, txt, re.IGNORECASE) for p in all_patterns):
                     context = txt
                     break
+        # if it's date-like and matches date pattern, set context to Date
         if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
             context = "Date"
         if not context:
             context = "(para)"
         paras.setdefault(context, []).append(red_txt)
     if paras:
         out["paragraphs"] = paras
     return out
+# -------------------------------------------------------------------
+# File-like wrapper (keeps API used elsewhere)
+# -------------------------------------------------------------------
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts:
             json.dump(result, f, indent=2, ensure_ascii=False)
     return result
+# -------------------------------------------------------------------
+# CLI entrypoint (preserve original UX)
+# -------------------------------------------------------------------
 if __name__ == "__main__":
     if len(sys.argv) == 3:
         input_docx = sys.argv[1]
         output_json = sys.argv[2]