Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

6f54dab

verified ·

1 Parent(s): 364a368

Update extract_red_text.py

Browse files

Files changed (1) hide show

extract_red_text.py +0 -90

extract_red_text.py CHANGED Viewed

@@ -279,96 +279,6 @@ def extract_red_text(input_doc):
         out["paragraphs"] = paras
     return out
-def handle_management_summary_table(table, flat_json):
-    """Enhanced function to handle Management Summary tables specifically"""
-    replacements_made = 0
-    # Check if this is a Management Summary table
-    table_text = ""
-    for row in table.rows[:3]:
-        for cell in row.cells:
-            table_text += get_clean_text(cell).lower() + " "
-    # Detect which type of management summary
-    management_type = None
-    if "mass management" in table_text and "details" in table_text:
-        management_type = "Mass Management"
-    elif "maintenance management" in table_text and "details" in table_text:
-        management_type = "Maintenance Management"
-    elif "fatigue management" in table_text and "details" in table_text:
-        management_type = "Fatigue Management"
-    if not management_type:
-        return 0
-    print(f"    📋 Detected {management_type} Summary table with DETAILS column")
-    # Process each row to find standards and update DETAILS column
-    for row_idx, row in enumerate(table.rows):
-        if len(row.cells) < 2:
-            continue
-        # Skip header row
-        if row_idx == 0:
-            continue
-        standard_cell = row.cells[0]
-        details_cell = row.cells[1]
-        standard_text = get_clean_text(standard_cell).strip()
-        # Check if this row contains a standard (Std 1., Std 2., etc.)
-        if not re.match(r'Std \d+\.', standard_text):
-            continue
-        print(f"    📌 Processing {standard_text}")
-        # Only process if DETAILS cell has red text
-        if not has_red_text(details_cell):
-            continue
-        # Try multiple approaches to find matching data
-        json_value = None
-        # Approach 1: Try direct standard match in the base management section
-        base_management_data = flat_json.get(management_type, {})
-        if isinstance(base_management_data, dict):
-            for key, value in base_management_data.items():
-                if standard_text in key and isinstance(value, list) and len(value) > 0:
-                    json_value = value
-                    print(f"        ✅ Found match in {management_type}: '{key}'")
-                    break
-        # Approach 2: Try the summary section
-        if json_value is None:
-            summary_section = flat_json.get(f"{management_type} Summary", {})
-            if isinstance(summary_section, dict):
-                for key, value in summary_section.items():
-                    if standard_text in key and isinstance(value, list) and len(value) > 0:
-                        json_value = value
-                        print(f"        ✅ Found match in {management_type} Summary: '{key}'")
-                        break
-        # Approach 3: Try fuzzy matching with all keys
-        if json_value is None:
-            json_value = find_matching_json_value(standard_text, flat_json)
-        # Replace red text if we found data
-        if json_value is not None:
-            replacement_text = get_value_as_string(json_value, standard_text)
-            if isinstance(json_value, list):
-                replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
-            cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
-            replacements_made += cell_replacements
-            if cell_replacements > 0:
-                print(f"        ✅ Updated DETAILS for {standard_text}")
-        else:
-            print(f"        ❌ No data found for {standard_text}")
-    return replacements_made
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts:

         out["paragraphs"] = paras
     return out
 def extract_red_text_filelike(input_file, output_file):
     """
     Accepts: