Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

0d57117

verified ·

1 Parent(s): 4091415

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +75 -69

updated_word.py CHANGED Viewed

@@ -570,7 +570,7 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """FIXED: Enhanced management summary processing that handles both dict and flattened JSON structures"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
@@ -600,96 +600,86 @@ def fix_management_summary_details_column(table, flat_json):
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        # FIXED: Build management data dict from multiple sources
         mgmt_data = {}
-        # Strategy 1: Look for direct nested dict in original JSON (before flattening)
-        if mgmt_type in flat_json:
-            direct_data = flat_json[mgmt_type]
-            if isinstance(direct_data, dict):
-                mgmt_data = direct_data
-                print(f"    ✅ Found direct nested dict for: '{mgmt_type}' with {len(mgmt_data)} standards")
-        # Strategy 2: Look for flattened keys like "Mass Management Summary.Std 5. Verification"
-        if not mgmt_data:
-            for key, value in flat_json.items():
-                if key.startswith(mgmt_type + "."):
-                    # Extract the standard part (after the management type)
-                    std_key = key[len(mgmt_type) + 1:]  # Remove "Mass Management Summary." prefix
-                    mgmt_data[std_key] = value
-                    print(f"    ✅ Found flattened standard: '{std_key}' = {value}")
-            if mgmt_data:
-                print(f"    ✅ Collected {len(mgmt_data)} standards from flattened keys for {mgmt_type}")
-        # Strategy 3: Search for keys that contain the management type
-        if not mgmt_data:
-            for key, value in flat_json.items():
-                if mgmt_type.lower().replace(" ", "") in key.lower().replace(" ", ""):
-                    if isinstance(value, dict):
-                        mgmt_data = value
-                        print(f"    ✅ Found data using key variation: '{key}'")
-                        break
         if not mgmt_data:
             print(f"    ⚠️ No JSON data found for {mgmt_type}")
             continue
         print(f"    📋 Processing {mgmt_type} with standards: {list(mgmt_data.keys())}")
-        # Process the table rows
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
-                standard_text = get_clean_text(standard_cell).strip().lower()
                 # Skip header rows
-                if "standard" in standard_text or "requirement" in standard_text or "details" in standard_text:
                     continue
                 if not has_red_text(details_cell):
                     continue
-                print(f"      🔍 Processing standard: '{standard_text}'")
-                # FIXED: Better standard matching with multiple strategies
                 replacement_value = None
-                # Strategy 1: Direct standard matching
-                if "std 1" in standard_text and ("daily" in standard_text or "check" in standard_text):
-                    replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Std 1", "Daily Check"])
-                    print(f"      🎯 Looking for Std 1 Daily Check")
-                elif "std 5" in standard_text:
-                    if "mass" in mgmt_type.lower():
-                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Std 5", "Verification"])
-                        print(f"      🎯 Looking for Std 5 Verification (Mass)")
-                    else:
-                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Internal Review", "Std 5", "Internal Review"])
-                        print(f"      🎯 Looking for Std 5 Internal Review (Fatigue)")
-                elif "std 6" in standard_text or "internal review" in standard_text:
-                    replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 6", "Internal Review"])
-                    print(f"      🎯 Looking for Std 6 Internal Review")
-                elif "std 7" in standard_text:
-                    replacement_value = find_best_standard_value(mgmt_data, ["Std 7. Internal Review", "Std 7", "Internal Review"])
-                    print(f"      🎯 Looking for Std 7 Internal Review")
-                # Strategy 2: Fuzzy matching if direct doesn't work
                 if not replacement_value:
-                    print(f"      🔍 No direct match, trying fuzzy matching...")
                     for std_key, std_value in mgmt_data.items():
-                        std_key_lower = std_key.lower()
-                        if "std" in standard_text:
-                            # Extract std number from both
-                            std_match = re.search(r'std\s*(\d+)', standard_text)
-                            key_match = re.search(r'std\s*(\d+)', std_key_lower)
-                            if std_match and key_match and std_match.group(1) == key_match.group(1):
-                                replacement_value = std_value
-                                print(f"      ✅ Fuzzy matched by std number: {std_key}")
-                                break
                 # Apply replacement if found
                 if replacement_value:
@@ -702,18 +692,24 @@ def fix_management_summary_details_column(table, flat_json):
                     else:
                         replacement_text = str(replacement_value)
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                     replacements_made += cell_replacements
-                    if cell_replacements:
-                        print(f"      ✅ Replaced '{standard_text}' details in {mgmt_type} with: '{replacement_text[:50]}...'")
                 else:
                     print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
                     print(f"      📋 Available standards: {list(mgmt_data.keys())}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
-    """FIXED: Find the best matching value for a standard from management data"""
     print(f"        🔍 Searching for candidates: {candidate_keys}")
     print(f"        📋 In available keys: {list(mgmt_data.keys())}")
@@ -730,13 +726,23 @@ def find_best_standard_value(mgmt_data, candidate_keys):
                 print(f"        ✅ Case-insensitive match found: '{key}' for '{candidate}'")
                 return value
-    # Partial match
     for candidate in candidate_keys:
         for key, value in mgmt_data.items():
             if candidate.lower() in key.lower() or key.lower() in candidate.lower():
                 print(f"        ✅ Partial match found: '{key}' for '{candidate}'")
                 return value
     print(f"        ❌ No match found for any candidate")
     return None

     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """FINAL FIX: Enhanced management summary processing that processes ALL standards correctly"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # FIXED: Build management data dict from flattened keys
         mgmt_data = {}
+        # Look for flattened keys like "Mass Management Summary.Std 5. Verification"
+        for key, value in flat_json.items():
+            if key.startswith(mgmt_type + "."):
+                # Extract the standard part (after the management type)
+                std_key = key[len(mgmt_type) + 1:]  # Remove "Mass Management Summary." prefix
+                mgmt_data[std_key] = value
+                print(f"    ✅ Found flattened standard: '{std_key}' = {value}")
         if not mgmt_data:
             print(f"    ⚠️ No JSON data found for {mgmt_type}")
             continue
         print(f"    📋 Processing {mgmt_type} with standards: {list(mgmt_data.keys())}")
+        # Process the table rows - FIXED: Better row processing
         for row_idx, row in enumerate(table.rows):
             if len(row.cells) >= 2:
                 standard_cell = row.cells[0]
                 details_cell = row.cells[1]
+                standard_text = get_clean_text(standard_cell).strip()
+                standard_text_lower = standard_text.lower()
                 # Skip header rows
+                if any(header in standard_text_lower for header in ["standard", "requirement", "details", "management"]):
                     continue
+                # Only process cells with red text in details column
                 if not has_red_text(details_cell):
                     continue
+                print(f"      🔍 Processing row {row_idx + 1}: '{standard_text}'")
+                # FIXED: Comprehensive standard matching
                 replacement_value = None
+                matched_std = None
+                # Strategy 1: Extract standard number and match
+                std_match = re.search(r'std\s*(\d+)', standard_text_lower)
+                if std_match:
+                    std_num = std_match.group(1)
+                    print(f"      🎯 Looking for Standard {std_num}")
+                    # Look for matching standard in mgmt_data
+                    for std_key, std_value in mgmt_data.items():
+                        if f"std {std_num}" in std_key.lower():
+                            replacement_value = std_value
+                            matched_std = std_key
+                            print(f"      ✅ Found match by std number: '{std_key}'")
+                            break
+                # Strategy 2: Keyword-based matching if std number doesn't work
                 if not replacement_value:
+                    if "daily" in standard_text_lower and "check" in standard_text_lower:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Daily Check"])
+                        matched_std = "Daily Check related"
+                    elif "verification" in standard_text_lower:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Verification"])
+                        matched_std = "Verification related"
+                    elif "internal review" in standard_text_lower:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 7. Internal Review", "Std 5. Internal Review", "Internal Review"])
+                        matched_std = "Internal Review related"
+                    elif "fault" in standard_text_lower and "recording" in standard_text_lower:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 2. Fault Recording", "Fault Recording"])
+                        matched_std = "Fault Recording related"
+                    elif "fault" in standard_text_lower and "repair" in standard_text_lower:
+                        replacement_value = find_best_standard_value(mgmt_data, ["Std 3. Fault Repair", "Fault Repair"])
+                        matched_std = "Fault Repair related"
+                # Strategy 3: Try all available standards if nothing specific matches
+                if not replacement_value and mgmt_data:
+                    print(f"      🔍 No specific match, trying all available standards...")
+                    # Just take the first available standard for this row
                     for std_key, std_value in mgmt_data.items():
+                        replacement_value = std_value
+                        matched_std = std_key
+                        print(f"      ⚡ Using available standard: '{std_key}'")
+                        break
                 # Apply replacement if found
                 if replacement_value:
                     else:
                         replacement_text = str(replacement_value)
+                    print(f"      🎯 About to replace red text with: '{replacement_text[:100]}...'")
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                     replacements_made += cell_replacements
+                    if cell_replacements > 0:
+                        print(f"      ✅ SUCCESSFULLY replaced '{standard_text}' details in {mgmt_type}")
+                        print(f"      📋 Used data from: '{matched_std}'")
+                    else:
+                        print(f"      ❌ Failed to replace red text in cell")
                 else:
                     print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
                     print(f"      📋 Available standards: {list(mgmt_data.keys())}")
+    print(f"    📊 Total management summary replacements: {replacements_made}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
+    """ENHANCED: Find the best matching value for a standard from management data"""
     print(f"        🔍 Searching for candidates: {candidate_keys}")
     print(f"        📋 In available keys: {list(mgmt_data.keys())}")
                 print(f"        ✅ Case-insensitive match found: '{key}' for '{candidate}'")
                 return value
+    # Partial match (contains)
     for candidate in candidate_keys:
         for key, value in mgmt_data.items():
             if candidate.lower() in key.lower() or key.lower() in candidate.lower():
                 print(f"        ✅ Partial match found: '{key}' for '{candidate}'")
                 return value
+    # Extract number and match by number
+    for candidate in candidate_keys:
+        candidate_num = re.search(r'(\d+)', candidate)
+        if candidate_num:
+            for key, value in mgmt_data.items():
+                key_num = re.search(r'(\d+)', key)
+                if key_num and candidate_num.group(1) == key_num.group(1):
+                    print(f"        ✅ Number match found: '{key}' for '{candidate}'")
+                    return value
     print(f"        ❌ No match found for any candidate")
     return None