Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

878a622

verified ·

1 Parent(s): 575fdf9

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +135 -50

updated_word.py CHANGED Viewed

@@ -570,9 +570,10 @@ def handle_attendance_list_table_enhanced(table, flat_json):
     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
-    """DEBUG VERSION: Enhanced management summary processing with detailed debugging - FIXED FOR FLATTENED JSON"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
     # Determine which type of management summary this is
     table_text = ""
@@ -600,24 +601,39 @@ def fix_management_summary_details_column(table, flat_json):
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
-        # Build management data dict from flattened keys
         mgmt_data = {}
         # Look for flattened keys like "Mass Management Summary.Std 5. Verification"
         for key, value in flat_json.items():
             if key.startswith(mgmt_type + "."):
                 # Extract the standard part (after the management type)
                 std_key = key[len(mgmt_type) + 1:]  # Remove "Mass Management Summary." prefix
-                mgmt_data[std_key] = value
-                print(f"    ✅ Found flattened standard: '{std_key}' = {value}")
         if not mgmt_data:
-            print(f"    ⚠️ No JSON data found for {mgmt_type}")
             continue
-        print(f"    📋 Processing {mgmt_type} with standards: {list(mgmt_data.keys())}")
-        # DEBUG: Check every row in the table
         print(f"    🔍 Analyzing all {len(table.rows)} rows in table:")
         for row_idx, row in enumerate(table.rows):
@@ -630,59 +646,94 @@ def fix_management_summary_details_column(table, flat_json):
                 print(f"      📋 Row {row_idx + 1}:")
                 print(f"         📄 Standard: '{standard_text}'")
-                print(f"         📄 Details: '{details_text[:50]}...' (length: {len(details_text)})")
-                print(f"         🔴 Has red text: {has_red_text(details_cell)}")
-                # Skip header rows
-                if any(header in standard_text_lower for header in ["standard", "requirement", "details", "management"]):
                     print(f"         ⏭️ Skipping header row")
                     continue
-                # Check if this row has red text
                 if not has_red_text(details_cell):
-                    print(f"         ⏭️ No red text found, skipping")
                     continue
-                print(f"      🎯 PROCESSING row {row_idx + 1}: '{standard_text}'")
-                # Extract standard number and match
                 replacement_value = None
                 matched_std = None
                 std_match = re.search(r'std\s*(\d+)', standard_text_lower)
                 if std_match:
                     std_num = std_match.group(1)
-                    print(f"      🎯 Looking for Standard {std_num}")
-                    # Look for matching standard in mgmt_data
                     for std_key, std_value in mgmt_data.items():
                         if f"std {std_num}" in std_key.lower():
                             replacement_value = std_value
                             matched_std = std_key
-                            print(f"      ✅ Found match by std number: '{std_key}'")
                             break
-                # Keyword-based matching if std number doesn't work
                 if not replacement_value:
-                    print(f"      🔍 No std number match, trying keyword matching...")
-                    if "daily" in standard_text_lower and "check" in standard_text_lower:
-                        replacement_value = find_best_standard_value(mgmt_data, ["Std 1. Daily Check", "Daily Check"])
-                        matched_std = "Daily Check related"
-                    elif "verification" in standard_text_lower:
-                        replacement_value = find_best_standard_value(mgmt_data, ["Std 5. Verification", "Verification"])
-                        matched_std = "Verification related"
-                    elif "internal review" in standard_text_lower:
-                        replacement_value = find_best_standard_value(mgmt_data, ["Std 6. Internal Review", "Std 7. Internal Review", "Std 5. Internal Review", "Internal Review"])
-                        matched_std = "Internal Review related"
-                # Last resort: use any available standard
-                if not replacement_value and mgmt_data:
-                    print(f"      🔍 No specific match, using first available standard...")
                     for std_key, std_value in mgmt_data.items():
-                        replacement_value = std_value
-                        matched_std = std_key
-                        print(f"      ⚡ Using available standard: '{std_key}'")
-                        break
                 # Apply replacement if found
                 if replacement_value:
@@ -695,36 +746,70 @@ def fix_management_summary_details_column(table, flat_json):
                     else:
                         replacement_text = str(replacement_value)
-                    print(f"      🎯 About to replace red text with: '{replacement_text[:100]}...'")
-                    # DEBUG: Show red text segments before replacement
-                    red_segments = extract_red_text_segments(details_cell)
-                    print(f"      🔍 Found {len(red_segments)} red text segments:")
-                    for i, segment in enumerate(red_segments):
-                        print(f"         Segment {i+1}: '{segment['text'][:50]}...'")
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
                     replacements_made += cell_replacements
                     if cell_replacements > 0:
-                        print(f"      ✅ SUCCESSFULLY replaced '{standard_text}' details in {mgmt_type}")
-                        print(f"      📋 Used data from: '{matched_std}'")
                         # Verify the replacement worked
                         new_details_text = get_clean_text(details_cell).strip()
-                        print(f"      🔍 New details text: '{new_details_text[:100]}...'")
                     else:
                         print(f"      ❌ Failed to replace red text in cell")
-                        print(f"      🔍 Cell still contains: '{get_clean_text(details_cell)[:100]}...'")
                 else:
-                    print(f"      ⚠️ No replacement found for '{standard_text}' in {mgmt_type}")
-                    print(f"      📋 Available standards: {list(mgmt_data.keys())}")
             else:
                 print(f"      ⚠️ Row {row_idx + 1} has insufficient columns ({len(row.cells)})")
-    print(f"    📊 Total management summary replacements: {replacements_made}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
     """ENHANCED: Find the best matching value for a standard from management data"""
     print(f"        🔍 Searching for candidates: {candidate_keys}")

     return replacements_made
 def fix_management_summary_details_column(table, flat_json):
+    """CORRECTED VERSION: Replace red text with UPDATED values from JSON (not old extracted values)"""
     replacements_made = 0
     print(f"    🎯 FIX: Management Summary DETAILS column processing")
+    print(f"    📋 NOTE: JSON contains UPDATED values to replace red text with")
     # Determine which type of management summary this is
     table_text = ""
     for mgmt_type in mgmt_types:
         print(f"    ✅ Confirmed {mgmt_type} table processing")
+        # Build management data dict from flattened keys - these contain UPDATED values
         mgmt_data = {}
         # Look for flattened keys like "Mass Management Summary.Std 5. Verification"
+        # IMPORTANT: Prioritize longer, more detailed values over shorter ones
         for key, value in flat_json.items():
             if key.startswith(mgmt_type + "."):
                 # Extract the standard part (after the management type)
                 std_key = key[len(mgmt_type) + 1:]  # Remove "Mass Management Summary." prefix
+                # Check if this is a longer, more detailed version than what we already have
+                if std_key in mgmt_data:
+                    # Compare value lengths - prefer longer, more detailed content
+                    existing_value = mgmt_data[std_key]
+                    existing_length = len(str(existing_value)) if not isinstance(existing_value, list) else len(str(existing_value[0]) if existing_value else "")
+                    new_length = len(str(value)) if not isinstance(value, list) else len(str(value[0]) if value else "")
+                    if new_length > existing_length:
+                        mgmt_data[std_key] = value
+                        print(f"    ✅ UPDATED to longer standard: '{std_key}' = {value}")
+                    else:
+                        print(f"    ⏭️ Keeping existing longer standard: '{std_key}'")
+                else:
+                    mgmt_data[std_key] = value
+                    print(f"    ✅ Found UPDATED standard: '{std_key}' = {value}")
         if not mgmt_data:
+            print(f"    ⚠️ No UPDATED JSON data found for {mgmt_type}")
             continue
+        print(f"    📋 Processing {mgmt_type} with {len(mgmt_data)} updated standards: {list(mgmt_data.keys())}")
+        # Process each row looking for red text in details column
         print(f"    🔍 Analyzing all {len(table.rows)} rows in table:")
         for row_idx, row in enumerate(table.rows):
                 print(f"      📋 Row {row_idx + 1}:")
                 print(f"         📄 Standard: '{standard_text}'")
+                print(f"         📄 Current Details: '{details_text[:50]}...' (length: {len(details_text)})")
+                print(f"         🔴 Has red text (OLD data): {has_red_text(details_cell)}")
+                # Skip header rows - be more specific about what constitutes a header
+                header_indicators = ["standard", "requirement", "details", mgmt_type.lower().split()[0]]
+                if any(header in standard_text_lower for header in header_indicators) and len(standard_text) < 50:
                     print(f"         ⏭️ Skipping header row")
                     continue
+                # IMPORTANT: We want to replace red text (old data) with updated data from JSON
+                # Check if this row has red text in details cell - this is what we need to replace
                 if not has_red_text(details_cell):
+                    print(f"         ⏭️ No red text found in details cell (already updated?), skipping")
                     continue
+                print(f"      🎯 PROCESSING row {row_idx + 1} - REPLACING OLD red text with NEW data")
+                # Extract current red text (this is the OLD data we're replacing)
+                red_segments = extract_red_text_segments(details_cell)
+                current_red_text = ""
+                for segment in red_segments:
+                    current_red_text += segment['text']
+                print(f"      🔴 Current red text (OLD): '{current_red_text[:100]}...'")
+                # Find the UPDATED replacement value from JSON
                 replacement_value = None
                 matched_std = None
+                # Strategy 1: Extract standard number and match
                 std_match = re.search(r'std\s*(\d+)', standard_text_lower)
                 if std_match:
                     std_num = std_match.group(1)
+                    print(f"      🎯 Looking for UPDATED Standard {std_num} data")
+                    # Look for matching standard in mgmt_data (contains UPDATED values)
                     for std_key, std_value in mgmt_data.items():
                         if f"std {std_num}" in std_key.lower():
                             replacement_value = std_value
                             matched_std = std_key
+                            print(f"      ✅ Found UPDATED data for std {std_num}: '{std_key}'")
                             break
+                # Strategy 2: Keyword-based matching if std number doesn't work
                 if not replacement_value:
+                    print(f"      🔍 No std number match, trying keyword matching for UPDATED data...")
+                    # More comprehensive keyword matching
+                    keyword_mappings = {
+                        "daily check": ["Std 1. Daily Check", "Daily Check"],
+                        "verification": ["Std 5. Verification", "Verification"],
+                        "internal review": ["Std 6. Internal Review", "Std 7. Internal Review", "Std 5. Internal Review", "Internal Review"],
+                        "fault recording": ["Std 2. Fault Recording", "Fault Recording/ Reporting"],
+                        "fault repair": ["Std 3. Fault Repair", "Fault Repair"],
+                        "maintenance schedules": ["Std 4. Maintenance Schedules", "Maintenance Schedules"],
+                        "responsibilities": ["Std 1. Responsibilities", "Std 6. Responsibilities"],
+                        "vehicle control": ["Std 2. Vehicle Control", "Vehicle Control"],
+                        "vehicle use": ["Std 3. Vehicle Use", "Vehicle Use"],
+                        "records and documentation": ["Std 4. Records", "Std 5. Records", "Records and Documentation"],
+                        "training": ["Std 8. Training", "Std 3. Training", "Training"],
+                        "suspension": ["Std 8. Maintenance of Suspension", "Suspension"],
+                        "scheduling": ["Std 1. Scheduling", "Scheduling"],
+                        "health and wellbeing": ["Std 2. Health", "Health and wellbeing"],
+                        "workplace conditions": ["Std 7. Workplace", "Workplace conditions"]
+                    }
+                    for keyword, candidates in keyword_mappings.items():
+                        if keyword in standard_text_lower:
+                            replacement_value = find_best_standard_value(mgmt_data, candidates)
+                            if replacement_value:
+                                matched_std = f"{keyword} related"
+                                print(f"      ✅ Found UPDATED data for keyword '{keyword}'")
+                                break
+                # Strategy 3: Try exact standard name matching
+                if not replacement_value:
+                    print(f"      🔍 Trying exact standard name matching for UPDATED data...")
+                    # Clean the standard text for better matching
+                    clean_standard = re.sub(r'\([^)]*\)', '', standard_text).strip()
                     for std_key, std_value in mgmt_data.items():
+                        # Try partial matching
+                        if (clean_standard.lower() in std_key.lower() or
+                            std_key.lower() in clean_standard.lower()):
+                            replacement_value = std_value
+                            matched_std = std_key
+                            print(f"      ✅ Found UPDATED data via partial match: '{std_key}'")
+                            break
                 # Apply replacement if found
                 if replacement_value:
                     else:
                         replacement_text = str(replacement_value)
+                    print(f"      🎯 REPLACING old red text with UPDATED data: '{replacement_text[:100]}...'")
+                    # Use robust red text replacement
                     cell_replacements = replace_red_text_in_cell(details_cell, replacement_text)
+                    # FALLBACK: If replace_red_text_in_cell fails, try manual replacement
+                    if cell_replacements == 0:
+                        print(f"      ⚠️ Standard replacement failed, trying manual approach...")
+                        # Try to replace red text manually
+                        for paragraph in details_cell.paragraphs:
+                            for run in paragraph.runs:
+                                if is_red(run) and run.text.strip():
+                                    print(f"      🔧 Manually replacing red run: '{run.text[:50]}...'")
+                                    run.text = replacement_text
+                                    run.font.color.rgb = RGBColor(0, 0, 0)
+                                    cell_replacements = 1
+                                    break
+                            if cell_replacements > 0:
+                                break
                     replacements_made += cell_replacements
                     if cell_replacements > 0:
+                        print(f"      ✅ SUCCESSFULLY UPDATED '{standard_text}' with NEW data in {mgmt_type}")
+                        print(f"      📋 Used UPDATED data from: '{matched_std}'")
                         # Verify the replacement worked
                         new_details_text = get_clean_text(details_cell).strip()
+                        print(f"      🔍 NEW details text: '{new_details_text[:100]}...'")
+                        print(f"      🎉 OLD red text replaced with UPDATED data!")
                     else:
                         print(f"      ❌ Failed to replace red text in cell")
+                        print(f"      🔍 Cell still contains OLD data: '{get_clean_text(details_cell)[:100]}...'")
                 else:
+                    print(f"      ⚠️ No UPDATED replacement found for '{standard_text}' in {mgmt_type}")
+                    print(f"      📋 Available UPDATED standards: {list(mgmt_data.keys())}")
+                    # FALLBACK: Try to find ANY available standard that might fit
+                    if mgmt_data and current_red_text:
+                        print(f"      🔄 Trying fallback - any available UPDATED standard...")
+                        # Use the first available standard as a fallback
+                        first_std_key = list(mgmt_data.keys())[0]
+                        fallback_value = mgmt_data[first_std_key]
+                        if isinstance(fallback_value, list):
+                            fallback_text = "\n".join(str(item) for item in fallback_value)
+                        else:
+                            fallback_text = str(fallback_value)
+                        print(f"      🔄 Using fallback UPDATED data: '{fallback_text[:100]}...'")
+                        cell_replacements = replace_red_text_in_cell(details_cell, fallback_text)
+                        if cell_replacements > 0:
+                            replacements_made += cell_replacements
+                            print(f"      ✅ Applied fallback UPDATED data successfully")
             else:
                 print(f"      ⚠️ Row {row_idx + 1} has insufficient columns ({len(row.cells)})")
+    print(f"    📊 Total management summary UPDATES: {replacements_made}")
     return replacements_made
 def find_best_standard_value(mgmt_data, candidate_keys):
     """ENHANCED: Find the best matching value for a standard from management data"""
     print(f"        🔍 Searching for candidates: {candidate_keys}")