Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 21

Commit

6aa8b72

verified ·

1 Parent(s): da7e8af

Update updated_word.py

Browse files

Files changed (1) hide show

updated_word.py +0 -162

updated_word.py CHANGED Viewed

@@ -1300,169 +1300,7 @@ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
     return replacements_made
-def force_red_text_replacement(document, flat_json):
-    """Force replacement of any remaining red text by trying ALL JSON values"""
-    replacements_made = 0
-    print(f"\n🎯 FORCE FIX: Scanning for any remaining red text...")
-    # Collect all possible replacement values from JSON
-    all_values = {}
-    for key, value in flat_json.items():
-        if value:
-            value_str = get_value_as_string(value, key)
-            if value_str and isinstance(value_str, str) and value_str.strip():
-                all_values[key] = value_str.strip()
-                # Store individual items from lists for partial matching
-                if isinstance(value, list):
-                    for i, item in enumerate(value):
-                        item_str = str(item).strip() if item else ""
-                        if item_str:
-                            all_values[f"{key}_item_{i}"] = item_str
-    print(f"    Found {len(all_values)} potential replacement values")
-    # Process all tables
-    for table_idx, table in enumerate(document.tables):
-        for row_idx, row in enumerate(table.rows):
-            for cell_idx, cell in enumerate(row.cells):
-                if has_red_text(cell):
-                    print(f"    🔍 Found red text in Table {table_idx + 1}, Row {row_idx + 1}, Cell {cell_idx + 1}")
-                    # Extract all red text from this cell
-                    red_text_parts = []
-                    for paragraph in cell.paragraphs:
-                        for run in paragraph.runs:
-                            if is_red(run) and run.text.strip():
-                                red_text_parts.append(run.text.strip())
-                    combined_red_text = " ".join(red_text_parts).strip()
-                    print(f"        Red text: '{combined_red_text}'")
-                    # safety: when red text is very short, avoid replacing with very long multi-item values
-                    red_len_words = len(combined_red_text.split())
-                    # Find best match
-                    best_match = None
-                    best_key = None
-                    # Exact matching (prefer exact)
-                    for key, value in all_values.items():
-                        if combined_red_text.lower() == value.lower():
-                            best_match = value
-                            best_key = key
-                            break
-                    # Partial matching (skip aggressive short->long mapping)
-                    if not best_match:
-                        for key, value in all_values.items():
-                            # <<< PATCH: skip matching single-word red_text to multi-item candidate values
-                            if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
-                                continue
-                            if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
-                               (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
-                                best_match = value
-                                best_key = key
-                                break
-                    # Word-by-word matching for names/dates
-                    if not best_match:
-                        red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
-                        best_score = 0
-                        for key, value in all_values.items():
-                            # skip aggressive substitution for short red tokens vs long values
-                            if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
-                                continue
-                            value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
-                            if red_words and value_words:
-                                common_words = red_words.intersection(value_words)
-                                if common_words:
-                                    score = len(common_words) / len(red_words)
-                                    if score > best_score and score >= 0.5:  # At least 50% match
-                                        best_score = score
-                                        best_match = value
-                                        best_key = key
-                    # Replace if we found a match
-                    if best_match:
-                        print(f"        ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
-                        cell_replacements = replace_red_text_in_cell(cell, best_match)
-                        replacements_made += cell_replacements
-                        print(f"        Made {cell_replacements} replacements")
-                    else:
-                        print(f"        ❌ No suitable replacement found")
-    # Process all paragraphs
-    for para_idx, paragraph in enumerate(document.paragraphs):
-        if has_red_text_in_paragraph(paragraph):
-            red_text_parts = []
-            for run in paragraph.runs:
-                if is_red(run) and run.text.strip():
-                    red_text_parts.append(run.text.strip())
-            combined_red_text = " ".join(red_text_parts).strip()
-            if combined_red_text:
-                print(f"    🔍 Found red text in Paragraph {para_idx + 1}: '{combined_red_text}'")
-                # Same matching logic as above
-                best_match = None
-                best_key = None
-                red_len_words = len(combined_red_text.split())
-                # Exact match
-                for key, value in all_values.items():
-                    if combined_red_text.lower() == value.lower():
-                        best_match = value
-                        best_key = key
-                        break
-                # Partial match
-                if not best_match:
-                    for key, value in all_values.items():
-                        if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 3:
-                            continue
-                        if (len(value) > 3 and value.lower() in combined_red_text.lower()) or \
-                           (len(combined_red_text) > 3 and combined_red_text.lower() in value.lower()):
-                            best_match = value
-                            best_key = key
-                            break
-                # Word match
-                if not best_match:
-                    red_words = set(word.lower() for word in combined_red_text.split() if len(word) > 2)
-                    best_score = 0
-                    for key, value in all_values.items():
-                        if red_len_words <= 2 and isinstance(value, str) and len(value.split()) > 4:
-                            continue
-                        value_words = set(word.lower() for word in str(value).split() if len(word) > 2)
-                        if red_words and value_words:
-                            common_words = red_words.intersection(value_words)
-                            if common_words:
-                                score = len(common_words) / len(red_words)
-                                if score > best_score and score >= 0.5:
-                                    best_score = score
-                                    best_match = value
-                                    best_key = key
-                # Replace if found
-                if best_match:
-                    print(f"        ✅ Replacing with: '{best_match}' (from key: '{best_key}')")
-                    red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
-                    if red_runs:
-                        red_runs[0].text = best_match
-                        red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
-                        for run in red_runs[1:]:
-                            run.text = ''
-                        replacements_made += 1
-                        print(f"        Made 1 paragraph replacement")
-                else:
-                    print(f"        ❌ No suitable replacement found")
-    return replacements_made
 def process_hf(json_file, docx_file, output_file):
     """Main processing function with comprehensive error handling"""

     return replacements_made
 def process_hf(json_file, docx_file, output_file):
     """Main processing function with comprehensive error handling"""