Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

PDF-Data_Extractor / updated_word.py

Shami96

Update updated_word.py

ddb37e5 verified 3 months ago

raw

history blame

17.9 kB

	import json
	from docx import Document
	from docx.shared import RGBColor
	import re

	def load_json(filepath):
	with open(filepath, 'r') as file:
	return json.load(file)

	def flatten_json_new_system(json_data):
	"""Flatten your new JSON structure to work with replacement logic"""
	flat_json = {}

	for schema_name, schema_data in json_data.items():
	if isinstance(schema_data, dict):
	for field_name, values in schema_data.items():
	# Handle list values (your system returns lists)
	if isinstance(values, list) and values:
	value = values[0] if len(values) == 1 else values
	else:
	value = values

	# Add multiple key variations for better matching
	flat_json[field_name] = value
	flat_json[field_name.lower()] = value
	flat_json[field_name.lower().strip()] = value

	# Add schema-prefixed keys
	flat_json[f"{schema_name}.{field_name}"] = value
	flat_json[f"{schema_name.lower()}.{field_name.lower()}"] = value

	# Special mappings for common cases
	if "print name" in field_name.lower():
	flat_json["print name"] = value
	flat_json["operator name"] = value
	flat_json["name"] = value

	if "position title" in field_name.lower():
	flat_json["position title"] = value
	flat_json["position"] = value
	flat_json["title"] = value

	if "accreditation number" in field_name.lower():
	flat_json["accreditation number"] = value
	flat_json["nhvas accreditation no"] = value

	if "expiry date" in field_name.lower():
	flat_json["expiry date"] = value
	flat_json["expiry"] = value

	return flat_json

	def is_red(run):
	"""Detect red colored text"""
	color = run.font.color
	return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)

	def get_value_as_string(value, field_name=""):
	"""Convert value to string, handling lists appropriately"""
	if isinstance(value, list):
	if len(value) == 0:
	return ""
	elif len(value) == 1:
	return str(value[0])
	else:
	if "australian company number" in field_name.lower() or "company number" in field_name.lower():
	return value # Return as list for ACN processing
	else:
	return " ".join(str(v) for v in value)
	else:
	return str(value)

	def find_matching_json_value(field_name, flat_json):
	"""Enhanced matching for your new JSON structure"""
	field_name = field_name.strip()

	# Direct match (exact)
	if field_name in flat_json:
	print(f" ✅ Direct match found for key '{field_name}'")
	return flat_json[field_name]

	# Case-insensitive exact match
	for key, value in flat_json.items():
	if key.lower() == field_name.lower():
	print(f" ✅ Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
	return value

	# Partial matching for common field names
	field_lower = field_name.lower().strip()

	# Handle common variations
	if "print name" in field_lower:
	for key in ["Print Name", "print name", "operator name", "name"]:
	if key in flat_json:
	print(f" ✅ Print name match: '{field_name}' -> '{key}'")
	return flat_json[key]

	if "position title" in field_lower:
	for key in ["Position Title", "position title", "position", "title"]:
	if key in flat_json:
	print(f" ✅ Position title match: '{field_name}' -> '{key}'")
	return flat_json[key]

	if "accreditation number" in field_lower:
	for key in flat_json.keys():
	if "accreditation" in key.lower() and "number" in key.lower():
	print(f" ✅ Accreditation number match: '{field_name}' -> '{key}'")
	return flat_json[key]

	if "expiry date" in field_lower:
	for key in flat_json.keys():
	if "expiry" in key.lower():
	print(f" ✅ Expiry date match: '{field_name}' -> '{key}'")
	return flat_json[key]

	# Fuzzy matching
	field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
	if not field_words:
	return None

	best_match = None
	best_score = 0
	best_key = None

	for key, value in flat_json.items():
	key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
	if not key_words:
	continue

	common_words = field_words.intersection(key_words)
	if common_words:
	similarity = len(common_words) / len(field_words.union(key_words))
	coverage = len(common_words) / len(field_words)
	final_score = (similarity * 0.6) + (coverage * 0.4)

	if final_score > best_score:
	best_score = final_score
	best_match = value
	best_key = key

	if best_match and best_score >= 0.25:
	print(f" ✅ Fuzzy match found for key '{field_name}' with JSON key '{best_key}' (score: {best_score:.2f})")
	return best_match

	print(f" ❌ No match found for '{field_name}'")
	return None

	def get_clean_text(cell):
	"""Extract clean text from cell"""
	text = ""
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	text += run.text
	return text.strip()

	def has_red_text(cell):
	"""Check if cell has red text"""
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	if is_red(run) and run.text.strip():
	return True
	return False

	def replace_red_text_in_cell(cell, replacement_text):
	"""Replace red text in cell with new text"""
	replacements_made = 0

	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	if is_red(run) and run.text.strip():
	run.text = replacement_text
	run.font.color.rgb = RGBColor(0, 0, 0) # Change to black
	replacements_made += 1
	break # Only replace first red text found

	return replacements_made

	def handle_australian_company_number(row, company_numbers):
	"""Handle ACN digit placement"""
	replacements_made = 0
	for i, digit in enumerate(company_numbers):
	cell_idx = i + 1
	if cell_idx < len(row.cells):
	cell = row.cells[cell_idx]
	if has_red_text(cell):
	cell_replacements = replace_red_text_in_cell(cell, str(digit))
	replacements_made += cell_replacements
	print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
	return replacements_made

	def handle_nature_business_section(cell, flat_json):
	"""Handle Nature of Business section with sub-fields"""
	if not has_red_text(cell):
	return 0

	cell_text = get_clean_text(cell).lower()
	if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
	return 0

	print(f" 🎯 Found Nature of Business section")

	# Check for business description
	for key in flat_json.keys():
	if "nature of the operators business" in key.lower():
	business_value = flat_json[key]
	replacement_text = get_value_as_string(business_value)
	cell_replacements = replace_red_text_in_cell(cell, replacement_text)
	if cell_replacements > 0:
	print(f" ✅ Updated business description")
	return cell_replacements

	return 0

	def handle_operator_declaration_table(table, flat_json):
	"""Handle Operator Declaration table specifically"""
	replacements_made = 0

	for row_idx, row in enumerate(table.rows):
	if len(row.cells) >= 2:
	cell1_text = get_clean_text(row.cells[0]).strip()
	cell2_text = get_clean_text(row.cells[1]).strip()

	# Check if this is the Print Name / Position Title header row
	if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower()):
	print(f" 🎯 Found Operator Declaration table")

	# Look for data row
	if row_idx + 1 < len(table.rows):
	data_row = table.rows[row_idx + 1]
	if len(data_row.cells) >= 2:
	name_cell = data_row.cells[0]
	position_cell = data_row.cells[1]

	# Update Print Name
	if has_red_text(name_cell):
	name_value = None
	for key in ["Print Name", "print name", "Operator Declaration.Print Name"]:
	if key in flat_json:
	name_value = flat_json[key]
	break

	if name_value:
	name_text = get_value_as_string(name_value)
	cell_replacements = replace_red_text_in_cell(name_cell, name_text)
	replacements_made += cell_replacements
	print(f" ✅ Updated Print Name: '{name_text}'")

	# Update Position Title
	if has_red_text(position_cell):
	position_value = None
	for key in ["Position Title", "position title", "Operator Declaration.Position Title"]:
	if key in flat_json:
	position_value = flat_json[key]
	break

	if position_value:
	position_text = get_value_as_string(position_value)
	cell_replacements = replace_red_text_in_cell(position_cell, position_text)
	replacements_made += cell_replacements
	print(f" ✅ Updated Position Title: '{position_text}'")

	break

	return replacements_made

	def process_tables(document, flat_json):
	"""Process all tables in document"""
	replacements_made = 0

	for table_idx, table in enumerate(document.tables):
	print(f"\n🔍 Processing table {table_idx + 1}:")

	# Check for Operator Declaration table first (priority fix)
	if len(table.rows) <= 4: # Small tables
	declaration_replacements = handle_operator_declaration_table(table, flat_json)
	if declaration_replacements > 0:
	replacements_made += declaration_replacements
	continue

	# Process all rows
	for row_idx, row in enumerate(table.rows):
	if len(row.cells) < 1:
	continue

	key_cell = row.cells[0]
	key_text = get_clean_text(key_cell)

	if not key_text:
	continue

	print(f" 📌 Row {row_idx + 1}: Key = '{key_text}'")

	# Handle Nature of Business section
	if "nature of the operators business" in key_text.lower():
	nature_replacements = handle_nature_business_section(key_cell, flat_json)
	replacements_made += nature_replacements
	continue

	# Regular field matching
	json_value = find_matching_json_value(key_text, flat_json)

	if json_value is not None:
	replacement_text = get_value_as_string(json_value, key_text)

	# Handle Australian Company Number specially
	if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
	cell_replacements = handle_australian_company_number(row, json_value)
	replacements_made += cell_replacements
	else:
	# Handle regular fields
	for cell_idx in range(len(row.cells)):
	cell = row.cells[cell_idx]
	if has_red_text(cell):
	cell_replacements = replace_red_text_in_cell(cell, replacement_text)
	replacements_made += cell_replacements
	if cell_replacements > 0:
	print(f" ✅ Updated cell {cell_idx + 1}: '{replacement_text}'")
	else:
	# Process any red text in row cells
	for cell_idx in range(len(row.cells)):
	cell = row.cells[cell_idx]
	if has_red_text(cell):
	# Try to extract red text and match it
	red_text = ""
	for paragraph in cell.paragraphs:
	for run in paragraph.runs:
	if is_red(run):
	red_text += run.text

	if red_text.strip():
	json_value = find_matching_json_value(red_text.strip(), flat_json)
	if json_value is not None:
	replacement_text = get_value_as_string(json_value)
	cell_replacements = replace_red_text_in_cell(cell, replacement_text)
	replacements_made += cell_replacements
	if cell_replacements > 0:
	print(f" ✅ Replaced red text: '{red_text.strip()}' -> '{replacement_text}'")

	return replacements_made

	def process_paragraphs(document, flat_json):
	"""Process paragraphs for red text"""
	replacements_made = 0
	print(f"\n🔍 Processing paragraphs:")

	for para_idx, paragraph in enumerate(document.paragraphs):
	red_text = ""
	red_runs = []

	for run in paragraph.runs:
	if is_red(run) and run.text.strip():
	red_text += run.text
	red_runs.append(run)

	if red_text.strip():
	print(f" 📌 Paragraph {para_idx + 1}: Found red text: '{red_text.strip()}'")

	json_value = find_matching_json_value(red_text.strip(), flat_json)

	if json_value is not None:
	replacement_text = get_value_as_string(json_value)
	print(f" ✅ Replacing with: '{replacement_text}'")

	# Replace in first red run only
	if red_runs:
	red_runs[0].text = replacement_text
	red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
	# Clear other red runs
	for run in red_runs[1:]:
	run.text = ''
	replacements_made += 1

	return replacements_made

	def process_hf(json_file, docx_file, output_file):
	"""Main processing function compatible with your new system"""
	try:
	# Load JSON
	if hasattr(json_file, "read"):
	json_data = json.load(json_file)
	else:
	with open(json_file, 'r', encoding='utf-8') as f:
	json_data = json.load(f)

	# Flatten your new JSON structure
	flat_json = flatten_json_new_system(json_data)
	print("📄 Available JSON keys (sample):")
	for i, (key, value) in enumerate(sorted(flat_json.items())):
	if i < 10:
	print(f" - {key}: {value}")
	print(f" ... and {len(flat_json) - 10} more keys\n")

	# Load DOCX
	if hasattr(docx_file, "read"):
	doc = Document(docx_file)
	else:
	doc = Document(docx_file)

	# Process document
	print("🚀 Starting processing compatible with your new system...")

	table_replacements = process_tables(doc, flat_json)
	paragraph_replacements = process_paragraphs(doc, flat_json)

	total_replacements = table_replacements + paragraph_replacements

	# Save output
	if hasattr(output_file, "write"):
	doc.save(output_file)
	else:
	doc.save(output_file)

	print(f"\n✅ Document saved as: {output_file}")
	print(f"✅ Total replacements: {total_replacements}")
	print(f" 📊 Tables: {table_replacements}")
	print(f" 📝 Paragraphs: {paragraph_replacements}")
	print(f"🎉 Processing complete!")

	except FileNotFoundError as e:
	print(f"❌ File not found: {e}")
	except Exception as e:
	print(f"❌ Error: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	import sys
	if len(sys.argv) != 4:
	print("Usage: python compatible_pipeline.py <input_docx> <updated_json> <output_docx>")
	exit(1)
	docx_path = sys.argv[1]
	json_path = sys.argv[2]
	output_path = sys.argv[3]
	process_hf(json_path, docx_path, output_path)