Spaces:
Sleeping
Sleeping
File size: 7,780 Bytes
5244c54 3edd648 5244c54 df67c09 89ec944 5244c54 89ec944 5244c54 89ec944 5244c54 df67c09 5244c54 f486b52 df67c09 5244c54 e8b46b5 df67c09 f5393f7 df67c09 f5393f7 df67c09 f5393f7 df67c09 3edd648 5244c54 876a319 5244c54 25603c9 89ec944 5244c54 df67c09 5244c54 df67c09 5244c54 f486b52 5244c54 3edd648 df67c09 5244c54 3edd648 5244c54 df67c09 3edd648 5244c54 3edd648 89ec944 5244c54 df67c09 5244c54 df67c09 f486b52 876a319 5244c54 876a319 5244c54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
from openai import OpenAI
import json
import os
import re
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
"""
word_json_file: file-like object or file path (docx extraction JSON)
pdf_txt_file: file-like object or file path (PDF plain text)
output_file: file-like object (opened for writing) or file path
"""
# --- Load files ---
def read_any(f):
if hasattr(f, "read"):
f.seek(0)
content = f.read()
if isinstance(content, bytes):
content = content.decode("utf-8")
return content
else:
with open(f, "r", encoding="utf-8") as fh:
return fh.read()
word_json = read_any(word_json_file)
pdf_txt = read_any(pdf_txt_file)
# --- Build prompt ---
user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF:
{word_json}
Here is the extracted text from a PDF document:
{pdf_txt}
EXTRACTION INSTRUCTIONS:
1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field.
2. FIELD-SPECIFIC EXTRACTION RULES:
- Dates: Look for patterns like "5th July 2023", "28th February 2024"
- Company Names: Extract the EXACT company name from the current PDF document
- Registration Numbers: Look for vehicle registrations (format: XX ## XX)
- Contact Details: Extract addresses, phone numbers, emails exactly as written
- ACN Numbers: Extract 9-digit Australian Company Numbers
- Audit Numbers: Look for audit matrix identifiers, CAR numbers
3. TABLE DATA EXTRACTION:
- For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data
- For attendance lists: Extract ALL names with their positions/roles
- For management summaries: Extract specific dates, numbers, and compliance details
4. MISSING DATA HANDLING:
- If data is not found in PDF, use "Not Available" instead of "Entry"
- For empty date ranges, use "Date range not specified"
- For missing numbers, use "Not provided"
- Only use actual data found in the PDF text
5. OPERATOR DECLARATION CRITICAL RULES:
- "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor)
- Look for "OPERATOR DECLARATION" section - the person signing this is from the company
- "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.)
- NEVER use the auditor's name (Greg Dyer) for operator declaration
6. DATA CONSISTENCY:
- Ensure the same company name appears throughout all sections
- Ensure the same people appear consistently with correct roles
- Cross-reference data between sections for accuracy
7. QUALITY VALIDATION:
- Verify extracted company name matches throughout the document
- Check that dates are logical and properly formatted
- Ensure vehicle registrations follow proper format
CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data.
Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON."""
# --- Call OpenAI API ---
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."},
{"role": "user", "content": user_prompt}
],
max_tokens=6000, # Increased for more comprehensive extraction
temperature=0.1 # Slightly increased for better handling of variations in text
)
updated_json_str = response.choices[0].message.content.strip()
# Clean up common formatting issues
if updated_json_str.startswith("```json"):
updated_json_str = updated_json_str[7:]
if updated_json_str.endswith("```"):
updated_json_str = updated_json_str[:-3]
updated_json_str = updated_json_str.strip()
# --- Try to parse as JSON ---
try:
parsed = json.loads(updated_json_str)
# Basic validation
print("π Validating extracted data...")
original_data = json.loads(word_json)
# Check if we have the same structure
original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set()
parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set()
if original_keys and parsed_keys:
missing_keys = original_keys - parsed_keys
if missing_keys:
print(f"β οΈ Warning: Missing keys in extraction: {missing_keys}")
added_keys = parsed_keys - original_keys
if added_keys:
print(f"β οΈ Warning: Unexpected keys added: {added_keys}")
# Save the parsed JSON
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print("β
JSON updated and saved to", getattr(output_file, "name", output_file))
# Print extraction summary
print(f"π Extraction Summary:")
if isinstance(parsed, dict):
total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values())
print(f" - Total sections: {len(parsed)}")
print(f" - Total data points extracted: {total_fields}")
# Debug: Print the updated JSON content
print("\nπ UPDATED JSON CONTENT:")
print("=" * 80)
print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else ""))
print("=" * 80)
except json.JSONDecodeError as e:
print("β οΈ Model did not return valid JSON. Raw output below:\n")
print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str)
print(f"\nβ JSON Parse Error: {e}")
print("π§ Attempting to fix common JSON issues...")
# Try to fix common issues
try:
# Remove trailing commas
fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str)
parsed = json.loads(fixed_json)
print("β
Fixed JSON formatting issues")
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print("β
JSON saved after fixes")
except Exception as fix_error:
print(f"β Could not fix JSON: {fix_error}")
raise e
except Exception as e:
print(f"β Unexpected error: {e}")
raise e
if __name__ == "__main__":
import sys
if len(sys.argv) != 4:
print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
exit(1)
update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) |