Spaces:
Running
Running
| from openai import OpenAI | |
| import json | |
| import os | |
| import re | |
| def update_json_with_pdf(word_json_file, pdf_txt_file, output_file): | |
| """ | |
| word_json_file: file-like object or file path (docx extraction JSON) | |
| pdf_txt_file: file-like object or file path (PDF plain text) | |
| output_file: file-like object (opened for writing) or file path | |
| """ | |
| # --- Load files --- | |
| def read_any(f): | |
| if hasattr(f, "read"): | |
| f.seek(0) | |
| content = f.read() | |
| if isinstance(content, bytes): | |
| content = content.decode("utf-8") | |
| return content | |
| else: | |
| with open(f, "r", encoding="utf-8") as fh: | |
| return fh.read() | |
| word_json = read_any(word_json_file) | |
| pdf_txt = read_any(pdf_txt_file) | |
| # --- Build prompt --- | |
| user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF: | |
| {word_json} | |
| Here is the extracted text from a PDF document: | |
| {pdf_txt} | |
| EXTRACTION INSTRUCTIONS: | |
| 1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field. | |
| 2. FIELD-SPECIFIC EXTRACTION RULES: | |
| - Dates: Look for patterns like "5th July 2023", "28th February 2024" | |
| - Company Names: Extract the EXACT company name from the current PDF document | |
| - Registration Numbers: Look for vehicle registrations (format: XX ## XX) | |
| - Contact Details: Extract addresses, phone numbers, emails exactly as written | |
| - ACN Numbers: Extract 9-digit Australian Company Numbers | |
| - Audit Numbers: Look for audit matrix identifiers, CAR numbers | |
| 3. TABLE DATA EXTRACTION: | |
| - For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data | |
| - For attendance lists: Extract ALL names with their positions/roles | |
| - For management summaries: Extract specific dates, numbers, and compliance details | |
| 4. MISSING DATA HANDLING: | |
| - If data is not found in PDF, use "Not Available" instead of "Entry" | |
| - For empty date ranges, use "Date range not specified" | |
| - For missing numbers, use "Not provided" | |
| - Only use actual data found in the PDF text | |
| 5. OPERATOR DECLARATION CRITICAL RULES: | |
| - "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor) | |
| - Look for "OPERATOR DECLARATION" section - the person signing this is from the company | |
| - "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.) | |
| - NEVER use the auditor's name (Greg Dyer) for operator declaration | |
| 6. DATA CONSISTENCY: | |
| - Ensure the same company name appears throughout all sections | |
| - Ensure the same people appear consistently with correct roles | |
| - Cross-reference data between sections for accuracy | |
| 7. QUALITY VALIDATION: | |
| - Verify extracted company name matches throughout the document | |
| - Check that dates are logical and properly formatted | |
| - Ensure vehicle registrations follow proper format | |
| CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data. | |
| Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON.""" | |
| # --- Call OpenAI API --- | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("OPENAI_API_KEY not found in environment variables!") | |
| client = OpenAI(api_key=api_key) | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."}, | |
| {"role": "user", "content": user_prompt} | |
| ], | |
| max_tokens=6000, # Increased for more comprehensive extraction | |
| temperature=0.1 # Slightly increased for better handling of variations in text | |
| ) | |
| updated_json_str = response.choices[0].message.content.strip() | |
| # Clean up common formatting issues | |
| if updated_json_str.startswith("```json"): | |
| updated_json_str = updated_json_str[7:] | |
| if updated_json_str.endswith("```"): | |
| updated_json_str = updated_json_str[:-3] | |
| updated_json_str = updated_json_str.strip() | |
| # --- Try to parse as JSON --- | |
| try: | |
| parsed = json.loads(updated_json_str) | |
| # Basic validation | |
| print("π Validating extracted data...") | |
| original_data = json.loads(word_json) | |
| # Check if we have the same structure | |
| original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set() | |
| parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set() | |
| if original_keys and parsed_keys: | |
| missing_keys = original_keys - parsed_keys | |
| if missing_keys: | |
| print(f"β οΈ Warning: Missing keys in extraction: {missing_keys}") | |
| added_keys = parsed_keys - original_keys | |
| if added_keys: | |
| print(f"β οΈ Warning: Unexpected keys added: {added_keys}") | |
| # Save the parsed JSON | |
| if hasattr(output_file, "write"): | |
| json.dump(parsed, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(parsed, f, indent=2, ensure_ascii=False) | |
| print("β JSON updated and saved to", getattr(output_file, "name", output_file)) | |
| # Print extraction summary | |
| print(f"π Extraction Summary:") | |
| if isinstance(parsed, dict): | |
| total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values()) | |
| print(f" - Total sections: {len(parsed)}") | |
| print(f" - Total data points extracted: {total_fields}") | |
| # Debug: Print the updated JSON content | |
| print("\nπ UPDATED JSON CONTENT:") | |
| print("=" * 80) | |
| print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else "")) | |
| print("=" * 80) | |
| except json.JSONDecodeError as e: | |
| print("β οΈ Model did not return valid JSON. Raw output below:\n") | |
| print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str) | |
| print(f"\nβ JSON Parse Error: {e}") | |
| print("π§ Attempting to fix common JSON issues...") | |
| # Try to fix common issues | |
| try: | |
| # Remove trailing commas | |
| fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str) | |
| parsed = json.loads(fixed_json) | |
| print("β Fixed JSON formatting issues") | |
| if hasattr(output_file, "write"): | |
| json.dump(parsed, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(parsed, f, indent=2, ensure_ascii=False) | |
| print("β JSON saved after fixes") | |
| except Exception as fix_error: | |
| print(f"β Could not fix JSON: {fix_error}") | |
| raise e | |
| except Exception as e: | |
| print(f"β Unexpected error: {e}") | |
| raise e | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) != 4: | |
| print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>") | |
| exit(1) | |
| update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) |