Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

df67c09

verified ·

1 Parent(s): 878a622

Update update_docx_with_pdf.py

Browse files

Files changed (1) hide show

update_docx_with_pdf.py +109 -28

update_docx_with_pdf.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from openai import OpenAI
 import json
 import os
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
     """
@@ -24,35 +25,53 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
     pdf_txt = read_any(pdf_txt_file)
     # --- Build prompt ---
-    user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:
 {word_json}
-Here is the extracted text from a PDF:
 {pdf_txt}
-Instructions:
-- ONLY update the fields present in the JSON template, using information from the PDF text.
-- DO NOT add any extra fields, and do not change the JSON structure.
-- Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
-- Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")
-CRITICAL - For Operator Declaration section:
-- The "Print Name" should be the OPERATOR/COMPANY REPRESENTATIVE's name, NOT the auditor's name
-- Look for the "OPERATOR DECLARATION" section at the end of the document
-- The person signing the operator declaration is usually someone from the company like a manager, compliance officer, or director
-- Common examples: "Peter Sheppard", "Jeff Nitschke", etc.
-- AVOID using the auditor's name (typically "Greg Dyer" in these documents)
-- The "Position Title" should be their job role (e.g., "Compliance Officer", "Director", "Manager", "WHSE Compliance Officer")
-For Attendance List:
-- Extract all people listed with their roles (e.g., "Peter Sheppard - Compliance", "Greg Dyer - Auditor")
-- Include both operator staff and auditor in the attendance list
-- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
-- Make sure the JSON is valid and ready to use.
-- Update operator names, auditor names, and all personal details consistently throughout all sections."""
     # --- Call OpenAI API ---
     api_key = os.environ.get("OPENAI_API_KEY")
@@ -64,18 +83,44 @@ For Attendance List:
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=[
-            {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
             {"role": "user", "content": user_prompt}
         ],
-        max_tokens=4096,
-        temperature=0
     )
     updated_json_str = response.choices[0].message.content.strip()
     # --- Try to parse as JSON ---
     try:
         parsed = json.loads(updated_json_str)
         if hasattr(output_file, "write"):
             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
@@ -83,10 +128,46 @@ For Attendance List:
             with open(output_file, "w", encoding="utf-8") as f:
                 json.dump(parsed, f, indent=2, ensure_ascii=False)
         print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
-    except Exception as e:
         print("⚠️ Model did not return valid JSON. Raw output below:\n")
-        print(updated_json_str)
-        print("\n❌ Failed to parse updated JSON:", e)
 if __name__ == "__main__":
     import sys

 from openai import OpenAI
 import json
 import os
+import re
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
     """
     pdf_txt = read_any(pdf_txt_file)
     # --- Build prompt ---
+    user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF:
 {word_json}
+Here is the extracted text from a PDF document:
 {pdf_txt}
+EXTRACTION INSTRUCTIONS:
+1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field.
+2. FIELD-SPECIFIC EXTRACTION RULES:
+   - Dates: Look for patterns like "5th July 2023", "28th February 2024"
+   - Company Names: Extract the EXACT company name from the current PDF document
+   - Registration Numbers: Look for vehicle registrations (format: XX ## XX)
+   - Contact Details: Extract addresses, phone numbers, emails exactly as written
+   - ACN Numbers: Extract 9-digit Australian Company Numbers
+   - Audit Numbers: Look for audit matrix identifiers, CAR numbers
+3. TABLE DATA EXTRACTION:
+   - For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data
+   - For attendance lists: Extract ALL names with their positions/roles
+   - For management summaries: Extract specific dates, numbers, and compliance details
+4. MISSING DATA HANDLING:
+   - If data is not found in PDF, use "Not Available" instead of "Entry"
+   - For empty date ranges, use "Date range not specified"
+   - For missing numbers, use "Not provided"
+   - Only use actual data found in the PDF text
+5. OPERATOR DECLARATION CRITICAL RULES:
+   - "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor)
+   - Look for "OPERATOR DECLARATION" section - the person signing this is from the company
+   - "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.)
+   - NEVER use the auditor's name (Greg Dyer) for operator declaration
+6. DATA CONSISTENCY:
+   - Ensure the same company name appears throughout all sections
+   - Ensure the same people appear consistently with correct roles
+   - Cross-reference data between sections for accuracy
+7. QUALITY VALIDATION:
+   - Verify extracted company name matches throughout the document
+   - Check that dates are logical and properly formatted
+   - Ensure vehicle registrations follow proper format
+CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data.
+Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON."""
     # --- Call OpenAI API ---
     api_key = os.environ.get("OPENAI_API_KEY")
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=[
+            {"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."},
             {"role": "user", "content": user_prompt}
         ],
+        max_tokens=6000,  # Increased for more comprehensive extraction
+        temperature=0.1   # Slightly increased for better handling of variations in text
     )
     updated_json_str = response.choices[0].message.content.strip()
+    # Clean up common formatting issues
+    if updated_json_str.startswith("```json"):
+        updated_json_str = updated_json_str[7:]
+    if updated_json_str.endswith("```"):
+        updated_json_str = updated_json_str[:-3]
+    updated_json_str = updated_json_str.strip()
     # --- Try to parse as JSON ---
     try:
         parsed = json.loads(updated_json_str)
+        # Basic validation
+        print("🔍 Validating extracted data...")
+        original_data = json.loads(word_json)
+        # Check if we have the same structure
+        original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set()
+        parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set()
+        if original_keys and parsed_keys:
+            missing_keys = original_keys - parsed_keys
+            if missing_keys:
+                print(f"⚠️ Warning: Missing keys in extraction: {missing_keys}")
+            added_keys = parsed_keys - original_keys
+            if added_keys:
+                print(f"⚠️ Warning: Unexpected keys added: {added_keys}")
+        # Save the parsed JSON
         if hasattr(output_file, "write"):
             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
             with open(output_file, "w", encoding="utf-8") as f:
                 json.dump(parsed, f, indent=2, ensure_ascii=False)
         print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
+        # Print extraction summary
+        print(f"📊 Extraction Summary:")
+        if isinstance(parsed, dict):
+            total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values())
+            print(f"   - Total sections: {len(parsed)}")
+            print(f"   - Total data points extracted: {total_fields}")
+        # Debug: Print the updated JSON content
+        print("\n🔍 UPDATED JSON CONTENT:")
+        print("=" * 80)
+        print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else ""))
+        print("=" * 80)
+    except json.JSONDecodeError as e:
         print("⚠️ Model did not return valid JSON. Raw output below:\n")
+        print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str)
+        print(f"\n❌ JSON Parse Error: {e}")
+        print("🔧 Attempting to fix common JSON issues...")
+        # Try to fix common issues
+        try:
+            # Remove trailing commas
+            fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str)
+            parsed = json.loads(fixed_json)
+            print("✅ Fixed JSON formatting issues")
+            if hasattr(output_file, "write"):
+                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
+                output_file.flush()
+            else:
+                with open(output_file, "w", encoding="utf-8") as f:
+                    json.dump(parsed, f, indent=2, ensure_ascii=False)
+            print("✅ JSON saved after fixes")
+        except Exception as fix_error:
+            print(f"❌ Could not fix JSON: {fix_error}")
+            raise e
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        raise e
 if __name__ == "__main__":
     import sys