Spaces:

Shami96
/

PDF-Data_Extractor

Sleeping

File size: 7,780 Bytes

from openai import OpenAI
import json
import os
import re

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    """
    word_json_file: file-like object or file path (docx extraction JSON)
    pdf_txt_file: file-like object or file path (PDF plain text)
    output_file: file-like object (opened for writing) or file path
    """
    # --- Load files ---
    def read_any(f):
        if hasattr(f, "read"):
            f.seek(0)
            content = f.read()
            if isinstance(content, bytes):
                content = content.decode("utf-8")
            return content
        else:
            with open(f, "r", encoding="utf-8") as fh:
                return fh.read()

    word_json = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    # --- Build prompt ---
    user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF:
{word_json}

Here is the extracted text from a PDF document:
{pdf_txt}

EXTRACTION INSTRUCTIONS:
1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field.

2. FIELD-SPECIFIC EXTRACTION RULES:
   - Dates: Look for patterns like "5th July 2023", "28th February 2024"
   - Company Names: Extract the EXACT company name from the current PDF document
   - Registration Numbers: Look for vehicle registrations (format: XX ## XX)
   - Contact Details: Extract addresses, phone numbers, emails exactly as written
   - ACN Numbers: Extract 9-digit Australian Company Numbers
   - Audit Numbers: Look for audit matrix identifiers, CAR numbers

3. TABLE DATA EXTRACTION:
   - For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data
   - For attendance lists: Extract ALL names with their positions/roles
   - For management summaries: Extract specific dates, numbers, and compliance details

4. MISSING DATA HANDLING:
   - If data is not found in PDF, use "Not Available" instead of "Entry"
   - For empty date ranges, use "Date range not specified"
   - For missing numbers, use "Not provided"
   - Only use actual data found in the PDF text

5. OPERATOR DECLARATION CRITICAL RULES:
   - "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor)
   - Look for "OPERATOR DECLARATION" section - the person signing this is from the company
   - "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.)
   - NEVER use the auditor's name (Greg Dyer) for operator declaration

6. DATA CONSISTENCY:
   - Ensure the same company name appears throughout all sections
   - Ensure the same people appear consistently with correct roles
   - Cross-reference data between sections for accuracy

7. QUALITY VALIDATION:
   - Verify extracted company name matches throughout the document
   - Check that dates are logical and properly formatted
   - Ensure vehicle registrations follow proper format

CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data.

Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON."""

    # --- Call OpenAI API ---
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")

    client = OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=6000,  # Increased for more comprehensive extraction
        temperature=0.1   # Slightly increased for better handling of variations in text
    )

    updated_json_str = response.choices[0].message.content.strip()

    # Clean up common formatting issues
    if updated_json_str.startswith("```json"):
        updated_json_str = updated_json_str[7:]
    if updated_json_str.endswith("```"):
        updated_json_str = updated_json_str[:-3]
    updated_json_str = updated_json_str.strip()

    # --- Try to parse as JSON ---
    try:
        parsed = json.loads(updated_json_str)
        
        # Basic validation
        print("🔍 Validating extracted data...")
        original_data = json.loads(word_json)
        
        # Check if we have the same structure
        original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set()
        parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set()
        
        if original_keys and parsed_keys:
            missing_keys = original_keys - parsed_keys
            if missing_keys:
                print(f"⚠️ Warning: Missing keys in extraction: {missing_keys}")
            
            added_keys = parsed_keys - original_keys
            if added_keys:
                print(f"⚠️ Warning: Unexpected keys added: {added_keys}")
        
        # Save the parsed JSON
        if hasattr(output_file, "write"):
            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(parsed, f, indent=2, ensure_ascii=False)
        print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
        
        # Print extraction summary
        print(f"📊 Extraction Summary:")
        if isinstance(parsed, dict):
            total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values())
            print(f"   - Total sections: {len(parsed)}")
            print(f"   - Total data points extracted: {total_fields}")
        
        # Debug: Print the updated JSON content
        print("\n🔍 UPDATED JSON CONTENT:")
        print("=" * 80)
        print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else ""))
        print("=" * 80)
        
    except json.JSONDecodeError as e:
        print("⚠️ Model did not return valid JSON. Raw output below:\n")
        print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str)
        print(f"\n❌ JSON Parse Error: {e}")
        print("🔧 Attempting to fix common JSON issues...")
        
        # Try to fix common issues
        try:
            # Remove trailing commas
            fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str)
            parsed = json.loads(fixed_json)
            print("✅ Fixed JSON formatting issues")
            
            if hasattr(output_file, "write"):
                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                output_file.flush()
            else:
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(parsed, f, indent=2, ensure_ascii=False)
            print("✅ JSON saved after fixes")
        except Exception as fix_error:
            print(f"❌ Could not fix JSON: {fix_error}")
            raise e
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        raise e

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])