File size: 7,780 Bytes
5244c54
3edd648
5244c54
df67c09
89ec944
5244c54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ec944
5244c54
 
89ec944
5244c54
df67c09
5244c54
f486b52
df67c09
5244c54
e8b46b5
df67c09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5393f7
df67c09
 
 
 
 
f5393f7
df67c09
 
 
 
f5393f7
df67c09
 
 
 
 
 
 
 
3edd648
5244c54
876a319
 
5244c54
25603c9
89ec944
5244c54
 
 
 
df67c09
5244c54
 
df67c09
 
5244c54
f486b52
5244c54
3edd648
df67c09
 
 
 
 
 
 
5244c54
3edd648
5244c54
df67c09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3edd648
5244c54
3edd648
 
89ec944
5244c54
 
df67c09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5244c54
df67c09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f486b52
876a319
5244c54
876a319
 
5244c54
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from openai import OpenAI
import json
import os
import re

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    """
    word_json_file: file-like object or file path (docx extraction JSON)
    pdf_txt_file: file-like object or file path (PDF plain text)
    output_file: file-like object (opened for writing) or file path
    """
    # --- Load files ---
    def read_any(f):
        if hasattr(f, "read"):
            f.seek(0)
            content = f.read()
            if isinstance(content, bytes):
                content = content.decode("utf-8")
            return content
        else:
            with open(f, "r", encoding="utf-8") as fh:
                return fh.read()

    word_json = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    # --- Build prompt ---
    user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF:
{word_json}

Here is the extracted text from a PDF document:
{pdf_txt}

EXTRACTION INSTRUCTIONS:
1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field.

2. FIELD-SPECIFIC EXTRACTION RULES:
   - Dates: Look for patterns like "5th July 2023", "28th February 2024"
   - Company Names: Extract the EXACT company name from the current PDF document
   - Registration Numbers: Look for vehicle registrations (format: XX ## XX)
   - Contact Details: Extract addresses, phone numbers, emails exactly as written
   - ACN Numbers: Extract 9-digit Australian Company Numbers
   - Audit Numbers: Look for audit matrix identifiers, CAR numbers

3. TABLE DATA EXTRACTION:
   - For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data
   - For attendance lists: Extract ALL names with their positions/roles
   - For management summaries: Extract specific dates, numbers, and compliance details

4. MISSING DATA HANDLING:
   - If data is not found in PDF, use "Not Available" instead of "Entry"
   - For empty date ranges, use "Date range not specified"
   - For missing numbers, use "Not provided"
   - Only use actual data found in the PDF text

5. OPERATOR DECLARATION CRITICAL RULES:
   - "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor)
   - Look for "OPERATOR DECLARATION" section - the person signing this is from the company
   - "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.)
   - NEVER use the auditor's name (Greg Dyer) for operator declaration

6. DATA CONSISTENCY:
   - Ensure the same company name appears throughout all sections
   - Ensure the same people appear consistently with correct roles
   - Cross-reference data between sections for accuracy

7. QUALITY VALIDATION:
   - Verify extracted company name matches throughout the document
   - Check that dates are logical and properly formatted
   - Ensure vehicle registrations follow proper format

CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data.

Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON."""

    # --- Call OpenAI API ---
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")

    client = OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=6000,  # Increased for more comprehensive extraction
        temperature=0.1   # Slightly increased for better handling of variations in text
    )

    updated_json_str = response.choices[0].message.content.strip()

    # Clean up common formatting issues
    if updated_json_str.startswith("```json"):
        updated_json_str = updated_json_str[7:]
    if updated_json_str.endswith("```"):
        updated_json_str = updated_json_str[:-3]
    updated_json_str = updated_json_str.strip()

    # --- Try to parse as JSON ---
    try:
        parsed = json.loads(updated_json_str)
        
        # Basic validation
        print("πŸ” Validating extracted data...")
        original_data = json.loads(word_json)
        
        # Check if we have the same structure
        original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set()
        parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set()
        
        if original_keys and parsed_keys:
            missing_keys = original_keys - parsed_keys
            if missing_keys:
                print(f"⚠️ Warning: Missing keys in extraction: {missing_keys}")
            
            added_keys = parsed_keys - original_keys
            if added_keys:
                print(f"⚠️ Warning: Unexpected keys added: {added_keys}")
        
        # Save the parsed JSON
        if hasattr(output_file, "write"):
            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(parsed, f, indent=2, ensure_ascii=False)
        print("βœ… JSON updated and saved to", getattr(output_file, "name", output_file))
        
        # Print extraction summary
        print(f"πŸ“Š Extraction Summary:")
        if isinstance(parsed, dict):
            total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values())
            print(f"   - Total sections: {len(parsed)}")
            print(f"   - Total data points extracted: {total_fields}")
        
        # Debug: Print the updated JSON content
        print("\nπŸ” UPDATED JSON CONTENT:")
        print("=" * 80)
        print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else ""))
        print("=" * 80)
        
    except json.JSONDecodeError as e:
        print("⚠️ Model did not return valid JSON. Raw output below:\n")
        print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str)
        print(f"\n❌ JSON Parse Error: {e}")
        print("πŸ”§ Attempting to fix common JSON issues...")
        
        # Try to fix common issues
        try:
            # Remove trailing commas
            fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str)
            parsed = json.loads(fixed_json)
            print("βœ… Fixed JSON formatting issues")
            
            if hasattr(output_file, "write"):
                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                output_file.flush()
            else:
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(parsed, f, indent=2, ensure_ascii=False)
            print("βœ… JSON saved after fixes")
        except Exception as fix_error:
            print(f"❌ Could not fix JSON: {fix_error}")
            raise e
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        raise e

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])