Shami96 commited on
Commit
df67c09
·
verified ·
1 Parent(s): 878a622

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +109 -28
update_docx_with_pdf.py CHANGED
@@ -1,6 +1,7 @@
1
  from openai import OpenAI
2
  import json
3
  import os
 
4
 
5
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
6
  """
@@ -24,35 +25,53 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
24
  pdf_txt = read_any(pdf_txt_file)
25
 
26
  # --- Build prompt ---
27
- user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:
28
-
29
  {word_json}
30
 
31
- Here is the extracted text from a PDF:
32
-
33
  {pdf_txt}
34
 
35
- Instructions:
36
- - ONLY update the fields present in the JSON template, using information from the PDF text.
37
- - DO NOT add any extra fields, and do not change the JSON structure.
38
- - Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
39
- - Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- CRITICAL - For Operator Declaration section:
42
- - The "Print Name" should be the OPERATOR/COMPANY REPRESENTATIVE's name, NOT the auditor's name
43
- - Look for the "OPERATOR DECLARATION" section at the end of the document
44
- - The person signing the operator declaration is usually someone from the company like a manager, compliance officer, or director
45
- - Common examples: "Peter Sheppard", "Jeff Nitschke", etc.
46
- - AVOID using the auditor's name (typically "Greg Dyer" in these documents)
47
- - The "Position Title" should be their job role (e.g., "Compliance Officer", "Director", "Manager", "WHSE Compliance Officer")
48
 
49
- For Attendance List:
50
- - Extract all people listed with their roles (e.g., "Peter Sheppard - Compliance", "Greg Dyer - Auditor")
51
- - Include both operator staff and auditor in the attendance list
 
52
 
53
- - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
54
- - Make sure the JSON is valid and ready to use.
55
- - Update operator names, auditor names, and all personal details consistently throughout all sections."""
 
 
 
 
 
56
 
57
  # --- Call OpenAI API ---
58
  api_key = os.environ.get("OPENAI_API_KEY")
@@ -64,18 +83,44 @@ For Attendance List:
64
  response = client.chat.completions.create(
65
  model="gpt-4o",
66
  messages=[
67
- {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
68
  {"role": "user", "content": user_prompt}
69
  ],
70
- max_tokens=4096,
71
- temperature=0
72
  )
73
 
74
  updated_json_str = response.choices[0].message.content.strip()
75
 
 
 
 
 
 
 
 
76
  # --- Try to parse as JSON ---
77
  try:
78
  parsed = json.loads(updated_json_str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  if hasattr(output_file, "write"):
80
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
81
  output_file.flush()
@@ -83,10 +128,46 @@ For Attendance List:
83
  with open(output_file, "w", encoding="utf-8") as f:
84
  json.dump(parsed, f, indent=2, ensure_ascii=False)
85
  print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
86
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  print("⚠️ Model did not return valid JSON. Raw output below:\n")
88
- print(updated_json_str)
89
- print("\n❌ Failed to parse updated JSON:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  if __name__ == "__main__":
92
  import sys
 
1
  from openai import OpenAI
2
  import json
3
  import os
4
+ import re
5
 
6
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
7
  """
 
25
  pdf_txt = read_any(pdf_txt_file)
26
 
27
  # --- Build prompt ---
28
+ user_prompt = f"""Here is a JSON template with fields that need updating with data from the PDF:
 
29
  {word_json}
30
 
31
+ Here is the extracted text from a PDF document:
 
32
  {pdf_txt}
33
 
34
+ EXTRACTION INSTRUCTIONS:
35
+ 1. COMPREHENSIVE EXTRACTION: Extract data for EVERY field present in the JSON template. Do not skip any field.
36
+
37
+ 2. FIELD-SPECIFIC EXTRACTION RULES:
38
+ - Dates: Look for patterns like "5th July 2023", "28th February 2024"
39
+ - Company Names: Extract the EXACT company name from the current PDF document
40
+ - Registration Numbers: Look for vehicle registrations (format: XX ## XX)
41
+ - Contact Details: Extract addresses, phone numbers, emails exactly as written
42
+ - ACN Numbers: Extract 9-digit Australian Company Numbers
43
+ - Audit Numbers: Look for audit matrix identifiers, CAR numbers
44
+
45
+ 3. TABLE DATA EXTRACTION:
46
+ - For Vehicle Registration tables: Extract ALL columns including maintenance records, weight verification, suspension data
47
+ - For attendance lists: Extract ALL names with their positions/roles
48
+ - For management summaries: Extract specific dates, numbers, and compliance details
49
+
50
+ 4. MISSING DATA HANDLING:
51
+ - If data is not found in PDF, use "Not Available" instead of "Entry"
52
+ - For empty date ranges, use "Date range not specified"
53
+ - For missing numbers, use "Not provided"
54
+ - Only use actual data found in the PDF text
55
 
56
+ 5. OPERATOR DECLARATION CRITICAL RULES:
57
+ - "Print Name": Must be the COMPANY REPRESENTATIVE signing the operator declaration (NOT the auditor)
58
+ - Look for "OPERATOR DECLARATION" section - the person signing this is from the company
59
+ - "Position Title": Their job role within the company (Director, Compliance Officer, Manager, etc.)
60
+ - NEVER use the auditor's name (Greg Dyer) for operator declaration
 
 
61
 
62
+ 6. DATA CONSISTENCY:
63
+ - Ensure the same company name appears throughout all sections
64
+ - Ensure the same people appear consistently with correct roles
65
+ - Cross-reference data between sections for accuracy
66
 
67
+ 7. QUALITY VALIDATION:
68
+ - Verify extracted company name matches throughout the document
69
+ - Check that dates are logical and properly formatted
70
+ - Ensure vehicle registrations follow proper format
71
+
72
+ CRITICAL: Extract data ONLY from the provided PDF text. Do not use any external knowledge or previous document data.
73
+
74
+ Output ONLY the updated JSON with all fields filled using the extracted data. No markdown, no explanations, just valid JSON."""
75
 
76
  # --- Call OpenAI API ---
77
  api_key = os.environ.get("OPENAI_API_KEY")
 
83
  response = client.chat.completions.create(
84
  model="gpt-4o",
85
  messages=[
86
+ {"role": "system", "content": "You are a precise data extraction assistant specializing in audit documents. Extract data EXACTLY as it appears in the source document. Only reply with valid JSON - no markdown, no explanations, no extra formatting. Be thorough and extract ALL requested fields from the provided text."},
87
  {"role": "user", "content": user_prompt}
88
  ],
89
+ max_tokens=6000, # Increased for more comprehensive extraction
90
+ temperature=0.1 # Slightly increased for better handling of variations in text
91
  )
92
 
93
  updated_json_str = response.choices[0].message.content.strip()
94
 
95
+ # Clean up common formatting issues
96
+ if updated_json_str.startswith("```json"):
97
+ updated_json_str = updated_json_str[7:]
98
+ if updated_json_str.endswith("```"):
99
+ updated_json_str = updated_json_str[:-3]
100
+ updated_json_str = updated_json_str.strip()
101
+
102
  # --- Try to parse as JSON ---
103
  try:
104
  parsed = json.loads(updated_json_str)
105
+
106
+ # Basic validation
107
+ print("🔍 Validating extracted data...")
108
+ original_data = json.loads(word_json)
109
+
110
+ # Check if we have the same structure
111
+ original_keys = set(original_data.keys()) if isinstance(original_data, dict) else set()
112
+ parsed_keys = set(parsed.keys()) if isinstance(parsed, dict) else set()
113
+
114
+ if original_keys and parsed_keys:
115
+ missing_keys = original_keys - parsed_keys
116
+ if missing_keys:
117
+ print(f"⚠️ Warning: Missing keys in extraction: {missing_keys}")
118
+
119
+ added_keys = parsed_keys - original_keys
120
+ if added_keys:
121
+ print(f"⚠️ Warning: Unexpected keys added: {added_keys}")
122
+
123
+ # Save the parsed JSON
124
  if hasattr(output_file, "write"):
125
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
126
  output_file.flush()
 
128
  with open(output_file, "w", encoding="utf-8") as f:
129
  json.dump(parsed, f, indent=2, ensure_ascii=False)
130
  print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
131
+
132
+ # Print extraction summary
133
+ print(f"📊 Extraction Summary:")
134
+ if isinstance(parsed, dict):
135
+ total_fields = sum(len(v) if isinstance(v, list) else 1 for v in parsed.values())
136
+ print(f" - Total sections: {len(parsed)}")
137
+ print(f" - Total data points extracted: {total_fields}")
138
+
139
+ # Debug: Print the updated JSON content
140
+ print("\n🔍 UPDATED JSON CONTENT:")
141
+ print("=" * 80)
142
+ print(json.dumps(parsed, indent=2, ensure_ascii=False)[:3000] + ("..." if len(json.dumps(parsed, indent=2)) > 3000 else ""))
143
+ print("=" * 80)
144
+
145
+ except json.JSONDecodeError as e:
146
  print("⚠️ Model did not return valid JSON. Raw output below:\n")
147
+ print(updated_json_str[:1000] + "..." if len(updated_json_str) > 1000 else updated_json_str)
148
+ print(f"\n❌ JSON Parse Error: {e}")
149
+ print("🔧 Attempting to fix common JSON issues...")
150
+
151
+ # Try to fix common issues
152
+ try:
153
+ # Remove trailing commas
154
+ fixed_json = re.sub(r',(\s*[}\]])', r'\1', updated_json_str)
155
+ parsed = json.loads(fixed_json)
156
+ print("✅ Fixed JSON formatting issues")
157
+
158
+ if hasattr(output_file, "write"):
159
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
160
+ output_file.flush()
161
+ else:
162
+ with open(output_file, "w", encoding="utf-8") as f:
163
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
164
+ print("✅ JSON saved after fixes")
165
+ except Exception as fix_error:
166
+ print(f"❌ Could not fix JSON: {fix_error}")
167
+ raise e
168
+ except Exception as e:
169
+ print(f"❌ Unexpected error: {e}")
170
+ raise e
171
 
172
  if __name__ == "__main__":
173
  import sys