Shami96 commited on
Commit
f486b52
·
verified ·
1 Parent(s): 704d2a2

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +199 -112
update_docx_with_pdf.py CHANGED
@@ -1,4 +1,8 @@
1
  #!/usr/bin/env python3
 
 
 
 
2
  import os
3
  import sys
4
  import json
@@ -6,14 +10,19 @@ import time
6
  import re
7
  from typing import Optional
8
 
9
- # Try to import OpenAI client in the style you used previously
10
  try:
11
  from openai import OpenAI
12
- except Exception as e:
13
  OpenAI = None
14
 
 
15
  RETRIES = 3
16
  RETRY_DELAY = 1.0 # seconds between retries
 
 
 
 
17
 
18
  def read_any(path_or_file):
19
  """Read content from file path or file-like object."""
@@ -27,47 +36,102 @@ def read_any(path_or_file):
27
  with open(path_or_file, "r", encoding="utf-8") as fh:
28
  return fh.read()
29
 
30
- def extract_json_substring(s: str) -> Optional[str]:
 
31
  """
32
- Attempt to find the first balanced JSON object substring in s.
33
- Returns the substring or None.
34
  """
35
  if not s:
36
  return None
37
- # Find first '{' then walk forward counting braces
38
- start = s.find("{")
39
- if start == -1:
40
- return None
41
- depth = 0
42
- in_string = False
43
- escape = False
44
- for i in range(start, len(s)):
45
- ch = s[i]
46
- if ch == '"' and not escape:
47
- in_string = not in_string
48
- if in_string:
49
- if ch == "\\" and not escape:
50
- escape = True
51
- else:
52
- escape = False
53
- continue
54
- if ch == "{":
55
- depth += 1
56
- elif ch == "}":
57
- depth -= 1
58
- if depth == 0:
59
- candidate = s[start:i+1]
60
- return candidate
 
 
 
 
 
61
  return None
62
 
63
- def try_parse_json(s: str):
64
- """Try standard json.loads, return parsed or raise."""
 
 
 
 
 
 
 
 
65
  return json.loads(s)
66
 
 
67
  def safe_write(path: str, data):
68
  with open(path, "w", encoding="utf-8") as f:
69
  json.dump(data, f, indent=2, ensure_ascii=False)
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
72
  # --- load inputs ---
73
  word_json_text = read_any(word_json_file)
@@ -86,20 +150,28 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
86
  f.write(word_json_text)
87
  return
88
 
89
- # --- build prompt ---
90
- user_prompt = f"""
91
- Here is a JSON template. It contains only the fields that need updating:
92
- {json.dumps(word_json, ensure_ascii=False)}
 
93
 
94
- Here is the extracted text from a PDF:
95
- {pdf_txt}
 
 
 
 
 
 
 
 
 
96
 
97
- Instructions:
98
- - ONLY update the fields present in the JSON template, using information from the PDF text.
99
- - DO NOT add any extra fields, and do not change the JSON structure.
100
- - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
101
- - Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template.
102
- """
103
 
104
  api_key = os.environ.get("OPENAI_API_KEY")
105
  if not api_key:
@@ -120,116 +192,132 @@ Instructions:
120
  safe_write(output_file, word_json)
121
  return
122
 
 
123
  client = OpenAI(api_key=api_key)
124
 
125
- system_msgs = [
126
- "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.",
127
- ]
128
-
129
- # Progressive user prompts: first attempt standard, later attempt stricter guidance
130
- additional_user_variants = [
131
- user_prompt,
132
- user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.",
133
- user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.",
134
- ]
135
-
136
- model_name = os.environ.get("OPENAI_MODEL", "gpt-4o") # keep same default you used
137
 
138
  raw_outputs = []
139
  parsed = None
140
 
141
- for attempt in range(RETRIES):
142
- user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)]
 
 
 
 
 
 
 
 
143
  try:
144
- print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...")
145
- response = client.chat.completions.create(
146
- model=model_name,
147
- messages=[
148
- {"role": "system", "content": system_msgs[0]},
149
- {"role": "user", "content": user_content}
150
- ],
151
- max_tokens=4096,
152
- temperature=0.0
153
- )
154
- # The SDK returns different shapes; attempt to access responsibly
155
- raw_text = None
156
- try:
157
- # preferred: choices[0].message.content
158
- raw_text = response.choices[0].message.content
159
- except Exception:
160
- try:
161
- raw_text = response.choices[0].text
162
- except Exception:
163
- raw_text = str(response)
164
- if isinstance(raw_text, bytes):
165
- raw_text = raw_text.decode("utf-8", errors="replace")
166
- raw_text = raw_text.strip()
167
  raw_outputs.append(raw_text)
 
 
 
 
 
168
  # Try parse as JSON directly
169
  try:
170
- parsed = json.loads(raw_text)
171
  print("✅ Model returned valid JSON.")
172
- # write output and exit
173
  if hasattr(output_file, "write"):
174
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
175
  output_file.flush()
176
  else:
177
  safe_write(output_file, parsed)
178
  return parsed
179
- except Exception as e:
180
- print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.")
181
- # try extracting json substring
182
  candidate = extract_json_substring(raw_text)
183
  if candidate:
184
  try:
185
- parsed = json.loads(candidate)
186
  print("✅ Successfully extracted and parsed JSON substring from model output.")
187
  if hasattr(output_file, "write"):
188
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
189
  output_file.flush()
190
  else:
191
  safe_write(output_file, parsed)
192
- # save raw for debugging too
193
- raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
194
- with open(raw_path, "w", encoding="utf-8") as rf:
195
- rf.write(raw_text)
196
  return parsed
197
  except Exception:
198
- print("⚠️ Extracted substring still not valid JSON.")
199
  else:
200
  print("⚠️ Could not find a balanced JSON substring in the model output.")
201
- # if here, wait and retry
202
- except Exception as e:
203
- print(f"⚠️ Exception while calling model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  time.sleep(RETRY_DELAY)
205
 
206
  # If we've reached here, all attempts failed
207
- # Save raw outputs for debugging
208
  print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
209
- # write raw outputs to file next to output_file
210
- raw_path = None
211
  try:
212
- raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
 
213
  with open(raw_path, "w", encoding="utf-8") as rf:
214
  rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
215
- for i, out in enumerate(raw_outputs):
216
- rf.write(f"--- ATTEMPT {i+1} ---\n")
217
- rf.write(out + "\n\n")
218
  rf.write("\n=== END ===\n\n")
219
- rf.write("\n\n=== PDF TEXT USED ===\n\n")
220
- rf.write(pdf_txt or "")
221
  print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
222
  except Exception as e:
223
- print(f"⚠️ Failed to save raw model output: {e}")
224
 
225
- # Also create a salvage file for manual inspection
226
- salvage_path = None
227
  try:
228
- salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json"
229
  salvage_bundle = {
230
  "original_word_json": word_json,
231
- "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "",
232
- "raw_outputs_path": raw_path
233
  }
234
  with open(salvage_path, "w", encoding="utf-8") as sf:
235
  json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
@@ -250,6 +338,7 @@ Instructions:
250
 
251
  return None
252
 
 
253
  if __name__ == "__main__":
254
  if len(sys.argv) != 4:
255
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
@@ -258,14 +347,12 @@ if __name__ == "__main__":
258
  try:
259
  update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
260
  except Exception as e:
261
- # This top-level catch ensures the script exits successfully while logging the issue.
262
  print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
263
- # Attempt to copy original json to output before exiting
264
  try:
265
  with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
266
  outf.write(inf.read())
267
  print("Wrote original input JSON to output due to exception.")
268
  except Exception:
269
  pass
270
- # exit with status 0 so calling process doesn't crash (preserve pipeline behavior)
271
  sys.exit(0)
 
1
  #!/usr/bin/env python3
2
+ """
3
+ update_docx_with_pdf.py
4
+ """
5
+
6
  import os
7
  import sys
8
  import json
 
10
  import re
11
  from typing import Optional
12
 
13
+ # Try to import OpenAI client in the style used previously
14
  try:
15
  from openai import OpenAI
16
+ except Exception:
17
  OpenAI = None
18
 
19
+ # Config
20
  RETRIES = 3
21
  RETRY_DELAY = 1.0 # seconds between retries
22
+ DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
23
+ MAX_TOKENS = 4096
24
+ TEMPERATURE = 0.0
25
+
26
 
27
  def read_any(path_or_file):
28
  """Read content from file path or file-like object."""
 
36
  with open(path_or_file, "r", encoding="utf-8") as fh:
37
  return fh.read()
38
 
39
+
40
+ def find_first_balanced_json(s: str) -> Optional[str]:
41
  """
42
+ Scan the input string and return the first substring that is a balanced JSON object
43
+ starting with '{' and ending with the matching '}' that parses successfully.
44
  """
45
  if not s:
46
  return None
47
+ # Find all possible '{' positions
48
+ for m in re.finditer(r"\{", s):
49
+ start = m.start()
50
+ depth = 0
51
+ in_str = False
52
+ escape = False
53
+ for i in range(start, len(s)):
54
+ ch = s[i]
55
+ if ch == '"' and not escape:
56
+ in_str = not in_str
57
+ if in_str:
58
+ # handle escape toggling but don't treat braces inside strings
59
+ if ch == "\\" and not escape:
60
+ escape = True
61
+ else:
62
+ escape = False
63
+ continue
64
+ if ch == "{":
65
+ depth += 1
66
+ elif ch == "}":
67
+ depth -= 1
68
+ if depth == 0:
69
+ candidate = s[start : i + 1]
70
+ try:
71
+ json.loads(candidate)
72
+ return candidate
73
+ except Exception:
74
+ # candidate not valid JSON (maybe trailing commas etc.) -> continue searching
75
+ break
76
  return None
77
 
78
+
79
+ def extract_json_substring(s: str) -> Optional[str]:
80
+ """
81
+ Wrapper for find_first_balanced_json kept for compatibility with existing naming.
82
+ """
83
+ return find_first_balanced_json(s)
84
+
85
+
86
+ def try_parse_json_str(s: str):
87
+ """Attempt to parse JSON string, raising the same exceptions as json.loads."""
88
  return json.loads(s)
89
 
90
+
91
  def safe_write(path: str, data):
92
  with open(path, "w", encoding="utf-8") as f:
93
  json.dump(data, f, indent=2, ensure_ascii=False)
94
 
95
+
96
+ def save_raw(path: str, text: str):
97
+ try:
98
+ with open(path, "w", encoding="utf-8") as f:
99
+ f.write(text)
100
+ except Exception:
101
+ # best-effort; don't crash
102
+ pass
103
+
104
+
105
+ def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
106
+ """
107
+ Call the model and return raw text content. Support variations in SDK response shape.
108
+ """
109
+ resp = client.chat.completions.create(
110
+ model=model_name,
111
+ messages=[
112
+ {"role": "system", "content": system_msg},
113
+ {"role": "user", "content": user_msg},
114
+ ],
115
+ max_tokens=MAX_TOKENS,
116
+ temperature=TEMPERATURE,
117
+ )
118
+
119
+ # Try to extract raw text in a few shapes
120
+ raw_text = ""
121
+ try:
122
+ # New-style: resp.choices[0].message.content
123
+ raw_text = resp.choices[0].message.content
124
+ except Exception:
125
+ try:
126
+ # Older shape: resp.choices[0].text
127
+ raw_text = resp.choices[0].text
128
+ except Exception:
129
+ raw_text = str(resp)
130
+ if isinstance(raw_text, bytes):
131
+ raw_text = raw_text.decode("utf-8", errors="replace")
132
+ return (raw_text or "").strip()
133
+
134
+
135
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
136
  # --- load inputs ---
137
  word_json_text = read_any(word_json_file)
 
150
  f.write(word_json_text)
151
  return
152
 
153
+ # --- build base prompts ---
154
+ system_msg = (
155
+ "You are a strict JSON extraction assistant. Only output valid JSON with no surrounding text, "
156
+ "no markdown, no explanation. The JSON must be parseable by json.loads()."
157
+ )
158
 
159
+ user_prompt_template = (
160
+ "Here is a JSON template that must be updated (DO NOT change structure or keys):\n\n"
161
+ "{word_json}\n\n"
162
+ "Here is the extracted text from a PDF (use this to fill/update fields):\n\n"
163
+ "{pdf_text}\n\n"
164
+ "Instructions:\n"
165
+ "- ONLY update fields that already exist in the JSON template using evidence from the PDF text.\n"
166
+ "- DO NOT add new top-level keys or alter the structure.\n"
167
+ "- If you cannot find a value for an existing field, leave it unchanged.\n"
168
+ "- OUTPUT EXACTLY one JSON object and NOTHING else.\n"
169
+ )
170
 
171
+ user_prompt = user_prompt_template.format(
172
+ word_json=json.dumps(word_json, ensure_ascii=False),
173
+ pdf_text=(pdf_txt or "")[:120000], # cap size to avoid truncation/hitting token limits
174
+ )
 
 
175
 
176
  api_key = os.environ.get("OPENAI_API_KEY")
177
  if not api_key:
 
192
  safe_write(output_file, word_json)
193
  return
194
 
195
+ # Create client (constructor signature can be adapted if your OpenAI wrapper differs)
196
  client = OpenAI(api_key=api_key)
197
 
198
+ model_name = DEFAULT_MODEL
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  raw_outputs = []
201
  parsed = None
202
 
203
+ # Try multiple attempts (progressive instructions)
204
+ for attempt in range(1, RETRIES + 1):
205
+ variant_user_prompt = user_prompt
206
+ # On later attempts, append stricter instruction
207
+ if attempt == 2:
208
+ variant_user_prompt += "\nIMPORTANT: Return ONLY valid JSON. If you cannot find new values, keep existing template values."
209
+ elif attempt >= 3:
210
+ variant_user_prompt += "\nLAST ATTEMPT: Output exactly one JSON object and nothing else. If uncertain, keep fields unchanged."
211
+
212
+ print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
213
  try:
214
+ raw_text = call_model_and_get_raw(client, model_name, system_msg, variant_user_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  raw_outputs.append(raw_text)
216
+ # Save raw model output for diagnostics
217
+ out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
218
+ raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
219
+ save_raw(raw_save_path, raw_text)
220
+
221
  # Try parse as JSON directly
222
  try:
223
+ parsed = try_parse_json_str(raw_text)
224
  print("✅ Model returned valid JSON.")
225
+ # write and return
226
  if hasattr(output_file, "write"):
227
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
228
  output_file.flush()
229
  else:
230
  safe_write(output_file, parsed)
231
  return parsed
232
+ except Exception:
233
+ # try extracting a balanced JSON substring
 
234
  candidate = extract_json_substring(raw_text)
235
  if candidate:
236
  try:
237
+ parsed = try_parse_json_str(candidate)
238
  print("✅ Successfully extracted and parsed JSON substring from model output.")
239
  if hasattr(output_file, "write"):
240
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
241
  output_file.flush()
242
  else:
243
  safe_write(output_file, parsed)
 
 
 
 
244
  return parsed
245
  except Exception:
246
+ print("⚠️ Extracted substring was not valid JSON after parsing attempt.")
247
  else:
248
  print("⚠️ Could not find a balanced JSON substring in the model output.")
249
+
250
+ # If we get here, the model output is not parseable - attempt a repair pass once per attempt
251
+ print("🔧 Attempting repair pass: sending model its raw output and asking for VALID JSON only...")
252
+ repair_system = "You are a JSON repair assistant. The previous model output (possibly with commentary) is provided. Extract and return a single VALID JSON object and NOTHING else. If you cannot produce valid JSON, return {}."
253
+ # Provide the model its own raw output for repair
254
+ repair_user = f"Raw model output:\n\n{raw_text}\n\nReturn only valid JSON object."
255
+ repair_raw = ""
256
+ try:
257
+ repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
258
+ # Save repair output
259
+ repair_save_path = f"{out_base}.model_raw_attempt{attempt}_repair.txt"
260
+ save_raw(repair_save_path, repair_raw)
261
+
262
+ # Try parse repair output
263
+ try:
264
+ parsed = try_parse_json_str(repair_raw)
265
+ print("✅ Repair pass succeeded with valid JSON.")
266
+ if hasattr(output_file, "write"):
267
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
268
+ output_file.flush()
269
+ else:
270
+ safe_write(output_file, parsed)
271
+ return parsed
272
+ except Exception:
273
+ candidate = extract_json_substring(repair_raw)
274
+ if candidate:
275
+ try:
276
+ parsed = try_parse_json_str(candidate)
277
+ print("✅ Successfully extracted JSON substring from repair output.")
278
+ if hasattr(output_file, "write"):
279
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
280
+ output_file.flush()
281
+ else:
282
+ safe_write(output_file, parsed)
283
+ return parsed
284
+ except Exception:
285
+ print("⚠️ Repair output contained JSON-like substring but could not be parsed.")
286
+ else:
287
+ print("⚠️ Repair pass did not produce a parseable JSON substring.")
288
+ except Exception as rep_err:
289
+ print(f"⚠️ Exception during repair pass: {rep_err}")
290
+
291
+ except Exception as call_err:
292
+ print(f"⚠️ Exception while calling model: {call_err}")
293
+
294
+ # Wait before next attempt
295
  time.sleep(RETRY_DELAY)
296
 
297
  # If we've reached here, all attempts failed
 
298
  print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
 
 
299
  try:
300
+ out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
301
+ raw_path = f"{out_base}.model_raw.txt"
302
  with open(raw_path, "w", encoding="utf-8") as rf:
303
  rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
304
+ for i, out in enumerate(raw_outputs, start=1):
305
+ rf.write(f"--- ATTEMPT {i} ---\n")
306
+ rf.write((out or "") + "\n\n")
307
  rf.write("\n=== END ===\n\n")
308
+ rf.write("\n\n=== PDF TEXT USED (truncated) ===\n\n")
309
+ rf.write((pdf_txt or "")[:20000])
310
  print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
311
  except Exception as e:
312
+ print(f"⚠️ Failed to save raw model outputs: {e}")
313
 
314
+ # Also create a salvage bundle for manual inspection
 
315
  try:
316
+ salvage_path = f"{out_base}.salvage.json"
317
  salvage_bundle = {
318
  "original_word_json": word_json,
319
+ "pdf_text_sample": (pdf_txt or "")[:2000],
320
+ "raw_outputs_path": raw_path,
321
  }
322
  with open(salvage_path, "w", encoding="utf-8") as sf:
323
  json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
 
338
 
339
  return None
340
 
341
+
342
  if __name__ == "__main__":
343
  if len(sys.argv) != 4:
344
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
 
347
  try:
348
  update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
349
  except Exception as e:
350
+ # Top-level catch to avoid crashing the pipeline; write original input as fallback.
351
  print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
 
352
  try:
353
  with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
354
  outf.write(inf.read())
355
  print("Wrote original input JSON to output due to exception.")
356
  except Exception:
357
  pass
 
358
  sys.exit(0)