#!/usr/bin/env python3 import os import sys import json import time import re from typing import Optional # Try to import OpenAI client in the style you used previously try: from openai import OpenAI except Exception as e: OpenAI = None RETRIES = 3 RETRY_DELAY = 1.0 # seconds between retries def read_any(path_or_file): """Read content from file path or file-like object.""" if hasattr(path_or_file, "read"): path_or_file.seek(0) content = path_or_file.read() if isinstance(content, bytes): content = content.decode("utf-8") return content else: with open(path_or_file, "r", encoding="utf-8") as fh: return fh.read() def extract_json_substring(s: str) -> Optional[str]: """ Attempt to find the first balanced JSON object substring in s. Returns the substring or None. """ if not s: return None # Find first '{' then walk forward counting braces start = s.find("{") if start == -1: return None depth = 0 in_string = False escape = False for i in range(start, len(s)): ch = s[i] if ch == '"' and not escape: in_string = not in_string if in_string: if ch == "\\" and not escape: escape = True else: escape = False continue if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: candidate = s[start:i+1] return candidate return None def try_parse_json(s: str): """Try standard json.loads, return parsed or raise.""" return json.loads(s) def safe_write(path: str, data): with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) def update_json_with_pdf(word_json_file, pdf_txt_file, output_file): # --- load inputs --- word_json_text = read_any(word_json_file) pdf_txt = read_any(pdf_txt_file) try: word_json = json.loads(word_json_text) except Exception: # If the input word_json isn't valid JSON, abort early but write original to output print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.") if hasattr(output_file, "write"): output_file.write(word_json_text) output_file.flush() else: with open(output_file, "w", encoding="utf-8") as f: f.write(word_json_text) return # --- build prompt --- user_prompt = f""" Here is a JSON template. It contains only the fields that need updating: {json.dumps(word_json, ensure_ascii=False)} Here is the extracted text from a PDF: {pdf_txt} Instructions: - ONLY update the fields present in the JSON template, using information from the PDF text. - DO NOT add any extra fields, and do not change the JSON structure. - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings). - Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template. """ api_key = os.environ.get("OPENAI_API_KEY") if not api_key: print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.") if hasattr(output_file, "write"): json.dump(word_json, output_file, indent=2, ensure_ascii=False) output_file.flush() else: safe_write(output_file, word_json) return if OpenAI is None: print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.") if hasattr(output_file, "write"): json.dump(word_json, output_file, indent=2, ensure_ascii=False) output_file.flush() else: safe_write(output_file, word_json) return client = OpenAI(api_key=api_key) system_msgs = [ "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.", ] # Progressive user prompts: first attempt standard, later attempt stricter guidance additional_user_variants = [ user_prompt, user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.", user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.", ] model_name = os.environ.get("OPENAI_MODEL", "gpt-4o") # keep same default you used raw_outputs = [] parsed = None for attempt in range(RETRIES): user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)] try: print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...") response = client.chat.completions.create( model=model_name, messages=[ {"role": "system", "content": system_msgs[0]}, {"role": "user", "content": user_content} ], max_tokens=4096, temperature=0.0 ) # The SDK returns different shapes; attempt to access responsibly raw_text = None try: # preferred: choices[0].message.content raw_text = response.choices[0].message.content except Exception: try: raw_text = response.choices[0].text except Exception: raw_text = str(response) if isinstance(raw_text, bytes): raw_text = raw_text.decode("utf-8", errors="replace") raw_text = raw_text.strip() raw_outputs.append(raw_text) # Try parse as JSON directly try: parsed = json.loads(raw_text) print("✅ Model returned valid JSON.") # write output and exit if hasattr(output_file, "write"): json.dump(parsed, output_file, indent=2, ensure_ascii=False) output_file.flush() else: safe_write(output_file, parsed) return parsed except Exception as e: print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.") # try extracting json substring candidate = extract_json_substring(raw_text) if candidate: try: parsed = json.loads(candidate) print("✅ Successfully extracted and parsed JSON substring from model output.") if hasattr(output_file, "write"): json.dump(parsed, output_file, indent=2, ensure_ascii=False) output_file.flush() else: safe_write(output_file, parsed) # save raw for debugging too raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt" with open(raw_path, "w", encoding="utf-8") as rf: rf.write(raw_text) return parsed except Exception: print("⚠️ Extracted substring still not valid JSON.") else: print("⚠️ Could not find a balanced JSON substring in the model output.") # if here, wait and retry except Exception as e: print(f"⚠️ Exception while calling model: {e}") time.sleep(RETRY_DELAY) # If we've reached here, all attempts failed # Save raw outputs for debugging print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).") # write raw outputs to file next to output_file raw_path = None try: raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt" with open(raw_path, "w", encoding="utf-8") as rf: rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n") for i, out in enumerate(raw_outputs): rf.write(f"--- ATTEMPT {i+1} ---\n") rf.write(out + "\n\n") rf.write("\n=== END ===\n\n") rf.write("\n\n=== PDF TEXT USED ===\n\n") rf.write(pdf_txt or "") print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}") except Exception as e: print(f"⚠️ Failed to save raw model output: {e}") # Also create a salvage file for manual inspection salvage_path = None try: salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json" salvage_bundle = { "original_word_json": word_json, "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "", "raw_outputs_path": raw_path } with open(salvage_path, "w", encoding="utf-8") as sf: json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False) print(f"ℹ️ Salvage bundle saved to: {salvage_path}") except Exception as e: print(f"⚠️ Failed to save salvage bundle: {e}") # Write original JSON to output to avoid failing the calling process try: if hasattr(output_file, "write"): json.dump(word_json, output_file, indent=2, ensure_ascii=False) output_file.flush() else: safe_write(output_file, word_json) print("✅ Original JSON template written to output (no updates applied).") except Exception as e: print(f"⚠️ Failed to write original JSON to output: {e}") return None if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: python update_docx_with_pdf.py ") sys.exit(0) try: update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) except Exception as e: # This top-level catch ensures the script exits successfully while logging the issue. print(f"Unexpected exception in update_docx_with_pdf.py: {e}") # Attempt to copy original json to output before exiting try: with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf: outf.write(inf.read()) print("Wrote original input JSON to output due to exception.") except Exception: pass # exit with status 0 so calling process doesn't crash (preserve pipeline behavior) sys.exit(0)