Spaces:
Sleeping
Sleeping
| # update_docx_with_pdf.py | |
| from openai import OpenAI | |
| import json | |
| import os | |
| import time | |
| def read_any(f): | |
| if hasattr(f, "read"): | |
| f.seek(0) | |
| content = f.read() | |
| if isinstance(content, bytes): | |
| content = content.decode("utf-8") | |
| return content | |
| else: | |
| with open(f, "r", encoding="utf-8") as fh: | |
| return fh.read() | |
| def update_json_with_pdf(word_json_file, pdf_txt_file, output_file): | |
| word_json = read_any(word_json_file) | |
| pdf_txt = read_any(pdf_txt_file) | |
| user_prompt = f""" | |
| Here is a JSON template. It contains only the fields that need updating: | |
| {word_json} | |
| Here is the extracted text from a PDF: | |
| {pdf_txt} | |
| Instructions: | |
| - ONLY update the fields present in the JSON template, using information from the PDF text. | |
| - DO NOT add any extra fields, and do not change the JSON structure. | |
| - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings). | |
| - If a field cannot be populated, keep its original value. | |
| """ | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("OPENAI_API_KEY not found in environment variables!") | |
| client = OpenAI(api_key=api_key) | |
| # Try a small number of attempts if the model returns text instead of JSON | |
| for attempt in range(3): | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."}, | |
| {"role":"user","content":user_prompt} | |
| ], | |
| max_tokens=4096, | |
| temperature=0 | |
| ) | |
| updated_json_str = response.choices[0].message.content.strip() | |
| try: | |
| parsed = json.loads(updated_json_str) | |
| template_keys = set(json.loads(word_json).keys()) | |
| parsed_keys = set(parsed.keys()) | |
| added = parsed_keys - template_keys | |
| if added: | |
| print("⚠️ Model returned extra top-level keys; pruning:", added) | |
| for ak in added: | |
| parsed.pop(ak, None) | |
| if hasattr(output_file, "write"): | |
| json.dump(parsed, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(parsed, f, indent=2, ensure_ascii=False) | |
| print("✅ JSON updated and saved to", getattr(output_file, "name", output_file)) | |
| return | |
| except json.JSONDecodeError: | |
| print("⚠️ Model output was not valid JSON. Raw output (truncated):") | |
| print(updated_json_str[:2000]) | |
| time.sleep(1) | |
| raise RuntimeError("Model failed to return valid JSON after retries.") | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) != 4: | |
| print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>") | |
| exit(1) | |
| update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) |