Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import json | |
| import time | |
| import re | |
| from typing import Optional | |
| # Try to import OpenAI client in the style you used previously | |
| try: | |
| from openai import OpenAI | |
| except Exception as e: | |
| OpenAI = None | |
| RETRIES = 3 | |
| RETRY_DELAY = 1.0 # seconds between retries | |
| def read_any(path_or_file): | |
| """Read content from file path or file-like object.""" | |
| if hasattr(path_or_file, "read"): | |
| path_or_file.seek(0) | |
| content = path_or_file.read() | |
| if isinstance(content, bytes): | |
| content = content.decode("utf-8") | |
| return content | |
| else: | |
| with open(path_or_file, "r", encoding="utf-8") as fh: | |
| return fh.read() | |
| def extract_json_substring(s: str) -> Optional[str]: | |
| """ | |
| Attempt to find the first balanced JSON object substring in s. | |
| Returns the substring or None. | |
| """ | |
| if not s: | |
| return None | |
| # Find first '{' then walk forward counting braces | |
| start = s.find("{") | |
| if start == -1: | |
| return None | |
| depth = 0 | |
| in_string = False | |
| escape = False | |
| for i in range(start, len(s)): | |
| ch = s[i] | |
| if ch == '"' and not escape: | |
| in_string = not in_string | |
| if in_string: | |
| if ch == "\\" and not escape: | |
| escape = True | |
| else: | |
| escape = False | |
| continue | |
| if ch == "{": | |
| depth += 1 | |
| elif ch == "}": | |
| depth -= 1 | |
| if depth == 0: | |
| candidate = s[start:i+1] | |
| return candidate | |
| return None | |
| def try_parse_json(s: str): | |
| """Try standard json.loads, return parsed or raise.""" | |
| return json.loads(s) | |
| def safe_write(path: str, data): | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| def update_json_with_pdf(word_json_file, pdf_txt_file, output_file): | |
| # --- load inputs --- | |
| word_json_text = read_any(word_json_file) | |
| pdf_txt = read_any(pdf_txt_file) | |
| try: | |
| word_json = json.loads(word_json_text) | |
| except Exception: | |
| # If the input word_json isn't valid JSON, abort early but write original to output | |
| print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.") | |
| if hasattr(output_file, "write"): | |
| output_file.write(word_json_text) | |
| output_file.flush() | |
| else: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| f.write(word_json_text) | |
| return | |
| # --- build prompt --- | |
| user_prompt = f""" | |
| Here is a JSON template. It contains only the fields that need updating: | |
| {json.dumps(word_json, ensure_ascii=False)} | |
| Here is the extracted text from a PDF: | |
| {pdf_txt} | |
| Instructions: | |
| - ONLY update the fields present in the JSON template, using information from the PDF text. | |
| - DO NOT add any extra fields, and do not change the JSON structure. | |
| - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings). | |
| - Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template. | |
| """ | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.") | |
| if hasattr(output_file, "write"): | |
| json.dump(word_json, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| safe_write(output_file, word_json) | |
| return | |
| if OpenAI is None: | |
| print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.") | |
| if hasattr(output_file, "write"): | |
| json.dump(word_json, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| safe_write(output_file, word_json) | |
| return | |
| client = OpenAI(api_key=api_key) | |
| system_msgs = [ | |
| "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.", | |
| ] | |
| # Progressive user prompts: first attempt standard, later attempt stricter guidance | |
| additional_user_variants = [ | |
| user_prompt, | |
| user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.", | |
| user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.", | |
| ] | |
| model_name = os.environ.get("OPENAI_MODEL", "gpt-4o") # keep same default you used | |
| raw_outputs = [] | |
| parsed = None | |
| for attempt in range(RETRIES): | |
| user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)] | |
| try: | |
| print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...") | |
| response = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": system_msgs[0]}, | |
| {"role": "user", "content": user_content} | |
| ], | |
| max_tokens=4096, | |
| temperature=0.0 | |
| ) | |
| # The SDK returns different shapes; attempt to access responsibly | |
| raw_text = None | |
| try: | |
| # preferred: choices[0].message.content | |
| raw_text = response.choices[0].message.content | |
| except Exception: | |
| try: | |
| raw_text = response.choices[0].text | |
| except Exception: | |
| raw_text = str(response) | |
| if isinstance(raw_text, bytes): | |
| raw_text = raw_text.decode("utf-8", errors="replace") | |
| raw_text = raw_text.strip() | |
| raw_outputs.append(raw_text) | |
| # Try parse as JSON directly | |
| try: | |
| parsed = json.loads(raw_text) | |
| print("✅ Model returned valid JSON.") | |
| # write output and exit | |
| if hasattr(output_file, "write"): | |
| json.dump(parsed, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| safe_write(output_file, parsed) | |
| return parsed | |
| except Exception as e: | |
| print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.") | |
| # try extracting json substring | |
| candidate = extract_json_substring(raw_text) | |
| if candidate: | |
| try: | |
| parsed = json.loads(candidate) | |
| print("✅ Successfully extracted and parsed JSON substring from model output.") | |
| if hasattr(output_file, "write"): | |
| json.dump(parsed, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| safe_write(output_file, parsed) | |
| # save raw for debugging too | |
| raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt" | |
| with open(raw_path, "w", encoding="utf-8") as rf: | |
| rf.write(raw_text) | |
| return parsed | |
| except Exception: | |
| print("⚠️ Extracted substring still not valid JSON.") | |
| else: | |
| print("⚠️ Could not find a balanced JSON substring in the model output.") | |
| # if here, wait and retry | |
| except Exception as e: | |
| print(f"⚠️ Exception while calling model: {e}") | |
| time.sleep(RETRY_DELAY) | |
| # If we've reached here, all attempts failed | |
| # Save raw outputs for debugging | |
| print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).") | |
| # write raw outputs to file next to output_file | |
| raw_path = None | |
| try: | |
| raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt" | |
| with open(raw_path, "w", encoding="utf-8") as rf: | |
| rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n") | |
| for i, out in enumerate(raw_outputs): | |
| rf.write(f"--- ATTEMPT {i+1} ---\n") | |
| rf.write(out + "\n\n") | |
| rf.write("\n=== END ===\n\n") | |
| rf.write("\n\n=== PDF TEXT USED ===\n\n") | |
| rf.write(pdf_txt or "") | |
| print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}") | |
| except Exception as e: | |
| print(f"⚠️ Failed to save raw model output: {e}") | |
| # Also create a salvage file for manual inspection | |
| salvage_path = None | |
| try: | |
| salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json" | |
| salvage_bundle = { | |
| "original_word_json": word_json, | |
| "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "", | |
| "raw_outputs_path": raw_path | |
| } | |
| with open(salvage_path, "w", encoding="utf-8") as sf: | |
| json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False) | |
| print(f"ℹ️ Salvage bundle saved to: {salvage_path}") | |
| except Exception as e: | |
| print(f"⚠️ Failed to save salvage bundle: {e}") | |
| # Write original JSON to output to avoid failing the calling process | |
| try: | |
| if hasattr(output_file, "write"): | |
| json.dump(word_json, output_file, indent=2, ensure_ascii=False) | |
| output_file.flush() | |
| else: | |
| safe_write(output_file, word_json) | |
| print("✅ Original JSON template written to output (no updates applied).") | |
| except Exception as e: | |
| print(f"⚠️ Failed to write original JSON to output: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 4: | |
| print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>") | |
| sys.exit(0) | |
| try: | |
| update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) | |
| except Exception as e: | |
| # This top-level catch ensures the script exits successfully while logging the issue. | |
| print(f"Unexpected exception in update_docx_with_pdf.py: {e}") | |
| # Attempt to copy original json to output before exiting | |
| try: | |
| with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf: | |
| outf.write(inf.read()) | |
| print("Wrote original input JSON to output due to exception.") | |
| except Exception: | |
| pass | |
| # exit with status 0 so calling process doesn't crash (preserve pipeline behavior) | |
| sys.exit(0) |