Spaces:

Shami96
/

PDF-Data_Extractor

Sleeping

App Files Files Community

PDF-Data_Extractor / update_docx_with_pdf.py

Shami96

Update update_docx_with_pdf.py

8001b1f verified 3 months ago

raw

history blame

3.11 kB

	# update_docx_with_pdf.py
	from openai import OpenAI
	import json
	import os
	import time

	def read_any(f):
	if hasattr(f, "read"):
	f.seek(0)
	content = f.read()
	if isinstance(content, bytes):
	content = content.decode("utf-8")
	return content
	else:
	with open(f, "r", encoding="utf-8") as fh:
	return fh.read()

	def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
	word_json = read_any(word_json_file)
	pdf_txt = read_any(pdf_txt_file)

	user_prompt = f"""
	Here is a JSON template. It contains only the fields that need updating:
	{word_json}
	Here is the extracted text from a PDF:
	{pdf_txt}
	Instructions:
	- ONLY update the fields present in the JSON template, using information from the PDF text.
	- DO NOT add any extra fields, and do not change the JSON structure.
	- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
	- If a field cannot be populated, keep its original value.
	"""

	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
	client = OpenAI(api_key=api_key)

	# Try a small number of attempts if the model returns text instead of JSON
	for attempt in range(3):
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
	{"role":"user","content":user_prompt}
	],
	max_tokens=4096,
	temperature=0
	)
	updated_json_str = response.choices[0].message.content.strip()

	try:
	parsed = json.loads(updated_json_str)
	template_keys = set(json.loads(word_json).keys())
	parsed_keys = set(parsed.keys())
	added = parsed_keys - template_keys
	if added:
	print("⚠️ Model returned extra top-level keys; pruning:", added)
	for ak in added:
	parsed.pop(ak, None)
	if hasattr(output_file, "write"):
	json.dump(parsed, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(parsed, f, indent=2, ensure_ascii=False)
	print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
	return
	except json.JSONDecodeError:
	print("⚠️ Model output was not valid JSON. Raw output (truncated):")
	print(updated_json_str[:2000])
	time.sleep(1)
	raise RuntimeError("Model failed to return valid JSON after retries.")

	if __name__ == "__main__":
	import sys
	if len(sys.argv) != 4:
	print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
	exit(1)
	update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])