Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

f486b52

verified ·

1 Parent(s): 704d2a2

Update update_docx_with_pdf.py

Browse files

Files changed (1) hide show

update_docx_with_pdf.py +199 -112

update_docx_with_pdf.py CHANGED Viewed

@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
 import os
 import sys
 import json
@@ -6,14 +10,19 @@ import time
 import re
 from typing import Optional
-# Try to import OpenAI client in the style you used previously
 try:
     from openai import OpenAI
-except Exception as e:
     OpenAI = None
 RETRIES = 3
 RETRY_DELAY = 1.0  # seconds between retries
 def read_any(path_or_file):
     """Read content from file path or file-like object."""
@@ -27,47 +36,102 @@ def read_any(path_or_file):
         with open(path_or_file, "r", encoding="utf-8") as fh:
             return fh.read()
-def extract_json_substring(s: str) -> Optional[str]:
     """
-    Attempt to find the first balanced JSON object substring in s.
-    Returns the substring or None.
     """
     if not s:
         return None
-    # Find first '{' then walk forward counting braces
-    start = s.find("{")
-    if start == -1:
-        return None
-    depth = 0
-    in_string = False
-    escape = False
-    for i in range(start, len(s)):
-        ch = s[i]
-        if ch == '"' and not escape:
-            in_string = not in_string
-        if in_string:
-            if ch == "\\" and not escape:
-                escape = True
-            else:
-                escape = False
-            continue
-        if ch == "{":
-            depth += 1
-        elif ch == "}":
-            depth -= 1
-            if depth == 0:
-                candidate = s[start:i+1]
-                return candidate
     return None
-def try_parse_json(s: str):
-    """Try standard json.loads, return parsed or raise."""
     return json.loads(s)
 def safe_write(path: str, data):
     with open(path, "w", encoding="utf-8") as f:
         json.dump(data, f, indent=2, ensure_ascii=False)
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
     # --- load inputs ---
     word_json_text = read_any(word_json_file)
@@ -86,20 +150,28 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
                 f.write(word_json_text)
         return
-    # --- build prompt ---
-    user_prompt = f"""
-Here is a JSON template. It contains only the fields that need updating:
-{json.dumps(word_json, ensure_ascii=False)}
-Here is the extracted text from a PDF:
-{pdf_txt}
-Instructions:
-- ONLY update the fields present in the JSON template, using information from the PDF text.
-- DO NOT add any extra fields, and do not change the JSON structure.
-- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
-- Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template.
-"""
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
@@ -120,116 +192,132 @@ Instructions:
             safe_write(output_file, word_json)
         return
     client = OpenAI(api_key=api_key)
-    system_msgs = [
-        "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.",
-    ]
-    # Progressive user prompts: first attempt standard, later attempt stricter guidance
-    additional_user_variants = [
-        user_prompt,
-        user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.",
-        user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.",
-    ]
-    model_name = os.environ.get("OPENAI_MODEL", "gpt-4o")  # keep same default you used
     raw_outputs = []
     parsed = None
-    for attempt in range(RETRIES):
-        user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)]
         try:
-            print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...")
-            response = client.chat.completions.create(
-                model=model_name,
-                messages=[
-                    {"role": "system", "content": system_msgs[0]},
-                    {"role": "user", "content": user_content}
-                ],
-                max_tokens=4096,
-                temperature=0.0
-            )
-            # The SDK returns different shapes; attempt to access responsibly
-            raw_text = None
-            try:
-                # preferred: choices[0].message.content
-                raw_text = response.choices[0].message.content
-            except Exception:
-                try:
-                    raw_text = response.choices[0].text
-                except Exception:
-                    raw_text = str(response)
-            if isinstance(raw_text, bytes):
-                raw_text = raw_text.decode("utf-8", errors="replace")
-            raw_text = raw_text.strip()
             raw_outputs.append(raw_text)
             # Try parse as JSON directly
             try:
-                parsed = json.loads(raw_text)
                 print("✅ Model returned valid JSON.")
-                # write output and exit
                 if hasattr(output_file, "write"):
                     json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                     output_file.flush()
                 else:
                     safe_write(output_file, parsed)
                 return parsed
-            except Exception as e:
-                print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.")
-                # try extracting json substring
                 candidate = extract_json_substring(raw_text)
                 if candidate:
                     try:
-                        parsed = json.loads(candidate)
                         print("✅ Successfully extracted and parsed JSON substring from model output.")
                         if hasattr(output_file, "write"):
                             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                             output_file.flush()
                         else:
                             safe_write(output_file, parsed)
-                        # save raw for debugging too
-                        raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
-                        with open(raw_path, "w", encoding="utf-8") as rf:
-                            rf.write(raw_text)
                         return parsed
                     except Exception:
-                        print("⚠️ Extracted substring still not valid JSON.")
                 else:
                     print("⚠️ Could not find a balanced JSON substring in the model output.")
-            # if here, wait and retry
-        except Exception as e:
-            print(f"⚠️ Exception while calling model: {e}")
         time.sleep(RETRY_DELAY)
     # If we've reached here, all attempts failed
-    # Save raw outputs for debugging
     print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
-    # write raw outputs to file next to output_file
-    raw_path = None
     try:
-        raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
         with open(raw_path, "w", encoding="utf-8") as rf:
             rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
-            for i, out in enumerate(raw_outputs):
-                rf.write(f"--- ATTEMPT {i+1} ---\n")
-                rf.write(out + "\n\n")
             rf.write("\n=== END ===\n\n")
-            rf.write("\n\n=== PDF TEXT USED ===\n\n")
-            rf.write(pdf_txt or "")
         print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
     except Exception as e:
-        print(f"⚠️ Failed to save raw model output: {e}")
-    # Also create a salvage file for manual inspection
-    salvage_path = None
     try:
-        salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json"
         salvage_bundle = {
             "original_word_json": word_json,
-            "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "",
-            "raw_outputs_path": raw_path
         }
         with open(salvage_path, "w", encoding="utf-8") as sf:
             json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
@@ -250,6 +338,7 @@ Instructions:
     return None
 if __name__ == "__main__":
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
@@ -258,14 +347,12 @@ if __name__ == "__main__":
     try:
         update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
     except Exception as e:
-        # This top-level catch ensures the script exits successfully while logging the issue.
         print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
-        # Attempt to copy original json to output before exiting
         try:
             with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                 outf.write(inf.read())
                 print("Wrote original input JSON to output due to exception.")
         except Exception:
             pass
-        # exit with status 0 so calling process doesn't crash (preserve pipeline behavior)
         sys.exit(0)

 #!/usr/bin/env python3
+"""
+update_docx_with_pdf.py
+"""
 import os
 import sys
 import json
 import re
 from typing import Optional
+# Try to import OpenAI client in the style used previously
 try:
     from openai import OpenAI
+except Exception:
     OpenAI = None
+# Config
 RETRIES = 3
 RETRY_DELAY = 1.0  # seconds between retries
+DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
+MAX_TOKENS = 4096
+TEMPERATURE = 0.0
 def read_any(path_or_file):
     """Read content from file path or file-like object."""
         with open(path_or_file, "r", encoding="utf-8") as fh:
             return fh.read()
+def find_first_balanced_json(s: str) -> Optional[str]:
     """
+    Scan the input string and return the first substring that is a balanced JSON object
+    starting with '{' and ending with the matching '}' that parses successfully.
     """
     if not s:
         return None
+    # Find all possible '{' positions
+    for m in re.finditer(r"\{", s):
+        start = m.start()
+        depth = 0
+        in_str = False
+        escape = False
+        for i in range(start, len(s)):
+            ch = s[i]
+            if ch == '"' and not escape:
+                in_str = not in_str
+            if in_str:
+                # handle escape toggling but don't treat braces inside strings
+                if ch == "\\" and not escape:
+                    escape = True
+                else:
+                    escape = False
+                continue
+            if ch == "{":
+                depth += 1
+            elif ch == "}":
+                depth -= 1
+                if depth == 0:
+                    candidate = s[start : i + 1]
+                    try:
+                        json.loads(candidate)
+                        return candidate
+                    except Exception:
+                        # candidate not valid JSON (maybe trailing commas etc.) -> continue searching
+                        break
     return None
+def extract_json_substring(s: str) -> Optional[str]:
+    """
+    Wrapper for find_first_balanced_json kept for compatibility with existing naming.
+    """
+    return find_first_balanced_json(s)
+def try_parse_json_str(s: str):
+    """Attempt to parse JSON string, raising the same exceptions as json.loads."""
     return json.loads(s)
 def safe_write(path: str, data):
     with open(path, "w", encoding="utf-8") as f:
         json.dump(data, f, indent=2, ensure_ascii=False)
+def save_raw(path: str, text: str):
+    try:
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(text)
+    except Exception:
+        # best-effort; don't crash
+        pass
+def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
+    """
+    Call the model and return raw text content. Support variations in SDK response shape.
+    """
+    resp = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {"role": "system", "content": system_msg},
+            {"role": "user", "content": user_msg},
+        ],
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
+    )
+    # Try to extract raw text in a few shapes
+    raw_text = ""
+    try:
+        # New-style: resp.choices[0].message.content
+        raw_text = resp.choices[0].message.content
+    except Exception:
+        try:
+            # Older shape: resp.choices[0].text
+            raw_text = resp.choices[0].text
+        except Exception:
+            raw_text = str(resp)
+    if isinstance(raw_text, bytes):
+        raw_text = raw_text.decode("utf-8", errors="replace")
+    return (raw_text or "").strip()
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
     # --- load inputs ---
     word_json_text = read_any(word_json_file)
                 f.write(word_json_text)
         return
+    # --- build base prompts ---
+    system_msg = (
+        "You are a strict JSON extraction assistant. Only output valid JSON with no surrounding text, "
+        "no markdown, no explanation. The JSON must be parseable by json.loads()."
+    )
+    user_prompt_template = (
+        "Here is a JSON template that must be updated (DO NOT change structure or keys):\n\n"
+        "{word_json}\n\n"
+        "Here is the extracted text from a PDF (use this to fill/update fields):\n\n"
+        "{pdf_text}\n\n"
+        "Instructions:\n"
+        "- ONLY update fields that already exist in the JSON template using evidence from the PDF text.\n"
+        "- DO NOT add new top-level keys or alter the structure.\n"
+        "- If you cannot find a value for an existing field, leave it unchanged.\n"
+        "- OUTPUT EXACTLY one JSON object and NOTHING else.\n"
+    )
+    user_prompt = user_prompt_template.format(
+        word_json=json.dumps(word_json, ensure_ascii=False),
+        pdf_text=(pdf_txt or "")[:120000],  # cap size to avoid truncation/hitting token limits
+    )
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
             safe_write(output_file, word_json)
         return
+    # Create client (constructor signature can be adapted if your OpenAI wrapper differs)
     client = OpenAI(api_key=api_key)
+    model_name = DEFAULT_MODEL
     raw_outputs = []
     parsed = None
+    # Try multiple attempts (progressive instructions)
+    for attempt in range(1, RETRIES + 1):
+        variant_user_prompt = user_prompt
+        # On later attempts, append stricter instruction
+        if attempt == 2:
+            variant_user_prompt += "\nIMPORTANT: Return ONLY valid JSON. If you cannot find new values, keep existing template values."
+        elif attempt >= 3:
+            variant_user_prompt += "\nLAST ATTEMPT: Output exactly one JSON object and nothing else. If uncertain, keep fields unchanged."
+        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
         try:
+            raw_text = call_model_and_get_raw(client, model_name, system_msg, variant_user_prompt)
             raw_outputs.append(raw_text)
+            # Save raw model output for diagnostics
+            out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
+            raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
+            save_raw(raw_save_path, raw_text)
             # Try parse as JSON directly
             try:
+                parsed = try_parse_json_str(raw_text)
                 print("✅ Model returned valid JSON.")
+                # write and return
                 if hasattr(output_file, "write"):
                     json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                     output_file.flush()
                 else:
                     safe_write(output_file, parsed)
                 return parsed
+            except Exception:
+                # try extracting a balanced JSON substring
                 candidate = extract_json_substring(raw_text)
                 if candidate:
                     try:
+                        parsed = try_parse_json_str(candidate)
                         print("✅ Successfully extracted and parsed JSON substring from model output.")
                         if hasattr(output_file, "write"):
                             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                             output_file.flush()
                         else:
                             safe_write(output_file, parsed)
                         return parsed
                     except Exception:
+                        print("⚠️ Extracted substring was not valid JSON after parsing attempt.")
                 else:
                     print("⚠️ Could not find a balanced JSON substring in the model output.")
+            # If we get here, the model output is not parseable - attempt a repair pass once per attempt
+            print("🔧 Attempting repair pass: sending model its raw output and asking for VALID JSON only...")
+            repair_system = "You are a JSON repair assistant. The previous model output (possibly with commentary) is provided. Extract and return a single VALID JSON object and NOTHING else. If you cannot produce valid JSON, return {}."
+            # Provide the model its own raw output for repair
+            repair_user = f"Raw model output:\n\n{raw_text}\n\nReturn only valid JSON object."
+            repair_raw = ""
+            try:
+                repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
+                # Save repair output
+                repair_save_path = f"{out_base}.model_raw_attempt{attempt}_repair.txt"
+                save_raw(repair_save_path, repair_raw)
+                # Try parse repair output
+                try:
+                    parsed = try_parse_json_str(repair_raw)
+                    print("✅ Repair pass succeeded with valid JSON.")
+                    if hasattr(output_file, "write"):
+                        json.dump(parsed, output_file, indent=2, ensure_ascii=False)
+                        output_file.flush()
+                    else:
+                        safe_write(output_file, parsed)
+                    return parsed
+                except Exception:
+                    candidate = extract_json_substring(repair_raw)
+                    if candidate:
+                        try:
+                            parsed = try_parse_json_str(candidate)
+                            print("✅ Successfully extracted JSON substring from repair output.")
+                            if hasattr(output_file, "write"):
+                                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
+                                output_file.flush()
+                            else:
+                                safe_write(output_file, parsed)
+                            return parsed
+                        except Exception:
+                            print("⚠️ Repair output contained JSON-like substring but could not be parsed.")
+                    else:
+                        print("⚠️ Repair pass did not produce a parseable JSON substring.")
+            except Exception as rep_err:
+                print(f"⚠️ Exception during repair pass: {rep_err}")
+        except Exception as call_err:
+            print(f"⚠️ Exception while calling model: {call_err}")
+        # Wait before next attempt
         time.sleep(RETRY_DELAY)
     # If we've reached here, all attempts failed
     print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
     try:
+        out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
+        raw_path = f"{out_base}.model_raw.txt"
         with open(raw_path, "w", encoding="utf-8") as rf:
             rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
+            for i, out in enumerate(raw_outputs, start=1):
+                rf.write(f"--- ATTEMPT {i} ---\n")
+                rf.write((out or "") + "\n\n")
             rf.write("\n=== END ===\n\n")
+            rf.write("\n\n=== PDF TEXT USED (truncated) ===\n\n")
+            rf.write((pdf_txt or "")[:20000])
         print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
     except Exception as e:
+        print(f"⚠️ Failed to save raw model outputs: {e}")
+    # Also create a salvage bundle for manual inspection
     try:
+        salvage_path = f"{out_base}.salvage.json"
         salvage_bundle = {
             "original_word_json": word_json,
+            "pdf_text_sample": (pdf_txt or "")[:2000],
+            "raw_outputs_path": raw_path,
         }
         with open(salvage_path, "w", encoding="utf-8") as sf:
             json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
     return None
 if __name__ == "__main__":
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
     try:
         update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
     except Exception as e:
+        # Top-level catch to avoid crashing the pipeline; write original input as fallback.
         print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
         try:
             with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                 outf.write(inf.read())
                 print("Wrote original input JSON to output due to exception.")
         except Exception:
             pass
         sys.exit(0)