Spaces:

lucacadalora
/

jatevo

Running on Zero

App Files Files Community

lucacadalora commited on 14 days ago

Commit

aa2b99c

verified ·

1 Parent(s): d7ea6d3

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -285

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import os
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")  # disable hf_transfer if missing
 import io
-import base64
 import re
 import tempfile
-from typing import List, Tuple, Optional
 import gradio as gr
 import torch
@@ -13,38 +13,13 @@ from transformers import AutoModel, AutoTokenizer
 import spaces
 from PIL import Image, ImageDraw
-# Optional plotting & parsing libs (graceful fallbacks if missing)
-try:
-    import matplotlib.pyplot as plt
-    _HAS_MPL = True
-except Exception:
-    _HAS_MPL = False
 try:
     import pandas as pd
     _HAS_PANDAS = True
 except Exception:
     _HAS_PANDAS = False
-# read_html needs bs4 + lxml; we'll try but don't hard-require
-_HAS_READ_HTML = False
-if _HAS_PANDAS:
-    try:
-        import bs4  # noqa: F401
-        import lxml  # noqa: F401
-        _HAS_READ_HTML = True
-    except Exception:
-        _HAS_READ_HTML = False
-# RDKit (optional)
-_HAS_RDKIT = False
-try:
-    from rdkit import Chem
-    from rdkit.Chem import Draw
-    _HAS_RDKIT = True
-except Exception:
-    _HAS_RDKIT = False
 from gradio.themes import Soft
 from gradio.themes.utils import fonts
@@ -71,96 +46,108 @@ except Exception:
             pass
-# ===== Helpers for deep parsing rendering =====
-def _img_to_data_uri(pil_img, fmt="PNG"):
-    buf = io.BytesIO()
-    pil_img.save(buf, format=fmt)
-    return f"data:image/{fmt.lower()};base64,{base64.b64encode(buf.getvalue()).decode()}"
-def _df_to_chart_data_uri(df: "pd.DataFrame") -> Optional[str]:
-    """Render a simple chart from a DataFrame using matplotlib (single figure, no explicit colors)."""
-    if not _HAS_MPL or not _HAS_PANDAS:
-        return None
-    try:
-        # basic heuristics: first col is x-axis if non-numeric-ish
-        plt.figure()  # one plot per chart (no subplots; no explicit colors)
-        df_plot = df.copy()
-        # If first column is non-numeric, set it as index
-        if df_plot.shape[1] >= 2:
-            xcol = df_plot.columns[0]
-            numeric_x = pd.to_numeric(df_plot[xcol], errors="coerce")
-            if numeric_x.isna().any():
-                df_plot = df_plot.set_index(xcol)
-        # bar for <=5 series, else line
-        if df_plot.shape[1] <= 5:
-            df_plot.plot(kind="bar")
-        else:
-            df_plot.plot()
-        buf = io.BytesIO()
-        plt.tight_layout()
-        plt.savefig(buf, format="PNG", dpi=160)
-        buf.seek(0)
-        return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
-    except Exception:
         return None
-def _html_table_to_df(html: str) -> Optional["pd.DataFrame"]:
-    """Pick the largest <table> from HTML; return as DataFrame or None."""
-    if not _HAS_READ_HTML:
         return None
     try:
-        tables = pd.read_html(html)  # list[DataFrame]
-        if not tables:
-            return None
-        return max(tables, key=lambda t: (t.shape[0] * t.shape[1]))
     except Exception:
         return None
-# --- Fallbacks when we don't get HTML tables ---
-_MD_TABLE_BLOCK_RE = re.compile(
-    r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
-    flags=re.MULTILINE
-)
-def _md_table_to_df(md_text: str) -> Optional["pd.DataFrame"]:
-    """Parse the first Markdown pipe-table into a DataFrame."""
     if not _HAS_PANDAS:
         return None
-    try:
-        m = _MD_TABLE_BLOCK_RE.search(md_text or "")
-        if not m:
-            return None
-        block = m.group(0).strip()
-        # Normalize: remove alignment row, split by pipes
-        lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
-        if len(lines) < 2:
-            return None
-        header = [h.strip() for h in lines[0].strip("|").split("|")]
-        align_or_sep = lines[1]
-        data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
-        rows = []
-        for ln in data_lines:
-            parts = [p.strip() for p in ln.strip("|").split("|")]
-            if len(parts) == len(header):
-                rows.append(parts)
-        if not rows:
-            return None
-        df = pd.DataFrame(rows, columns=header)
-        # try cast numerics where possible
-        for c in df.columns:
-            df[c] = pd.to_numeric(df[c], errors="ignore")
-        return df
-    except Exception:
         return None
-def _numeric_block_to_df(text: str) -> Optional["pd.DataFrame"]:
-    """Very rough fallback: parse whitespace/csv-ish numeric blocks into a DataFrame."""
     if not _HAS_PANDAS:
         return None
-    # grab the largest numeric-ish block: lines containing numbers and separators
     blocks = []
     cur = []
     for ln in (text or "").splitlines():
@@ -168,100 +155,59 @@ def _numeric_block_to_df(text: str) -> Optional["pd.DataFrame"]:
             cur.append(ln)
         else:
             if cur:
-                blocks.append("\n".join(cur))
-                cur = []
-    if cur:
-        blocks.append("\n".join(cur))
-    if not blocks:
-        return None
-    block = max(blocks, key=len)
-    # try CSV first
     try:
-        from io import StringIO
         df = pd.read_csv(StringIO(block))
-        if df.shape[1] >= 2:
-            return df
     except Exception:
         pass
-    # try whitespace sep
     try:
-        from io import StringIO
         df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
-        if df.shape[1] >= 2:
-            return df
     except Exception:
         pass
     return None
-_SMILES_REGEX = re.compile(r"(?:SMILES|Smiles)\s*[:：]\s*([A-Za-z0-9@\[\]\(\)\+\-\=\\\/%]+)")
-def _render_smiles_block(text: str) -> List[Tuple[str, str]]:
-    """Find SMILES in text, render with RDKit, return list[(title, data_uri)]."""
-    assets: List[Tuple[str, str]] = []
-    if not _HAS_RDKIT:
-        return assets
-    try:
-        found = _SMILES_REGEX.findall(text or "")
-        for s in found[:6]:  # safety cap
-            mol = Chem.MolFromSmiles(s)
-            if mol is None:
-                continue
-            im = Draw.MolToImage(mol, size=(520, 260))
-            assets.append((f"Molecule (SMILES: {s})", _img_to_data_uri(im)))
-    except Exception:
-        pass
-    return assets
-def _assets_to_markdown_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
-    out = ["\n\n---\n\n### Parsed Figures (auto-rendered)\n"]
-    if not assets and not parsed_text.strip():
-        out.append("_No deep-parsed content detected._\n")
-        return "".join(out)
-    for title, data_uri in assets:
-        if data_uri:
-            out.append(f"**{title}**\n\n![]({data_uri})\n\n")
-    # Always expose a short snippet so you can see what the model returned
-    snippet = parsed_text.strip()
-    if len(snippet) > 4000:
-        snippet = snippet[:4000] + "\n<!-- truncated -->"
-    if snippet:
-        out.append("**Raw deep-parse output (snippet)**\n\n```text\n")
-        out.append(snippet)
-        out.append("\n```\n")
-    return "".join(out)
-def _assets_to_html_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
-    """Simple HTML block for the HTML Preview tab; ensures images render even if Markdown sanitizer blocks data URIs."""
-    parts = [
-        '<section class="parsed-figures"><h3>Parsed Figures (auto-rendered)</h3>'
-    ]
-    if not assets and not parsed_text.strip():
-        parts.append("<p><em>No deep-parsed content detected.</em></p></section>")
-        return "".join(parts)
-    for title, data_uri in assets:
-        if data_uri:
-            parts.append(f'<figure><figcaption><strong>{title}</strong></figcaption><img style="max-width:100%;height:auto" src="{data_uri}"/></figure>')
-    if parsed_text.strip():
-        safe = parsed_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-        parts.append(f"<details><summary>Raw deep-parse output (snippet)</summary><pre>{safe[:8000]}</pre></details>")
-    parts.append("</section>")
-    return "".join(parts)
 # ===== Inference Function =====
 @spaces.GPU
 def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
     """
-    Process image with DeepSeek-OCR and return annotated image, markdown, html, and text.
-    Adds deep parsing for figures to render charts (from tables) and chemistry (from SMILES).
     """
     if image is None:
-        return None, "Please upload an image first.", "<p>Please upload an image first.</p>", "Please upload an image first."
-    # handle CPU/GPU
     if torch.cuda.is_available():
         model_runtime = model.to("cuda", dtype=torch.bfloat16)
     else:
@@ -283,10 +229,11 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
             prompt = "<image>\nFree OCR."
         # save image
         temp_image_path = os.path.join(output_path, "temp_image.jpg")
         image.save(temp_image_path)
-        # ===== size config =====
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -296,9 +243,9 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        # ===== inference (primary pass) =====
         with torch.no_grad():
-            plain_text_result = model_runtime.infer(
                 tokenizer,
                 prompt=prompt,
                 image_file=temp_image_path,
@@ -311,7 +258,7 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
                 eval_mode=is_eval_mode,
             )
-        # ===== collect results =====
         image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
         markdown_result_path = os.path.join(output_path, "result.mmd")
@@ -326,39 +273,49 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
         result_image = None
         if os.path.exists(image_result_path):
             try:
                 result_image = Image.open(image_result_path)
                 result_image.load()
             except Exception:
                 result_image = None
-        # ===== draw bounding boxes if <|det|> tags exist (optional) =====
-        pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
-        matches = list(pattern.finditer(plain_text_result or ""))
         if matches:
-            image_with_bboxes = image.copy()
-            draw = ImageDraw.Draw(image_with_bboxes)
             w, h = image.size
-            for match in matches:
-                x1, y1, x2, y2 = [int(c) for c in match.groups()]
-                x1 = int(x1 / 1000 * w)
-                y1 = int(y1 / 1000 * h)
-                x2 = int(x2 / 1000 * w)
-                y2 = int(y2 / 1000 * h)
-                draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
-            result_image = image_with_bboxes
-        # ===== DEEP PARSING & RENDERING (secondary pass) =====
-        should_run_deep = deep_parse and task_type in {"📄 Convert to Markdown", "📈 Parse Figure"}
-        deep_assets: List[Tuple[str, str]] = []  # (title, data_uri)
-        parsed_text = ""
-        def _run_deep_parse(prompt_text):
             with torch.no_grad():
-                return model_runtime.infer(
                     tokenizer,
-                    prompt=prompt_text,
                     image_file=temp_image_path,
                     output_path=output_path,
                     base_size=config["base_size"],
@@ -367,59 +324,70 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
                     save_results=False,
                     test_compress=True,
                     eval_mode=True,
-                )
-        if should_run_deep:
-            try:
-                parsed_text = _run_deep_parse("<image>\nParse the figure.") or ""
-            except Exception:
-                parsed_text = ""
-            # 1) Charts/tables:
-            df = None
-            if "<table" in parsed_text.lower() and _HAS_PANDAS:
-                df = _html_table_to_df(parsed_text)
-            if df is None:  # fallback: markdown pipe-table
-                df = _md_table_to_df(parsed_text)
-            if df is None:  # fallback: generic numeric block
-                df = _numeric_block_to_df(parsed_text)
-            if df is not None:
-                chart_uri = _df_to_chart_data_uri(df)
-                if chart_uri:
-                    deep_assets.append(("Figure (re-rendered from parsed data)", chart_uri))
-            # 2) Chemistry (SMILES)
-            deep_assets.extend(_render_smiles_block(parsed_text))
-        # ===== Append deep assets into the Markdown + build HTML preview =====
-        html_preview = ""  # for HTML tab
-        if task_type == "📄 Convert to Markdown":
-            # extend markdown with a diagnostic/asset section regardless of success,
-            # so you can see whether deep-parse attempted
-            markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
-            html_preview = _assets_to_html_section(deep_assets, parsed_text)
-        elif task_type == "📈 Parse Figure":
-            # just show what we got from deep parse
-            header = "# Parse Figure\n\n"
-            body = _assets_to_markdown_section(deep_assets, parsed_text)
-            markdown_content = header + body
-            html_preview = _assets_to_html_section(deep_assets, parsed_text)
-        else:
-            # other tasks: keep as-is, but still provide an HTML tab with any assets
-            if deep_assets or parsed_text.strip():
-                markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
-                html_preview = _assets_to_html_section(deep_assets, parsed_text)
             else:
-                html_preview = "<p>No parsed-figure content.</p>"
-        # ===== Decide what to show in the "Markdown Source (or Eval Output)" tab =====
-        text_result = plain_text_result if plain_text_result else markdown_content
-        # return (image, markdown, html, text)
-        return result_image, markdown_content, html_preview, text_result
 # ===== Theme and UI =====
@@ -460,45 +428,32 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(
-                type="pil", label="Upload Image", sources=["upload", "clipboard"]
-            )
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
                 label="Model Size",
             )
             task_type = gr.Dropdown(
-                choices=[
-                    "📝 Free OCR",
-                    "📄 Convert to Markdown",
-                    "📈 Parse Figure",
-                    "🔍 Locate Object by Reference",
-                ],
                 value="📄 Convert to Markdown",
                 label="Task Type",
             )
             ref_text_input = gr.Textbox(
                 label="Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
             eval_mode_checkbox = gr.Checkbox(
                 value=False,
                 label="Enable Evaluation Mode",
                 info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
             )
             deep_parse_checkbox = gr.Checkbox(
                 value=True,
-                label="Deep parse and re-render figures (charts/molecules)",
-                info="Runs a secondary pass to parse tables/SMILES and embeds rendered visuals into Markdown.",
             )
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
@@ -507,27 +462,18 @@ with gr.Blocks(
                     output_image = gr.Image(interactive=False)
                 with gr.TabItem("Markdown Preview"):
                     output_markdown = gr.Markdown()
-                with gr.TabItem("Rendered HTML (figures)"):
-                    output_html = gr.HTML()
                 with gr.TabItem("Markdown Source (or Eval Output)"):
-                    output_text = gr.Textbox(
-                        lines=20, show_copy_button=True, interactive=False
-                    )
-    # show/hide reference text box based on selected task
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
-    task_type.change(
-        fn=toggle_ref_text_visibility,
-        inputs=task_type,
-        outputs=ref_text_input,
-    )
     submit_btn.click(
         fn=process_image,
         inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
-        outputs=[output_image, output_markdown, output_html, output_text],
     )

 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")  # disable hf_transfer if missing
 import io
+import json
 import re
 import tempfile
+from typing import List, Dict, Any, Optional, Tuple
 import gradio as gr
 import torch
 import spaces
 from PIL import Image, ImageDraw
+# Optional: pandas for better table handling (not mandatory)
 try:
     import pandas as pd
     _HAS_PANDAS = True
 except Exception:
     _HAS_PANDAS = False
 from gradio.themes import Soft
 from gradio.themes.utils import fonts
             pass
+# ====== Utilities: JSON/table parsing ======
+_JSON_FENCE_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
+_ANY_JSON_RE   = re.compile(r"(\{(?:[^{}]|(?1))*\})", re.DOTALL)  # recursive-ish best-effort
+_MD_TABLE_BLOCK_RE = re.compile(
+    r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
+    flags=re.MULTILINE
+)
+def _extract_json(text: str) -> Optional[Dict[str, Any]]:
+    """
+    Try to extract a single JSON object from text.
+    1) prefer ```json fenced block
+    2) fallback to first top-level-looking {...}
+    """
+    if not text:
         return None
+    m = _JSON_FENCE_RE.search(text)
+    candidate = None
+    if m:
+        candidate = m.group(1).strip()
+    else:
+        m2 = _ANY_JSON_RE.search(text)
+        if m2:
+            candidate = m2.group(1).strip()
+    if not candidate:
         return None
     try:
+        return json.loads(candidate)
     except Exception:
         return None
+def _json_to_markdown_table(js: Dict[str, Any]) -> Optional[str]:
+    """
+    Convert a chart-style JSON into a Markdown pipe table.
+    Expected schema (flexible):
+      {
+        "type": "bar|line|...",
+        "title": "...",
+        "x": ["Germany","France",...],  # categories (or "categories")
+        "series": [{"name":"2024","data":[...]} , ...]
+      }
+    We handle keys: x|categories; y ignored (derived from series).
+    """
+    if not js:
+        return None
+    x = js.get("x") or js.get("categories")
+    series = js.get("series")
+    if not isinstance(x, list) or not isinstance(series, list):
+        return None
+    # build rows: first col is x category, next cols are series values
+    headers = ["Category"] + [str(s.get("name", f"series{i}")) for i, s in enumerate(series)]
+    rows: List[List[str]] = []
+    for i, cat in enumerate(x):
+        row = [str(cat)]
+        for s in series:
+            data = s.get("data", [])
+            val = data[i] if i < len(data) else ""
+            row.append(str(val))
+        rows.append(row)
+    # to markdown pipe table
+    header_line = "| " + " | ".join(headers) + " |"
+    align_line  = "| " + " | ".join([":---"] * len(headers)) + " |"
+    data_lines  = ["| " + " | ".join(r) + " |" for r in rows]
+    return "\n".join([header_line, align_line, *data_lines])
+def _md_table_to_df(md_text: str):
     if not _HAS_PANDAS:
         return None
+    m = _MD_TABLE_BLOCK_RE.search(md_text or "")
+    if not m:
+        return None
+    block = m.group(0).strip()
+    lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
+    if len(lines) < 2:
+        return None
+    header = [h.strip() for h in lines[0].strip("|").split("|")]
+    align_or_sep = lines[1]
+    data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
+    rows = []
+    for ln in data_lines:
+        parts = [p.strip() for p in ln.strip("|").split("|")]
+        if len(parts) == len(header):
+            rows.append(parts)
+    if not rows:
         return None
+    df = pd.DataFrame(rows, columns=header)
+    # try cast numerics
+    for c in df.columns[1:]:
+        df[c] = pd.to_numeric(df[c], errors="ignore")
+    return df
+def _numeric_block_to_df(text: str):
+    """Rough fallback: largest numeric-ish block into a DataFrame."""
     if not _HAS_PANDAS:
         return None
     blocks = []
     cur = []
     for ln in (text or "").splitlines():
             cur.append(ln)
         else:
             if cur:
+                blocks.append("\n".join(cur)); cur = []
+    if cur: blocks.append("\n".join(cur))
+    if not blocks: return None
+    block = max(blocks, key=len)
+    from io import StringIO
+    # CSV
     try:
         df = pd.read_csv(StringIO(block))
+        if df.shape[1] >= 2: return df
     except Exception:
         pass
+    # whitespace
     try:
         df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
+        if df.shape[1] >= 2: return df
     except Exception:
         pass
     return None
+def _df_to_markdown_and_csv(df) -> Tuple[str, str]:
+    """Return (markdown_pipe_table, csv_text)."""
+    if not _HAS_PANDAS or df is None:
+        return "", ""
+    # Markdown
+    md = []
+    headers = list(df.columns)
+    md.append("| " + " | ".join(map(str, headers)) + " |")
+    md.append("| " + " | ".join([":---"] * len(headers)) + " |")
+    for _, row in df.iterrows():
+        md.append("| " + " | ".join(map(lambda x: str(x), row.tolist())) + " |")
+    md_text = "\n".join(md)
+    # CSV
+    buf = io.StringIO()
+    df.to_csv(buf, index=False)
+    csv_text = buf.getvalue()
+    return md_text, csv_text
 # ===== Inference Function =====
 @spaces.GPU
 def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
     """
+    Process image with DeepSeek-OCR and return annotated image, markdown, and text.
+    Adds deep parsing to extract structured DATA from figures (JSON + Table + CSV)
+    and appends it inside the Markdown for RAG indexing.
     """
     if image is None:
+        return None, "Please upload an image first.", "Please upload an image first."
+    # device
     if torch.cuda.is_available():
         model_runtime = model.to("cuda", dtype=torch.bfloat16)
     else:
             prompt = "<image>\nFree OCR."
         # save image
+        os.makedirs(output_path, exist_ok=True)
         temp_image_path = os.path.join(output_path, "temp_image.jpg")
         image.save(temp_image_path)
+        # size
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        # ===== primary pass =====
         with torch.no_grad():
+            primary_text = model_runtime.infer(
                 tokenizer,
                 prompt=prompt,
                 image_file=temp_image_path,
                 eval_mode=is_eval_mode,
             )
+        # collect results
         image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
         markdown_result_path = os.path.join(output_path, "result.mmd")
         result_image = None
         if os.path.exists(image_result_path):
             try:
+                from PIL import Image
                 result_image = Image.open(image_result_path)
                 result_image.load()
             except Exception:
                 result_image = None
+        # draw bboxes if <|det|>
+        det_pat = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
+        matches = list(det_pat.finditer(primary_text or ""))
         if matches:
+            img_with_boxes = image.copy()
+            draw = ImageDraw.Draw(img_with_boxes)
             w, h = image.size
+            for m in matches:
+                x1,y1,x2,y2 = [int(c) for c in m.groups()]
+                x1 = int(x1/1000*w); y1=int(y1/1000*h); x2=int(x2/1000*w); y2=int(y2/1000*h)
+                draw.rectangle([x1,y1,x2,y2], outline="red", width=3)
+            result_image = img_with_boxes
+        # ===== deep parse for DATA (not rendering) =====
+        # We always try deep parse for Convert to Markdown / Parse Figure, otherwise only if checkbox is on
+        should_deep = deep_parse and (task_type in {"📄 Convert to Markdown", "📈 Parse Figure"})
+        extracted_md_section = ""
+        if should_deep:
+            # ask model for STRICT JSON for charts
+            strict_json_prompt = (
+                "<image>\n"
+                "Parse the figure. If it's a chart, return ONLY a single JSON object with keys:\n"
+                "{\n"
+                '  "type": "bar|line|area|scatter|pie|table|unknown",\n'
+                '  "title": "string",\n'
+                '  "x": ["category", ...],\n'
+                '  "series": [{"name": "string", "data": [number|null, ...]}, ...]\n'
+                "}\n"
+                "If it's a table, return the same JSON using 'type': 'table' and fill x from the first column and series from remaining columns.\n"
+                "Do not include any explanation text. Return ONLY the JSON."
+            )
             with torch.no_grad():
+                deep_text = model_runtime.infer(
                     tokenizer,
+                    prompt=strict_json_prompt,
                     image_file=temp_image_path,
                     output_path=output_path,
                     base_size=config["base_size"],
                     save_results=False,
                     test_compress=True,
                     eval_mode=True,
+                ) or ""
+            js = _extract_json(deep_text)
+            md_table, csv_text = "", ""
+            if js:
+                # Prefer JSON → Markdown table
+                md_table = _json_to_markdown_table(js) or ""
+                if _HAS_PANDAS and md_table:
+                    df = _md_table_to_df(md_table)
+                    if df is not None:
+                        md_table, csv_text = _df_to_markdown_and_csv(df)
+                # Build Markdown block with JSON + (optional) table + CSV
+                extracted_md_section = "### Extracted Figure Data\n\n"
+                extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js, ensure_ascii=False, indent=2) + "\n```\n\n"
+                if md_table:
+                    extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
+                if csv_text:
+                    extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
             else:
+                # Fallback: ask for generic figure parse, then try to pull Markdown tables / numeric blocks
+                with torch.no_grad():
+                    fallback_text = model_runtime.infer(
+                        tokenizer,
+                        prompt="<image>\nParse the figure.",
+                        image_file=temp_image_path,
+                        output_path=output_path,
+                        base_size=config["base_size"],
+                        image_size=config["image_size"],
+                        crop_mode=config["crop_mode"],
+                        save_results=False,
+                        test_compress=True,
+                        eval_mode=True,
+                    ) or ""
+                df = _md_table_to_df(fallback_text)
+                if df is None:
+                    df = _numeric_block_to_df(fallback_text)
+                if df is not None:
+                    md_table, csv_text = _df_to_markdown_and_csv(df)
+                    js_fallback = {
+                        "type": "table",
+                        "title": "",
+                        "x": df.iloc[:,0].astype(str).tolist(),
+                        "series": [{"name": c, "data": [None if pd.isna(v) else (float(v) if str(v).replace('.','',1).isdigit() else v) for v in df[c].tolist()]}
+                                   for c in df.columns[1:]] if _HAS_PANDAS else []
+                    }
+                    extracted_md_section = "### Extracted Figure Data\n\n"
+                    extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js_fallback, ensure_ascii=False, indent=2) + "\n```\n\n"
+                    extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
+                    extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
+                else:
+                    # Nothing structured; keep a short diagnostic (plain text only)
+                    extracted_md_section = "### Extracted Figure Data\n\n_No structured table/series detected. You may need to adjust the deep-parse prompt for this figure type._\n"
+        # ===== Merge into final Markdown =====
+        if extracted_md_section:
+            markdown_content = markdown_content.rstrip() + "\n\n---\n\n" + extracted_md_section
+        # For the “Markdown Source (or Eval Output)” tab
+        text_result = primary_text if primary_text else markdown_content
+        return result_image, markdown_content, text_result
 # ===== Theme and UI =====
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"])
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
                 label="Model Size",
             )
             task_type = gr.Dropdown(
+                choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
                 value="📄 Convert to Markdown",
                 label="Task Type",
             )
             ref_text_input = gr.Textbox(
                 label="Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
             eval_mode_checkbox = gr.Checkbox(
                 value=False,
                 label="Enable Evaluation Mode",
                 info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
             )
             deep_parse_checkbox = gr.Checkbox(
                 value=True,
+                label="Deep parse and extract figure data (JSON + table + CSV)",
+                info="Adds a second pass that extracts machine-readable data for RAG.",
             )
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
                     output_image = gr.Image(interactive=False)
                 with gr.TabItem("Markdown Preview"):
                     output_markdown = gr.Markdown()
                 with gr.TabItem("Markdown Source (or Eval Output)"):
+                    output_text = gr.Textbox(lines=20, show_copy_button=True, interactive=False)
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
+    task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
     submit_btn.click(
         fn=process_image,
         inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
+        outputs=[output_image, output_markdown, output_text],
     )