Spaces:

lucacadalora
/

jatevo

Running on Zero

App Files Files Community

lucacadalora commited on 7 days ago

Commit

7e1f32d

verified ·

1 Parent(s): aa2b99c

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -369

app.py CHANGED Viewed

@@ -1,28 +1,16 @@
 import os
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")  # disable hf_transfer if missing
-import io
-import json
-import re
-import tempfile
-from typing import List, Dict, Any, Optional, Tuple
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import spaces
-from PIL import Image, ImageDraw
-# Optional: pandas for better table handling (not mandatory)
-try:
-    import pandas as pd
-    _HAS_PANDAS = True
-except Exception:
-    _HAS_PANDAS = False
 from gradio.themes import Soft
 from gradio.themes.utils import fonts
 # ===== Model Load =====
 model_name = "deepseek-ai/DeepSeek-OCR"
@@ -46,348 +34,160 @@ except Exception:
             pass
-# ====== Utilities: JSON/table parsing ======
-_JSON_FENCE_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
-_ANY_JSON_RE   = re.compile(r"(\{(?:[^{}]|(?1))*\})", re.DOTALL)  # recursive-ish best-effort
-_MD_TABLE_BLOCK_RE = re.compile(
-    r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
-    flags=re.MULTILINE
-)
-def _extract_json(text: str) -> Optional[Dict[str, Any]]:
     """
-    Try to extract a single JSON object from text.
-    1) prefer ```json fenced block
-    2) fallback to first top-level-looking {...}
     """
-    if not text:
-        return None
-    m = _JSON_FENCE_RE.search(text)
-    candidate = None
-    if m:
-        candidate = m.group(1).strip()
     else:
-        m2 = _ANY_JSON_RE.search(text)
-        if m2:
-            candidate = m2.group(1).strip()
-    if not candidate:
-        return None
-    try:
-        return json.loads(candidate)
-    except Exception:
-        return None
-def _json_to_markdown_table(js: Dict[str, Any]) -> Optional[str]:
-    """
-    Convert a chart-style JSON into a Markdown pipe table.
-    Expected schema (flexible):
-      {
-        "type": "bar|line|...",
-        "title": "...",
-        "x": ["Germany","France",...],  # categories (or "categories")
-        "series": [{"name":"2024","data":[...]} , ...]
-      }
-    We handle keys: x|categories; y ignored (derived from series).
-    """
-    if not js:
-        return None
-    x = js.get("x") or js.get("categories")
-    series = js.get("series")
-    if not isinstance(x, list) or not isinstance(series, list):
-        return None
-    # build rows: first col is x category, next cols are series values
-    headers = ["Category"] + [str(s.get("name", f"series{i}")) for i, s in enumerate(series)]
-    rows: List[List[str]] = []
-    for i, cat in enumerate(x):
-        row = [str(cat)]
-        for s in series:
-            data = s.get("data", [])
-            val = data[i] if i < len(data) else ""
-            row.append(str(val))
-        rows.append(row)
-    # to markdown pipe table
-    header_line = "| " + " | ".join(headers) + " |"
-    align_line  = "| " + " | ".join([":---"] * len(headers)) + " |"
-    data_lines  = ["| " + " | ".join(r) + " |" for r in rows]
-    return "\n".join([header_line, align_line, *data_lines])
-def _md_table_to_df(md_text: str):
-    if not _HAS_PANDAS:
-        return None
-    m = _MD_TABLE_BLOCK_RE.search(md_text or "")
-    if not m:
-        return None
-    block = m.group(0).strip()
-    lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
-    if len(lines) < 2:
-        return None
-    header = [h.strip() for h in lines[0].strip("|").split("|")]
-    align_or_sep = lines[1]
-    data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
-    rows = []
-    for ln in data_lines:
-        parts = [p.strip() for p in ln.strip("|").split("|")]
-        if len(parts) == len(header):
-            rows.append(parts)
-    if not rows:
-        return None
-    df = pd.DataFrame(rows, columns=header)
-    # try cast numerics
-    for c in df.columns[1:]:
-        df[c] = pd.to_numeric(df[c], errors="ignore")
-    return df
-def _numeric_block_to_df(text: str):
-    """Rough fallback: largest numeric-ish block into a DataFrame."""
-    if not _HAS_PANDAS:
-        return None
-    blocks = []
-    cur = []
-    for ln in (text or "").splitlines():
-        if re.search(r"\d", ln) and ("," in ln or "\t" in ln or "  " in ln or "|" in ln):
-            cur.append(ln)
-        else:
-            if cur:
-                blocks.append("\n".join(cur)); cur = []
-    if cur: blocks.append("\n".join(cur))
-    if not blocks: return None
-    block = max(blocks, key=len)
-    from io import StringIO
-    # CSV
-    try:
-        df = pd.read_csv(StringIO(block))
-        if df.shape[1] >= 2: return df
-    except Exception:
-        pass
-    # whitespace
-    try:
-        df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
-        if df.shape[1] >= 2: return df
-    except Exception:
-        pass
-    return None
-def _df_to_markdown_and_csv(df) -> Tuple[str, str]:
-    """Return (markdown_pipe_table, csv_text)."""
-    if not _HAS_PANDAS or df is None:
-        return "", ""
-    # Markdown
-    md = []
-    headers = list(df.columns)
-    md.append("| " + " | ".join(map(str, headers)) + " |")
-    md.append("| " + " | ".join([":---"] * len(headers)) + " |")
-    for _, row in df.iterrows():
-        md.append("| " + " | ".join(map(lambda x: str(x), row.tolist())) + " |")
-    md_text = "\n".join(md)
-    # CSV
-    buf = io.StringIO()
-    df.to_csv(buf, index=False)
-    csv_text = buf.getvalue()
-    return md_text, csv_text
-# ===== Inference Function =====
 @spaces.GPU
-def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
     """
-    Process image with DeepSeek-OCR and return annotated image, markdown, and text.
-    Adds deep parsing to extract structured DATA from figures (JSON + Table + CSV)
-    and appends it inside the Markdown for RAG indexing.
     """
-    if image is None:
-        return None, "Please upload an image first.", "Please upload an image first."
-    # device
     if torch.cuda.is_available():
         model_runtime = model.to("cuda", dtype=torch.bfloat16)
     else:
         model_runtime = model.to("cpu", dtype=torch.float32)
-    with tempfile.TemporaryDirectory() as output_path:
-        # ===== choose task prompt =====
-        if task_type == "📝 Free OCR":
-            prompt = "<image>\nFree OCR."
-        elif task_type == "📄 Convert to Markdown":
-            prompt = "<image>\n<|grounding|>Convert the document to markdown."
-        elif task_type == "📈 Parse Figure":
-            prompt = "<image>\nParse the figure."
-        elif task_type == "🔍 Locate Object by Reference":
-            if not ref_text or ref_text.strip() == "":
-                raise gr.Error("Please provide reference text for the Locate task!")
-            prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
-        else:
-            prompt = "<image>\nFree OCR."
-        # save image
-        os.makedirs(output_path, exist_ok=True)
-        temp_image_path = os.path.join(output_path, "temp_image.jpg")
-        image.save(temp_image_path)
-        # size
-        size_configs = {
-            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
-        }
-        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        # ===== primary pass =====
-        with torch.no_grad():
-            primary_text = model_runtime.infer(
-                tokenizer,
-                prompt=prompt,
-                image_file=temp_image_path,
-                output_path=output_path,
-                base_size=config["base_size"],
-                image_size=config["image_size"],
-                crop_mode=config["crop_mode"],
-                save_results=True,
-                test_compress=True,
-                eval_mode=is_eval_mode,
-            )
-        # collect results
-        image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
-        markdown_result_path = os.path.join(output_path, "result.mmd")
-        markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
-        if os.path.exists(markdown_result_path):
-            try:
-                with open(markdown_result_path, "r", encoding="utf-8") as f:
-                    markdown_content = f.read()
-            except Exception:
-                pass
-        result_image = None
-        if os.path.exists(image_result_path):
-            try:
-                from PIL import Image
-                result_image = Image.open(image_result_path)
-                result_image.load()
-            except Exception:
-                result_image = None
-        # draw bboxes if <|det|>
-        det_pat = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
-        matches = list(det_pat.finditer(primary_text or ""))
-        if matches:
-            img_with_boxes = image.copy()
-            draw = ImageDraw.Draw(img_with_boxes)
-            w, h = image.size
-            for m in matches:
-                x1,y1,x2,y2 = [int(c) for c in m.groups()]
-                x1 = int(x1/1000*w); y1=int(y1/1000*h); x2=int(x2/1000*w); y2=int(y2/1000*h)
-                draw.rectangle([x1,y1,x2,y2], outline="red", width=3)
-            result_image = img_with_boxes
-        # ===== deep parse for DATA (not rendering) =====
-        # We always try deep parse for Convert to Markdown / Parse Figure, otherwise only if checkbox is on
-        should_deep = deep_parse and (task_type in {"📄 Convert to Markdown", "📈 Parse Figure"})
-        extracted_md_section = ""
-        if should_deep:
-            # ask model for STRICT JSON for charts
-            strict_json_prompt = (
-                "<image>\n"
-                "Parse the figure. If it's a chart, return ONLY a single JSON object with keys:\n"
-                "{\n"
-                '  "type": "bar|line|area|scatter|pie|table|unknown",\n'
-                '  "title": "string",\n'
-                '  "x": ["category", ...],\n'
-                '  "series": [{"name": "string", "data": [number|null, ...]}, ...]\n'
-                "}\n"
-                "If it's a table, return the same JSON using 'type': 'table' and fill x from the first column and series from remaining columns.\n"
-                "Do not include any explanation text. Return ONLY the JSON."
-            )
-            with torch.no_grad():
-                deep_text = model_runtime.infer(
                     tokenizer,
-                    prompt=strict_json_prompt,
-                    image_file=temp_image_path,
-                    output_path=output_path,
-                    base_size=config["base_size"],
-                    image_size=config["image_size"],
-                    crop_mode=config["crop_mode"],
-                    save_results=False,
-                    test_compress=True,
-                    eval_mode=True,
-                ) or ""
-            js = _extract_json(deep_text)
-            md_table, csv_text = "", ""
-            if js:
-                # Prefer JSON → Markdown table
-                md_table = _json_to_markdown_table(js) or ""
-                if _HAS_PANDAS and md_table:
-                    df = _md_table_to_df(md_table)
-                    if df is not None:
-                        md_table, csv_text = _df_to_markdown_and_csv(df)
-                # Build Markdown block with JSON + (optional) table + CSV
-                extracted_md_section = "### Extracted Figure Data\n\n"
-                extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js, ensure_ascii=False, indent=2) + "\n```\n\n"
-                if md_table:
-                    extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
-                if csv_text:
-                    extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
-            else:
-                # Fallback: ask for generic figure parse, then try to pull Markdown tables / numeric blocks
-                with torch.no_grad():
-                    fallback_text = model_runtime.infer(
-                        tokenizer,
-                        prompt="<image>\nParse the figure.",
-                        image_file=temp_image_path,
-                        output_path=output_path,
-                        base_size=config["base_size"],
-                        image_size=config["image_size"],
-                        crop_mode=config["crop_mode"],
-                        save_results=False,
-                        test_compress=True,
-                        eval_mode=True,
-                    ) or ""
-                df = _md_table_to_df(fallback_text)
-                if df is None:
-                    df = _numeric_block_to_df(fallback_text)
-                if df is not None:
-                    md_table, csv_text = _df_to_markdown_and_csv(df)
-                    js_fallback = {
-                        "type": "table",
-                        "title": "",
-                        "x": df.iloc[:,0].astype(str).tolist(),
-                        "series": [{"name": c, "data": [None if pd.isna(v) else (float(v) if str(v).replace('.','',1).isdigit() else v) for v in df[c].tolist()]}
-                                   for c in df.columns[1:]] if _HAS_PANDAS else []
-                    }
-                    extracted_md_section = "### Extracted Figure Data\n\n"
-                    extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js_fallback, ensure_ascii=False, indent=2) + "\n```\n\n"
-                    extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
-                    extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
-                else:
-                    # Nothing structured; keep a short diagnostic (plain text only)
-                    extracted_md_section = "### Extracted Figure Data\n\n_No structured table/series detected. You may need to adjust the deep-parse prompt for this figure type._\n"
-        # ===== Merge into final Markdown =====
-        if extracted_md_section:
-            markdown_content = markdown_content.rstrip() + "\n\n---\n\n" + extracted_md_section
-        # For the “Markdown Source (or Eval Output)” tab
-        text_result = primary_text if primary_text else markdown_content
-        return result_image, markdown_content, text_result
 # ===== Theme and UI =====
@@ -407,77 +207,109 @@ custom_css = """
 # ===== Interface =====
 with gr.Blocks(
-    title="DeepSeek-OCR by Jatevo LLM Inference",
     theme=theme,
     css=custom_css,
 ) as demo:
     gr.Markdown(
         """
-        # DeepSeek-OCR by Jatevo LLM Inference
-        Upload an image to extract text using **DeepSeek-OCR**.
-        Supports documents, forms, receipts, figures, and object localization.
         **Model Sizes:**
         - **Tiny** — Fastest, lower accuracy (512×512)
         - **Small** — Fast, good accuracy (640×640)
         - **Base** — Balanced performance (1024×1024)
         - **Large** — Best accuracy, slower (1280×1280)
         - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"])
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
                 label="Model Size",
             )
             task_type = gr.Dropdown(
-                choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
                 value="📄 Convert to Markdown",
                 label="Task Type",
             )
             ref_text_input = gr.Textbox(
                 label="Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
             eval_mode_checkbox = gr.Checkbox(
                 value=False,
                 label="Enable Evaluation Mode",
-                info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
-            )
-            deep_parse_checkbox = gr.Checkbox(
-                value=True,
-                label="Deep parse and extract figure data (JSON + table + CSV)",
-                info="Adds a second pass that extracts machine-readable data for RAG.",
             )
-            submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
-            with gr.Tabs():
-                with gr.TabItem("Annotated Image"):
-                    output_image = gr.Image(interactive=False)
-                with gr.TabItem("Markdown Preview"):
-                    output_markdown = gr.Markdown()
-                with gr.TabItem("Markdown Source (or Eval Output)"):
-                    output_text = gr.Textbox(lines=20, show_copy_button=True, interactive=False)
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
-    task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
     submit_btn.click(
-        fn=process_image,
-        inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
-        outputs=[output_image, output_markdown, output_text],
     )
 # ===== Launch =====
 if __name__ == "__main__":
     demo.queue(max_size=20)
-    demo.launch()

 import os
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")  # disable hf_transfer if missing
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import spaces
+import tempfile
+from PIL import Image
+import re
 from gradio.themes import Soft
 from gradio.themes.utils import fonts
+import fitz  # PyMuPDF for PDF processing
 # ===== Model Load =====
 model_name = "deepseek-ai/DeepSeek-OCR"
             pass
+def pdf_to_images(pdf_path, dpi=200):
+    """
+    Convert PDF pages to PIL Images using PyMuPDF
+    Args:
+        pdf_path: Path to PDF file
+        dpi: Resolution for rendering (default 200)
+    Returns:
+        List of PIL Image objects
+    """
+    images = []
+    pdf_document = fitz.open(pdf_path)
+    for page_num in range(len(pdf_document)):
+        page = pdf_document[page_num]
+        # Render page to pixmap with specified DPI
+        mat = fitz.Matrix(dpi / 72, dpi / 72)  # 72 is default DPI
+        pix = page.get_pixmap(matrix=mat)
+        # Convert to PIL Image
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    pdf_document.close()
+    return images
+def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path):
     """
+    Process a single page/image with DeepSeek-OCR
+    Returns markdown content
     """
+    # ===== choose task prompt =====
+    if task_type == "📝 Free OCR":
+        prompt = "<image>\nFree OCR."
+    elif task_type == "📄 Convert to Markdown":
+        prompt = "<image>\n<|grounding|>Convert the document to markdown."
+    elif task_type == "📈 Parse Figure":
+        prompt = "<image>\nParse the figure."
+    elif task_type == "🔍 Locate Object by Reference":
+        if not ref_text or ref_text.strip() == "":
+            raise gr.Error("Please provide reference text for the Locate task!")
+        prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
     else:
+        prompt = "<image>\nFree OCR."
+    # save image temporarily
+    temp_image_path = os.path.join(output_path, "temp_image.jpg")
+    image.save(temp_image_path)
+    # ===== size config =====
+    size_configs = {
+        "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+        "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+        "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+        "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+        "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
+    }
+    config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+    # ===== inference =====
+    with torch.no_grad():
+        plain_text_result = model_runtime.infer(
+            tokenizer,
+            prompt=prompt,
+            image_file=temp_image_path,
+            output_path=output_path,
+            base_size=config["base_size"],
+            image_size=config["image_size"],
+            crop_mode=config["crop_mode"],
+            save_results=True,
+            test_compress=True,
+            eval_mode=is_eval_mode,
+        )
+    # ===== collect markdown result =====
+    markdown_result_path = os.path.join(output_path, "result.mmd")
+    markdown_content = ""
+    if os.path.exists(markdown_result_path):
+        try:
+            with open(markdown_result_path, "r", encoding="utf-8") as f:
+                markdown_content = f.read()
+        except Exception:
+            pass
+    # If no markdown file, use plain text result
+    if not markdown_content and plain_text_result:
+        markdown_content = plain_text_result
+    return markdown_content
+# ===== Main Processing Function =====
 @spaces.GPU
+def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, progress=gr.Progress()):
     """
+    Process PDF with DeepSeek-OCR and return combined markdown from all pages.
     """
+    if pdf_file is None:
+        return "Please upload a PDF file first."
+    # handle CPU/GPU
     if torch.cuda.is_available():
         model_runtime = model.to("cuda", dtype=torch.bfloat16)
     else:
         model_runtime = model.to("cpu", dtype=torch.float32)
+    try:
+        # Convert PDF to images
+        progress(0, desc="Converting PDF to images...")
+        images = pdf_to_images(pdf_file.name)
+        total_pages = len(images)
+        if total_pages == 0:
+            return "No pages found in the PDF."
+        progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
+        # Process each page
+        all_markdown_results = []
+        with tempfile.TemporaryDirectory() as output_path:
+            for page_num, image in enumerate(images, start=1):
+                progress(
+                    (page_num / total_pages) * 0.9 + 0.1,
+                    desc=f"Processing page {page_num}/{total_pages}..."
+                )
+                markdown_content = process_single_page(
+                    image,
+                    model_runtime,
                     tokenizer,
+                    model_size,
+                    task_type,
+                    ref_text,
+                    is_eval_mode,
+                    output_path
+                )
+                # Add page separator
+                page_header = f"\n\n---\n\n# Page {page_num}\n\n"
+                all_markdown_results.append(page_header + markdown_content)
+        # Combine all results
+        progress(1.0, desc="Finalizing...")
+        combined_markdown = "\n\n".join(all_markdown_results)
+        # Add document header
+        final_output = f"# Document OCR Results\n\n**Total Pages:** {total_pages}\n\n{combined_markdown}"
+        return final_output
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}"
 # ===== Theme and UI =====
 # ===== Interface =====
 with gr.Blocks(
+    title="DeepSeek-OCR PDF Parser by Jatevo LLM Inference",
     theme=theme,
     css=custom_css,
 ) as demo:
     gr.Markdown(
         """
+        # 📄 DeepSeek-OCR PDF Parser by Jatevo LLM Inference
+        Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
+        Each page is processed sequentially and combined into a single markdown document.
         **Model Sizes:**
         - **Tiny** — Fastest, lower accuracy (512×512)
         - **Small** — Fast, good accuracy (640×640)
         - **Base** — Balanced performance (1024×1024)
         - **Large** — Best accuracy, slower (1280×1280)
         - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
+        **Note:** Processing time depends on the number of pages and model size.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="Upload PDF",
+                file_types=[".pdf"],
+                type="filepath"
+            )
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
                 label="Model Size",
             )
             task_type = gr.Dropdown(
+                choices=[
+                    "📝 Free OCR",
+                    "📄 Convert to Markdown",
+                    "📈 Parse Figure",
+                    "🔍 Locate Object by Reference",
+                ],
                 value="📄 Convert to Markdown",
                 label="Task Type",
             )
             ref_text_input = gr.Textbox(
                 label="Reference Text (for Locate task)",
                 placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
                 visible=False,
             )
             eval_mode_checkbox = gr.Checkbox(
                 value=False,
                 label="Enable Evaluation Mode",
+                info="Returns only plain text (faster).",
             )
+            submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
         with gr.Column(scale=2):
+            gr.Markdown("### 📝 Markdown Output")
+            output_markdown_preview = gr.Markdown(
+                label="Rendered Markdown",
+                value="*Upload a PDF and click 'Process PDF' to see results here.*"
+            )
+            gr.Markdown("### 📄 Markdown Source (Copy/Download)")
+            output_text = gr.Textbox(
+                label="Raw Markdown",
+                lines=25,
+                show_copy_button=True,
+                interactive=False,
+                placeholder="Markdown source will appear here..."
+            )
+    # show/hide reference text box based on selected task
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
+    task_type.change(
+        fn=toggle_ref_text_visibility,
+        inputs=task_type,
+        outputs=ref_text_input,
+    )
+    def update_outputs(markdown_text):
+        """Update both markdown preview and raw text"""
+        return markdown_text, markdown_text
     submit_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox],
+        outputs=output_text,
+    ).then(
+        fn=update_outputs,
+        inputs=output_text,
+        outputs=[output_markdown_preview, output_text]
     )
 # ===== Launch =====
 if __name__ == "__main__":
     demo.queue(max_size=20)
+    demo.launch()