lucacadalora commited on
Commit
aa2b99c
·
verified ·
1 Parent(s): d7ea6d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -285
app.py CHANGED
@@ -2,10 +2,10 @@ import os
2
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # disable hf_transfer if missing
3
 
4
  import io
5
- import base64
6
  import re
7
  import tempfile
8
- from typing import List, Tuple, Optional
9
 
10
  import gradio as gr
11
  import torch
@@ -13,38 +13,13 @@ from transformers import AutoModel, AutoTokenizer
13
  import spaces
14
  from PIL import Image, ImageDraw
15
 
16
- # Optional plotting & parsing libs (graceful fallbacks if missing)
17
- try:
18
- import matplotlib.pyplot as plt
19
- _HAS_MPL = True
20
- except Exception:
21
- _HAS_MPL = False
22
-
23
  try:
24
  import pandas as pd
25
  _HAS_PANDAS = True
26
  except Exception:
27
  _HAS_PANDAS = False
28
 
29
- # read_html needs bs4 + lxml; we'll try but don't hard-require
30
- _HAS_READ_HTML = False
31
- if _HAS_PANDAS:
32
- try:
33
- import bs4 # noqa: F401
34
- import lxml # noqa: F401
35
- _HAS_READ_HTML = True
36
- except Exception:
37
- _HAS_READ_HTML = False
38
-
39
- # RDKit (optional)
40
- _HAS_RDKIT = False
41
- try:
42
- from rdkit import Chem
43
- from rdkit.Chem import Draw
44
- _HAS_RDKIT = True
45
- except Exception:
46
- _HAS_RDKIT = False
47
-
48
  from gradio.themes import Soft
49
  from gradio.themes.utils import fonts
50
 
@@ -71,96 +46,108 @@ except Exception:
71
  pass
72
 
73
 
74
- # ===== Helpers for deep parsing rendering =====
75
- def _img_to_data_uri(pil_img, fmt="PNG"):
76
- buf = io.BytesIO()
77
- pil_img.save(buf, format=fmt)
78
- return f"data:image/{fmt.lower()};base64,{base64.b64encode(buf.getvalue()).decode()}"
79
 
80
- def _df_to_chart_data_uri(df: "pd.DataFrame") -> Optional[str]:
81
- """Render a simple chart from a DataFrame using matplotlib (single figure, no explicit colors)."""
82
- if not _HAS_MPL or not _HAS_PANDAS:
83
- return None
84
- try:
85
- # basic heuristics: first col is x-axis if non-numeric-ish
86
- plt.figure() # one plot per chart (no subplots; no explicit colors)
87
- df_plot = df.copy()
88
-
89
- # If first column is non-numeric, set it as index
90
- if df_plot.shape[1] >= 2:
91
- xcol = df_plot.columns[0]
92
- numeric_x = pd.to_numeric(df_plot[xcol], errors="coerce")
93
- if numeric_x.isna().any():
94
- df_plot = df_plot.set_index(xcol)
95
-
96
- # bar for <=5 series, else line
97
- if df_plot.shape[1] <= 5:
98
- df_plot.plot(kind="bar")
99
- else:
100
- df_plot.plot()
101
 
102
- buf = io.BytesIO()
103
- plt.tight_layout()
104
- plt.savefig(buf, format="PNG", dpi=160)
105
- buf.seek(0)
106
- return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
107
- except Exception:
 
108
  return None
109
-
110
- def _html_table_to_df(html: str) -> Optional["pd.DataFrame"]:
111
- """Pick the largest <table> from HTML; return as DataFrame or None."""
112
- if not _HAS_READ_HTML:
 
 
 
 
 
113
  return None
114
  try:
115
- tables = pd.read_html(html) # list[DataFrame]
116
- if not tables:
117
- return None
118
- return max(tables, key=lambda t: (t.shape[0] * t.shape[1]))
119
  except Exception:
120
  return None
121
 
122
- # --- Fallbacks when we don't get HTML tables ---
123
- _MD_TABLE_BLOCK_RE = re.compile(
124
- r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
125
- flags=re.MULTILINE
126
- )
127
 
128
- def _md_table_to_df(md_text: str) -> Optional["pd.DataFrame"]:
129
- """Parse the first Markdown pipe-table into a DataFrame."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  if not _HAS_PANDAS:
131
  return None
132
- try:
133
- m = _MD_TABLE_BLOCK_RE.search(md_text or "")
134
- if not m:
135
- return None
136
- block = m.group(0).strip()
137
- # Normalize: remove alignment row, split by pipes
138
- lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
139
- if len(lines) < 2:
140
- return None
141
- header = [h.strip() for h in lines[0].strip("|").split("|")]
142
- align_or_sep = lines[1]
143
- data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
144
- rows = []
145
- for ln in data_lines:
146
- parts = [p.strip() for p in ln.strip("|").split("|")]
147
- if len(parts) == len(header):
148
- rows.append(parts)
149
- if not rows:
150
- return None
151
- df = pd.DataFrame(rows, columns=header)
152
- # try cast numerics where possible
153
- for c in df.columns:
154
- df[c] = pd.to_numeric(df[c], errors="ignore")
155
- return df
156
- except Exception:
157
  return None
 
 
 
 
 
158
 
159
- def _numeric_block_to_df(text: str) -> Optional["pd.DataFrame"]:
160
- """Very rough fallback: parse whitespace/csv-ish numeric blocks into a DataFrame."""
 
161
  if not _HAS_PANDAS:
162
  return None
163
- # grab the largest numeric-ish block: lines containing numbers and separators
164
  blocks = []
165
  cur = []
166
  for ln in (text or "").splitlines():
@@ -168,100 +155,59 @@ def _numeric_block_to_df(text: str) -> Optional["pd.DataFrame"]:
168
  cur.append(ln)
169
  else:
170
  if cur:
171
- blocks.append("\n".join(cur))
172
- cur = []
173
- if cur:
174
- blocks.append("\n".join(cur))
175
- if not blocks:
176
- return None
177
- block = max(blocks, key=len)
178
 
179
- # try CSV first
 
 
180
  try:
181
- from io import StringIO
182
  df = pd.read_csv(StringIO(block))
183
- if df.shape[1] >= 2:
184
- return df
185
  except Exception:
186
  pass
187
-
188
- # try whitespace sep
189
  try:
190
- from io import StringIO
191
  df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
192
- if df.shape[1] >= 2:
193
- return df
194
  except Exception:
195
  pass
196
  return None
197
 
198
- _SMILES_REGEX = re.compile(r"(?:SMILES|Smiles)\s*[::]\s*([A-Za-z0-9@\[\]\(\)\+\-\=\\\/%]+)")
199
 
200
- def _render_smiles_block(text: str) -> List[Tuple[str, str]]:
201
- """Find SMILES in text, render with RDKit, return list[(title, data_uri)]."""
202
- assets: List[Tuple[str, str]] = []
203
- if not _HAS_RDKIT:
204
- return assets
205
- try:
206
- found = _SMILES_REGEX.findall(text or "")
207
- for s in found[:6]: # safety cap
208
- mol = Chem.MolFromSmiles(s)
209
- if mol is None:
210
- continue
211
- im = Draw.MolToImage(mol, size=(520, 260))
212
- assets.append((f"Molecule (SMILES: {s})", _img_to_data_uri(im)))
213
- except Exception:
214
- pass
215
- return assets
216
-
217
- def _assets_to_markdown_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
218
- out = ["\n\n---\n\n### Parsed Figures (auto-rendered)\n"]
219
- if not assets and not parsed_text.strip():
220
- out.append("_No deep-parsed content detected._\n")
221
- return "".join(out)
222
-
223
- for title, data_uri in assets:
224
- if data_uri:
225
- out.append(f"**{title}**\n\n![]({data_uri})\n\n")
226
- # Always expose a short snippet so you can see what the model returned
227
- snippet = parsed_text.strip()
228
- if len(snippet) > 4000:
229
- snippet = snippet[:4000] + "\n<!-- truncated -->"
230
- if snippet:
231
- out.append("**Raw deep-parse output (snippet)**\n\n```text\n")
232
- out.append(snippet)
233
- out.append("\n```\n")
234
- return "".join(out)
235
-
236
- def _assets_to_html_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
237
- """Simple HTML block for the HTML Preview tab; ensures images render even if Markdown sanitizer blocks data URIs."""
238
- parts = [
239
- '<section class="parsed-figures"><h3>Parsed Figures (auto-rendered)</h3>'
240
- ]
241
- if not assets and not parsed_text.strip():
242
- parts.append("<p><em>No deep-parsed content detected.</em></p></section>")
243
- return "".join(parts)
244
- for title, data_uri in assets:
245
- if data_uri:
246
- parts.append(f'<figure><figcaption><strong>{title}</strong></figcaption><img style="max-width:100%;height:auto" src="{data_uri}"/></figure>')
247
- if parsed_text.strip():
248
- safe = parsed_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
249
- parts.append(f"<details><summary>Raw deep-parse output (snippet)</summary><pre>{safe[:8000]}</pre></details>")
250
- parts.append("</section>")
251
- return "".join(parts)
252
 
253
 
254
  # ===== Inference Function =====
255
  @spaces.GPU
256
  def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
257
  """
258
- Process image with DeepSeek-OCR and return annotated image, markdown, html, and text.
259
- Adds deep parsing for figures to render charts (from tables) and chemistry (from SMILES).
 
260
  """
261
  if image is None:
262
- return None, "Please upload an image first.", "<p>Please upload an image first.</p>", "Please upload an image first."
263
 
264
- # handle CPU/GPU
265
  if torch.cuda.is_available():
266
  model_runtime = model.to("cuda", dtype=torch.bfloat16)
267
  else:
@@ -283,10 +229,11 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
283
  prompt = "<image>\nFree OCR."
284
 
285
  # save image
 
286
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
287
  image.save(temp_image_path)
288
 
289
- # ===== size config =====
290
  size_configs = {
291
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
292
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -296,9 +243,9 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
296
  }
297
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
298
 
299
- # ===== inference (primary pass) =====
300
  with torch.no_grad():
301
- plain_text_result = model_runtime.infer(
302
  tokenizer,
303
  prompt=prompt,
304
  image_file=temp_image_path,
@@ -311,7 +258,7 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
311
  eval_mode=is_eval_mode,
312
  )
313
 
314
- # ===== collect results =====
315
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
316
  markdown_result_path = os.path.join(output_path, "result.mmd")
317
 
@@ -326,39 +273,49 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
326
  result_image = None
327
  if os.path.exists(image_result_path):
328
  try:
 
329
  result_image = Image.open(image_result_path)
330
  result_image.load()
331
  except Exception:
332
  result_image = None
333
 
334
- # ===== draw bounding boxes if <|det|> tags exist (optional) =====
335
- pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
336
- matches = list(pattern.finditer(plain_text_result or ""))
337
-
338
  if matches:
339
- image_with_bboxes = image.copy()
340
- draw = ImageDraw.Draw(image_with_bboxes)
341
  w, h = image.size
342
- for match in matches:
343
- x1, y1, x2, y2 = [int(c) for c in match.groups()]
344
- x1 = int(x1 / 1000 * w)
345
- y1 = int(y1 / 1000 * h)
346
- x2 = int(x2 / 1000 * w)
347
- y2 = int(y2 / 1000 * h)
348
- draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
349
- result_image = image_with_bboxes
350
-
351
- # ===== DEEP PARSING & RENDERING (secondary pass) =====
352
- should_run_deep = deep_parse and task_type in {"📄 Convert to Markdown", "📈 Parse Figure"}
353
-
354
- deep_assets: List[Tuple[str, str]] = [] # (title, data_uri)
355
- parsed_text = ""
356
-
357
- def _run_deep_parse(prompt_text):
 
 
 
 
 
 
 
 
 
 
358
  with torch.no_grad():
359
- return model_runtime.infer(
360
  tokenizer,
361
- prompt=prompt_text,
362
  image_file=temp_image_path,
363
  output_path=output_path,
364
  base_size=config["base_size"],
@@ -367,59 +324,70 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
367
  save_results=False,
368
  test_compress=True,
369
  eval_mode=True,
370
- )
371
-
372
- if should_run_deep:
373
- try:
374
- parsed_text = _run_deep_parse("<image>\nParse the figure.") or ""
375
- except Exception:
376
- parsed_text = ""
377
-
378
- # 1) Charts/tables:
379
- df = None
380
- if "<table" in parsed_text.lower() and _HAS_PANDAS:
381
- df = _html_table_to_df(parsed_text)
382
-
383
- if df is None: # fallback: markdown pipe-table
384
- df = _md_table_to_df(parsed_text)
385
-
386
- if df is None: # fallback: generic numeric block
387
- df = _numeric_block_to_df(parsed_text)
388
-
389
- if df is not None:
390
- chart_uri = _df_to_chart_data_uri(df)
391
- if chart_uri:
392
- deep_assets.append(("Figure (re-rendered from parsed data)", chart_uri))
393
-
394
- # 2) Chemistry (SMILES)
395
- deep_assets.extend(_render_smiles_block(parsed_text))
396
-
397
- # ===== Append deep assets into the Markdown + build HTML preview =====
398
- html_preview = "" # for HTML tab
399
- if task_type == "📄 Convert to Markdown":
400
- # extend markdown with a diagnostic/asset section regardless of success,
401
- # so you can see whether deep-parse attempted
402
- markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
403
- html_preview = _assets_to_html_section(deep_assets, parsed_text)
404
- elif task_type == "📈 Parse Figure":
405
- # just show what we got from deep parse
406
- header = "# Parse Figure\n\n"
407
- body = _assets_to_markdown_section(deep_assets, parsed_text)
408
- markdown_content = header + body
409
- html_preview = _assets_to_html_section(deep_assets, parsed_text)
410
- else:
411
- # other tasks: keep as-is, but still provide an HTML tab with any assets
412
- if deep_assets or parsed_text.strip():
413
- markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
414
- html_preview = _assets_to_html_section(deep_assets, parsed_text)
415
  else:
416
- html_preview = "<p>No parsed-figure content.</p>"
417
-
418
- # ===== Decide what to show in the "Markdown Source (or Eval Output)" tab =====
419
- text_result = plain_text_result if plain_text_result else markdown_content
420
-
421
- # return (image, markdown, html, text)
422
- return result_image, markdown_content, html_preview, text_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
 
425
  # ===== Theme and UI =====
@@ -460,45 +428,32 @@ with gr.Blocks(
460
 
461
  with gr.Row():
462
  with gr.Column(scale=1):
463
- image_input = gr.Image(
464
- type="pil", label="Upload Image", sources=["upload", "clipboard"]
465
- )
466
-
467
  model_size = gr.Dropdown(
468
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
469
  value="Gundam (Recommended)",
470
  label="Model Size",
471
  )
472
-
473
  task_type = gr.Dropdown(
474
- choices=[
475
- "📝 Free OCR",
476
- "📄 Convert to Markdown",
477
- "📈 Parse Figure",
478
- "🔍 Locate Object by Reference",
479
- ],
480
  value="📄 Convert to Markdown",
481
  label="Task Type",
482
  )
483
-
484
  ref_text_input = gr.Textbox(
485
  label="Reference Text (for Locate task)",
486
  placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
487
  visible=False,
488
  )
489
-
490
  eval_mode_checkbox = gr.Checkbox(
491
  value=False,
492
  label="Enable Evaluation Mode",
493
  info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
494
  )
495
-
496
  deep_parse_checkbox = gr.Checkbox(
497
  value=True,
498
- label="Deep parse and re-render figures (charts/molecules)",
499
- info="Runs a secondary pass to parse tables/SMILES and embeds rendered visuals into Markdown.",
500
  )
501
-
502
  submit_btn = gr.Button("Process Image", variant="primary")
503
 
504
  with gr.Column(scale=2):
@@ -507,27 +462,18 @@ with gr.Blocks(
507
  output_image = gr.Image(interactive=False)
508
  with gr.TabItem("Markdown Preview"):
509
  output_markdown = gr.Markdown()
510
- with gr.TabItem("Rendered HTML (figures)"):
511
- output_html = gr.HTML()
512
  with gr.TabItem("Markdown Source (or Eval Output)"):
513
- output_text = gr.Textbox(
514
- lines=20, show_copy_button=True, interactive=False
515
- )
516
 
517
- # show/hide reference text box based on selected task
518
  def toggle_ref_text_visibility(task):
519
  return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
520
 
521
- task_type.change(
522
- fn=toggle_ref_text_visibility,
523
- inputs=task_type,
524
- outputs=ref_text_input,
525
- )
526
 
527
  submit_btn.click(
528
  fn=process_image,
529
  inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
530
- outputs=[output_image, output_markdown, output_html, output_text],
531
  )
532
 
533
 
 
2
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # disable hf_transfer if missing
3
 
4
  import io
5
+ import json
6
  import re
7
  import tempfile
8
+ from typing import List, Dict, Any, Optional, Tuple
9
 
10
  import gradio as gr
11
  import torch
 
13
  import spaces
14
  from PIL import Image, ImageDraw
15
 
16
+ # Optional: pandas for better table handling (not mandatory)
 
 
 
 
 
 
17
  try:
18
  import pandas as pd
19
  _HAS_PANDAS = True
20
  except Exception:
21
  _HAS_PANDAS = False
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  from gradio.themes import Soft
24
  from gradio.themes.utils import fonts
25
 
 
46
  pass
47
 
48
 
49
+ # ====== Utilities: JSON/table parsing ======
50
+ _JSON_FENCE_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
51
+ _ANY_JSON_RE = re.compile(r"(\{(?:[^{}]|(?1))*\})", re.DOTALL) # recursive-ish best-effort
 
 
52
 
53
+ _MD_TABLE_BLOCK_RE = re.compile(
54
+ r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
55
+ flags=re.MULTILINE
56
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def _extract_json(text: str) -> Optional[Dict[str, Any]]:
59
+ """
60
+ Try to extract a single JSON object from text.
61
+ 1) prefer ```json fenced block
62
+ 2) fallback to first top-level-looking {...}
63
+ """
64
+ if not text:
65
  return None
66
+ m = _JSON_FENCE_RE.search(text)
67
+ candidate = None
68
+ if m:
69
+ candidate = m.group(1).strip()
70
+ else:
71
+ m2 = _ANY_JSON_RE.search(text)
72
+ if m2:
73
+ candidate = m2.group(1).strip()
74
+ if not candidate:
75
  return None
76
  try:
77
+ return json.loads(candidate)
 
 
 
78
  except Exception:
79
  return None
80
 
 
 
 
 
 
81
 
82
+ def _json_to_markdown_table(js: Dict[str, Any]) -> Optional[str]:
83
+ """
84
+ Convert a chart-style JSON into a Markdown pipe table.
85
+ Expected schema (flexible):
86
+ {
87
+ "type": "bar|line|...",
88
+ "title": "...",
89
+ "x": ["Germany","France",...], # categories (or "categories")
90
+ "series": [{"name":"2024","data":[...]} , ...]
91
+ }
92
+ We handle keys: x|categories; y ignored (derived from series).
93
+ """
94
+ if not js:
95
+ return None
96
+
97
+ x = js.get("x") or js.get("categories")
98
+ series = js.get("series")
99
+ if not isinstance(x, list) or not isinstance(series, list):
100
+ return None
101
+
102
+ # build rows: first col is x category, next cols are series values
103
+ headers = ["Category"] + [str(s.get("name", f"series{i}")) for i, s in enumerate(series)]
104
+ rows: List[List[str]] = []
105
+ for i, cat in enumerate(x):
106
+ row = [str(cat)]
107
+ for s in series:
108
+ data = s.get("data", [])
109
+ val = data[i] if i < len(data) else ""
110
+ row.append(str(val))
111
+ rows.append(row)
112
+
113
+ # to markdown pipe table
114
+ header_line = "| " + " | ".join(headers) + " |"
115
+ align_line = "| " + " | ".join([":---"] * len(headers)) + " |"
116
+ data_lines = ["| " + " | ".join(r) + " |" for r in rows]
117
+ return "\n".join([header_line, align_line, *data_lines])
118
+
119
+
120
+ def _md_table_to_df(md_text: str):
121
  if not _HAS_PANDAS:
122
  return None
123
+ m = _MD_TABLE_BLOCK_RE.search(md_text or "")
124
+ if not m:
125
+ return None
126
+ block = m.group(0).strip()
127
+ lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
128
+ if len(lines) < 2:
129
+ return None
130
+ header = [h.strip() for h in lines[0].strip("|").split("|")]
131
+ align_or_sep = lines[1]
132
+ data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
133
+ rows = []
134
+ for ln in data_lines:
135
+ parts = [p.strip() for p in ln.strip("|").split("|")]
136
+ if len(parts) == len(header):
137
+ rows.append(parts)
138
+ if not rows:
 
 
 
 
 
 
 
 
 
139
  return None
140
+ df = pd.DataFrame(rows, columns=header)
141
+ # try cast numerics
142
+ for c in df.columns[1:]:
143
+ df[c] = pd.to_numeric(df[c], errors="ignore")
144
+ return df
145
 
146
+
147
+ def _numeric_block_to_df(text: str):
148
+ """Rough fallback: largest numeric-ish block into a DataFrame."""
149
  if not _HAS_PANDAS:
150
  return None
 
151
  blocks = []
152
  cur = []
153
  for ln in (text or "").splitlines():
 
155
  cur.append(ln)
156
  else:
157
  if cur:
158
+ blocks.append("\n".join(cur)); cur = []
159
+ if cur: blocks.append("\n".join(cur))
160
+ if not blocks: return None
 
 
 
 
161
 
162
+ block = max(blocks, key=len)
163
+ from io import StringIO
164
+ # CSV
165
  try:
 
166
  df = pd.read_csv(StringIO(block))
167
+ if df.shape[1] >= 2: return df
 
168
  except Exception:
169
  pass
170
+ # whitespace
 
171
  try:
 
172
  df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
173
+ if df.shape[1] >= 2: return df
 
174
  except Exception:
175
  pass
176
  return None
177
 
 
178
 
179
+ def _df_to_markdown_and_csv(df) -> Tuple[str, str]:
180
+ """Return (markdown_pipe_table, csv_text)."""
181
+ if not _HAS_PANDAS or df is None:
182
+ return "", ""
183
+ # Markdown
184
+ md = []
185
+ headers = list(df.columns)
186
+ md.append("| " + " | ".join(map(str, headers)) + " |")
187
+ md.append("| " + " | ".join([":---"] * len(headers)) + " |")
188
+ for _, row in df.iterrows():
189
+ md.append("| " + " | ".join(map(lambda x: str(x), row.tolist())) + " |")
190
+ md_text = "\n".join(md)
191
+
192
+ # CSV
193
+ buf = io.StringIO()
194
+ df.to_csv(buf, index=False)
195
+ csv_text = buf.getvalue()
196
+ return md_text, csv_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
 
199
  # ===== Inference Function =====
200
  @spaces.GPU
201
  def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
202
  """
203
+ Process image with DeepSeek-OCR and return annotated image, markdown, and text.
204
+ Adds deep parsing to extract structured DATA from figures (JSON + Table + CSV)
205
+ and appends it inside the Markdown for RAG indexing.
206
  """
207
  if image is None:
208
+ return None, "Please upload an image first.", "Please upload an image first."
209
 
210
+ # device
211
  if torch.cuda.is_available():
212
  model_runtime = model.to("cuda", dtype=torch.bfloat16)
213
  else:
 
229
  prompt = "<image>\nFree OCR."
230
 
231
  # save image
232
+ os.makedirs(output_path, exist_ok=True)
233
  temp_image_path = os.path.join(output_path, "temp_image.jpg")
234
  image.save(temp_image_path)
235
 
236
+ # size
237
  size_configs = {
238
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
239
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
 
243
  }
244
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
245
 
246
+ # ===== primary pass =====
247
  with torch.no_grad():
248
+ primary_text = model_runtime.infer(
249
  tokenizer,
250
  prompt=prompt,
251
  image_file=temp_image_path,
 
258
  eval_mode=is_eval_mode,
259
  )
260
 
261
+ # collect results
262
  image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
263
  markdown_result_path = os.path.join(output_path, "result.mmd")
264
 
 
273
  result_image = None
274
  if os.path.exists(image_result_path):
275
  try:
276
+ from PIL import Image
277
  result_image = Image.open(image_result_path)
278
  result_image.load()
279
  except Exception:
280
  result_image = None
281
 
282
+ # draw bboxes if <|det|>
283
+ det_pat = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
284
+ matches = list(det_pat.finditer(primary_text or ""))
 
285
  if matches:
286
+ img_with_boxes = image.copy()
287
+ draw = ImageDraw.Draw(img_with_boxes)
288
  w, h = image.size
289
+ for m in matches:
290
+ x1,y1,x2,y2 = [int(c) for c in m.groups()]
291
+ x1 = int(x1/1000*w); y1=int(y1/1000*h); x2=int(x2/1000*w); y2=int(y2/1000*h)
292
+ draw.rectangle([x1,y1,x2,y2], outline="red", width=3)
293
+ result_image = img_with_boxes
294
+
295
+ # ===== deep parse for DATA (not rendering) =====
296
+ # We always try deep parse for Convert to Markdown / Parse Figure, otherwise only if checkbox is on
297
+ should_deep = deep_parse and (task_type in {"📄 Convert to Markdown", "📈 Parse Figure"})
298
+
299
+ extracted_md_section = ""
300
+ if should_deep:
301
+ # ask model for STRICT JSON for charts
302
+ strict_json_prompt = (
303
+ "<image>\n"
304
+ "Parse the figure. If it's a chart, return ONLY a single JSON object with keys:\n"
305
+ "{\n"
306
+ ' "type": "bar|line|area|scatter|pie|table|unknown",\n'
307
+ ' "title": "string",\n'
308
+ ' "x": ["category", ...],\n'
309
+ ' "series": [{"name": "string", "data": [number|null, ...]}, ...]\n'
310
+ "}\n"
311
+ "If it's a table, return the same JSON using 'type': 'table' and fill x from the first column and series from remaining columns.\n"
312
+ "Do not include any explanation text. Return ONLY the JSON."
313
+ )
314
+
315
  with torch.no_grad():
316
+ deep_text = model_runtime.infer(
317
  tokenizer,
318
+ prompt=strict_json_prompt,
319
  image_file=temp_image_path,
320
  output_path=output_path,
321
  base_size=config["base_size"],
 
324
  save_results=False,
325
  test_compress=True,
326
  eval_mode=True,
327
+ ) or ""
328
+
329
+ js = _extract_json(deep_text)
330
+
331
+ md_table, csv_text = "", ""
332
+ if js:
333
+ # Prefer JSON → Markdown table
334
+ md_table = _json_to_markdown_table(js) or ""
335
+ if _HAS_PANDAS and md_table:
336
+ df = _md_table_to_df(md_table)
337
+ if df is not None:
338
+ md_table, csv_text = _df_to_markdown_and_csv(df)
339
+ # Build Markdown block with JSON + (optional) table + CSV
340
+ extracted_md_section = "### Extracted Figure Data\n\n"
341
+ extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js, ensure_ascii=False, indent=2) + "\n```\n\n"
342
+ if md_table:
343
+ extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
344
+ if csv_text:
345
+ extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  else:
347
+ # Fallback: ask for generic figure parse, then try to pull Markdown tables / numeric blocks
348
+ with torch.no_grad():
349
+ fallback_text = model_runtime.infer(
350
+ tokenizer,
351
+ prompt="<image>\nParse the figure.",
352
+ image_file=temp_image_path,
353
+ output_path=output_path,
354
+ base_size=config["base_size"],
355
+ image_size=config["image_size"],
356
+ crop_mode=config["crop_mode"],
357
+ save_results=False,
358
+ test_compress=True,
359
+ eval_mode=True,
360
+ ) or ""
361
+
362
+ df = _md_table_to_df(fallback_text)
363
+ if df is None:
364
+ df = _numeric_block_to_df(fallback_text)
365
+
366
+ if df is not None:
367
+ md_table, csv_text = _df_to_markdown_and_csv(df)
368
+ js_fallback = {
369
+ "type": "table",
370
+ "title": "",
371
+ "x": df.iloc[:,0].astype(str).tolist(),
372
+ "series": [{"name": c, "data": [None if pd.isna(v) else (float(v) if str(v).replace('.','',1).isdigit() else v) for v in df[c].tolist()]}
373
+ for c in df.columns[1:]] if _HAS_PANDAS else []
374
+ }
375
+ extracted_md_section = "### Extracted Figure Data\n\n"
376
+ extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js_fallback, ensure_ascii=False, indent=2) + "\n```\n\n"
377
+ extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
378
+ extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
379
+ else:
380
+ # Nothing structured; keep a short diagnostic (plain text only)
381
+ extracted_md_section = "### Extracted Figure Data\n\n_No structured table/series detected. You may need to adjust the deep-parse prompt for this figure type._\n"
382
+
383
+ # ===== Merge into final Markdown =====
384
+ if extracted_md_section:
385
+ markdown_content = markdown_content.rstrip() + "\n\n---\n\n" + extracted_md_section
386
+
387
+ # For the “Markdown Source (or Eval Output)” tab
388
+ text_result = primary_text if primary_text else markdown_content
389
+
390
+ return result_image, markdown_content, text_result
391
 
392
 
393
  # ===== Theme and UI =====
 
428
 
429
  with gr.Row():
430
  with gr.Column(scale=1):
431
+ image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"])
 
 
 
432
  model_size = gr.Dropdown(
433
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
434
  value="Gundam (Recommended)",
435
  label="Model Size",
436
  )
 
437
  task_type = gr.Dropdown(
438
+ choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
 
 
 
 
 
439
  value="📄 Convert to Markdown",
440
  label="Task Type",
441
  )
 
442
  ref_text_input = gr.Textbox(
443
  label="Reference Text (for Locate task)",
444
  placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
445
  visible=False,
446
  )
 
447
  eval_mode_checkbox = gr.Checkbox(
448
  value=False,
449
  label="Enable Evaluation Mode",
450
  info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
451
  )
 
452
  deep_parse_checkbox = gr.Checkbox(
453
  value=True,
454
+ label="Deep parse and extract figure data (JSON + table + CSV)",
455
+ info="Adds a second pass that extracts machine-readable data for RAG.",
456
  )
 
457
  submit_btn = gr.Button("Process Image", variant="primary")
458
 
459
  with gr.Column(scale=2):
 
462
  output_image = gr.Image(interactive=False)
463
  with gr.TabItem("Markdown Preview"):
464
  output_markdown = gr.Markdown()
 
 
465
  with gr.TabItem("Markdown Source (or Eval Output)"):
466
+ output_text = gr.Textbox(lines=20, show_copy_button=True, interactive=False)
 
 
467
 
 
468
  def toggle_ref_text_visibility(task):
469
  return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
470
 
471
+ task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
 
 
 
 
472
 
473
  submit_btn.click(
474
  fn=process_image,
475
  inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
476
+ outputs=[output_image, output_markdown, output_text],
477
  )
478
 
479