lucacadalora commited on
Commit
d7ea6d3
·
verified ·
1 Parent(s): d71bc6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -66
app.py CHANGED
@@ -5,6 +5,7 @@ import io
5
  import base64
6
  import re
7
  import tempfile
 
8
 
9
  import gradio as gr
10
  import torch
@@ -76,32 +77,28 @@ def _img_to_data_uri(pil_img, fmt="PNG"):
76
  pil_img.save(buf, format=fmt)
77
  return f"data:image/{fmt.lower()};base64,{base64.b64encode(buf.getvalue()).decode()}"
78
 
79
- def _df_to_chart_data_uri(df):
80
  """Render a simple chart from a DataFrame using matplotlib (single figure, no explicit colors)."""
81
  if not _HAS_MPL or not _HAS_PANDAS:
82
  return None
83
- # Heuristics: first column is x-axis if non-numeric or unique labels; otherwise use index
84
  try:
 
85
  plt.figure() # one plot per chart (no subplots; no explicit colors)
86
- if df.shape[1] >= 2:
87
- # Assume first column is x if not numeric or contains non-monotonic categories
88
- x = df.columns[0]
89
- maybe_x = df[x]
90
- numeric_x = pd.to_numeric(maybe_x, errors="coerce")
91
- if numeric_x.isna().any() or maybe_x.nunique() < len(maybe_x):
92
- # categorical-ish: set as index
93
- df_plot = df.set_index(x)
94
- else:
95
- # numeric x: also set as index so matplotlib draws a line plot
96
- df_plot = df.set_index(x)
97
- # If there are many series, default to line; if few, bar
98
- if df_plot.shape[1] <= 5:
99
- df_plot.plot(kind="bar")
100
- else:
101
- df_plot.plot()
102
  else:
103
- # Single series: plot it
104
- df.plot()
105
  buf = io.BytesIO()
106
  plt.tight_layout()
107
  plt.savefig(buf, format="PNG", dpi=160)
@@ -110,7 +107,7 @@ def _df_to_chart_data_uri(df):
110
  except Exception:
111
  return None
112
 
113
- def _html_table_to_df(html):
114
  """Pick the largest <table> from HTML; return as DataFrame or None."""
115
  if not _HAS_READ_HTML:
116
  return None
@@ -118,16 +115,91 @@ def _html_table_to_df(html):
118
  tables = pd.read_html(html) # list[DataFrame]
119
  if not tables:
120
  return None
121
- # choose the table with max cells
122
  return max(tables, key=lambda t: (t.shape[0] * t.shape[1]))
123
  except Exception:
124
  return None
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  _SMILES_REGEX = re.compile(r"(?:SMILES|Smiles)\s*[::]\s*([A-Za-z0-9@\[\]\(\)\+\-\=\\\/%]+)")
127
 
128
- def _render_smiles_block(text):
129
  """Find SMILES in text, render with RDKit, return list[(title, data_uri)]."""
130
- assets = []
131
  if not _HAS_RDKIT:
132
  return assets
133
  try:
@@ -142,16 +214,52 @@ def _render_smiles_block(text):
142
  pass
143
  return assets
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  # ===== Inference Function =====
147
  @spaces.GPU
148
  def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
149
  """
150
- Process image with DeepSeek-OCR and return annotated image, markdown, and text.
151
- Adds deep parsing for figures to render charts (from HTML tables) and chemistry (from SMILES).
152
  """
153
  if image is None:
154
- return None, "Please upload an image first.", "Please upload an image first."
155
 
156
  # handle CPU/GPU
157
  if torch.cuda.is_available():
@@ -241,13 +349,10 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
241
  result_image = image_with_bboxes
242
 
243
  # ===== DEEP PARSING & RENDERING (secondary pass) =====
244
- # Enable when:
245
- # - Task is Convert to Markdown (common flow), or
246
- # - Task is Parse Figure (primary content already requested), or
247
- # - User checked deep_parse (default True)
248
  should_run_deep = deep_parse and task_type in {"📄 Convert to Markdown", "📈 Parse Figure"}
249
 
250
- deep_assets = [] # list of tuples (title, data_uri or None), appended to markdown
 
251
 
252
  def _run_deep_parse(prompt_text):
253
  with torch.no_grad():
@@ -261,51 +366,60 @@ def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_par
261
  crop_mode=config["crop_mode"],
262
  save_results=False,
263
  test_compress=True,
264
- eval_mode=True, # text is enough
265
  )
266
 
267
- parsed_text = ""
268
  if should_run_deep:
269
  try:
270
  parsed_text = _run_deep_parse("<image>\nParse the figure.") or ""
271
  except Exception:
272
  parsed_text = ""
273
 
274
- # 1) If there is an HTML table → build a chart
 
275
  if "<table" in parsed_text.lower() and _HAS_PANDAS:
276
- try:
277
- df = _html_table_to_df(parsed_text)
278
- if df is not None:
279
- chart_uri = _df_to_chart_data_uri(df)
280
- if chart_uri:
281
- deep_assets.append(("Figure (re-rendered from parsed table)", chart_uri))
282
- # Add raw HTML table for copy/paste even if chart failed
283
- deep_assets.append(("Parsed table (HTML)", None))
284
- except Exception:
285
- pass
286
-
287
- # 2) If there are SMILES → render molecules (if RDKit available)
288
- smiles_assets = _render_smiles_block(parsed_text)
289
- deep_assets.extend(smiles_assets)
290
-
291
- # ===== Append deep assets into the Markdown so they show in the preview =====
292
- if deep_assets:
293
- md_parts = [markdown_content, "\n\n---\n\n### Parsed Figures (auto-rendered)\n"]
294
- for title, data_uri in deep_assets:
295
- if data_uri:
296
- md_parts.append(f"**{title}**\n\n![]({data_uri})\n")
297
- else:
298
- # raw parsed output (e.g., table HTML) fenced for readability
299
- # Keep size sensible; truncate huge content
300
- snippet = parsed_text.strip()
301
- if len(snippet) > 6000:
302
- snippet = snippet[:6000] + "\n<!-- truncated -->"
303
- md_parts.append(f"**{title}**\n\n```html\n{snippet}\n```\n")
304
- markdown_content = "\n".join(md_parts)
 
 
 
 
 
 
 
305
 
306
  # ===== Decide what to show in the "Markdown Source (or Eval Output)" tab =====
307
  text_result = plain_text_result if plain_text_result else markdown_content
308
- return result_image, markdown_content, text_result
 
 
309
 
310
 
311
  # ===== Theme and UI =====
@@ -393,6 +507,8 @@ with gr.Blocks(
393
  output_image = gr.Image(interactive=False)
394
  with gr.TabItem("Markdown Preview"):
395
  output_markdown = gr.Markdown()
 
 
396
  with gr.TabItem("Markdown Source (or Eval Output)"):
397
  output_text = gr.Textbox(
398
  lines=20, show_copy_button=True, interactive=False
@@ -411,7 +527,7 @@ with gr.Blocks(
411
  submit_btn.click(
412
  fn=process_image,
413
  inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
414
- outputs=[output_image, output_markdown, output_text],
415
  )
416
 
417
 
 
5
  import base64
6
  import re
7
  import tempfile
8
+ from typing import List, Tuple, Optional
9
 
10
  import gradio as gr
11
  import torch
 
77
  pil_img.save(buf, format=fmt)
78
  return f"data:image/{fmt.lower()};base64,{base64.b64encode(buf.getvalue()).decode()}"
79
 
80
+ def _df_to_chart_data_uri(df: "pd.DataFrame") -> Optional[str]:
81
  """Render a simple chart from a DataFrame using matplotlib (single figure, no explicit colors)."""
82
  if not _HAS_MPL or not _HAS_PANDAS:
83
  return None
 
84
  try:
85
+ # basic heuristics: first col is x-axis if non-numeric-ish
86
  plt.figure() # one plot per chart (no subplots; no explicit colors)
87
+ df_plot = df.copy()
88
+
89
+ # If first column is non-numeric, set it as index
90
+ if df_plot.shape[1] >= 2:
91
+ xcol = df_plot.columns[0]
92
+ numeric_x = pd.to_numeric(df_plot[xcol], errors="coerce")
93
+ if numeric_x.isna().any():
94
+ df_plot = df_plot.set_index(xcol)
95
+
96
+ # bar for <=5 series, else line
97
+ if df_plot.shape[1] <= 5:
98
+ df_plot.plot(kind="bar")
 
 
 
 
99
  else:
100
+ df_plot.plot()
101
+
102
  buf = io.BytesIO()
103
  plt.tight_layout()
104
  plt.savefig(buf, format="PNG", dpi=160)
 
107
  except Exception:
108
  return None
109
 
110
+ def _html_table_to_df(html: str) -> Optional["pd.DataFrame"]:
111
  """Pick the largest <table> from HTML; return as DataFrame or None."""
112
  if not _HAS_READ_HTML:
113
  return None
 
115
  tables = pd.read_html(html) # list[DataFrame]
116
  if not tables:
117
  return None
 
118
  return max(tables, key=lambda t: (t.shape[0] * t.shape[1]))
119
  except Exception:
120
  return None
121
 
122
+ # --- Fallbacks when we don't get HTML tables ---
123
+ _MD_TABLE_BLOCK_RE = re.compile(
124
+ r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
125
+ flags=re.MULTILINE
126
+ )
127
+
128
+ def _md_table_to_df(md_text: str) -> Optional["pd.DataFrame"]:
129
+ """Parse the first Markdown pipe-table into a DataFrame."""
130
+ if not _HAS_PANDAS:
131
+ return None
132
+ try:
133
+ m = _MD_TABLE_BLOCK_RE.search(md_text or "")
134
+ if not m:
135
+ return None
136
+ block = m.group(0).strip()
137
+ # Normalize: remove alignment row, split by pipes
138
+ lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
139
+ if len(lines) < 2:
140
+ return None
141
+ header = [h.strip() for h in lines[0].strip("|").split("|")]
142
+ align_or_sep = lines[1]
143
+ data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
144
+ rows = []
145
+ for ln in data_lines:
146
+ parts = [p.strip() for p in ln.strip("|").split("|")]
147
+ if len(parts) == len(header):
148
+ rows.append(parts)
149
+ if not rows:
150
+ return None
151
+ df = pd.DataFrame(rows, columns=header)
152
+ # try cast numerics where possible
153
+ for c in df.columns:
154
+ df[c] = pd.to_numeric(df[c], errors="ignore")
155
+ return df
156
+ except Exception:
157
+ return None
158
+
159
+ def _numeric_block_to_df(text: str) -> Optional["pd.DataFrame"]:
160
+ """Very rough fallback: parse whitespace/csv-ish numeric blocks into a DataFrame."""
161
+ if not _HAS_PANDAS:
162
+ return None
163
+ # grab the largest numeric-ish block: lines containing numbers and separators
164
+ blocks = []
165
+ cur = []
166
+ for ln in (text or "").splitlines():
167
+ if re.search(r"\d", ln) and ("," in ln or "\t" in ln or " " in ln or "|" in ln):
168
+ cur.append(ln)
169
+ else:
170
+ if cur:
171
+ blocks.append("\n".join(cur))
172
+ cur = []
173
+ if cur:
174
+ blocks.append("\n".join(cur))
175
+ if not blocks:
176
+ return None
177
+ block = max(blocks, key=len)
178
+
179
+ # try CSV first
180
+ try:
181
+ from io import StringIO
182
+ df = pd.read_csv(StringIO(block))
183
+ if df.shape[1] >= 2:
184
+ return df
185
+ except Exception:
186
+ pass
187
+
188
+ # try whitespace sep
189
+ try:
190
+ from io import StringIO
191
+ df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
192
+ if df.shape[1] >= 2:
193
+ return df
194
+ except Exception:
195
+ pass
196
+ return None
197
+
198
  _SMILES_REGEX = re.compile(r"(?:SMILES|Smiles)\s*[::]\s*([A-Za-z0-9@\[\]\(\)\+\-\=\\\/%]+)")
199
 
200
+ def _render_smiles_block(text: str) -> List[Tuple[str, str]]:
201
  """Find SMILES in text, render with RDKit, return list[(title, data_uri)]."""
202
+ assets: List[Tuple[str, str]] = []
203
  if not _HAS_RDKIT:
204
  return assets
205
  try:
 
214
  pass
215
  return assets
216
 
217
+ def _assets_to_markdown_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
218
+ out = ["\n\n---\n\n### Parsed Figures (auto-rendered)\n"]
219
+ if not assets and not parsed_text.strip():
220
+ out.append("_No deep-parsed content detected._\n")
221
+ return "".join(out)
222
+
223
+ for title, data_uri in assets:
224
+ if data_uri:
225
+ out.append(f"**{title}**\n\n![]({data_uri})\n\n")
226
+ # Always expose a short snippet so you can see what the model returned
227
+ snippet = parsed_text.strip()
228
+ if len(snippet) > 4000:
229
+ snippet = snippet[:4000] + "\n<!-- truncated -->"
230
+ if snippet:
231
+ out.append("**Raw deep-parse output (snippet)**\n\n```text\n")
232
+ out.append(snippet)
233
+ out.append("\n```\n")
234
+ return "".join(out)
235
+
236
+ def _assets_to_html_section(assets: List[Tuple[str, str]], parsed_text: str) -> str:
237
+ """Simple HTML block for the HTML Preview tab; ensures images render even if Markdown sanitizer blocks data URIs."""
238
+ parts = [
239
+ '<section class="parsed-figures"><h3>Parsed Figures (auto-rendered)</h3>'
240
+ ]
241
+ if not assets and not parsed_text.strip():
242
+ parts.append("<p><em>No deep-parsed content detected.</em></p></section>")
243
+ return "".join(parts)
244
+ for title, data_uri in assets:
245
+ if data_uri:
246
+ parts.append(f'<figure><figcaption><strong>{title}</strong></figcaption><img style="max-width:100%;height:auto" src="{data_uri}"/></figure>')
247
+ if parsed_text.strip():
248
+ safe = parsed_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
249
+ parts.append(f"<details><summary>Raw deep-parse output (snippet)</summary><pre>{safe[:8000]}</pre></details>")
250
+ parts.append("</section>")
251
+ return "".join(parts)
252
+
253
 
254
  # ===== Inference Function =====
255
  @spaces.GPU
256
  def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
257
  """
258
+ Process image with DeepSeek-OCR and return annotated image, markdown, html, and text.
259
+ Adds deep parsing for figures to render charts (from tables) and chemistry (from SMILES).
260
  """
261
  if image is None:
262
+ return None, "Please upload an image first.", "<p>Please upload an image first.</p>", "Please upload an image first."
263
 
264
  # handle CPU/GPU
265
  if torch.cuda.is_available():
 
349
  result_image = image_with_bboxes
350
 
351
  # ===== DEEP PARSING & RENDERING (secondary pass) =====
 
 
 
 
352
  should_run_deep = deep_parse and task_type in {"📄 Convert to Markdown", "📈 Parse Figure"}
353
 
354
+ deep_assets: List[Tuple[str, str]] = [] # (title, data_uri)
355
+ parsed_text = ""
356
 
357
  def _run_deep_parse(prompt_text):
358
  with torch.no_grad():
 
366
  crop_mode=config["crop_mode"],
367
  save_results=False,
368
  test_compress=True,
369
+ eval_mode=True,
370
  )
371
 
 
372
  if should_run_deep:
373
  try:
374
  parsed_text = _run_deep_parse("<image>\nParse the figure.") or ""
375
  except Exception:
376
  parsed_text = ""
377
 
378
+ # 1) Charts/tables:
379
+ df = None
380
  if "<table" in parsed_text.lower() and _HAS_PANDAS:
381
+ df = _html_table_to_df(parsed_text)
382
+
383
+ if df is None: # fallback: markdown pipe-table
384
+ df = _md_table_to_df(parsed_text)
385
+
386
+ if df is None: # fallback: generic numeric block
387
+ df = _numeric_block_to_df(parsed_text)
388
+
389
+ if df is not None:
390
+ chart_uri = _df_to_chart_data_uri(df)
391
+ if chart_uri:
392
+ deep_assets.append(("Figure (re-rendered from parsed data)", chart_uri))
393
+
394
+ # 2) Chemistry (SMILES)
395
+ deep_assets.extend(_render_smiles_block(parsed_text))
396
+
397
+ # ===== Append deep assets into the Markdown + build HTML preview =====
398
+ html_preview = "" # for HTML tab
399
+ if task_type == "📄 Convert to Markdown":
400
+ # extend markdown with a diagnostic/asset section regardless of success,
401
+ # so you can see whether deep-parse attempted
402
+ markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
403
+ html_preview = _assets_to_html_section(deep_assets, parsed_text)
404
+ elif task_type == "📈 Parse Figure":
405
+ # just show what we got from deep parse
406
+ header = "# Parse Figure\n\n"
407
+ body = _assets_to_markdown_section(deep_assets, parsed_text)
408
+ markdown_content = header + body
409
+ html_preview = _assets_to_html_section(deep_assets, parsed_text)
410
+ else:
411
+ # other tasks: keep as-is, but still provide an HTML tab with any assets
412
+ if deep_assets or parsed_text.strip():
413
+ markdown_content = markdown_content + _assets_to_markdown_section(deep_assets, parsed_text)
414
+ html_preview = _assets_to_html_section(deep_assets, parsed_text)
415
+ else:
416
+ html_preview = "<p>No parsed-figure content.</p>"
417
 
418
  # ===== Decide what to show in the "Markdown Source (or Eval Output)" tab =====
419
  text_result = plain_text_result if plain_text_result else markdown_content
420
+
421
+ # return (image, markdown, html, text)
422
+ return result_image, markdown_content, html_preview, text_result
423
 
424
 
425
  # ===== Theme and UI =====
 
507
  output_image = gr.Image(interactive=False)
508
  with gr.TabItem("Markdown Preview"):
509
  output_markdown = gr.Markdown()
510
+ with gr.TabItem("Rendered HTML (figures)"):
511
+ output_html = gr.HTML()
512
  with gr.TabItem("Markdown Source (or Eval Output)"):
513
  output_text = gr.Textbox(
514
  lines=20, show_copy_button=True, interactive=False
 
527
  submit_btn.click(
528
  fn=process_image,
529
  inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
530
+ outputs=[output_image, output_markdown, output_html, output_text],
531
  )
532
 
533