lucacadalora commited on
Commit
7e1f32d
·
verified ·
1 Parent(s): aa2b99c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -369
app.py CHANGED
@@ -1,28 +1,16 @@
1
  import os
2
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # disable hf_transfer if missing
3
 
4
- import io
5
- import json
6
- import re
7
- import tempfile
8
- from typing import List, Dict, Any, Optional, Tuple
9
-
10
  import gradio as gr
11
  import torch
12
  from transformers import AutoModel, AutoTokenizer
13
  import spaces
14
- from PIL import Image, ImageDraw
15
-
16
- # Optional: pandas for better table handling (not mandatory)
17
- try:
18
- import pandas as pd
19
- _HAS_PANDAS = True
20
- except Exception:
21
- _HAS_PANDAS = False
22
-
23
  from gradio.themes import Soft
24
  from gradio.themes.utils import fonts
25
-
26
 
27
  # ===== Model Load =====
28
  model_name = "deepseek-ai/DeepSeek-OCR"
@@ -46,348 +34,160 @@ except Exception:
46
  pass
47
 
48
 
49
- # ====== Utilities: JSON/table parsing ======
50
- _JSON_FENCE_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
51
- _ANY_JSON_RE = re.compile(r"(\{(?:[^{}]|(?1))*\})", re.DOTALL) # recursive-ish best-effort
52
-
53
- _MD_TABLE_BLOCK_RE = re.compile(
54
- r"(?:^\s*\|.+\|\s*$\n^\s*\|(?:\s*:?-+:?\s*\|)+\s*$\n(?:^\s*\|.+\|\s*$\n?)+)",
55
- flags=re.MULTILINE
56
- )
57
-
58
- def _extract_json(text: str) -> Optional[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  """
60
- Try to extract a single JSON object from text.
61
- 1) prefer ```json fenced block
62
- 2) fallback to first top-level-looking {...}
63
  """
64
- if not text:
65
- return None
66
- m = _JSON_FENCE_RE.search(text)
67
- candidate = None
68
- if m:
69
- candidate = m.group(1).strip()
 
 
 
 
 
70
  else:
71
- m2 = _ANY_JSON_RE.search(text)
72
- if m2:
73
- candidate = m2.group(1).strip()
74
- if not candidate:
75
- return None
76
- try:
77
- return json.loads(candidate)
78
- except Exception:
79
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
- def _json_to_markdown_table(js: Dict[str, Any]) -> Optional[str]:
83
- """
84
- Convert a chart-style JSON into a Markdown pipe table.
85
- Expected schema (flexible):
86
- {
87
- "type": "bar|line|...",
88
- "title": "...",
89
- "x": ["Germany","France",...], # categories (or "categories")
90
- "series": [{"name":"2024","data":[...]} , ...]
91
- }
92
- We handle keys: x|categories; y ignored (derived from series).
93
- """
94
- if not js:
95
- return None
96
-
97
- x = js.get("x") or js.get("categories")
98
- series = js.get("series")
99
- if not isinstance(x, list) or not isinstance(series, list):
100
- return None
101
-
102
- # build rows: first col is x category, next cols are series values
103
- headers = ["Category"] + [str(s.get("name", f"series{i}")) for i, s in enumerate(series)]
104
- rows: List[List[str]] = []
105
- for i, cat in enumerate(x):
106
- row = [str(cat)]
107
- for s in series:
108
- data = s.get("data", [])
109
- val = data[i] if i < len(data) else ""
110
- row.append(str(val))
111
- rows.append(row)
112
-
113
- # to markdown pipe table
114
- header_line = "| " + " | ".join(headers) + " |"
115
- align_line = "| " + " | ".join([":---"] * len(headers)) + " |"
116
- data_lines = ["| " + " | ".join(r) + " |" for r in rows]
117
- return "\n".join([header_line, align_line, *data_lines])
118
-
119
-
120
- def _md_table_to_df(md_text: str):
121
- if not _HAS_PANDAS:
122
- return None
123
- m = _MD_TABLE_BLOCK_RE.search(md_text or "")
124
- if not m:
125
- return None
126
- block = m.group(0).strip()
127
- lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
128
- if len(lines) < 2:
129
- return None
130
- header = [h.strip() for h in lines[0].strip("|").split("|")]
131
- align_or_sep = lines[1]
132
- data_lines = lines[2:] if re.search(r":?-+:?", align_or_sep) else lines[1:]
133
- rows = []
134
- for ln in data_lines:
135
- parts = [p.strip() for p in ln.strip("|").split("|")]
136
- if len(parts) == len(header):
137
- rows.append(parts)
138
- if not rows:
139
- return None
140
- df = pd.DataFrame(rows, columns=header)
141
- # try cast numerics
142
- for c in df.columns[1:]:
143
- df[c] = pd.to_numeric(df[c], errors="ignore")
144
- return df
145
-
146
-
147
- def _numeric_block_to_df(text: str):
148
- """Rough fallback: largest numeric-ish block into a DataFrame."""
149
- if not _HAS_PANDAS:
150
- return None
151
- blocks = []
152
- cur = []
153
- for ln in (text or "").splitlines():
154
- if re.search(r"\d", ln) and ("," in ln or "\t" in ln or " " in ln or "|" in ln):
155
- cur.append(ln)
156
- else:
157
- if cur:
158
- blocks.append("\n".join(cur)); cur = []
159
- if cur: blocks.append("\n".join(cur))
160
- if not blocks: return None
161
-
162
- block = max(blocks, key=len)
163
- from io import StringIO
164
- # CSV
165
- try:
166
- df = pd.read_csv(StringIO(block))
167
- if df.shape[1] >= 2: return df
168
- except Exception:
169
- pass
170
- # whitespace
171
- try:
172
- df = pd.read_csv(StringIO(block), sep=r"\s+", engine="python")
173
- if df.shape[1] >= 2: return df
174
- except Exception:
175
- pass
176
- return None
177
-
178
-
179
- def _df_to_markdown_and_csv(df) -> Tuple[str, str]:
180
- """Return (markdown_pipe_table, csv_text)."""
181
- if not _HAS_PANDAS or df is None:
182
- return "", ""
183
- # Markdown
184
- md = []
185
- headers = list(df.columns)
186
- md.append("| " + " | ".join(map(str, headers)) + " |")
187
- md.append("| " + " | ".join([":---"] * len(headers)) + " |")
188
- for _, row in df.iterrows():
189
- md.append("| " + " | ".join(map(lambda x: str(x), row.tolist())) + " |")
190
- md_text = "\n".join(md)
191
-
192
- # CSV
193
- buf = io.StringIO()
194
- df.to_csv(buf, index=False)
195
- csv_text = buf.getvalue()
196
- return md_text, csv_text
197
-
198
-
199
- # ===== Inference Function =====
200
  @spaces.GPU
201
- def process_image(image, model_size, task_type, ref_text, is_eval_mode, deep_parse=True):
202
  """
203
- Process image with DeepSeek-OCR and return annotated image, markdown, and text.
204
- Adds deep parsing to extract structured DATA from figures (JSON + Table + CSV)
205
- and appends it inside the Markdown for RAG indexing.
206
  """
207
- if image is None:
208
- return None, "Please upload an image first.", "Please upload an image first."
209
 
210
- # device
211
  if torch.cuda.is_available():
212
  model_runtime = model.to("cuda", dtype=torch.bfloat16)
213
  else:
214
  model_runtime = model.to("cpu", dtype=torch.float32)
215
 
216
- with tempfile.TemporaryDirectory() as output_path:
217
- # ===== choose task prompt =====
218
- if task_type == "📝 Free OCR":
219
- prompt = "<image>\nFree OCR."
220
- elif task_type == "📄 Convert to Markdown":
221
- prompt = "<image>\n<|grounding|>Convert the document to markdown."
222
- elif task_type == "📈 Parse Figure":
223
- prompt = "<image>\nParse the figure."
224
- elif task_type == "🔍 Locate Object by Reference":
225
- if not ref_text or ref_text.strip() == "":
226
- raise gr.Error("Please provide reference text for the Locate task!")
227
- prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
228
- else:
229
- prompt = "<image>\nFree OCR."
230
-
231
- # save image
232
- os.makedirs(output_path, exist_ok=True)
233
- temp_image_path = os.path.join(output_path, "temp_image.jpg")
234
- image.save(temp_image_path)
235
-
236
- # size
237
- size_configs = {
238
- "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
239
- "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
240
- "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
241
- "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
242
- "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
243
- }
244
- config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
245
-
246
- # ===== primary pass =====
247
- with torch.no_grad():
248
- primary_text = model_runtime.infer(
249
- tokenizer,
250
- prompt=prompt,
251
- image_file=temp_image_path,
252
- output_path=output_path,
253
- base_size=config["base_size"],
254
- image_size=config["image_size"],
255
- crop_mode=config["crop_mode"],
256
- save_results=True,
257
- test_compress=True,
258
- eval_mode=is_eval_mode,
259
- )
260
-
261
- # collect results
262
- image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
263
- markdown_result_path = os.path.join(output_path, "result.mmd")
264
-
265
- markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
266
- if os.path.exists(markdown_result_path):
267
- try:
268
- with open(markdown_result_path, "r", encoding="utf-8") as f:
269
- markdown_content = f.read()
270
- except Exception:
271
- pass
272
-
273
- result_image = None
274
- if os.path.exists(image_result_path):
275
- try:
276
- from PIL import Image
277
- result_image = Image.open(image_result_path)
278
- result_image.load()
279
- except Exception:
280
- result_image = None
281
-
282
- # draw bboxes if <|det|>
283
- det_pat = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
284
- matches = list(det_pat.finditer(primary_text or ""))
285
- if matches:
286
- img_with_boxes = image.copy()
287
- draw = ImageDraw.Draw(img_with_boxes)
288
- w, h = image.size
289
- for m in matches:
290
- x1,y1,x2,y2 = [int(c) for c in m.groups()]
291
- x1 = int(x1/1000*w); y1=int(y1/1000*h); x2=int(x2/1000*w); y2=int(y2/1000*h)
292
- draw.rectangle([x1,y1,x2,y2], outline="red", width=3)
293
- result_image = img_with_boxes
294
-
295
- # ===== deep parse for DATA (not rendering) =====
296
- # We always try deep parse for Convert to Markdown / Parse Figure, otherwise only if checkbox is on
297
- should_deep = deep_parse and (task_type in {"📄 Convert to Markdown", "📈 Parse Figure"})
298
-
299
- extracted_md_section = ""
300
- if should_deep:
301
- # ask model for STRICT JSON for charts
302
- strict_json_prompt = (
303
- "<image>\n"
304
- "Parse the figure. If it's a chart, return ONLY a single JSON object with keys:\n"
305
- "{\n"
306
- ' "type": "bar|line|area|scatter|pie|table|unknown",\n'
307
- ' "title": "string",\n'
308
- ' "x": ["category", ...],\n'
309
- ' "series": [{"name": "string", "data": [number|null, ...]}, ...]\n'
310
- "}\n"
311
- "If it's a table, return the same JSON using 'type': 'table' and fill x from the first column and series from remaining columns.\n"
312
- "Do not include any explanation text. Return ONLY the JSON."
313
- )
314
-
315
- with torch.no_grad():
316
- deep_text = model_runtime.infer(
317
  tokenizer,
318
- prompt=strict_json_prompt,
319
- image_file=temp_image_path,
320
- output_path=output_path,
321
- base_size=config["base_size"],
322
- image_size=config["image_size"],
323
- crop_mode=config["crop_mode"],
324
- save_results=False,
325
- test_compress=True,
326
- eval_mode=True,
327
- ) or ""
328
-
329
- js = _extract_json(deep_text)
330
-
331
- md_table, csv_text = "", ""
332
- if js:
333
- # Prefer JSON → Markdown table
334
- md_table = _json_to_markdown_table(js) or ""
335
- if _HAS_PANDAS and md_table:
336
- df = _md_table_to_df(md_table)
337
- if df is not None:
338
- md_table, csv_text = _df_to_markdown_and_csv(df)
339
- # Build Markdown block with JSON + (optional) table + CSV
340
- extracted_md_section = "### Extracted Figure Data\n\n"
341
- extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js, ensure_ascii=False, indent=2) + "\n```\n\n"
342
- if md_table:
343
- extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
344
- if csv_text:
345
- extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
346
- else:
347
- # Fallback: ask for generic figure parse, then try to pull Markdown tables / numeric blocks
348
- with torch.no_grad():
349
- fallback_text = model_runtime.infer(
350
- tokenizer,
351
- prompt="<image>\nParse the figure.",
352
- image_file=temp_image_path,
353
- output_path=output_path,
354
- base_size=config["base_size"],
355
- image_size=config["image_size"],
356
- crop_mode=config["crop_mode"],
357
- save_results=False,
358
- test_compress=True,
359
- eval_mode=True,
360
- ) or ""
361
-
362
- df = _md_table_to_df(fallback_text)
363
- if df is None:
364
- df = _numeric_block_to_df(fallback_text)
365
-
366
- if df is not None:
367
- md_table, csv_text = _df_to_markdown_and_csv(df)
368
- js_fallback = {
369
- "type": "table",
370
- "title": "",
371
- "x": df.iloc[:,0].astype(str).tolist(),
372
- "series": [{"name": c, "data": [None if pd.isna(v) else (float(v) if str(v).replace('.','',1).isdigit() else v) for v in df[c].tolist()]}
373
- for c in df.columns[1:]] if _HAS_PANDAS else []
374
- }
375
- extracted_md_section = "### Extracted Figure Data\n\n"
376
- extracted_md_section += "**JSON (canonical for RAG)**\n\n```json\n" + json.dumps(js_fallback, ensure_ascii=False, indent=2) + "\n```\n\n"
377
- extracted_md_section += "**Table (Markdown)**\n\n" + md_table + "\n\n"
378
- extracted_md_section += "**CSV**\n\n```csv\n" + csv_text.strip() + "\n```\n"
379
- else:
380
- # Nothing structured; keep a short diagnostic (plain text only)
381
- extracted_md_section = "### Extracted Figure Data\n\n_No structured table/series detected. You may need to adjust the deep-parse prompt for this figure type._\n"
382
-
383
- # ===== Merge into final Markdown =====
384
- if extracted_md_section:
385
- markdown_content = markdown_content.rstrip() + "\n\n---\n\n" + extracted_md_section
386
-
387
- # For the “Markdown Source (or Eval Output)” tab
388
- text_result = primary_text if primary_text else markdown_content
389
-
390
- return result_image, markdown_content, text_result
391
 
392
 
393
  # ===== Theme and UI =====
@@ -407,77 +207,109 @@ custom_css = """
407
 
408
  # ===== Interface =====
409
  with gr.Blocks(
410
- title="DeepSeek-OCR by Jatevo LLM Inference",
411
  theme=theme,
412
  css=custom_css,
413
  ) as demo:
414
  gr.Markdown(
415
  """
416
- # DeepSeek-OCR by Jatevo LLM Inference
417
- Upload an image to extract text using **DeepSeek-OCR**.
418
- Supports documents, forms, receipts, figures, and object localization.
419
-
 
420
  **Model Sizes:**
421
  - **Tiny** — Fastest, lower accuracy (512×512)
422
  - **Small** — Fast, good accuracy (640×640)
423
  - **Base** — Balanced performance (1024×1024)
424
  - **Large** — Best accuracy, slower (1280×1280)
425
  - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
 
 
426
  """
427
  )
428
 
429
  with gr.Row():
430
  with gr.Column(scale=1):
431
- image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"])
 
 
 
 
 
432
  model_size = gr.Dropdown(
433
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
434
  value="Gundam (Recommended)",
435
  label="Model Size",
436
  )
 
437
  task_type = gr.Dropdown(
438
- choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
 
 
 
 
 
439
  value="📄 Convert to Markdown",
440
  label="Task Type",
441
  )
 
442
  ref_text_input = gr.Textbox(
443
  label="Reference Text (for Locate task)",
444
  placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
445
  visible=False,
446
  )
 
447
  eval_mode_checkbox = gr.Checkbox(
448
  value=False,
449
  label="Enable Evaluation Mode",
450
- info="Returns only plain text (faster). Uncheck to get annotated image and markdown.",
451
- )
452
- deep_parse_checkbox = gr.Checkbox(
453
- value=True,
454
- label="Deep parse and extract figure data (JSON + table + CSV)",
455
- info="Adds a second pass that extracts machine-readable data for RAG.",
456
  )
457
- submit_btn = gr.Button("Process Image", variant="primary")
 
458
 
459
  with gr.Column(scale=2):
460
- with gr.Tabs():
461
- with gr.TabItem("Annotated Image"):
462
- output_image = gr.Image(interactive=False)
463
- with gr.TabItem("Markdown Preview"):
464
- output_markdown = gr.Markdown()
465
- with gr.TabItem("Markdown Source (or Eval Output)"):
466
- output_text = gr.Textbox(lines=20, show_copy_button=True, interactive=False)
 
 
 
 
 
 
 
467
 
 
468
  def toggle_ref_text_visibility(task):
469
  return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
470
 
471
- task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
 
 
 
 
 
 
 
 
472
 
473
  submit_btn.click(
474
- fn=process_image,
475
- inputs=[image_input, model_size, task_type, ref_text_input, eval_mode_checkbox, deep_parse_checkbox],
476
- outputs=[output_image, output_markdown, output_text],
 
 
 
 
477
  )
478
 
479
 
480
  # ===== Launch =====
481
  if __name__ == "__main__":
482
  demo.queue(max_size=20)
483
- demo.launch()
 
1
  import os
2
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # disable hf_transfer if missing
3
 
 
 
 
 
 
 
4
  import gradio as gr
5
  import torch
6
  from transformers import AutoModel, AutoTokenizer
7
  import spaces
8
+ import tempfile
9
+ from PIL import Image
10
+ import re
 
 
 
 
 
 
11
  from gradio.themes import Soft
12
  from gradio.themes.utils import fonts
13
+ import fitz # PyMuPDF for PDF processing
14
 
15
  # ===== Model Load =====
16
  model_name = "deepseek-ai/DeepSeek-OCR"
 
34
  pass
35
 
36
 
37
+ def pdf_to_images(pdf_path, dpi=200):
38
+ """
39
+ Convert PDF pages to PIL Images using PyMuPDF
40
+ Args:
41
+ pdf_path: Path to PDF file
42
+ dpi: Resolution for rendering (default 200)
43
+ Returns:
44
+ List of PIL Image objects
45
+ """
46
+ images = []
47
+ pdf_document = fitz.open(pdf_path)
48
+
49
+ for page_num in range(len(pdf_document)):
50
+ page = pdf_document[page_num]
51
+ # Render page to pixmap with specified DPI
52
+ mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default DPI
53
+ pix = page.get_pixmap(matrix=mat)
54
+
55
+ # Convert to PIL Image
56
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
57
+ images.append(img)
58
+
59
+ pdf_document.close()
60
+ return images
61
+
62
+
63
+ def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path):
64
  """
65
+ Process a single page/image with DeepSeek-OCR
66
+ Returns markdown content
 
67
  """
68
+ # ===== choose task prompt =====
69
+ if task_type == "📝 Free OCR":
70
+ prompt = "<image>\nFree OCR."
71
+ elif task_type == "📄 Convert to Markdown":
72
+ prompt = "<image>\n<|grounding|>Convert the document to markdown."
73
+ elif task_type == "📈 Parse Figure":
74
+ prompt = "<image>\nParse the figure."
75
+ elif task_type == "🔍 Locate Object by Reference":
76
+ if not ref_text or ref_text.strip() == "":
77
+ raise gr.Error("Please provide reference text for the Locate task!")
78
+ prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
79
  else:
80
+ prompt = "<image>\nFree OCR."
81
+
82
+ # save image temporarily
83
+ temp_image_path = os.path.join(output_path, "temp_image.jpg")
84
+ image.save(temp_image_path)
85
+
86
+ # ===== size config =====
87
+ size_configs = {
88
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
89
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
90
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
91
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
92
+ "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
93
+ }
94
+ config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
95
+
96
+ # ===== inference =====
97
+ with torch.no_grad():
98
+ plain_text_result = model_runtime.infer(
99
+ tokenizer,
100
+ prompt=prompt,
101
+ image_file=temp_image_path,
102
+ output_path=output_path,
103
+ base_size=config["base_size"],
104
+ image_size=config["image_size"],
105
+ crop_mode=config["crop_mode"],
106
+ save_results=True,
107
+ test_compress=True,
108
+ eval_mode=is_eval_mode,
109
+ )
110
+
111
+ # ===== collect markdown result =====
112
+ markdown_result_path = os.path.join(output_path, "result.mmd")
113
+ markdown_content = ""
114
+
115
+ if os.path.exists(markdown_result_path):
116
+ try:
117
+ with open(markdown_result_path, "r", encoding="utf-8") as f:
118
+ markdown_content = f.read()
119
+ except Exception:
120
+ pass
121
+
122
+ # If no markdown file, use plain text result
123
+ if not markdown_content and plain_text_result:
124
+ markdown_content = plain_text_result
125
+
126
+ return markdown_content
127
 
128
 
129
+ # ===== Main Processing Function =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  @spaces.GPU
131
+ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, progress=gr.Progress()):
132
  """
133
+ Process PDF with DeepSeek-OCR and return combined markdown from all pages.
 
 
134
  """
135
+ if pdf_file is None:
136
+ return "Please upload a PDF file first."
137
 
138
+ # handle CPU/GPU
139
  if torch.cuda.is_available():
140
  model_runtime = model.to("cuda", dtype=torch.bfloat16)
141
  else:
142
  model_runtime = model.to("cpu", dtype=torch.float32)
143
 
144
+ try:
145
+ # Convert PDF to images
146
+ progress(0, desc="Converting PDF to images...")
147
+ images = pdf_to_images(pdf_file.name)
148
+ total_pages = len(images)
149
+
150
+ if total_pages == 0:
151
+ return "No pages found in the PDF."
152
+
153
+ progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
154
+
155
+ # Process each page
156
+ all_markdown_results = []
157
+
158
+ with tempfile.TemporaryDirectory() as output_path:
159
+ for page_num, image in enumerate(images, start=1):
160
+ progress(
161
+ (page_num / total_pages) * 0.9 + 0.1,
162
+ desc=f"Processing page {page_num}/{total_pages}..."
163
+ )
164
+
165
+ markdown_content = process_single_page(
166
+ image,
167
+ model_runtime,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  tokenizer,
169
+ model_size,
170
+ task_type,
171
+ ref_text,
172
+ is_eval_mode,
173
+ output_path
174
+ )
175
+
176
+ # Add page separator
177
+ page_header = f"\n\n---\n\n# Page {page_num}\n\n"
178
+ all_markdown_results.append(page_header + markdown_content)
179
+
180
+ # Combine all results
181
+ progress(1.0, desc="Finalizing...")
182
+ combined_markdown = "\n\n".join(all_markdown_results)
183
+
184
+ # Add document header
185
+ final_output = f"# Document OCR Results\n\n**Total Pages:** {total_pages}\n\n{combined_markdown}"
186
+
187
+ return final_output
188
+
189
+ except Exception as e:
190
+ return f"Error processing PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
 
193
  # ===== Theme and UI =====
 
207
 
208
  # ===== Interface =====
209
  with gr.Blocks(
210
+ title="DeepSeek-OCR PDF Parser by Jatevo LLM Inference",
211
  theme=theme,
212
  css=custom_css,
213
  ) as demo:
214
  gr.Markdown(
215
  """
216
+ # 📄 DeepSeek-OCR PDF Parser by Jatevo LLM Inference
217
+
218
+ Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
219
+ Each page is processed sequentially and combined into a single markdown document.
220
+
221
  **Model Sizes:**
222
  - **Tiny** — Fastest, lower accuracy (512×512)
223
  - **Small** — Fast, good accuracy (640×640)
224
  - **Base** — Balanced performance (1024×1024)
225
  - **Large** — Best accuracy, slower (1280×1280)
226
  - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
227
+
228
+ **Note:** Processing time depends on the number of pages and model size.
229
  """
230
  )
231
 
232
  with gr.Row():
233
  with gr.Column(scale=1):
234
+ pdf_input = gr.File(
235
+ label="Upload PDF",
236
+ file_types=[".pdf"],
237
+ type="filepath"
238
+ )
239
+
240
  model_size = gr.Dropdown(
241
  choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
242
  value="Gundam (Recommended)",
243
  label="Model Size",
244
  )
245
+
246
  task_type = gr.Dropdown(
247
+ choices=[
248
+ "📝 Free OCR",
249
+ "📄 Convert to Markdown",
250
+ "📈 Parse Figure",
251
+ "🔍 Locate Object by Reference",
252
+ ],
253
  value="📄 Convert to Markdown",
254
  label="Task Type",
255
  )
256
+
257
  ref_text_input = gr.Textbox(
258
  label="Reference Text (for Locate task)",
259
  placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
260
  visible=False,
261
  )
262
+
263
  eval_mode_checkbox = gr.Checkbox(
264
  value=False,
265
  label="Enable Evaluation Mode",
266
+ info="Returns only plain text (faster).",
 
 
 
 
 
267
  )
268
+
269
+ submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
270
 
271
  with gr.Column(scale=2):
272
+ gr.Markdown("### 📝 Markdown Output")
273
+ output_markdown_preview = gr.Markdown(
274
+ label="Rendered Markdown",
275
+ value="*Upload a PDF and click 'Process PDF' to see results here.*"
276
+ )
277
+
278
+ gr.Markdown("### 📄 Markdown Source (Copy/Download)")
279
+ output_text = gr.Textbox(
280
+ label="Raw Markdown",
281
+ lines=25,
282
+ show_copy_button=True,
283
+ interactive=False,
284
+ placeholder="Markdown source will appear here..."
285
+ )
286
 
287
+ # show/hide reference text box based on selected task
288
  def toggle_ref_text_visibility(task):
289
  return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
290
 
291
+ task_type.change(
292
+ fn=toggle_ref_text_visibility,
293
+ inputs=task_type,
294
+ outputs=ref_text_input,
295
+ )
296
+
297
+ def update_outputs(markdown_text):
298
+ """Update both markdown preview and raw text"""
299
+ return markdown_text, markdown_text
300
 
301
  submit_btn.click(
302
+ fn=process_pdf,
303
+ inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox],
304
+ outputs=output_text,
305
+ ).then(
306
+ fn=update_outputs,
307
+ inputs=output_text,
308
+ outputs=[output_markdown_preview, output_text]
309
  )
310
 
311
 
312
  # ===== Launch =====
313
  if __name__ == "__main__":
314
  demo.queue(max_size=20)
315
+ demo.launch()