Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -94,23 +94,35 @@ def extract_images_from_pdf(pdf_path):
|
|
| 94 |
return images_by_page
|
| 95 |
|
| 96 |
|
| 97 |
-
def image_to_base64(pil_image, format='
|
| 98 |
"""
|
| 99 |
Convert PIL Image to base64 string for markdown embedding
|
|
|
|
| 100 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
buffered = BytesIO()
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
img_str = base64.b64encode(buffered.getvalue()).decode()
|
| 104 |
return f"data:image/{format.lower()};base64,{img_str}"
|
| 105 |
|
| 106 |
|
| 107 |
def detect_figure_regions(text_result, original_image):
|
| 108 |
"""
|
| 109 |
-
Detect figure regions from OCR output
|
| 110 |
Returns list of cropped figure images
|
| 111 |
"""
|
| 112 |
-
# Look for figure-related patterns in the text
|
| 113 |
-
# This is a heuristic approach - adjust based on your needs
|
| 114 |
figure_images = []
|
| 115 |
|
| 116 |
# Pattern to detect bounding boxes (if model returns them)
|
|
@@ -200,40 +212,40 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
|
|
| 200 |
# ===== Embed images if requested =====
|
| 201 |
if embed_images and markdown_content:
|
| 202 |
# Check if markdown mentions figures/charts/images
|
| 203 |
-
figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration']
|
| 204 |
has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
|
| 205 |
|
| 206 |
if has_figure_mention:
|
| 207 |
-
# Try to detect figure regions
|
| 208 |
figure_images = detect_figure_regions(plain_text_result, image)
|
| 209 |
|
| 210 |
-
# If
|
| 211 |
-
if
|
| 212 |
-
|
| 213 |
-
base64_img = image_to_base64(image, format='JPEG')
|
| 214 |
-
figure_markdown = f"\n\n\n\n"
|
| 215 |
-
|
| 216 |
-
# Insert image after first mention of figure/chart
|
| 217 |
-
for keyword in figure_keywords:
|
| 218 |
-
if keyword in markdown_content.lower():
|
| 219 |
-
# Find the line with the keyword
|
| 220 |
-
lines = markdown_content.split('\n')
|
| 221 |
-
for i, line in enumerate(lines):
|
| 222 |
-
if keyword in line.lower():
|
| 223 |
-
# Insert image after this line
|
| 224 |
-
lines.insert(i + 1, figure_markdown)
|
| 225 |
-
markdown_content = '\n'.join(lines)
|
| 226 |
-
break
|
| 227 |
-
break
|
| 228 |
-
|
| 229 |
-
# If we found specific figure regions, embed them
|
| 230 |
-
elif figure_images:
|
| 231 |
for idx, fig_img in enumerate(figure_images):
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
return markdown_content
|
| 237 |
|
| 238 |
|
| 239 |
# ===== Main Processing Function =====
|
|
@@ -241,9 +253,10 @@ def process_single_page(image, model_runtime, tokenizer, model_size, task_type,
|
|
| 241 |
def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_images, progress=gr.Progress()):
|
| 242 |
"""
|
| 243 |
Process PDF with DeepSeek-OCR and return combined markdown from all pages.
|
|
|
|
| 244 |
"""
|
| 245 |
if pdf_file is None:
|
| 246 |
-
return "Please upload a PDF file first."
|
| 247 |
|
| 248 |
# handle CPU/GPU
|
| 249 |
if torch.cuda.is_available():
|
|
@@ -258,13 +271,17 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
|
|
| 258 |
total_pages = len(images)
|
| 259 |
|
| 260 |
if total_pages == 0:
|
| 261 |
-
return "No pages found in the PDF."
|
| 262 |
|
| 263 |
# Extract embedded images if needed
|
| 264 |
embedded_images = {}
|
| 265 |
if embed_images:
|
| 266 |
-
progress(0.05, desc="Extracting embedded images...")
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
|
| 270 |
|
|
@@ -279,7 +296,7 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
|
|
| 279 |
desc=f"Processing page {page_num}/{total_pages}..."
|
| 280 |
)
|
| 281 |
|
| 282 |
-
markdown_content = process_single_page(
|
| 283 |
image,
|
| 284 |
model_runtime,
|
| 285 |
tokenizer,
|
|
@@ -294,12 +311,15 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
|
|
| 294 |
|
| 295 |
# Add embedded images from PDF if any
|
| 296 |
if embed_images and (page_num - 1) in embedded_images:
|
| 297 |
-
markdown_content += "\n\n### Embedded Images from
|
| 298 |
for idx, img in enumerate(embedded_images[page_num - 1]):
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
-
# Add page separator
|
| 303 |
page_header = f"\n\n---\n\n# Page {page_num}\n\n"
|
| 304 |
all_markdown_results.append(page_header + markdown_content)
|
| 305 |
|
|
@@ -309,24 +329,61 @@ def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_i
|
|
| 309 |
gc.collect()
|
| 310 |
|
| 311 |
except Exception as e:
|
| 312 |
-
error_msg = f"\n\n---\n\n# Page {page_num}\n\n
|
| 313 |
all_markdown_results.append(error_msg)
|
| 314 |
print(f"Error on page {page_num}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
continue
|
| 316 |
|
| 317 |
# Combine all results
|
| 318 |
-
progress(1.0, desc="Finalizing...")
|
| 319 |
combined_markdown = "\n\n".join(all_markdown_results)
|
| 320 |
|
| 321 |
-
# Add document header
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
-
return final_output
|
| 325 |
|
| 326 |
except Exception as e:
|
| 327 |
-
error_message = f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
print(f"Fatal error: {str(e)}")
|
| 329 |
-
|
|
|
|
|
|
|
| 330 |
|
| 331 |
|
| 332 |
# ===== Theme and UI =====
|
|
@@ -334,6 +391,7 @@ theme = Soft(
|
|
| 334 |
font=fonts.GoogleFont("Inter"),
|
| 335 |
font_mono=fonts.GoogleFont("JetBrains Mono"),
|
| 336 |
)
|
|
|
|
| 337 |
custom_css = """
|
| 338 |
.gradio-container, body {
|
| 339 |
font-family: 'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', 'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol','Noto Color Emoji' !important;
|
|
@@ -341,6 +399,7 @@ custom_css = """
|
|
| 341 |
.prose h1 { font-weight: 800; letter-spacing: -0.02em; }
|
| 342 |
.prose h2, .prose h3 { font-weight: 700; letter-spacing: -0.01em; }
|
| 343 |
.gr-button { border-radius: 12px; font-weight: 600; }
|
|
|
|
| 344 |
"""
|
| 345 |
|
| 346 |
|
|
@@ -357,27 +416,36 @@ with gr.Blocks(
|
|
| 357 |
Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
|
| 358 |
Each page is processed sequentially and combined into a single markdown document.
|
| 359 |
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
**
|
| 363 |
-
- **
|
| 364 |
-
- **Small** — Fast, good accuracy (640×640) - Good for 20+ pages
|
| 365 |
- **Base** — Balanced performance (1024×1024) - Good for 10-20 pages
|
| 366 |
- **Large** — Best accuracy, slower (1280×1280) - Best for <10 pages
|
| 367 |
- **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
|
| 368 |
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
- Enable "Embed Images" to include charts/figures
|
| 372 |
-
-
|
|
|
|
| 373 |
- Maximum recommended: 50 pages at once
|
|
|
|
| 374 |
"""
|
| 375 |
)
|
| 376 |
|
| 377 |
with gr.Row():
|
| 378 |
with gr.Column(scale=1):
|
| 379 |
pdf_input = gr.File(
|
| 380 |
-
label="Upload PDF",
|
| 381 |
file_types=[".pdf"],
|
| 382 |
type="filepath"
|
| 383 |
)
|
|
@@ -385,7 +453,7 @@ with gr.Blocks(
|
|
| 385 |
model_size = gr.Dropdown(
|
| 386 |
choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
|
| 387 |
value="Small",
|
| 388 |
-
label="Model Size",
|
| 389 |
info="Use Tiny/Small for large PDFs (20+ pages)"
|
| 390 |
)
|
| 391 |
|
|
@@ -397,58 +465,65 @@ with gr.Blocks(
|
|
| 397 |
"🔍 Locate Object by Reference",
|
| 398 |
],
|
| 399 |
value="📄 Convert to Markdown",
|
| 400 |
-
label="Task Type",
|
| 401 |
)
|
| 402 |
|
| 403 |
ref_text_input = gr.Textbox(
|
| 404 |
-
label="Reference Text (for Locate task)",
|
| 405 |
placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
|
| 406 |
visible=False,
|
| 407 |
)
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
|
|
|
| 420 |
|
| 421 |
submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
|
| 422 |
|
| 423 |
gr.Markdown(
|
| 424 |
"""
|
| 425 |
---
|
| 426 |
-
**Processing Status:**
|
| 427 |
-
Watch the progress bar above for real-time updates.
|
| 428 |
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
"""
|
| 431 |
)
|
| 432 |
|
| 433 |
with gr.Column(scale=2):
|
| 434 |
-
gr.Markdown("### 📝 Markdown Output")
|
| 435 |
output_markdown_preview = gr.Markdown(
|
| 436 |
label="Rendered Markdown",
|
| 437 |
-
value="*Upload a PDF and click 'Process PDF' to see results here.*"
|
| 438 |
)
|
| 439 |
|
| 440 |
-
gr.Markdown("### 📄 Markdown Source (Copy/Download)")
|
| 441 |
output_text = gr.Textbox(
|
| 442 |
label="Raw Markdown",
|
| 443 |
lines=25,
|
| 444 |
show_copy_button=True,
|
| 445 |
interactive=False,
|
| 446 |
-
placeholder="Markdown source will appear here..."
|
| 447 |
)
|
| 448 |
|
| 449 |
# show/hide reference text box based on selected task
|
| 450 |
def toggle_ref_text_visibility(task):
|
| 451 |
-
return gr.Textbox(visible=
|
| 452 |
|
| 453 |
task_type.change(
|
| 454 |
fn=toggle_ref_text_visibility,
|
|
@@ -456,18 +531,10 @@ with gr.Blocks(
|
|
| 456 |
outputs=ref_text_input,
|
| 457 |
)
|
| 458 |
|
| 459 |
-
def update_outputs(markdown_text):
|
| 460 |
-
"""Update both markdown preview and raw text"""
|
| 461 |
-
return markdown_text, markdown_text
|
| 462 |
-
|
| 463 |
submit_btn.click(
|
| 464 |
fn=process_pdf,
|
| 465 |
inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox, embed_images_checkbox],
|
| 466 |
-
outputs=output_text,
|
| 467 |
-
).then(
|
| 468 |
-
fn=update_outputs,
|
| 469 |
-
inputs=output_text,
|
| 470 |
-
outputs=[output_markdown_preview, output_text]
|
| 471 |
)
|
| 472 |
|
| 473 |
|
|
@@ -479,6 +546,7 @@ if __name__ == "__main__":
|
|
| 479 |
default_concurrency_limit=2
|
| 480 |
)
|
| 481 |
demo.launch(
|
| 482 |
-
max_threads=40, # Increase thread limit
|
| 483 |
-
show_error=True # Show errors in UI
|
|
|
|
| 484 |
)
|
|
|
|
| 94 |
return images_by_page
|
| 95 |
|
| 96 |
|
| 97 |
+
def image_to_base64(pil_image, format='JPEG', max_size=(1200, 1200)):
|
| 98 |
"""
|
| 99 |
Convert PIL Image to base64 string for markdown embedding
|
| 100 |
+
Resize if too large to keep file size manageable
|
| 101 |
"""
|
| 102 |
+
# Resize if image is too large
|
| 103 |
+
if pil_image.size[0] > max_size[0] or pil_image.size[1] > max_size[1]:
|
| 104 |
+
pil_image = pil_image.copy()
|
| 105 |
+
pil_image.thumbnail(max_size, Image.Resampling.LANCZOS)
|
| 106 |
+
|
| 107 |
buffered = BytesIO()
|
| 108 |
+
|
| 109 |
+
# Convert RGBA to RGB if necessary
|
| 110 |
+
if pil_image.mode == 'RGBA' and format == 'JPEG':
|
| 111 |
+
rgb_image = Image.new('RGB', pil_image.size, (255, 255, 255))
|
| 112 |
+
rgb_image.paste(pil_image, mask=pil_image.split()[3])
|
| 113 |
+
rgb_image.save(buffered, format=format, quality=85)
|
| 114 |
+
else:
|
| 115 |
+
pil_image.save(buffered, format=format, quality=85 if format == 'JPEG' else None)
|
| 116 |
+
|
| 117 |
img_str = base64.b64encode(buffered.getvalue()).decode()
|
| 118 |
return f"data:image/{format.lower()};base64,{img_str}"
|
| 119 |
|
| 120 |
|
| 121 |
def detect_figure_regions(text_result, original_image):
|
| 122 |
"""
|
| 123 |
+
Detect figure regions from OCR output using bounding boxes
|
| 124 |
Returns list of cropped figure images
|
| 125 |
"""
|
|
|
|
|
|
|
| 126 |
figure_images = []
|
| 127 |
|
| 128 |
# Pattern to detect bounding boxes (if model returns them)
|
|
|
|
| 212 |
# ===== Embed images if requested =====
|
| 213 |
if embed_images and markdown_content:
|
| 214 |
# Check if markdown mentions figures/charts/images
|
| 215 |
+
figure_keywords = ['figure', 'chart', 'graph', 'diagram', 'image', 'plot', 'illustration', 'table', 'screenshot']
|
| 216 |
has_figure_mention = any(keyword in markdown_content.lower() for keyword in figure_keywords)
|
| 217 |
|
| 218 |
if has_figure_mention:
|
| 219 |
+
# Try to detect figure regions from bounding boxes
|
| 220 |
figure_images = detect_figure_regions(plain_text_result, image)
|
| 221 |
|
| 222 |
+
# If specific figures detected, embed them
|
| 223 |
+
if figure_images:
|
| 224 |
+
figures_markdown = "\n\n### Detected Figures\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
for idx, fig_img in enumerate(figure_images):
|
| 226 |
+
try:
|
| 227 |
+
base64_img = image_to_base64(fig_img, format='PNG')
|
| 228 |
+
figures_markdown += f"\n\n"
|
| 229 |
+
except Exception as e:
|
| 230 |
+
print(f"Error embedding figure {idx+1}: {e}")
|
| 231 |
+
markdown_content += figures_markdown
|
| 232 |
+
else:
|
| 233 |
+
# No specific regions detected, but figures mentioned
|
| 234 |
+
# Embed full page image for context
|
| 235 |
+
try:
|
| 236 |
+
base64_img = image_to_base64(image, format='JPEG')
|
| 237 |
+
page_image_markdown = f"\n\n### Page {page_num} Visual Content\n\n\n\n"
|
| 238 |
+
|
| 239 |
+
# Insert image after first paragraph or at the beginning
|
| 240 |
+
lines = markdown_content.split('\n\n', 1)
|
| 241 |
+
if len(lines) > 1:
|
| 242 |
+
markdown_content = lines[0] + page_image_markdown + lines[1]
|
| 243 |
+
else:
|
| 244 |
+
markdown_content = page_image_markdown + markdown_content
|
| 245 |
+
except Exception as e:
|
| 246 |
+
print(f"Error embedding page image: {e}")
|
| 247 |
|
| 248 |
+
return markdown_content, plain_text_result
|
| 249 |
|
| 250 |
|
| 251 |
# ===== Main Processing Function =====
|
|
|
|
| 253 |
def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, embed_images, progress=gr.Progress()):
|
| 254 |
"""
|
| 255 |
Process PDF with DeepSeek-OCR and return combined markdown from all pages.
|
| 256 |
+
Includes both visual images and extracted text content.
|
| 257 |
"""
|
| 258 |
if pdf_file is None:
|
| 259 |
+
return "Please upload a PDF file first.", "Please upload a PDF file first."
|
| 260 |
|
| 261 |
# handle CPU/GPU
|
| 262 |
if torch.cuda.is_available():
|
|
|
|
| 271 |
total_pages = len(images)
|
| 272 |
|
| 273 |
if total_pages == 0:
|
| 274 |
+
return "No pages found in the PDF.", "No pages found in the PDF."
|
| 275 |
|
| 276 |
# Extract embedded images if needed
|
| 277 |
embedded_images = {}
|
| 278 |
if embed_images:
|
| 279 |
+
progress(0.05, desc="Extracting embedded images from PDF...")
|
| 280 |
+
try:
|
| 281 |
+
embedded_images = extract_images_from_pdf(pdf_file.name)
|
| 282 |
+
print(f"Found embedded images on {len(embedded_images)} pages")
|
| 283 |
+
except Exception as e:
|
| 284 |
+
print(f"Could not extract embedded images: {e}")
|
| 285 |
|
| 286 |
progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...")
|
| 287 |
|
|
|
|
| 296 |
desc=f"Processing page {page_num}/{total_pages}..."
|
| 297 |
)
|
| 298 |
|
| 299 |
+
markdown_content, plain_text = process_single_page(
|
| 300 |
image,
|
| 301 |
model_runtime,
|
| 302 |
tokenizer,
|
|
|
|
| 311 |
|
| 312 |
# Add embedded images from PDF if any
|
| 313 |
if embed_images and (page_num - 1) in embedded_images:
|
| 314 |
+
markdown_content += "\n\n### Embedded Images from PDF\n\n"
|
| 315 |
for idx, img in enumerate(embedded_images[page_num - 1]):
|
| 316 |
+
try:
|
| 317 |
+
base64_img = image_to_base64(img, format='PNG')
|
| 318 |
+
markdown_content += f"\n\n"
|
| 319 |
+
except Exception as e:
|
| 320 |
+
print(f"Error embedding PDF image {idx+1}: {e}")
|
| 321 |
|
| 322 |
+
# Add page separator and content
|
| 323 |
page_header = f"\n\n---\n\n# Page {page_num}\n\n"
|
| 324 |
all_markdown_results.append(page_header + markdown_content)
|
| 325 |
|
|
|
|
| 329 |
gc.collect()
|
| 330 |
|
| 331 |
except Exception as e:
|
| 332 |
+
error_msg = f"\n\n---\n\n# Page {page_num}\n\n**⚠️ Error processing this page:** {str(e)}\n\n"
|
| 333 |
all_markdown_results.append(error_msg)
|
| 334 |
print(f"Error on page {page_num}: {str(e)}")
|
| 335 |
+
|
| 336 |
+
# Clear memory even on error
|
| 337 |
+
if torch.cuda.is_available():
|
| 338 |
+
torch.cuda.empty_cache()
|
| 339 |
+
gc.collect()
|
| 340 |
continue
|
| 341 |
|
| 342 |
# Combine all results
|
| 343 |
+
progress(1.0, desc="Finalizing document...")
|
| 344 |
combined_markdown = "\n\n".join(all_markdown_results)
|
| 345 |
|
| 346 |
+
# Add document header with metadata
|
| 347 |
+
image_status = "✅ Enabled" if embed_images else "❌ Disabled"
|
| 348 |
+
final_output = f"""# 📄 Document OCR Results
|
| 349 |
+
|
| 350 |
+
**Total Pages:** {total_pages}
|
| 351 |
+
**Model Size:** {model_size}
|
| 352 |
+
**Task Type:** {task_type}
|
| 353 |
+
**Image Embedding:** {image_status}
|
| 354 |
+
|
| 355 |
+
---
|
| 356 |
+
|
| 357 |
+
{combined_markdown}
|
| 358 |
+
|
| 359 |
+
---
|
| 360 |
+
|
| 361 |
+
**End of Document** - Processed {total_pages} pages successfully.
|
| 362 |
+
"""
|
| 363 |
|
| 364 |
+
return final_output, final_output # Return twice: once for preview, once for raw text
|
| 365 |
|
| 366 |
except Exception as e:
|
| 367 |
+
error_message = f"""# ❌ Error Processing PDF
|
| 368 |
+
|
| 369 |
+
**Error:** {str(e)}
|
| 370 |
+
|
| 371 |
+
**Troubleshooting Tips:**
|
| 372 |
+
- Try using a smaller model size (Tiny or Small)
|
| 373 |
+
- Disable image embedding for faster processing
|
| 374 |
+
- Check if the PDF is corrupted or password-protected
|
| 375 |
+
- For very large PDFs (50+ pages), consider processing in batches
|
| 376 |
+
- Ensure you have enough GPU memory available
|
| 377 |
+
|
| 378 |
+
**Technical Details:**
|
| 379 |
+
```
|
| 380 |
+
{str(e)}
|
| 381 |
+
```
|
| 382 |
+
"""
|
| 383 |
print(f"Fatal error: {str(e)}")
|
| 384 |
+
import traceback
|
| 385 |
+
traceback.print_exc()
|
| 386 |
+
return error_message, error_message # Return twice: once for preview, once for raw text
|
| 387 |
|
| 388 |
|
| 389 |
# ===== Theme and UI =====
|
|
|
|
| 391 |
font=fonts.GoogleFont("Inter"),
|
| 392 |
font_mono=fonts.GoogleFont("JetBrains Mono"),
|
| 393 |
)
|
| 394 |
+
|
| 395 |
custom_css = """
|
| 396 |
.gradio-container, body {
|
| 397 |
font-family: 'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', 'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol','Noto Color Emoji' !important;
|
|
|
|
| 399 |
.prose h1 { font-weight: 800; letter-spacing: -0.02em; }
|
| 400 |
.prose h2, .prose h3 { font-weight: 700; letter-spacing: -0.01em; }
|
| 401 |
.gr-button { border-radius: 12px; font-weight: 600; }
|
| 402 |
+
.prose img { max-width: 100%; height: auto; border-radius: 8px; margin: 1rem 0; }
|
| 403 |
"""
|
| 404 |
|
| 405 |
|
|
|
|
| 416 |
Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**.
|
| 417 |
Each page is processed sequentially and combined into a single markdown document.
|
| 418 |
|
| 419 |
+
## ✨ Features
|
| 420 |
+
|
| 421 |
+
- 🖼️ **Image Embedding** - Charts, graphs, and figures embedded directly in markdown
|
| 422 |
+
- 📝 **Text Extraction** - All text content from images and charts extracted
|
| 423 |
+
- 📊 **Table Support** - Tables converted to markdown format
|
| 424 |
+
- 🔍 **Object Detection** - Locate specific elements in documents
|
| 425 |
+
- 🎯 **Multiple Models** - Choose speed vs. accuracy trade-off
|
| 426 |
+
|
| 427 |
+
## 📏 Model Sizes
|
| 428 |
|
| 429 |
+
- **Tiny** — Fastest, lower accuracy (512×512) - Best for large PDFs (30+ pages)
|
| 430 |
+
- **Small** — Fast, good accuracy (640×640) - Good for 15-30 pages
|
|
|
|
| 431 |
- **Base** — Balanced performance (1024×1024) - Good for 10-20 pages
|
| 432 |
- **Large** — Best accuracy, slower (1280×1280) - Best for <10 pages
|
| 433 |
- **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode)
|
| 434 |
|
| 435 |
+
## 💡 Tips
|
| 436 |
+
|
| 437 |
+
- Enable **"Embed Images"** to include charts/figures (recommended)
|
| 438 |
+
- Use **Tiny or Small** model for large PDFs (20+ pages)
|
| 439 |
+
- Processing time: ~2-5 seconds per page depending on model
|
| 440 |
- Maximum recommended: 50 pages at once
|
| 441 |
+
- Image embedding increases file size (~1-2MB per page with images)
|
| 442 |
"""
|
| 443 |
)
|
| 444 |
|
| 445 |
with gr.Row():
|
| 446 |
with gr.Column(scale=1):
|
| 447 |
pdf_input = gr.File(
|
| 448 |
+
label="📎 Upload PDF",
|
| 449 |
file_types=[".pdf"],
|
| 450 |
type="filepath"
|
| 451 |
)
|
|
|
|
| 453 |
model_size = gr.Dropdown(
|
| 454 |
choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
|
| 455 |
value="Small",
|
| 456 |
+
label="🎯 Model Size",
|
| 457 |
info="Use Tiny/Small for large PDFs (20+ pages)"
|
| 458 |
)
|
| 459 |
|
|
|
|
| 465 |
"🔍 Locate Object by Reference",
|
| 466 |
],
|
| 467 |
value="📄 Convert to Markdown",
|
| 468 |
+
label="📋 Task Type",
|
| 469 |
)
|
| 470 |
|
| 471 |
ref_text_input = gr.Textbox(
|
| 472 |
+
label="🔍 Reference Text (for Locate task)",
|
| 473 |
placeholder="e.g., 'the teacher', '20-10', 'a red car'...",
|
| 474 |
visible=False,
|
| 475 |
)
|
| 476 |
|
| 477 |
+
with gr.Row():
|
| 478 |
+
eval_mode_checkbox = gr.Checkbox(
|
| 479 |
+
value=False,
|
| 480 |
+
label="⚡ Evaluation Mode",
|
| 481 |
+
info="Plain text only (faster)",
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
embed_images_checkbox = gr.Checkbox(
|
| 485 |
+
value=True,
|
| 486 |
+
label="🖼️ Embed Images",
|
| 487 |
+
info="Include charts/figures in output",
|
| 488 |
+
)
|
| 489 |
|
| 490 |
submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg")
|
| 491 |
|
| 492 |
gr.Markdown(
|
| 493 |
"""
|
| 494 |
---
|
|
|
|
|
|
|
| 495 |
|
| 496 |
+
### 📊 Processing Status
|
| 497 |
+
|
| 498 |
+
Watch the progress bar for real-time updates.
|
| 499 |
+
|
| 500 |
+
**Note:** Image embedding provides both:
|
| 501 |
+
- 👁️ Visual image (embedded as base64)
|
| 502 |
+
- 📝 Extracted text content (OCR'd from image)
|
| 503 |
+
|
| 504 |
+
You get the best of both worlds!
|
| 505 |
"""
|
| 506 |
)
|
| 507 |
|
| 508 |
with gr.Column(scale=2):
|
| 509 |
+
gr.Markdown("### 📝 Markdown Output Preview")
|
| 510 |
output_markdown_preview = gr.Markdown(
|
| 511 |
label="Rendered Markdown",
|
| 512 |
+
value="*Upload a PDF and click 'Process PDF' to see results here.*\n\n*The output will include both images and extracted text.*"
|
| 513 |
)
|
| 514 |
|
| 515 |
+
gr.Markdown("### 📄 Raw Markdown Source (Copy/Download)")
|
| 516 |
output_text = gr.Textbox(
|
| 517 |
label="Raw Markdown",
|
| 518 |
lines=25,
|
| 519 |
show_copy_button=True,
|
| 520 |
interactive=False,
|
| 521 |
+
placeholder="Markdown source will appear here... You can copy/paste this into any markdown editor."
|
| 522 |
)
|
| 523 |
|
| 524 |
# show/hide reference text box based on selected task
|
| 525 |
def toggle_ref_text_visibility(task):
|
| 526 |
+
return gr.Textbox(visible=(task == "🔍 Locate Object by Reference"))
|
| 527 |
|
| 528 |
task_type.change(
|
| 529 |
fn=toggle_ref_text_visibility,
|
|
|
|
| 531 |
outputs=ref_text_input,
|
| 532 |
)
|
| 533 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
submit_btn.click(
|
| 535 |
fn=process_pdf,
|
| 536 |
inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox, embed_images_checkbox],
|
| 537 |
+
outputs=[output_markdown_preview, output_text],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
)
|
| 539 |
|
| 540 |
|
|
|
|
| 546 |
default_concurrency_limit=2
|
| 547 |
)
|
| 548 |
demo.launch(
|
| 549 |
+
max_threads=40, # Increase thread limit for better concurrency
|
| 550 |
+
show_error=True, # Show errors in UI for debugging
|
| 551 |
+
share=False # Set to True to create a public link
|
| 552 |
)
|