Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # disable hf_transfer if missing | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| import spaces | |
| import tempfile | |
| from PIL import Image | |
| import re | |
| from gradio.themes import Soft | |
| from gradio.themes.utils import fonts | |
| import fitz # PyMuPDF for PDF processing | |
| # ===== Model Load ===== | |
| model_name = "deepseek-ai/DeepSeek-OCR" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| _attn_implementation="flash_attention_2", | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| ) | |
| model = model.eval() | |
| # fallback if flash-attn is unavailable | |
| try: | |
| import flash_attn # noqa | |
| except Exception: | |
| if hasattr(model, "config"): | |
| try: | |
| model.config._attn_implementation = "sdpa" | |
| except Exception: | |
| pass | |
| def pdf_to_images(pdf_path, dpi=200): | |
| """ | |
| Convert PDF pages to PIL Images using PyMuPDF | |
| Args: | |
| pdf_path: Path to PDF file | |
| dpi: Resolution for rendering (default 200) | |
| Returns: | |
| List of PIL Image objects | |
| """ | |
| images = [] | |
| pdf_document = fitz.open(pdf_path) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document[page_num] | |
| # Render page to pixmap with specified DPI | |
| mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 is default DPI | |
| pix = page.get_pixmap(matrix=mat) | |
| # Convert to PIL Image | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| images.append(img) | |
| pdf_document.close() | |
| return images | |
| def process_single_page(image, model_runtime, tokenizer, model_size, task_type, ref_text, is_eval_mode, output_path): | |
| """ | |
| Process a single page/image with DeepSeek-OCR | |
| Returns markdown content | |
| """ | |
| # ===== choose task prompt ===== | |
| if task_type == "📝 Free OCR": | |
| prompt = "<image>\nFree OCR." | |
| elif task_type == "📄 Convert to Markdown": | |
| prompt = "<image>\n<|grounding|>Convert the document to markdown." | |
| elif task_type == "📈 Parse Figure": | |
| prompt = "<image>\nParse the figure." | |
| elif task_type == "🔍 Locate Object by Reference": | |
| if not ref_text or ref_text.strip() == "": | |
| raise gr.Error("Please provide reference text for the Locate task!") | |
| prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image." | |
| else: | |
| prompt = "<image>\nFree OCR." | |
| # save image temporarily | |
| temp_image_path = os.path.join(output_path, "temp_image.jpg") | |
| image.save(temp_image_path) | |
| # ===== size config ===== | |
| size_configs = { | |
| "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, | |
| "Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, | |
| "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, | |
| "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, | |
| "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True}, | |
| } | |
| config = size_configs.get(model_size, size_configs["Gundam (Recommended)"]) | |
| # ===== inference ===== | |
| with torch.no_grad(): | |
| plain_text_result = model_runtime.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=temp_image_path, | |
| output_path=output_path, | |
| base_size=config["base_size"], | |
| image_size=config["image_size"], | |
| crop_mode=config["crop_mode"], | |
| save_results=True, | |
| test_compress=True, | |
| eval_mode=is_eval_mode, | |
| ) | |
| # ===== collect markdown result ===== | |
| markdown_result_path = os.path.join(output_path, "result.mmd") | |
| markdown_content = "" | |
| if os.path.exists(markdown_result_path): | |
| try: | |
| with open(markdown_result_path, "r", encoding="utf-8") as f: | |
| markdown_content = f.read() | |
| except Exception: | |
| pass | |
| # If no markdown file, use plain text result | |
| if not markdown_content and plain_text_result: | |
| markdown_content = plain_text_result | |
| return markdown_content | |
| # ===== Main Processing Function ===== | |
| def process_pdf(pdf_file, model_size, task_type, ref_text, is_eval_mode, progress=gr.Progress()): | |
| """ | |
| Process PDF with DeepSeek-OCR and return combined markdown from all pages. | |
| """ | |
| if pdf_file is None: | |
| return "Please upload a PDF file first." | |
| # handle CPU/GPU | |
| if torch.cuda.is_available(): | |
| model_runtime = model.to("cuda", dtype=torch.bfloat16) | |
| else: | |
| model_runtime = model.to("cpu", dtype=torch.float32) | |
| try: | |
| # Convert PDF to images | |
| progress(0, desc="Converting PDF to images...") | |
| images = pdf_to_images(pdf_file.name) | |
| total_pages = len(images) | |
| if total_pages == 0: | |
| return "No pages found in the PDF." | |
| progress(0.1, desc=f"Found {total_pages} pages. Starting OCR...") | |
| # Process each page | |
| all_markdown_results = [] | |
| with tempfile.TemporaryDirectory() as output_path: | |
| for page_num, image in enumerate(images, start=1): | |
| progress( | |
| (page_num / total_pages) * 0.9 + 0.1, | |
| desc=f"Processing page {page_num}/{total_pages}..." | |
| ) | |
| markdown_content = process_single_page( | |
| image, | |
| model_runtime, | |
| tokenizer, | |
| model_size, | |
| task_type, | |
| ref_text, | |
| is_eval_mode, | |
| output_path | |
| ) | |
| # Add page separator | |
| page_header = f"\n\n---\n\n# Page {page_num}\n\n" | |
| all_markdown_results.append(page_header + markdown_content) | |
| # Combine all results | |
| progress(1.0, desc="Finalizing...") | |
| combined_markdown = "\n\n".join(all_markdown_results) | |
| # Add document header | |
| final_output = f"# Document OCR Results\n\n**Total Pages:** {total_pages}\n\n{combined_markdown}" | |
| return final_output | |
| except Exception as e: | |
| return f"Error processing PDF: {str(e)}" | |
| # ===== Theme and UI ===== | |
| theme = Soft( | |
| font=fonts.GoogleFont("Inter"), | |
| font_mono=fonts.GoogleFont("JetBrains Mono"), | |
| ) | |
| custom_css = """ | |
| .gradio-container, body { | |
| font-family: 'Inter', ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', 'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol','Noto Color Emoji' !important; | |
| } | |
| .prose h1 { font-weight: 800; letter-spacing: -0.02em; } | |
| .prose h2, .prose h3 { font-weight: 700; letter-spacing: -0.01em; } | |
| .gr-button { border-radius: 12px; font-weight: 600; } | |
| """ | |
| # ===== Interface ===== | |
| with gr.Blocks( | |
| title="DeepSeek-OCR PDF Parser by Jatevo LLM Inference", | |
| theme=theme, | |
| css=custom_css, | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # 📄 DeepSeek-OCR PDF Parser by Jatevo LLM Inference | |
| Upload a PDF to extract text and convert to Markdown using **DeepSeek-OCR**. | |
| Each page is processed sequentially and combined into a single markdown document. | |
| **Model Sizes:** | |
| - **Tiny** — Fastest, lower accuracy (512×512) | |
| - **Small** — Fast, good accuracy (640×640) | |
| - **Base** — Balanced performance (1024×1024) | |
| - **Large** — Best accuracy, slower (1280×1280) | |
| - **Gundam (Recommended)** — Optimized for documents (1024 base, 640 image, crop mode) | |
| **Note:** Processing time depends on the number of pages and model size. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| model_size = gr.Dropdown( | |
| choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], | |
| value="Gundam (Recommended)", | |
| label="Model Size", | |
| ) | |
| task_type = gr.Dropdown( | |
| choices=[ | |
| "📝 Free OCR", | |
| "📄 Convert to Markdown", | |
| "📈 Parse Figure", | |
| "🔍 Locate Object by Reference", | |
| ], | |
| value="📄 Convert to Markdown", | |
| label="Task Type", | |
| ) | |
| ref_text_input = gr.Textbox( | |
| label="Reference Text (for Locate task)", | |
| placeholder="e.g., 'the teacher', '20-10', 'a red car'...", | |
| visible=False, | |
| ) | |
| eval_mode_checkbox = gr.Checkbox( | |
| value=False, | |
| label="Enable Evaluation Mode", | |
| info="Returns only plain text (faster).", | |
| ) | |
| submit_btn = gr.Button("🚀 Process PDF", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### 📝 Markdown Output") | |
| output_markdown_preview = gr.Markdown( | |
| label="Rendered Markdown", | |
| value="*Upload a PDF and click 'Process PDF' to see results here.*" | |
| ) | |
| gr.Markdown("### 📄 Markdown Source (Copy/Download)") | |
| output_text = gr.Textbox( | |
| label="Raw Markdown", | |
| lines=25, | |
| show_copy_button=True, | |
| interactive=False, | |
| placeholder="Markdown source will appear here..." | |
| ) | |
| # show/hide reference text box based on selected task | |
| def toggle_ref_text_visibility(task): | |
| return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False) | |
| task_type.change( | |
| fn=toggle_ref_text_visibility, | |
| inputs=task_type, | |
| outputs=ref_text_input, | |
| ) | |
| def update_outputs(markdown_text): | |
| """Update both markdown preview and raw text""" | |
| return markdown_text, markdown_text | |
| submit_btn.click( | |
| fn=process_pdf, | |
| inputs=[pdf_input, model_size, task_type, ref_text_input, eval_mode_checkbox], | |
| outputs=output_text, | |
| ).then( | |
| fn=update_outputs, | |
| inputs=output_text, | |
| outputs=[output_markdown_preview, output_text] | |
| ) | |
| # ===== Launch ===== | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20) | |
| demo.launch() |