import gradio as gr import torch from transformers import AutoModel, AutoTokenizer import spaces import os import tempfile from PIL import Image # Load model and tokenizer model_name = "deepseek-ai/DeepSeek-OCR" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained( model_name, _attn_implementation="flash_attention_2", trust_remote_code=True, use_safetensors=True, ) model = model.eval() @spaces.GPU def process_image(image, model_size, task_type, is_eval_mode): """ Process image with DeepSeek-OCR and return multiple output formats. Args: image: PIL Image or file path model_size: Model size configuration task_type: OCR task type Returns: A tuple containing: - Path to the image with bounding boxes. - The content of the markdown result file. - The plain text OCR result. """ if image is None: return None, "Please upload an image first.", "Please upload an image first." model_gpu = model.cuda().to(torch.bfloat16) # Create temporary directory for output with tempfile.TemporaryDirectory() as output_path: # Set prompt based on task type if task_type == "Free OCR": prompt = "\nFree OCR. " elif task_type == "Convert to Markdown": prompt = "\n<|grounding|>Convert the document to markdown. " else: prompt = "\nFree OCR. " # Save uploaded image temporarily temp_image_path = os.path.join(output_path, "temp_image.jpg") image.save(temp_image_path) # Configure model size parameters size_configs = { "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, "Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, "Gundam (Recommended)": { "base_size": 1024, "image_size": 640, "crop_mode": True, }, } config = size_configs.get(model_size, size_configs["Gundam (Recommended)"]) # Run inference plain_text_result = model_gpu.infer( tokenizer, prompt=prompt, image_file=temp_image_path, output_path=output_path, base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"], save_results=True, # Ensure results are saved to disk test_compress=True, eval_mode=is_eval_mode, ) # Define paths for the generated files image_result_path = os.path.join(output_path, "result_with_boxes.jpg") markdown_result_path = os.path.join(output_path, "result.mmd") # Read the markdown file content if it exists markdown_content = "" if os.path.exists(markdown_result_path): with open(markdown_result_path, "r", encoding="utf-8") as f: markdown_content = f.read() else: markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task." result_image = None # Check if the annotated image exists if os.path.exists(image_result_path): result_image = Image.open(image_result_path) result_image.load() # Return all three results. Gradio will handle the temporary file path for the image. text_result = plain_text_result if plain_text_result else markdown_content return result_image, markdown_content, text_result # Create Gradio interface with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # DeepSeek-OCR Demo Upload an image to extract text using DeepSeek-OCR model. Supports various document types and handwriting recognition. **Model Sizes:** - **Tiny**: Fastest, lower accuracy (512x512) - **Small**: Fast, good accuracy (640x640) - **Base**: Balanced performance (1024x1024) - **Large**: Best accuracy, slower (1280x1280) - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode) """ ) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( type="pil", label="Upload Image", sources=["upload", "clipboard"] ) model_size = gr.Dropdown( choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="Model Size", ) task_type = gr.Dropdown( choices=["Free OCR", "Convert to Markdown"], value="Convert to Markdown", label="Task Type", ) eval_mode_checkbox = gr.Checkbox( value=False, label="Enable Evaluation Mode", info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.", ) submit_btn = gr.Button("Process Image", variant="primary") with gr.Column(scale=2): with gr.Tabs(): with gr.TabItem("Annotated Image"): output_image = gr.Image( interactive=False ) with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown() with gr.TabItem("Markdown Source(or Eval Output)"): output_text = gr.Textbox( lines=20, show_copy_button=True, interactive=False, ) # Examples gr.Examples( examples=[ ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"], ["examples/receipt.jpg", "Base", "Convert to Markdown"], ["examples/receipt-2.png", "Base", "Convert to Markdown"], ], inputs=[image_input, model_size, task_type, eval_mode_checkbox], outputs=[output_image, output_markdown, output_text], fn=process_image, cache_examples=True, ) submit_btn.click( fn=process_image, inputs=[image_input, model_size, task_type, eval_mode_checkbox], outputs=[output_image, output_markdown, output_text], ) # Launch the app if __name__ == "__main__": demo.queue(max_size=20) demo.launch()