import gradio as gr import torch from transformers import AutoModel, AutoTokenizer from PIL import Image import io import os from typing import Optional # Set device os.environ["CUDA_VISIBLE_DEVICES"] = "0" device = "cuda" if torch.cuda.is_available() else "cpu" # Load model and tokenizer model_name = "deepseek-ai/DeepSeek-OCR" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained( model_name, _attn_implementation="flash_attention_2", trust_remote_code=True, use_safetensors=True, ) model = model.eval().to(device) if device == "cuda": model = model.to(torch.bfloat16) def ocr_process( image_input: Image.Image, task_type: str = "ocr", base_size: int = 1024, image_size: int = 640, crop_mode: bool = True, ) -> str: """ Process image and extract text using DeepSeek-OCR model. Args: image_input: Input image task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion base_size: Base size for model processing image_size: Target image size crop_mode: Whether to use crop mode Returns: Extracted text or markdown content """ if image_input is None: return "Please upload an image first." try: # Save image temporarily temp_image_path = "/tmp/temp_ocr_image.jpg" image_input.save(temp_image_path) # Set prompt based on task type if task_type == "markdown": prompt = "\n<|grounding|>Convert the document to markdown. " else: prompt = "\nFree OCR. " # Run inference output = model.infer( tokenizer, prompt=prompt, image_file=temp_image_path, output_path="", base_size=base_size, image_size=image_size, crop_mode=crop_mode, save_results=False, test_compress=False, ) # Clean up temp file if os.path.exists(temp_image_path): os.remove(temp_image_path) return output if output else "No text detected in image." except Exception as e: return f"Error processing image: {str(e)}" # Create Gradio interface with gr.Blocks(title="DeepSeek OCR") as demo: gr.HTML( """

🔍 DeepSeek OCR

Extract text and convert documents to markdown using DeepSeek-OCR

Built with anycoder

""" ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Upload Image") image_input = gr.Image( label="Input Image", type="pil", sources=["upload", "webcam", "clipboard"], ) gr.Markdown("### Settings") task_type = gr.Radio( choices=["ocr", "markdown"], value="ocr", label="Task Type", info="OCR: Extract text | Markdown: Convert document to markdown", ) base_size = gr.Slider( minimum=512, maximum=1280, step=128, value=1024, label="Base Size", info="Model processing size (larger = better quality, slower)", ) image_size = gr.Slider( minimum=512, maximum=1280, step=128, value=640, label="Image Size", info="Target image size", ) crop_mode = gr.Checkbox( value=True, label="Crop Mode", info="Enable crop mode for better processing", ) submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### Output") output_text = gr.Textbox( label="Extracted Text", lines=10, interactive=False, placeholder="Text will appear here...", ) copy_btn = gr.Button("📋 Copy Output") # Event handlers submit_btn.click( fn=ocr_process, inputs=[image_input, task_type, base_size, image_size, crop_mode], outputs=output_text, ) copy_btn.click( fn=lambda text: text, inputs=output_text, outputs=output_text, js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }", ) # Examples section gr.Markdown("### Examples") gr.Examples( examples=[ ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"], [ "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500", "markdown", ], ], inputs=[image_input, task_type], label="Try these examples", ) if __name__ == "__main__": demo.launch(share=False)