import gradio as gr import torch from transformers import AutoModel, AutoTokenizer from PIL import Image import os import spaces import tempfile import json from pathlib import Path # Set CUDA device os.environ["CUDA_VISIBLE_DEVICES"] = '0' # Load model and tokenizer model_name = "deepseek-ai/DeepSeek-OCR" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained( model_name, _attn_implementation="flash_attention_2", trust_remote_code=True, use_safetensors=True, ) model = model.eval() @spaces.GPU(duration=120) def ocr_process( image_input: Image.Image, task_type: str = "ocr", preset: str = "gundam", ) -> str: """ Process image and extract text using DeepSeek-OCR model. Args: image_input: Input image task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion preset: Preset configuration for model parameters Returns: Extracted text or markdown content """ if image_input is None: return "Please upload an image first." # Move model to GPU and set dtype model.cuda().to(torch.bfloat16) # Create temp directory for this session with tempfile.TemporaryDirectory() as temp_dir: # Save image with proper format temp_image_path = os.path.join(temp_dir, "input_image.jpg") # Convert RGBA to RGB if necessary if image_input.mode in ('RGBA', 'LA', 'P'): rgb_image = Image.new('RGB', image_input.size, (255, 255, 255)) # Handle different image modes if image_input.mode == 'RGBA': rgb_image.paste(image_input, mask=image_input.split()[3]) else: rgb_image.paste(image_input) rgb_image.save(temp_image_path, 'JPEG', quality=95) else: image_input.save(temp_image_path, 'JPEG', quality=95) # Set parameters based on preset presets = { "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, "small": {"base_size": 640, "image_size": 640, "crop_mode": False}, "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True}, } config = presets[preset] # Set prompt based on task type if task_type == "markdown": prompt = "\n<|grounding|>Convert the document to markdown. " else: prompt = "\nFree OCR. " # Run inference with save_results=True to save output result = model.infer( tokenizer, prompt=prompt, image_file=temp_image_path, output_path=temp_dir, base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"], save_results=True, test_compress=True, ) # Try to read the saved results extracted_text = "" # Check for saved JSON results json_path = Path(temp_dir) / "input_image_outputs.json" if json_path.exists(): try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) # Extract text from the JSON structure if isinstance(data, dict): if 'text' in data: extracted_text = data['text'] elif 'output' in data: extracted_text = data['output'] elif 'result' in data: extracted_text = data['result'] else: # If the structure is different, try to get the first string value for key, value in data.items(): if isinstance(value, str) and len(value) > 10: extracted_text = value break elif isinstance(data, list) and len(data) > 0: extracted_text = str(data[0]) else: extracted_text = str(data) except Exception as e: print(f"Error reading JSON: {e}") # If no JSON, check for text file if not extracted_text: txt_path = Path(temp_dir) / "input_image_outputs.txt" if txt_path.exists(): try: with open(txt_path, 'r', encoding='utf-8') as f: extracted_text = f.read() except Exception as e: print(f"Error reading text file: {e}") # If still no text, check for any output files if not extracted_text: output_files = list(Path(temp_dir).glob("*output*")) for file_path in output_files: if file_path.suffix in ['.txt', '.json', '.md']: try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() if content.strip(): extracted_text = content break except Exception as e: print(f"Error reading {file_path}: {e}") # If we still don't have text but result is not None, use result directly if not extracted_text and result is not None: if isinstance(result, str): extracted_text = result elif isinstance(result, (list, tuple)) and len(result) > 0: extracted_text = str(result[0]) else: extracted_text = str(result) # Move model back to CPU to free GPU memory model.to("cpu") torch.cuda.empty_cache() # Return the extracted text return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text." # Create Gradio interface with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo: gr.HTML( """

🔍 DeepSeek OCR

Extract text and convert documents to markdown using DeepSeek-OCR

Built with anycoder

""" ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📤 Upload Image") image_input = gr.Image( label="Input Image", type="pil", sources=["upload", "webcam", "clipboard"], height=300, ) gr.Markdown("### âš™ī¸ Settings") task_type = gr.Radio( choices=["ocr", "markdown"], value="ocr", label="Task Type", info="OCR: Extract plain text | Markdown: Convert to formatted markdown", ) preset = gr.Radio( choices=["gundam", "base", "large", "small", "tiny"], value="gundam", label="Model Preset", info="Start with 'gundam' - it's optimized for most documents", ) with gr.Accordion("â„šī¸ Preset Details", open=False): gr.Markdown(""" - **Gundam** (Recommended): Balanced performance with crop mode - **Base**: Standard quality without cropping - **Large**: Highest quality for complex documents - **Small**: Faster processing, good for simple text - **Tiny**: Fastest, suitable for clear printed text """) submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg") clear_btn = gr.ClearButton([image_input], value="đŸ—‘ī¸ Clear") with gr.Column(scale=1): gr.Markdown("### 📝 Extracted Text") output_text = gr.Textbox( label="Output", lines=15, max_lines=30, interactive=False, placeholder="Extracted text will appear here...", show_copy_button=True, ) # Event handlers submit_btn.click( fn=ocr_process, inputs=[image_input, task_type, preset], outputs=output_text, ) # Example section with receipt image gr.Markdown("### 📚 Example") gr.Examples( examples=[ ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"], ], inputs=[image_input, task_type, preset], label="Try this receipt example", ) gr.Markdown(""" ### 💡 Tips for Best Results - **For receipts**: Use "ocr" mode with "gundam" or "base" preset - **For documents with tables**: Use "markdown" mode with "large" preset - **If text is not detected**: Try different presets in this order: gundam → base → large - **For handwritten text**: Use "large" preset for better accuracy - Ensure images are clear and well-lit for optimal results """) if __name__ == "__main__": demo.launch(share=False)