Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import io | |
| import os | |
| from typing import Optional | |
| # Set device | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load model and tokenizer | |
| model_name = "deepseek-ai/DeepSeek-OCR" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| _attn_implementation="flash_attention_2", | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| ) | |
| model = model.eval().to(device) | |
| if device == "cuda": | |
| model = model.to(torch.bfloat16) | |
| def ocr_process( | |
| image_input: Image.Image, | |
| task_type: str = "ocr", | |
| base_size: int = 1024, | |
| image_size: int = 640, | |
| crop_mode: bool = True, | |
| ) -> str: | |
| """ | |
| Process image and extract text using DeepSeek-OCR model. | |
| Args: | |
| image_input: Input image | |
| task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion | |
| base_size: Base size for model processing | |
| image_size: Target image size | |
| crop_mode: Whether to use crop mode | |
| Returns: | |
| Extracted text or markdown content | |
| """ | |
| if image_input is None: | |
| return "Please upload an image first." | |
| try: | |
| # Save image temporarily | |
| temp_image_path = "/tmp/temp_ocr_image.jpg" | |
| image_input.save(temp_image_path) | |
| # Set prompt based on task type | |
| if task_type == "markdown": | |
| prompt = "<image>\n<|grounding|>Convert the document to markdown. " | |
| else: | |
| prompt = "<image>\nFree OCR. " | |
| # Run inference | |
| output = model.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=temp_image_path, | |
| output_path="", | |
| base_size=base_size, | |
| image_size=image_size, | |
| crop_mode=crop_mode, | |
| save_results=False, | |
| test_compress=False, | |
| ) | |
| # Clean up temp file | |
| if os.path.exists(temp_image_path): | |
| os.remove(temp_image_path) | |
| return output if output else "No text detected in image." | |
| except Exception as e: | |
| return f"Error processing image: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="DeepSeek OCR") as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; margin-bottom: 20px;"> | |
| <h1>π DeepSeek OCR</h1> | |
| <p>Extract text and convert documents to markdown using DeepSeek-OCR</p> | |
| <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Upload Image") | |
| image_input = gr.Image( | |
| label="Input Image", | |
| type="pil", | |
| sources=["upload", "webcam", "clipboard"], | |
| ) | |
| gr.Markdown("### Settings") | |
| task_type = gr.Radio( | |
| choices=["ocr", "markdown"], | |
| value="ocr", | |
| label="Task Type", | |
| info="OCR: Extract text | Markdown: Convert document to markdown", | |
| ) | |
| base_size = gr.Slider( | |
| minimum=512, | |
| maximum=1280, | |
| step=128, | |
| value=1024, | |
| label="Base Size", | |
| info="Model processing size (larger = better quality, slower)", | |
| ) | |
| image_size = gr.Slider( | |
| minimum=512, | |
| maximum=1280, | |
| step=128, | |
| value=640, | |
| label="Image Size", | |
| info="Target image size", | |
| ) | |
| crop_mode = gr.Checkbox( | |
| value=True, | |
| label="Crop Mode", | |
| info="Enable crop mode for better processing", | |
| ) | |
| submit_btn = gr.Button("π Extract Text", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Output") | |
| output_text = gr.Textbox( | |
| label="Extracted Text", | |
| lines=10, | |
| interactive=False, | |
| placeholder="Text will appear here...", | |
| ) | |
| copy_btn = gr.Button("π Copy Output") | |
| # Event handlers | |
| submit_btn.click( | |
| fn=ocr_process, | |
| inputs=[image_input, task_type, base_size, image_size, crop_mode], | |
| outputs=output_text, | |
| ) | |
| copy_btn.click( | |
| fn=lambda text: text, | |
| inputs=output_text, | |
| outputs=output_text, | |
| js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }", | |
| ) | |
| # Examples section | |
| gr.Markdown("### Examples") | |
| gr.Examples( | |
| examples=[ | |
| ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"], | |
| [ | |
| "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500", | |
| "markdown", | |
| ], | |
| ], | |
| inputs=[image_input, task_type], | |
| label="Try these examples", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |