DeepSeek-OCR / app.py
akhaliq's picture
akhaliq HF Staff
Update app.py
2643bec verified
raw
history blame
5.48 kB
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional
import spaces
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU(duration=120)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
base_size: int = 1024,
image_size: int = 640,
crop_mode: bool = True,
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
base_size: Base size for model processing
image_size: Target image size
crop_mode: Whether to use crop mode
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
try:
# Move model to GPU and set dtype
model.to("cuda")
model.to(torch.bfloat16)
# Save image temporarily
temp_image_path = "/tmp/temp_ocr_image.jpg"
image_input.save(temp_image_path)
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Run inference
output = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path="",
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
)
# Clean up temp file
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
# Move model back to CPU to free GPU memory
model.to("cpu")
torch.cuda.empty_cache()
return output if output else "No text detected in image."
except Exception as e:
# Ensure model is moved back to CPU on error
model.to("cpu")
torch.cuda.empty_cache()
return f"Error processing image: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>πŸ” DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
)
gr.Markdown("### Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract text | Markdown: Convert document to markdown",
)
base_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=1024,
label="Base Size",
info="Model processing size (larger = better quality, slower)",
)
image_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=640,
label="Image Size",
info="Target image size",
)
crop_mode = gr.Checkbox(
value=True,
label="Crop Mode",
info="Enable crop mode for better processing",
)
submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_text = gr.Textbox(
label="Extracted Text",
lines=10,
interactive=False,
placeholder="Text will appear here...",
)
copy_btn = gr.Button("πŸ“‹ Copy Output")
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, base_size, image_size, crop_mode],
outputs=output_text,
)
copy_btn.click(
fn=lambda text: text,
inputs=output_text,
outputs=output_text,
js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
[
"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
"markdown",
],
],
inputs=[image_input, task_type],
label="Try these examples",
)
if __name__ == "__main__":
demo.launch(share=False)