DeepSeek-OCR / app.py
akhaliq's picture
akhaliq HF Staff
Deploy Gradio app with multiple files
086e346 verified
raw
history blame
5.3 kB
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional
# Set device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval().to(device)
if device == "cuda":
model = model.to(torch.bfloat16)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
base_size: int = 1024,
image_size: int = 640,
crop_mode: bool = True,
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
base_size: Base size for model processing
image_size: Target image size
crop_mode: Whether to use crop mode
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
try:
# Save image temporarily
temp_image_path = "/tmp/temp_ocr_image.jpg"
image_input.save(temp_image_path)
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Run inference
output = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path="",
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
)
# Clean up temp file
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
return output if output else "No text detected in image."
except Exception as e:
return f"Error processing image: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>πŸ” DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
)
gr.Markdown("### Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract text | Markdown: Convert document to markdown",
)
base_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=1024,
label="Base Size",
info="Model processing size (larger = better quality, slower)",
)
image_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=640,
label="Image Size",
info="Target image size",
)
crop_mode = gr.Checkbox(
value=True,
label="Crop Mode",
info="Enable crop mode for better processing",
)
submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_text = gr.Textbox(
label="Extracted Text",
lines=10,
interactive=False,
placeholder="Text will appear here...",
)
copy_btn = gr.Button("πŸ“‹ Copy Output")
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, base_size, image_size, crop_mode],
outputs=output_text,
)
copy_btn.click(
fn=lambda text: text,
inputs=output_text,
outputs=output_text,
js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
[
"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
"markdown",
],
],
inputs=[image_input, task_type],
label="Try these examples",
)
if __name__ == "__main__":
demo.launch(share=False)