axiilay's picture
enable example cache
2681cf8
raw
history blame
6.72 kB
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU
def process_image(image, model_size, task_type, is_eval_mode):
"""
Process image with DeepSeek-OCR and return multiple output formats.
Args:
image: PIL Image or file path
model_size: Model size configuration
task_type: OCR task type
Returns:
A tuple containing:
- Path to the image with bounding boxes.
- The content of the markdown result file.
- The plain text OCR result.
"""
if image is None:
return None, "Please upload an image first.", "Please upload an image first."
model_gpu = model.cuda().to(torch.bfloat16)
# Create temporary directory for output
with tempfile.TemporaryDirectory() as output_path:
# Set prompt based on task type
if task_type == "Free OCR":
prompt = "<image>\nFree OCR. "
elif task_type == "Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Save uploaded image temporarily
temp_image_path = os.path.join(output_path, "temp_image.jpg")
image.save(temp_image_path)
# Configure model size parameters
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {
"base_size": 1024,
"image_size": 640,
"crop_mode": True,
},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
# Run inference
plain_text_result = model_gpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True, # Ensure results are saved to disk
test_compress=True,
eval_mode=is_eval_mode,
)
# Define paths for the generated files
image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
markdown_result_path = os.path.join(output_path, "result.mmd")
# Read the markdown file content if it exists
markdown_content = ""
if os.path.exists(markdown_result_path):
with open(markdown_result_path, "r", encoding="utf-8") as f:
markdown_content = f.read()
else:
markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
result_image = None
# Check if the annotated image exists
if os.path.exists(image_result_path):
result_image = Image.open(image_result_path)
result_image.load()
# Return all three results. Gradio will handle the temporary file path for the image.
text_result = plain_text_result if plain_text_result else markdown_content
return result_image, markdown_content, text_result
# Create Gradio interface
with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# DeepSeek-OCR Demo
Upload an image to extract text using DeepSeek-OCR model.
Supports various document types and handwriting recognition.
**Model Sizes:**
- **Tiny**: Fastest, lower accuracy (512x512)
- **Small**: Fast, good accuracy (640x640)
- **Base**: Balanced performance (1024x1024)
- **Large**: Best accuracy, slower (1280x1280)
- **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
type="pil", label="Upload Image", sources=["upload", "clipboard"]
)
model_size = gr.Dropdown(
choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
value="Gundam (Recommended)",
label="Model Size",
)
task_type = gr.Dropdown(
choices=["Free OCR", "Convert to Markdown"],
value="Convert to Markdown",
label="Task Type",
)
eval_mode_checkbox = gr.Checkbox(
value=False,
label="Enable Evaluation Mode",
info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
)
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Annotated Image"):
output_image = gr.Image(
interactive=False
)
with gr.TabItem("Markdown Preview"):
output_markdown = gr.Markdown()
with gr.TabItem("Markdown Source(or Eval Output)"):
output_text = gr.Textbox(
lines=20,
show_copy_button=True,
interactive=False,
)
# Examples
gr.Examples(
examples=[
["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
["examples/receipt.jpg", "Base", "Convert to Markdown"],
["examples/receipt-2.png", "Base", "Convert to Markdown"],
],
inputs=[image_input, model_size, task_type, eval_mode_checkbox],
outputs=[output_image, output_markdown, output_text],
fn=process_image,
cache_examples=True,
)
submit_btn.click(
fn=process_image,
inputs=[image_input, model_size, task_type, eval_mode_checkbox],
outputs=[output_image, output_markdown, output_text],
)
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch()