Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,501 Bytes
3da4f0d ca8cbba 3da4f0d ca8cbba 3da4f0d ca8cbba 3da4f0d 94fd0fd 3da4f0d ca8cbba 3da4f0d ca8cbba 3da4f0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU
def process_image(image, model_size, task_type):
"""
Process image with DeepSeek-OCR
Args:
image: PIL Image or file path
model_size: Model size configuration
task_type: OCR task type
"""
# 在 GPU 函数内部移动模型到 GPU
model_gpu = model.cuda().to(torch.bfloat16)
# Create temporary directory for output
with tempfile.TemporaryDirectory() as output_path:
# Set prompt based on task type
if task_type == "Free OCR":
prompt = "<image>\nFree OCR. "
elif task_type == "Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Save uploaded image temporarily
temp_image_path = os.path.join(output_path, "temp_image.jpg")
image.save(temp_image_path)
# Configure model size parameters
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {
"base_size": 1024,
"image_size": 640,
"crop_mode": True,
},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
# Run inference
result = model_gpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
eval_mode=True,
)
print(f"====\nresult: {result}\n====\n")
return result
# Create Gradio interface
with gr.Blocks(title="DeepSeek-OCR") as demo:
gr.Markdown(
"""
# DeepSeek-OCR Document Recognition
Upload an image to extract text using DeepSeek-OCR model.
Supports various document types and handwriting recognition.
**Model Sizes:**
- **Tiny**: Fastest, lower accuracy (512x512)
- **Small**: Fast, good accuracy (640x640)
- **Base**: Balanced performance (1024x1024)
- **Large**: Best accuracy, slower (1280x1280)
- **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
"""
)
with gr.Row():
with gr.Column():
image_input = gr.Image(
type="pil", label="Upload Image", sources=["upload", "clipboard"]
)
model_size = gr.Dropdown(
choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
value="Gundam (Recommended)",
label="Model Size",
)
task_type = gr.Dropdown(
choices=["Free OCR", "Convert to Markdown"],
value="Convert to Markdown",
label="Task Type",
)
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column():
output_text = gr.Textbox(
label="OCR Result", lines=20, show_copy_button=True
)
# Examples
gr.Examples(
examples=[
["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
["examples/receipt.jpg", "Base", "Free OCR"],
],
inputs=[image_input, model_size, task_type],
outputs=output_text,
fn=process_image,
cache_examples=False,
)
submit_btn.click(
fn=process_image,
inputs=[image_input, model_size, task_type],
outputs=output_text,
)
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch()
|