Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,475 Bytes
086e346 3d50de0 086e346 3d50de0 086e346 2643bec 086e346 3d50de0 086e346 3d50de0 086e346 3d50de0 086e346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional
import spaces
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU(duration=120)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
base_size: int = 1024,
image_size: int = 640,
crop_mode: bool = True,
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
base_size: Base size for model processing
image_size: Target image size
crop_mode: Whether to use crop mode
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
try:
# Move model to GPU and set dtype
model.to("cuda")
model.to(torch.bfloat16)
# Save image temporarily
temp_image_path = "/tmp/temp_ocr_image.jpg"
image_input.save(temp_image_path)
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Run inference
output = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path="",
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
)
# Clean up temp file
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
# Move model back to CPU to free GPU memory
model.to("cpu")
torch.cuda.empty_cache()
return output if output else "No text detected in image."
except Exception as e:
# Ensure model is moved back to CPU on error
model.to("cpu")
torch.cuda.empty_cache()
return f"Error processing image: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>π DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
)
gr.Markdown("### Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract text | Markdown: Convert document to markdown",
)
base_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=1024,
label="Base Size",
info="Model processing size (larger = better quality, slower)",
)
image_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=640,
label="Image Size",
info="Target image size",
)
crop_mode = gr.Checkbox(
value=True,
label="Crop Mode",
info="Enable crop mode for better processing",
)
submit_btn = gr.Button("π Extract Text", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_text = gr.Textbox(
label="Extracted Text",
lines=10,
interactive=False,
placeholder="Text will appear here...",
)
copy_btn = gr.Button("π Copy Output")
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, base_size, image_size, crop_mode],
outputs=output_text,
)
copy_btn.click(
fn=lambda text: text,
inputs=output_text,
outputs=output_text,
js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
[
"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
"markdown",
],
],
inputs=[image_input, task_type],
label="Try these examples",
)
if __name__ == "__main__":
demo.launch(share=False) |