Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,295 Bytes
086e346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional
# Set device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval().to(device)
if device == "cuda":
model = model.to(torch.bfloat16)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
base_size: int = 1024,
image_size: int = 640,
crop_mode: bool = True,
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
base_size: Base size for model processing
image_size: Target image size
crop_mode: Whether to use crop mode
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
try:
# Save image temporarily
temp_image_path = "/tmp/temp_ocr_image.jpg"
image_input.save(temp_image_path)
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Run inference
output = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path="",
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
)
# Clean up temp file
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
return output if output else "No text detected in image."
except Exception as e:
return f"Error processing image: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>π DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
)
gr.Markdown("### Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract text | Markdown: Convert document to markdown",
)
base_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=1024,
label="Base Size",
info="Model processing size (larger = better quality, slower)",
)
image_size = gr.Slider(
minimum=512,
maximum=1280,
step=128,
value=640,
label="Image Size",
info="Target image size",
)
crop_mode = gr.Checkbox(
value=True,
label="Crop Mode",
info="Enable crop mode for better processing",
)
submit_btn = gr.Button("π Extract Text", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_text = gr.Textbox(
label="Extracted Text",
lines=10,
interactive=False,
placeholder="Text will appear here...",
)
copy_btn = gr.Button("π Copy Output")
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, base_size, image_size, crop_mode],
outputs=output_text,
)
copy_btn.click(
fn=lambda text: text,
inputs=output_text,
outputs=output_text,
js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
[
"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
"markdown",
],
],
inputs=[image_input, task_type],
label="Try these examples",
)
if __name__ == "__main__":
demo.launch(share=False) |