Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,400 Bytes
086e346 3d50de0 1a07c5d b245a85 086e346 bd0cfb9 086e346 3d50de0 086e346 b245a85 2643bec 086e346 1a07c5d 086e346 1a07c5d 086e346 eb29213 1a07c5d eb29213 1a07c5d eb29213 ebec941 eb29213 ebec941 eb29213 b245a85 d44e05d b245a85 d44e05d b245a85 d44e05d b245a85 d44e05d b245a85 d44e05d b245a85 d44e05d b245a85 eb29213 086e346 d44e05d 086e346 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 1a07c5d ebec941 1a07c5d ebec941 086e346 ebec941 1a07c5d ebec941 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 ebec941 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 ebec941 086e346 1a07c5d ebec941 086e346 1a07c5d ebec941 1a07c5d 086e346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import os
import spaces
import tempfile
import sys
from io import StringIO
from contextlib import contextmanager
# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@contextmanager
def capture_stdout():
"""Capture stdout to get printed output from model"""
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
yield sys.stdout
finally:
sys.stdout = old_stdout
@spaces.GPU(duration=120)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
preset: str = "gundam",
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
preset: Preset configuration for model parameters
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
# Move model to GPU and set dtype
model.cuda().to(torch.bfloat16)
# Create temp directory for this session
with tempfile.TemporaryDirectory() as temp_dir:
# Save image with proper format
temp_image_path = os.path.join(temp_dir, "input_image.jpg")
# Convert RGBA to RGB if necessary
if image_input.mode in ('RGBA', 'LA', 'P'):
rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
# Handle different image modes
if image_input.mode == 'RGBA':
rgb_image.paste(image_input, mask=image_input.split()[3])
else:
rgb_image.paste(image_input)
rgb_image.save(temp_image_path, 'JPEG', quality=95)
else:
image_input.save(temp_image_path, 'JPEG', quality=95)
# Set parameters based on preset
presets = {
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
}
config = presets[preset]
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Capture stdout while running inference
captured_output = ""
with capture_stdout() as output:
result = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=temp_dir,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
)
captured_output = output.getvalue()
# Extract the text from captured output
extracted_text = ""
# Look for the actual OCR result in the captured output
# The model prints the extracted text between certain markers
lines = captured_output.split('\n')
capture_text = False
text_lines = []
for line in lines:
# Start capturing after seeing certain patterns
if "# " in line or line.strip().startswith("**"):
capture_text = True
if capture_text:
# Stop at the separator lines
if line.startswith("====") or line.startswith("---") and len(line) > 10:
if text_lines: # Only stop if we've captured something
break
# Add non-empty lines that aren't debug output
elif line.strip() and not line.startswith("image size:") and not line.startswith("valid image") and not line.startswith("output texts") and not line.startswith("compression"):
text_lines.append(line)
if text_lines:
extracted_text = '\n'.join(text_lines)
# If we didn't get text from stdout, check if result contains text
if not extracted_text and result is not None:
if isinstance(result, str):
extracted_text = result
elif isinstance(result, (list, tuple)) and len(result) > 0:
# Try to extract text from the result
if isinstance(result[0], str):
extracted_text = result[0]
elif hasattr(result[0], 'text'):
extracted_text = result[0].text
# Clean up any remaining markers from the text
if extracted_text:
# Remove any remaining debug output patterns
clean_lines = []
for line in extracted_text.split('\n'):
if not any(pattern in line.lower() for pattern in ['image size:', 'valid image', 'compression ratio', 'save results:', 'output texts']):
clean_lines.append(line)
extracted_text = '\n'.join(clean_lines).strip()
# Move model back to CPU to free GPU memory
model.to("cpu")
torch.cuda.empty_cache()
# Return the extracted text
return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text."
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>π DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π€ Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
height=300,
)
gr.Markdown("### βοΈ Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract plain text | Markdown: Convert to formatted markdown",
)
preset = gr.Radio(
choices=["gundam", "base", "large", "small", "tiny"],
value="gundam",
label="Model Preset",
info="Start with 'gundam' - it's optimized for most documents",
)
with gr.Accordion("βΉοΈ Preset Details", open=False):
gr.Markdown("""
- **Gundam** (Recommended): Balanced performance with crop mode
- **Base**: Standard quality without cropping
- **Large**: Highest quality for complex documents
- **Small**: Faster processing, good for simple text
- **Tiny**: Fastest, suitable for clear printed text
""")
submit_btn = gr.Button("π Extract Text", variant="primary", size="lg")
clear_btn = gr.ClearButton([image_input], value="ποΈ Clear")
with gr.Column(scale=1):
gr.Markdown("### π Extracted Text")
output_text = gr.Textbox(
label="Output",
lines=15,
max_lines=30,
interactive=False,
placeholder="Extracted text will appear here...",
show_copy_button=True,
)
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, preset],
outputs=output_text,
)
# Example section with receipt image
gr.Markdown("### π Example")
gr.Examples(
examples=[
["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"],
],
inputs=[image_input, task_type, preset],
label="Try this receipt example",
)
gr.Markdown("""
### π‘ Tips for Best Results
- **For receipts**: Use "ocr" mode with "gundam" or "base" preset
- **For documents with tables**: Use "markdown" mode with "large" preset
- **If text is not detected**: Try different presets in this order: gundam β base β large
- **For handwritten text**: Use "large" preset for better accuracy
- Ensure images are clear and well-lit for optimal results
""")
if __name__ == "__main__":
demo.launch(share=False) |