Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,677 Bytes
086e346 3d50de0 1a07c5d d44e05d 086e346 bd0cfb9 086e346 3d50de0 086e346 2643bec 086e346 1a07c5d 086e346 1a07c5d 086e346 eb29213 1a07c5d eb29213 1a07c5d eb29213 ebec941 eb29213 ebec941 eb29213 d44e05d eb29213 15cf5b6 eb29213 d44e05d eb29213 086e346 d44e05d 086e346 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 1a07c5d ebec941 1a07c5d ebec941 086e346 ebec941 1a07c5d ebec941 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 ebec941 1a07c5d 086e346 1a07c5d 086e346 1a07c5d 086e346 ebec941 086e346 ebec941 086e346 1a07c5d ebec941 086e346 1a07c5d ebec941 1a07c5d 086e346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import os
import spaces
import tempfile
import json
from pathlib import Path
# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
@spaces.GPU(duration=120)
def ocr_process(
image_input: Image.Image,
task_type: str = "ocr",
preset: str = "gundam",
) -> str:
"""
Process image and extract text using DeepSeek-OCR model.
Args:
image_input: Input image
task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
preset: Preset configuration for model parameters
Returns:
Extracted text or markdown content
"""
if image_input is None:
return "Please upload an image first."
# Move model to GPU and set dtype
model.cuda().to(torch.bfloat16)
# Create temp directory for this session
with tempfile.TemporaryDirectory() as temp_dir:
# Save image with proper format
temp_image_path = os.path.join(temp_dir, "input_image.jpg")
# Convert RGBA to RGB if necessary
if image_input.mode in ('RGBA', 'LA', 'P'):
rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
# Handle different image modes
if image_input.mode == 'RGBA':
rgb_image.paste(image_input, mask=image_input.split()[3])
else:
rgb_image.paste(image_input)
rgb_image.save(temp_image_path, 'JPEG', quality=95)
else:
image_input.save(temp_image_path, 'JPEG', quality=95)
# Set parameters based on preset
presets = {
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
}
config = presets[preset]
# Set prompt based on task type
if task_type == "markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
else:
prompt = "<image>\nFree OCR. "
# Run inference with save_results=True to save output
result = model.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=temp_dir,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
)
# Try to read the saved results
extracted_text = ""
# Check for saved JSON results
json_path = Path(temp_dir) / "input_image_outputs.json"
if json_path.exists():
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract text from the JSON structure
if isinstance(data, dict):
if 'text' in data:
extracted_text = data['text']
elif 'output' in data:
extracted_text = data['output']
elif 'result' in data:
extracted_text = data['result']
else:
# If the structure is different, try to get the first string value
for key, value in data.items():
if isinstance(value, str) and len(value) > 10:
extracted_text = value
break
elif isinstance(data, list) and len(data) > 0:
extracted_text = str(data[0])
else:
extracted_text = str(data)
except Exception as e:
print(f"Error reading JSON: {e}")
# If no JSON, check for text file
if not extracted_text:
txt_path = Path(temp_dir) / "input_image_outputs.txt"
if txt_path.exists():
try:
with open(txt_path, 'r', encoding='utf-8') as f:
extracted_text = f.read()
except Exception as e:
print(f"Error reading text file: {e}")
# If still no text, check for any output files
if not extracted_text:
output_files = list(Path(temp_dir).glob("*output*"))
for file_path in output_files:
if file_path.suffix in ['.txt', '.json', '.md']:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
if content.strip():
extracted_text = content
break
except Exception as e:
print(f"Error reading {file_path}: {e}")
# If we still don't have text but result is not None, use result directly
if not extracted_text and result is not None:
if isinstance(result, str):
extracted_text = result
elif isinstance(result, (list, tuple)) and len(result) > 0:
extracted_text = str(result[0])
else:
extracted_text = str(result)
# Move model back to CPU to free GPU memory
model.to("cpu")
torch.cuda.empty_cache()
# Return the extracted text
return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text."
# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 20px;">
<h1>π DeepSeek OCR</h1>
<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π€ Upload Image")
image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "webcam", "clipboard"],
height=300,
)
gr.Markdown("### βοΈ Settings")
task_type = gr.Radio(
choices=["ocr", "markdown"],
value="ocr",
label="Task Type",
info="OCR: Extract plain text | Markdown: Convert to formatted markdown",
)
preset = gr.Radio(
choices=["gundam", "base", "large", "small", "tiny"],
value="gundam",
label="Model Preset",
info="Start with 'gundam' - it's optimized for most documents",
)
with gr.Accordion("βΉοΈ Preset Details", open=False):
gr.Markdown("""
- **Gundam** (Recommended): Balanced performance with crop mode
- **Base**: Standard quality without cropping
- **Large**: Highest quality for complex documents
- **Small**: Faster processing, good for simple text
- **Tiny**: Fastest, suitable for clear printed text
""")
submit_btn = gr.Button("π Extract Text", variant="primary", size="lg")
clear_btn = gr.ClearButton([image_input], value="ποΈ Clear")
with gr.Column(scale=1):
gr.Markdown("### π Extracted Text")
output_text = gr.Textbox(
label="Output",
lines=15,
max_lines=30,
interactive=False,
placeholder="Extracted text will appear here...",
show_copy_button=True,
)
# Event handlers
submit_btn.click(
fn=ocr_process,
inputs=[image_input, task_type, preset],
outputs=output_text,
)
# Example section with receipt image
gr.Markdown("### π Example")
gr.Examples(
examples=[
["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"],
],
inputs=[image_input, task_type, preset],
label="Try this receipt example",
)
gr.Markdown("""
### π‘ Tips for Best Results
- **For receipts**: Use "ocr" mode with "gundam" or "base" preset
- **For documents with tables**: Use "markdown" mode with "large" preset
- **If text is not detected**: Try different presets in this order: gundam β base β large
- **For handwritten text**: Use "large" preset for better accuracy
- Ensure images are clear and well-lit for optimal results
""")
if __name__ == "__main__":
demo.launch(share=False) |