File size: 8,940 Bytes
f0568ad 3da4f0d 3b28ff1 3da4f0d f10b987 d03ac84 3da4f0d f10b987 3da4f0d ca8cbba f10b987 3da4f0d 3b28ff1 3da4f0d 2fd9f05 3da4f0d f0568ad 3da4f0d f10b987 2fd9f05 3da4f0d d03ac84 f10b987 ca8cbba f10b987 ca8cbba 3da4f0d 3b28ff1 f10b987 d03ac84 f10b987 d03ac84 f10b987 d03ac84 f10b987 d03ac84 f10b987 d03ac84 3da4f0d 3b28ff1 3da4f0d d03ac84 3da4f0d 3b28ff1 3da4f0d d03ac84 3da4f0d f10b987 d03ac84 3da4f0d 3b28ff1 3da4f0d 94fd0fd 3da4f0d f10b987 d03ac84 2fd9f05 3b28ff1 d03ac84 2fd9f05 3b28ff1 2fd9f05 637ac87 2fd9f05 3b28ff1 637ac87 2fd9f05 3b28ff1 2fd9f05 3b28ff1 d03ac84 3da4f0d 2fd9f05 51c0d3d 3da4f0d f10b987 8c99c2e 51c0d3d 3da4f0d 51c0d3d f0568ad 51c0d3d f0568ad 51c0d3d 3da4f0d 51c0d3d 3da4f0d a70375d 51c0d3d d03ac84 51c0d3d 3da4f0d f10b987 3da4f0d d03ac84 2fd9f05 d03ac84 2fd9f05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
from functools import partial
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image, ImageDraw
import re # Import thΖ° viα»n regular expression
# --- 1. Load Model and Tokenizer (Done only once at startup) ---
print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Load the model to CPU first; it will be moved to GPU during processing
model = AutoModel.from_pretrained(
model_name,
_attn_implementation="flash_attention_2",
trust_remote_code=True,
use_safetensors=True,
)
model = model.eval()
print("β
Model loaded successfully.")
# --- Helper function to find pre-generated result images ---
def find_result_image(path):
for filename in os.listdir(path):
if "grounding" in filename or "result" in filename:
try:
image_path = os.path.join(path, filename)
return Image.open(image_path)
except Exception as e:
print(f"Error opening result image {filename}: {e}")
return None
# --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
@spaces.GPU
def process_ocr_task(image, model_size, ref_text, task_type):
"""
Processes an image with DeepSeek-OCR for all supported tasks.
Now draws ALL detected bounding boxes for ANY task.
"""
if image is None:
return "Please upload an image first.", None
print("π Moving model to GPU...")
model_gpu = model.cuda().to(torch.bfloat16)
print("β
Model is on GPU.")
with tempfile.TemporaryDirectory() as output_path:
# Build the prompt... (same as before)
if task_type == "π Free OCR":
prompt = "<image>\nFree OCR."
elif task_type == "π Convert to Markdown":
prompt = "<image>\n<|grounding|>Convert the document to markdown."
elif task_type == "π Parse Figure":
prompt = "<image>\nParse the figure."
elif task_type == "π Locate Object by Reference":
if not ref_text or ref_text.strip() == "":
raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
else:
prompt = "<image>\nFree OCR."
temp_image_path = os.path.join(output_path, "temp_image.png")
image.save(temp_image_path)
# Configure model size... (same as before)
size_configs = {
"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
}
config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
print(f"π Running inference with prompt: {prompt}")
text_result = model_gpu.infer(
tokenizer,
prompt=prompt,
image_file=temp_image_path,
output_path=output_path,
base_size=config["base_size"],
image_size=config["image_size"],
crop_mode=config["crop_mode"],
save_results=True,
test_compress=True,
eval_mode=True,
)
print(f"====\nπ Text Result: {text_result}\n====")
# --- NEW LOGIC: Always try to find and draw all bounding boxes ---
result_image_pil = None
# Define the pattern to find all coordinates like [[280, 15, 696, 997]]
pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
matches = list(pattern.finditer(text_result)) # Use finditer to get all matches
if matches:
print(f"β
Found {len(matches)} bounding box(es). Drawing on the original image.")
# Create a copy of the original image to draw on
image_with_bboxes = image.copy()
# draw = ImageDraw.Draw(image_with_bboxes)
w, h = image.size # Get original image dimensions
for match in matches:
# Extract coordinates as integers
coords_norm = [int(c) for c in match.groups()]
x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
# Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
x1 = int(x1_norm / 1000 * w)
y1 = int(y1_norm / 1000 * h)
x2 = int(x2_norm / 1000 * w)
y2 = int(y2_norm / 1000 * h)
# Crop the image to the bounding box
image_with_bboxes = image_with_bboxes.crop([x1, y1, x2, y2])
result_image_pil = image_with_bboxes
else:
# If no coordinates are found in the text, fall back to finding a pre-generated image
print("β οΈ No bounding box coordinates found in text result. Falling back to search for a result image file.")
result_image_pil = find_result_image(output_path)
return text_result, result_image_pil
# --- 3. Build the Gradio Interface (UPDATED) ---
with gr.Blocks(title="Text Extraction Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# π³ Full Demo of DeepSeek-OCR π³
Use the tabs below to switch between Free OCR and Locate modes.
"""
)
with gr.Tabs():
with gr.TabItem("Free OCR"):
with gr.Row():
with gr.Column(scale=1):
free_image = gr.Image(type="pil", label="πΌοΈ Upload Image", sources=["upload", "clipboard"])
free_model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Base", label="βοΈ Resolution Size")
free_btn = gr.Button("Run Free OCR", variant="primary")
with gr.Column(scale=2):
free_output_text = gr.Textbox(label="π Text Result", lines=15, show_copy_button=True)
free_output_image = gr.Image(label="πΌοΈ Image Result (if any)", type="pil")
# Wire Free OCR button
free_ocr = partial(process_ocr_task, task_type="π Free OCR", ref_text="")
free_btn.click(fn=free_ocr, inputs=[free_image, free_model_size], outputs=[free_output_text, free_output_image])
with gr.TabItem("Locate"):
with gr.Row():
with gr.Column(scale=1):
loc_image = gr.Image(type="pil", label="πΌοΈ Upload Image", sources=["upload", "clipboard"])
loc_model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Base", label="βοΈ Resolution Size")
# ref_text_input = gr.Textbox(label="π Reference Text (what to locate)", placeholder="e.g., the teacher, 20-10, a red car...")
loc_btn = gr.Button("Locate", variant="primary")
with gr.Column(scale=2):
loc_output_text = gr.Textbox(label="π Text Result", lines=15, show_copy_button=True)
loc_output_image = gr.Image(label="πΌοΈ Image Result (if any)", type="pil")
# Wire Locate button
pets_detection = partial(process_ocr_task, task_type="π Locate Object by Reference", ref_text="pets")
loc_btn.click(fn=pets_detection, inputs=[loc_image, loc_model_size], outputs=[loc_output_text, loc_output_image])
# Keep examples (they'll run process_ocr_task directly) - provide a compact examples widget pointing to the free tab inputs
gr.Examples(
examples=[
["doc_markdown.png", "Gundam (Recommended)", "", "π Convert to Markdown"],
["chart.png", "Gundam (Recommended)", "", "π Parse Figure"],
["teacher.jpg", "Base", "the teacher", "π Locate Object by Reference"],
["math_locate.jpg", "Small", "20-10", "π Locate Object by Reference"],
["receipt.jpg", "Base", "", "π Free OCR"],
],
inputs=[free_image, free_model_size],
outputs=[free_output_text, free_output_image],
fn=process_ocr_task,
cache_examples=False,
)
# --- 4. Launch the App ---
if __name__ == "__main__":
if not os.path.exists("examples"):
os.makedirs("examples")
# Make sure to have the correct image files in your "examples" folder
# e.g., doc_markdown.png, chart.png, teacher.jpg, math_locate.jpg, receipt.jpg
demo.queue(max_size=20).launch(share=True) |