DeepSeek-OCR-DEMO

Sleeping

App Files Files Community

DeepSeek-OCR-DEMO / app.py

axiilay

enable eval_mode to make infer return value

94fd0fd about 2 months ago

raw

history blame

4.5 kB

	import gradio as gr
	import torch
	from transformers import AutoModel, AutoTokenizer
	import spaces
	import os
	import tempfile

	# Load model and tokenizer
	model_name = "deepseek-ai/DeepSeek-OCR"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	model_name,
	_attn_implementation="flash_attention_2",
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval()


	@spaces.GPU
	def process_image(image, model_size, task_type):
	"""
	Process image with DeepSeek-OCR

	Args:
	image: PIL Image or file path
	model_size: Model size configuration
	task_type: OCR task type
	"""
	# 在 GPU 函数内部移动模型到 GPU
	model_gpu = model.cuda().to(torch.bfloat16)

	# Create temporary directory for output
	with tempfile.TemporaryDirectory() as output_path:
	# Set prompt based on task type
	if task_type == "Free OCR":
	prompt = "<image>\nFree OCR. "
	elif task_type == "Convert to Markdown":
	prompt = "<image>\n<\|grounding\|>Convert the document to markdown. "
	else:
	prompt = "<image>\nFree OCR. "

	# Save uploaded image temporarily
	temp_image_path = os.path.join(output_path, "temp_image.jpg")
	image.save(temp_image_path)

	# Configure model size parameters
	size_configs = {
	"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
	"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
	"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
	"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
	"Gundam (Recommended)": {
	"base_size": 1024,
	"image_size": 640,
	"crop_mode": True,
	},
	}

	config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

	# Run inference
	result = model_gpu.infer(
	tokenizer,
	prompt=prompt,
	image_file=temp_image_path,
	output_path=output_path,
	base_size=config["base_size"],
	image_size=config["image_size"],
	crop_mode=config["crop_mode"],
	save_results=True,
	test_compress=True,
	eval_mode=True,
	)

	print(f"====\nresult: {result}\n====\n")
	return result


	# Create Gradio interface
	with gr.Blocks(title="DeepSeek-OCR") as demo:
	gr.Markdown(
	"""
	# DeepSeek-OCR Document Recognition

	Upload an image to extract text using DeepSeek-OCR model.
	Supports various document types and handwriting recognition.

	Model Sizes:
	- Tiny: Fastest, lower accuracy (512x512)
	- Small: Fast, good accuracy (640x640)
	- Base: Balanced performance (1024x1024)
	- Large: Best accuracy, slower (1280x1280)
	- Gundam (Recommended): Optimized for documents (1024 base, 640 image, crop mode)
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(
	type="pil", label="Upload Image", sources=["upload", "clipboard"]
	)

	model_size = gr.Dropdown(
	choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
	value="Gundam (Recommended)",
	label="Model Size",
	)

	task_type = gr.Dropdown(
	choices=["Free OCR", "Convert to Markdown"],
	value="Convert to Markdown",
	label="Task Type",
	)

	submit_btn = gr.Button("Process Image", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(
	label="OCR Result", lines=20, show_copy_button=True
	)

	# Examples
	gr.Examples(
	examples=[
	["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
	["examples/receipt.jpg", "Base", "Free OCR"],
	],
	inputs=[image_input, model_size, task_type],
	outputs=output_text,
	fn=process_image,
	cache_examples=False,
	)

	submit_btn.click(
	fn=process_image,
	inputs=[image_input, model_size, task_type],
	outputs=output_text,
	)

	# Launch the app
	if __name__ == "__main__":
	demo.queue(max_size=20)
	demo.launch()