Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

App Files Files Community

DeepSeek-OCR / app.py

akhaliq HF Staff

Update app.py

2643bec verified 21 days ago

raw

history blame

5.48 kB

	import gradio as gr
	import torch
	from transformers import AutoModel, AutoTokenizer
	from PIL import Image
	import io
	import os
	from typing import Optional
	import spaces

	# Load model and tokenizer
	model_name = "deepseek-ai/DeepSeek-OCR"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	model_name,
	_attn_implementation="flash_attention_2",
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval()


	@spaces.GPU(duration=120)
	def ocr_process(
	image_input: Image.Image,
	task_type: str = "ocr",
	base_size: int = 1024,
	image_size: int = 640,
	crop_mode: bool = True,
	) -> str:
	"""
	Process image and extract text using DeepSeek-OCR model.

	Args:
	image_input: Input image
	task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
	base_size: Base size for model processing
	image_size: Target image size
	crop_mode: Whether to use crop mode

	Returns:
	Extracted text or markdown content
	"""
	if image_input is None:
	return "Please upload an image first."

	try:
	# Move model to GPU and set dtype
	model.to("cuda")
	model.to(torch.bfloat16)
	# Save image temporarily
	temp_image_path = "/tmp/temp_ocr_image.jpg"
	image_input.save(temp_image_path)

	# Set prompt based on task type
	if task_type == "markdown":
	prompt = "<image>\n<\|grounding\|>Convert the document to markdown. "
	else:
	prompt = "<image>\nFree OCR. "

	# Run inference
	output = model.infer(
	tokenizer,
	prompt=prompt,
	image_file=temp_image_path,
	output_path="",
	base_size=base_size,
	image_size=image_size,
	crop_mode=crop_mode,
	save_results=False,
	test_compress=False,
	)

	# Clean up temp file
	if os.path.exists(temp_image_path):
	os.remove(temp_image_path)

	# Move model back to CPU to free GPU memory
	model.to("cpu")
	torch.cuda.empty_cache()

	return output if output else "No text detected in image."

	except Exception as e:
	# Ensure model is moved back to CPU on error
	model.to("cpu")
	torch.cuda.empty_cache()
	return f"Error processing image: {str(e)}"


	# Create Gradio interface
	with gr.Blocks(title="DeepSeek OCR") as demo:
	gr.HTML(
	"""
	<div style="text-align: center; margin-bottom: 20px;">
	<h1>🔍 DeepSeek OCR</h1>
	<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
	<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Upload Image")
	image_input = gr.Image(
	label="Input Image",
	type="pil",
	sources=["upload", "webcam", "clipboard"],
	)

	gr.Markdown("### Settings")
	task_type = gr.Radio(
	choices=["ocr", "markdown"],
	value="ocr",
	label="Task Type",
	info="OCR: Extract text \| Markdown: Convert document to markdown",
	)

	base_size = gr.Slider(
	minimum=512,
	maximum=1280,
	step=128,
	value=1024,
	label="Base Size",
	info="Model processing size (larger = better quality, slower)",
	)

	image_size = gr.Slider(
	minimum=512,
	maximum=1280,
	step=128,
	value=640,
	label="Image Size",
	info="Target image size",
	)

	crop_mode = gr.Checkbox(
	value=True,
	label="Crop Mode",
	info="Enable crop mode for better processing",
	)

	submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Output")
	output_text = gr.Textbox(
	label="Extracted Text",
	lines=10,
	interactive=False,
	placeholder="Text will appear here...",
	)

	copy_btn = gr.Button("📋 Copy Output")

	# Event handlers
	submit_btn.click(
	fn=ocr_process,
	inputs=[image_input, task_type, base_size, image_size, crop_mode],
	outputs=output_text,
	)

	copy_btn.click(
	fn=lambda text: text,
	inputs=output_text,
	outputs=output_text,
	js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
	)

	# Examples section
	gr.Markdown("### Examples")
	gr.Examples(
	examples=[
	["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
	[
	"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
	"markdown",
	],
	],
	inputs=[image_input, task_type],
	label="Try these examples",
	)


	if __name__ == "__main__":
	demo.launch(share=False)