Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

App Files Files Community

DeepSeek-OCR / app.py

akhaliq HF Staff

Deploy Gradio app with multiple files

086e346 verified 11 days ago

raw

history blame

5.3 kB

	import gradio as gr
	import torch
	from transformers import AutoModel, AutoTokenizer
	from PIL import Image
	import io
	import os
	from typing import Optional

	# Set device
	os.environ["CUDA_VISIBLE_DEVICES"] = "0"
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load model and tokenizer
	model_name = "deepseek-ai/DeepSeek-OCR"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	model_name,
	_attn_implementation="flash_attention_2",
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval().to(device)
	if device == "cuda":
	model = model.to(torch.bfloat16)


	def ocr_process(
	image_input: Image.Image,
	task_type: str = "ocr",
	base_size: int = 1024,
	image_size: int = 640,
	crop_mode: bool = True,
	) -> str:
	"""
	Process image and extract text using DeepSeek-OCR model.

	Args:
	image_input: Input image
	task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
	base_size: Base size for model processing
	image_size: Target image size
	crop_mode: Whether to use crop mode

	Returns:
	Extracted text or markdown content
	"""
	if image_input is None:
	return "Please upload an image first."

	try:
	# Save image temporarily
	temp_image_path = "/tmp/temp_ocr_image.jpg"
	image_input.save(temp_image_path)

	# Set prompt based on task type
	if task_type == "markdown":
	prompt = "<image>\n<\|grounding\|>Convert the document to markdown. "
	else:
	prompt = "<image>\nFree OCR. "

	# Run inference
	output = model.infer(
	tokenizer,
	prompt=prompt,
	image_file=temp_image_path,
	output_path="",
	base_size=base_size,
	image_size=image_size,
	crop_mode=crop_mode,
	save_results=False,
	test_compress=False,
	)

	# Clean up temp file
	if os.path.exists(temp_image_path):
	os.remove(temp_image_path)

	return output if output else "No text detected in image."

	except Exception as e:
	return f"Error processing image: {str(e)}"


	# Create Gradio interface
	with gr.Blocks(title="DeepSeek OCR") as demo:
	gr.HTML(
	"""
	<div style="text-align: center; margin-bottom: 20px;">
	<h1>🔍 DeepSeek OCR</h1>
	<p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
	<p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Upload Image")
	image_input = gr.Image(
	label="Input Image",
	type="pil",
	sources=["upload", "webcam", "clipboard"],
	)

	gr.Markdown("### Settings")
	task_type = gr.Radio(
	choices=["ocr", "markdown"],
	value="ocr",
	label="Task Type",
	info="OCR: Extract text \| Markdown: Convert document to markdown",
	)

	base_size = gr.Slider(
	minimum=512,
	maximum=1280,
	step=128,
	value=1024,
	label="Base Size",
	info="Model processing size (larger = better quality, slower)",
	)

	image_size = gr.Slider(
	minimum=512,
	maximum=1280,
	step=128,
	value=640,
	label="Image Size",
	info="Target image size",
	)

	crop_mode = gr.Checkbox(
	value=True,
	label="Crop Mode",
	info="Enable crop mode for better processing",
	)

	submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Output")
	output_text = gr.Textbox(
	label="Extracted Text",
	lines=10,
	interactive=False,
	placeholder="Text will appear here...",
	)

	copy_btn = gr.Button("📋 Copy Output")

	# Event handlers
	submit_btn.click(
	fn=ocr_process,
	inputs=[image_input, task_type, base_size, image_size, crop_mode],
	outputs=output_text,
	)

	copy_btn.click(
	fn=lambda text: text,
	inputs=output_text,
	outputs=output_text,
	js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
	)

	# Examples section
	gr.Markdown("### Examples")
	gr.Examples(
	examples=[
	["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
	[
	"https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
	"markdown",
	],
	],
	inputs=[image_input, task_type],
	label="Try these examples",
	)


	if __name__ == "__main__":
	demo.launch(share=False)