Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

File size: 5,295 Bytes

086e346

import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional

# Set device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="flash_attention_2",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval().to(device)
if device == "cuda":
    model = model.to(torch.bfloat16)


def ocr_process(
    image_input: Image.Image,
    task_type: str = "ocr",
    base_size: int = 1024,
    image_size: int = 640,
    crop_mode: bool = True,
) -> str:
    """
    Process image and extract text using DeepSeek-OCR model.

    Args:
        image_input: Input image
        task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
        base_size: Base size for model processing
        image_size: Target image size
        crop_mode: Whether to use crop mode

    Returns:
        Extracted text or markdown content
    """
    if image_input is None:
        return "Please upload an image first."

    try:
        # Save image temporarily
        temp_image_path = "/tmp/temp_ocr_image.jpg"
        image_input.save(temp_image_path)

        # Set prompt based on task type
        if task_type == "markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
        else:
            prompt = "<image>\nFree OCR. "

        # Run inference
        output = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path="",
            base_size=base_size,
            image_size=image_size,
            crop_mode=crop_mode,
            save_results=False,
            test_compress=False,
        )

        # Clean up temp file
        if os.path.exists(temp_image_path):
            os.remove(temp_image_path)

        return output if output else "No text detected in image."

    except Exception as e:
        return f"Error processing image: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
    gr.HTML(
        """
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>🔍 DeepSeek OCR</h1>
            <p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Upload Image")
            image_input = gr.Image(
                label="Input Image",
                type="pil",
                sources=["upload", "webcam", "clipboard"],
            )

            gr.Markdown("### Settings")
            task_type = gr.Radio(
                choices=["ocr", "markdown"],
                value="ocr",
                label="Task Type",
                info="OCR: Extract text | Markdown: Convert document to markdown",
            )

            base_size = gr.Slider(
                minimum=512,
                maximum=1280,
                step=128,
                value=1024,
                label="Base Size",
                info="Model processing size (larger = better quality, slower)",
            )

            image_size = gr.Slider(
                minimum=512,
                maximum=1280,
                step=128,
                value=640,
                label="Image Size",
                info="Target image size",
            )

            crop_mode = gr.Checkbox(
                value=True,
                label="Crop Mode",
                info="Enable crop mode for better processing",
            )

            submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### Output")
            output_text = gr.Textbox(
                label="Extracted Text",
                lines=10,
                interactive=False,
                placeholder="Text will appear here...",
            )

            copy_btn = gr.Button("📋 Copy Output")

    # Event handlers
    submit_btn.click(
        fn=ocr_process,
        inputs=[image_input, task_type, base_size, image_size, crop_mode],
        outputs=output_text,
    )

    copy_btn.click(
        fn=lambda text: text,
        inputs=output_text,
        outputs=output_text,
        js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
    )

    # Examples section
    gr.Markdown("### Examples")
    gr.Examples(
        examples=[
            ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
            [
                "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
                "markdown",
            ],
        ],
        inputs=[image_input, task_type],
        label="Try these examples",
    )


if __name__ == "__main__":
    demo.launch(share=False)