Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

File size: 9,677 Bytes

import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import os
import spaces
import tempfile
import json
from pathlib import Path

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="flash_attention_2",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval()


@spaces.GPU(duration=120)
def ocr_process(
    image_input: Image.Image,
    task_type: str = "ocr",
    preset: str = "gundam",
) -> str:
    """
    Process image and extract text using DeepSeek-OCR model.

    Args:
        image_input: Input image
        task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
        preset: Preset configuration for model parameters

    Returns:
        Extracted text or markdown content
    """
    if image_input is None:
        return "Please upload an image first."

    # Move model to GPU and set dtype
    model.cuda().to(torch.bfloat16)
    
    # Create temp directory for this session
    with tempfile.TemporaryDirectory() as temp_dir:
        # Save image with proper format
        temp_image_path = os.path.join(temp_dir, "input_image.jpg")
        # Convert RGBA to RGB if necessary
        if image_input.mode in ('RGBA', 'LA', 'P'):
            rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
            # Handle different image modes
            if image_input.mode == 'RGBA':
                rgb_image.paste(image_input, mask=image_input.split()[3])
            else:
                rgb_image.paste(image_input)
            rgb_image.save(temp_image_path, 'JPEG', quality=95)
        else:
            image_input.save(temp_image_path, 'JPEG', quality=95)
        
        # Set parameters based on preset
        presets = {
            "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
            "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
            "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
            "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
            "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
        }
        
        config = presets[preset]

        # Set prompt based on task type
        if task_type == "markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
        else:
            prompt = "<image>\nFree OCR. "

        # Run inference with save_results=True to save output
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path=temp_dir,
            base_size=config["base_size"],
            image_size=config["image_size"],
            crop_mode=config["crop_mode"],
            save_results=True,
            test_compress=True,
        )
        
        # Try to read the saved results
        extracted_text = ""
        
        # Check for saved JSON results
        json_path = Path(temp_dir) / "input_image_outputs.json"
        if json_path.exists():
            try:
                with open(json_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    # Extract text from the JSON structure
                    if isinstance(data, dict):
                        if 'text' in data:
                            extracted_text = data['text']
                        elif 'output' in data:
                            extracted_text = data['output']
                        elif 'result' in data:
                            extracted_text = data['result']
                        else:
                            # If the structure is different, try to get the first string value
                            for key, value in data.items():
                                if isinstance(value, str) and len(value) > 10:
                                    extracted_text = value
                                    break
                    elif isinstance(data, list) and len(data) > 0:
                        extracted_text = str(data[0])
                    else:
                        extracted_text = str(data)
            except Exception as e:
                print(f"Error reading JSON: {e}")
        
        # If no JSON, check for text file
        if not extracted_text:
            txt_path = Path(temp_dir) / "input_image_outputs.txt"
            if txt_path.exists():
                try:
                    with open(txt_path, 'r', encoding='utf-8') as f:
                        extracted_text = f.read()
                except Exception as e:
                    print(f"Error reading text file: {e}")
        
        # If still no text, check for any output files
        if not extracted_text:
            output_files = list(Path(temp_dir).glob("*output*"))
            for file_path in output_files:
                if file_path.suffix in ['.txt', '.json', '.md']:
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()
                            if content.strip():
                                extracted_text = content
                                break
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # If we still don't have text but result is not None, use result directly
        if not extracted_text and result is not None:
            if isinstance(result, str):
                extracted_text = result
            elif isinstance(result, (list, tuple)) and len(result) > 0:
                extracted_text = str(result[0])
            else:
                extracted_text = str(result)

    # Move model back to CPU to free GPU memory
    model.to("cpu")
    torch.cuda.empty_cache()

    # Return the extracted text
    return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text."


# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>🔍 DeepSeek OCR</h1>
            <p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📤 Upload Image")
            image_input = gr.Image(
                label="Input Image",
                type="pil",
                sources=["upload", "webcam", "clipboard"],
                height=300,
            )

            gr.Markdown("### ⚙️ Settings")
            task_type = gr.Radio(
                choices=["ocr", "markdown"],
                value="ocr",
                label="Task Type",
                info="OCR: Extract plain text | Markdown: Convert to formatted markdown",
            )

            preset = gr.Radio(
                choices=["gundam", "base", "large", "small", "tiny"],
                value="gundam",
                label="Model Preset",
                info="Start with 'gundam' - it's optimized for most documents",
            )

            with gr.Accordion("ℹ️ Preset Details", open=False):
                gr.Markdown("""
                - **Gundam** (Recommended): Balanced performance with crop mode
                - **Base**: Standard quality without cropping
                - **Large**: Highest quality for complex documents
                - **Small**: Faster processing, good for simple text
                - **Tiny**: Fastest, suitable for clear printed text
                """)

            submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
            clear_btn = gr.ClearButton([image_input], value="🗑️ Clear")

        with gr.Column(scale=1):
            gr.Markdown("### 📝 Extracted Text")
            output_text = gr.Textbox(
                label="Output",
                lines=15,
                max_lines=30,
                interactive=False,
                placeholder="Extracted text will appear here...",
                show_copy_button=True,
            )

    # Event handlers
    submit_btn.click(
        fn=ocr_process,
        inputs=[image_input, task_type, preset],
        outputs=output_text,
    )

    # Example section with receipt image
    gr.Markdown("### 📚 Example")
    gr.Examples(
        examples=[
            ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"],
        ],
        inputs=[image_input, task_type, preset],
        label="Try this receipt example",
    )

    gr.Markdown("""
    ### 💡 Tips for Best Results
    - **For receipts**: Use "ocr" mode with "gundam" or "base" preset
    - **For documents with tables**: Use "markdown" mode with "large" preset
    - **If text is not detected**: Try different presets in this order: gundam → base → large
    - **For handwritten text**: Use "large" preset for better accuracy
    - Ensure images are clear and well-lit for optimal results
    """)


if __name__ == "__main__":
    demo.launch(share=False)