Spaces:

axiilay
/

DeepSeek-OCR-Demo

Running on Zero

App Files Files Community

axiilay commited on 7 days ago

Commit

3da4f0d

1 Parent(s): 29a03ec

Add DeepSeek-OCR Gradio application files

Browse files

Files changed (3) hide show

README.md +34 -5
app.py +141 -0
requirements.txt +15 -0

README.md CHANGED Viewed

@@ -1,14 +1,43 @@
 ---
 title: DeepSeek OCR Demo
-emoji: 🏃
-colorFrom: gray
 colorTo: red
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: mit
-short_description: 'An interactive demo for the DeepSeek-OCR '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: DeepSeek OCR Demo
+emoji: 🖼
+colorFrom: purple
 colorTo: red
 sdk: gradio
+sdk_version: 5.44.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: An interactive demo for the DeepSeek-OCR model.
 ---
+# DeepSeek-OCR Document Recognition
+This Space uses the DeepSeek-OCR model for document text recognition and extraction.
+## Features
+- Multiple model size options (Tiny to Large)
+- Free OCR and Markdown conversion
+- Support for various document types
+- Powered by ZeroGPU for efficient inference
+## Usage
+1. Upload an image containing text
+2. Select model size (Gundam recommended for documents)
+3. Choose task type
+4. Click "Process Image"
+## Model Sizes
+- **Tiny**: 512x512, fastest
+- **Small**: 640x640, good balance
+- **Base**: 1024x1024, high quality
+- **Large**: 1280x1280, best quality
+- **Gundam**: Optimized for documents with crop mode
+## Credits
+Model: [deepseek-ai/DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import gradio as gr
+import torch
+from transformers import AutoModel, AutoTokenizer
+import spaces
+import os
+import tempfile
+# Load model and tokenizer
+model_name = "deepseek-ai/DeepSeek-OCR"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_name,
+    _attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+    use_safetensors=True,
+)
+model = model.eval().to(torch.bfloat16)
+@spaces.GPU
+def process_image(image, model_size, task_type):
+    """
+    Process image with DeepSeek-OCR
+    Args:
+        image: PIL Image or file path
+        model_size: Model size configuration
+        task_type: OCR task type
+    """
+    # Create temporary directory for output
+    with tempfile.TemporaryDirectory() as output_path:
+        # Set prompt based on task type
+        if task_type == "Free OCR":
+            prompt = "<image>\nFree OCR. "
+        elif task_type == "Convert to Markdown":
+            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+        elif task_type == "Extract Text":
+            prompt = "<image>\nExtract all text from the image. "
+        else:
+            prompt = "<image>\nFree OCR. "
+        # Save uploaded image temporarily
+        temp_image_path = os.path.join(output_path, "temp_image.jpg")
+        image.save(temp_image_path)
+        # Configure model size parameters
+        size_configs = {
+            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            "Gundam (Recommended)": {
+                "base_size": 1024,
+                "image_size": 640,
+                "crop_mode": True,
+            },
+        }
+        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        # Run inference
+        result = model.infer(
+            tokenizer,
+            prompt=prompt,
+            image_file=temp_image_path,
+            output_path=output_path,
+            base_size=config["base_size"],
+            image_size=config["image_size"],
+            crop_mode=config["crop_mode"],
+            save_results=True,
+            test_compress=True,
+        )
+        return result
+# Create Gradio interface
+with gr.Blocks(title="DeepSeek-OCR") as demo:
+    gr.Markdown(
+        """
+        # DeepSeek-OCR Document Recognition
+        Upload an image to extract text using DeepSeek-OCR model.
+        Supports various document types and handwriting recognition.
+        **Model Sizes:**
+        - **Tiny**: Fastest, lower accuracy (512x512)
+        - **Small**: Fast, good accuracy (640x640)
+        - **Base**: Balanced performance (1024x1024)
+        - **Large**: Best accuracy, slower (1280x1280)
+        - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                type="pil", label="Upload Image", sources=["upload", "clipboard"]
+            )
+            model_size = gr.Dropdown(
+                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                value="Gundam (Recommended)",
+                label="Model Size",
+            )
+            task_type = gr.Dropdown(
+                choices=["Free OCR", "Convert to Markdown", "Extract Text"],
+                value="Convert to Markdown",
+                label="Task Type",
+            )
+            submit_btn = gr.Button("Process Image", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="OCR Result", lines=20, show_copy_button=True
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
+            ["examples/receipt.jpg", "Base", "Free OCR"],
+        ],
+        inputs=[image_input, model_size, task_type],
+        outputs=output_text,
+        fn=process_image,
+        cache_examples=False,
+    )
+    submit_btn.click(
+        fn=process_image,
+        inputs=[image_input, model_size, task_type],
+        outputs=output_text,
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.queue(max_size=20)
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+einops
+addict
+easydict
+gradio>=4.0.0
+spaces>=0.20.0
+Pillow>=10.0.0
+safetensors>=0.4.0
+accelerate>=0.24.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0
+torchvision
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl