Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 8 days ago

Commit

086e346

verified ·

1 Parent(s): a39355b

Deploy Gradio app with multiple files

Browse files

Files changed (2) hide show

app.py +180 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+import io
+import os
+from typing import Optional
+# Set device
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and tokenizer
+model_name = "deepseek-ai/DeepSeek-OCR"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_name,
+    _attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+    use_safetensors=True,
+)
+model = model.eval().to(device)
+if device == "cuda":
+    model = model.to(torch.bfloat16)
+def ocr_process(
+    image_input: Image.Image,
+    task_type: str = "ocr",
+    base_size: int = 1024,
+    image_size: int = 640,
+    crop_mode: bool = True,
+) -> str:
+    """
+    Process image and extract text using DeepSeek-OCR model.
+    Args:
+        image_input: Input image
+        task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
+        base_size: Base size for model processing
+        image_size: Target image size
+        crop_mode: Whether to use crop mode
+    Returns:
+        Extracted text or markdown content
+    """
+    if image_input is None:
+        return "Please upload an image first."
+    try:
+        # Save image temporarily
+        temp_image_path = "/tmp/temp_ocr_image.jpg"
+        image_input.save(temp_image_path)
+        # Set prompt based on task type
+        if task_type == "markdown":
+            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+        else:
+            prompt = "<image>\nFree OCR. "
+        # Run inference
+        output = model.infer(
+            tokenizer,
+            prompt=prompt,
+            image_file=temp_image_path,
+            output_path="",
+            base_size=base_size,
+            image_size=image_size,
+            crop_mode=crop_mode,
+            save_results=False,
+            test_compress=False,
+        )
+        # Clean up temp file
+        if os.path.exists(temp_image_path):
+            os.remove(temp_image_path)
+        return output if output else "No text detected in image."
+    except Exception as e:
+        return f"Error processing image: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="DeepSeek OCR") as demo:
+    gr.HTML(
+        """
+        <div style="text-align: center; margin-bottom: 20px;">
+            <h1>🔍 DeepSeek OCR</h1>
+            <p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
+            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
+        </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Upload Image")
+            image_input = gr.Image(
+                label="Input Image",
+                type="pil",
+                sources=["upload", "webcam", "clipboard"],
+            )
+            gr.Markdown("### Settings")
+            task_type = gr.Radio(
+                choices=["ocr", "markdown"],
+                value="ocr",
+                label="Task Type",
+                info="OCR: Extract text | Markdown: Convert document to markdown",
+            )
+            base_size = gr.Slider(
+                minimum=512,
+                maximum=1280,
+                step=128,
+                value=1024,
+                label="Base Size",
+                info="Model processing size (larger = better quality, slower)",
+            )
+            image_size = gr.Slider(
+                minimum=512,
+                maximum=1280,
+                step=128,
+                value=640,
+                label="Image Size",
+                info="Target image size",
+            )
+            crop_mode = gr.Checkbox(
+                value=True,
+                label="Crop Mode",
+                info="Enable crop mode for better processing",
+            )
+            submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### Output")
+            output_text = gr.Textbox(
+                label="Extracted Text",
+                lines=10,
+                interactive=False,
+                placeholder="Text will appear here...",
+            )
+            copy_btn = gr.Button("📋 Copy Output")
+    # Event handlers
+    submit_btn.click(
+        fn=ocr_process,
+        inputs=[image_input, task_type, base_size, image_size, crop_mode],
+        outputs=output_text,
+    )
+    copy_btn.click(
+        fn=lambda text: text,
+        inputs=output_text,
+        outputs=output_text,
+        js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
+    )
+    # Examples section
+    gr.Markdown("### Examples")
+    gr.Examples(
+        examples=[
+            ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
+            [
+                "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
+                "markdown",
+            ],
+        ],
+        inputs=[image_input, task_type],
+        label="Try these examples",
+    )
+if __name__ == "__main__":
+    demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0.0
+torch>=2.0.0
+transformers>=4.40.0
+Pillow>=10.0.0
+deepseek-ai
+safetensors>=0.4.0
+flash-attn>=2.5.0