File size: 5,295 Bytes
086e346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import io
import os
from typing import Optional

# Set device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="flash_attention_2",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval().to(device)
if device == "cuda":
    model = model.to(torch.bfloat16)


def ocr_process(
    image_input: Image.Image,
    task_type: str = "ocr",
    base_size: int = 1024,
    image_size: int = 640,
    crop_mode: bool = True,
) -> str:
    """
    Process image and extract text using DeepSeek-OCR model.

    Args:
        image_input: Input image
        task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
        base_size: Base size for model processing
        image_size: Target image size
        crop_mode: Whether to use crop mode

    Returns:
        Extracted text or markdown content
    """
    if image_input is None:
        return "Please upload an image first."

    try:
        # Save image temporarily
        temp_image_path = "/tmp/temp_ocr_image.jpg"
        image_input.save(temp_image_path)

        # Set prompt based on task type
        if task_type == "markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
        else:
            prompt = "<image>\nFree OCR. "

        # Run inference
        output = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path="",
            base_size=base_size,
            image_size=image_size,
            crop_mode=crop_mode,
            save_results=False,
            test_compress=False,
        )

        # Clean up temp file
        if os.path.exists(temp_image_path):
            os.remove(temp_image_path)

        return output if output else "No text detected in image."

    except Exception as e:
        return f"Error processing image: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="DeepSeek OCR") as demo:
    gr.HTML(
        """
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>πŸ” DeepSeek OCR</h1>
            <p>Extract text and convert documents to markdown using DeepSeek-OCR</p>
            <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Upload Image")
            image_input = gr.Image(
                label="Input Image",
                type="pil",
                sources=["upload", "webcam", "clipboard"],
            )

            gr.Markdown("### Settings")
            task_type = gr.Radio(
                choices=["ocr", "markdown"],
                value="ocr",
                label="Task Type",
                info="OCR: Extract text | Markdown: Convert document to markdown",
            )

            base_size = gr.Slider(
                minimum=512,
                maximum=1280,
                step=128,
                value=1024,
                label="Base Size",
                info="Model processing size (larger = better quality, slower)",
            )

            image_size = gr.Slider(
                minimum=512,
                maximum=1280,
                step=128,
                value=640,
                label="Image Size",
                info="Target image size",
            )

            crop_mode = gr.Checkbox(
                value=True,
                label="Crop Mode",
                info="Enable crop mode for better processing",
            )

            submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### Output")
            output_text = gr.Textbox(
                label="Extracted Text",
                lines=10,
                interactive=False,
                placeholder="Text will appear here...",
            )

            copy_btn = gr.Button("πŸ“‹ Copy Output")

    # Event handlers
    submit_btn.click(
        fn=ocr_process,
        inputs=[image_input, task_type, base_size, image_size, crop_mode],
        outputs=output_text,
    )

    copy_btn.click(
        fn=lambda text: text,
        inputs=output_text,
        outputs=output_text,
        js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
    )

    # Examples section
    gr.Markdown("### Examples")
    gr.Examples(
        examples=[
            ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
            [
                "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
                "markdown",
            ],
        ],
        inputs=[image_input, task_type],
        label="Try these examples",
    )


if __name__ == "__main__":
    demo.launch(share=False)