File size: 4,501 Bytes
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8cbba
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
ca8cbba
 
 
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8cbba
3da4f0d
 
 
 
 
 
 
 
 
94fd0fd
3da4f0d
 
ca8cbba
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8cbba
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile

# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="flash_attention_2",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval()


@spaces.GPU
def process_image(image, model_size, task_type):
    """
    Process image with DeepSeek-OCR

    Args:
        image: PIL Image or file path
        model_size: Model size configuration
        task_type: OCR task type
    """
    # 在 GPU 函数内部移动模型到 GPU
    model_gpu = model.cuda().to(torch.bfloat16)

    # Create temporary directory for output
    with tempfile.TemporaryDirectory() as output_path:
        # Set prompt based on task type
        if task_type == "Free OCR":
            prompt = "<image>\nFree OCR. "
        elif task_type == "Convert to Markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
        else:
            prompt = "<image>\nFree OCR. "

        # Save uploaded image temporarily
        temp_image_path = os.path.join(output_path, "temp_image.jpg")
        image.save(temp_image_path)

        # Configure model size parameters
        size_configs = {
            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
            "Gundam (Recommended)": {
                "base_size": 1024,
                "image_size": 640,
                "crop_mode": True,
            },
        }

        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

        # Run inference
        result = model_gpu.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path=output_path,
            base_size=config["base_size"],
            image_size=config["image_size"],
            crop_mode=config["crop_mode"],
            save_results=True,
            test_compress=True,
            eval_mode=True,
        )

        print(f"====\nresult: {result}\n====\n")
        return result


# Create Gradio interface
with gr.Blocks(title="DeepSeek-OCR") as demo:
    gr.Markdown(
        """
        # DeepSeek-OCR Document Recognition
        
        Upload an image to extract text using DeepSeek-OCR model. 
        Supports various document types and handwriting recognition.
        
        **Model Sizes:**
        - **Tiny**: Fastest, lower accuracy (512x512)
        - **Small**: Fast, good accuracy (640x640)
        - **Base**: Balanced performance (1024x1024)
        - **Large**: Best accuracy, slower (1280x1280)
        - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(
                type="pil", label="Upload Image", sources=["upload", "clipboard"]
            )

            model_size = gr.Dropdown(
                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                value="Gundam (Recommended)",
                label="Model Size",
            )

            task_type = gr.Dropdown(
                choices=["Free OCR", "Convert to Markdown"],
                value="Convert to Markdown",
                label="Task Type",
            )

            submit_btn = gr.Button("Process Image", variant="primary")

        with gr.Column():
            output_text = gr.Textbox(
                label="OCR Result", lines=20, show_copy_button=True
            )

    # Examples
    gr.Examples(
        examples=[
            ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
            ["examples/receipt.jpg", "Base", "Free OCR"],
        ],
        inputs=[image_input, model_size, task_type],
        outputs=output_text,
        fn=process_image,
        cache_examples=False,
    )

    submit_btn.click(
        fn=process_image,
        inputs=[image_input, model_size, task_type],
        outputs=output_text,
    )

# Launch the app
if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch()