File size: 6,719 Bytes
3da4f0d
 
 
 
 
 
bad9d47
3da4f0d
 
 
 
 
 
 
 
 
 
ca8cbba
3da4f0d
 
 
a7d8dfa
3da4f0d
73f54ee
3da4f0d
 
 
 
 
73f54ee
 
 
 
 
 
3da4f0d
73f54ee
 
 
ca8cbba
 
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73f54ee
3da4f0d
 
 
 
 
 
 
73f54ee
3da4f0d
a7d8dfa
73f54ee
 
 
 
 
 
 
 
 
 
 
 
 
 
3bc3aa3
 
73f54ee
3bc3aa3
 
 
3da4f0d
73f54ee
3bc3aa3
6001782
3da4f0d
 
 
73f54ee
3da4f0d
 
aff0c0d
3da4f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
73f54ee
3da4f0d
 
 
 
 
 
 
 
 
 
 
ca8cbba
3da4f0d
 
 
 
73f54ee
 
 
 
 
 
3da4f0d
 
73f54ee
 
 
 
aff0c0d
73f54ee
e63fce5
 
 
73f54ee
 
 
 
 
3da4f0d
 
 
 
 
bad9d47
aff0c0d
3da4f0d
6001782
 
3da4f0d
2681cf8
3da4f0d
 
 
 
73f54ee
 
3da4f0d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
import os
import tempfile
from PIL import Image

# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-OCR"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation="flash_attention_2",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval()


@spaces.GPU
def process_image(image, model_size, task_type, is_eval_mode):
    """
    Process image with DeepSeek-OCR and return multiple output formats.

    Args:
        image: PIL Image or file path
        model_size: Model size configuration
        task_type: OCR task type

    Returns:
        A tuple containing:
        - Path to the image with bounding boxes.
        - The content of the markdown result file.
        - The plain text OCR result.
    """
    if image is None:
        return None, "Please upload an image first.", "Please upload an image first."

    model_gpu = model.cuda().to(torch.bfloat16)

    # Create temporary directory for output
    with tempfile.TemporaryDirectory() as output_path:
        # Set prompt based on task type
        if task_type == "Free OCR":
            prompt = "<image>\nFree OCR. "
        elif task_type == "Convert to Markdown":
            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
        else:
            prompt = "<image>\nFree OCR. "

        # Save uploaded image temporarily
        temp_image_path = os.path.join(output_path, "temp_image.jpg")
        image.save(temp_image_path)

        # Configure model size parameters
        size_configs = {
            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
            "Gundam (Recommended)": {
                "base_size": 1024,
                "image_size": 640,
                "crop_mode": True,
            },
        }

        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

        # Run inference
        plain_text_result = model_gpu.infer(
            tokenizer,
            prompt=prompt,
            image_file=temp_image_path,
            output_path=output_path,
            base_size=config["base_size"],
            image_size=config["image_size"],
            crop_mode=config["crop_mode"],
            save_results=True,  # Ensure results are saved to disk
            test_compress=True,
            eval_mode=is_eval_mode,
        )

        # Define paths for the generated files
        image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
        markdown_result_path = os.path.join(output_path, "result.mmd")

        # Read the markdown file content if it exists
        markdown_content = ""
        if os.path.exists(markdown_result_path):
            with open(markdown_result_path, "r", encoding="utf-8") as f:
                markdown_content = f.read()
        else:
            markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."

        
        result_image = None
        # Check if the annotated image exists
        if os.path.exists(image_result_path):
            result_image = Image.open(image_result_path)
            result_image.load()

        # Return all three results. Gradio will handle the temporary file path for the image.
        text_result = plain_text_result if plain_text_result else markdown_content
        return result_image, markdown_content, text_result


# Create Gradio interface
with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # DeepSeek-OCR Demo
        
        Upload an image to extract text using DeepSeek-OCR model. 
        Supports various document types and handwriting recognition.
        
        **Model Sizes:**
        - **Tiny**: Fastest, lower accuracy (512x512)
        - **Small**: Fast, good accuracy (640x640)
        - **Base**: Balanced performance (1024x1024)
        - **Large**: Best accuracy, slower (1280x1280)
        - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                type="pil", label="Upload Image", sources=["upload", "clipboard"]
            )

            model_size = gr.Dropdown(
                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                value="Gundam (Recommended)",
                label="Model Size",
            )

            task_type = gr.Dropdown(
                choices=["Free OCR", "Convert to Markdown"],
                value="Convert to Markdown",
                label="Task Type",
            )

            eval_mode_checkbox = gr.Checkbox(
                value=False,
                label="Enable Evaluation Mode",
                info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
            )

            submit_btn = gr.Button("Process Image", variant="primary")

        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("Annotated Image"):
                    output_image = gr.Image(
                        interactive=False
                    )
                with gr.TabItem("Markdown Preview"):
                    output_markdown = gr.Markdown()
                with gr.TabItem("Markdown Source(or Eval Output)"):
                    output_text = gr.Textbox(
                        lines=20,
                        show_copy_button=True,
                        interactive=False,
                    )

    # Examples
    gr.Examples(
        examples=[
            ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
            ["examples/receipt.jpg", "Base", "Convert to Markdown"],
            ["examples/receipt-2.png", "Base", "Convert to Markdown"],
        ],
        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
        outputs=[output_image, output_markdown, output_text],
        fn=process_image,
        cache_examples=True,
    )

    submit_btn.click(
        fn=process_image,
        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
        outputs=[output_image, output_markdown, output_text],
    )

# Launch the app
if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch()