lucacadalora commited on
Commit
6152a26
·
verified ·
1 Parent(s): 0bb6b0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -24
app.py CHANGED
@@ -1,33 +1,188 @@
1
- import os
2
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # optional speed-up
3
-
4
  import gradio as gr
5
  import torch
6
- from transformers import pipeline
7
-
8
- # important: trust_remote_code=True avoids the interactive prompt
9
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
10
 
11
- ocr = pipeline(
12
- "image-to-text",
13
- model="deepseek-ai/DeepSeek-OCR",
 
 
 
14
  trust_remote_code=True,
15
- device_map="auto", # uses GPU if available, else CPU
16
- torch_dtype=dtype,
17
  )
 
18
 
19
- def ocr_image(image):
20
- out = ocr(image)
21
- # pipeline returns a list of dicts like [{'generated_text': '...'}]
22
- return out[0]["generated_text"] if out else ""
23
-
24
- demo = gr.Interface(
25
- fn=ocr_image,
26
- inputs=gr.Image(type="pil"),
27
- outputs="text",
28
- title="DeepSeek OCR",
29
- description="Upload an image and get extracted text using DeepSeek-OCR."
30
- )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  if __name__ == "__main__":
 
33
  demo.launch()
 
 
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ import spaces
5
+ import os
6
+ import tempfile
7
+ from PIL import Image
8
 
9
+ # Load model and tokenizer
10
+ model_name = "deepseek-ai/DeepSeek-OCR"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
12
+ model = AutoModel.from_pretrained(
13
+ model_name,
14
+ _attn_implementation="flash_attention_2",
15
  trust_remote_code=True,
16
+ use_safetensors=True,
 
17
  )
18
+ model = model.eval()
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ @spaces.GPU
22
+ def process_image(image, model_size, task_type, is_eval_mode):
23
+ """
24
+ Process image with DeepSeek-OCR and return multiple output formats.
25
+ Args:
26
+ image: PIL Image or file path
27
+ model_size: Model size configuration
28
+ task_type: OCR task type
29
+ Returns:
30
+ A tuple containing:
31
+ - Path to the image with bounding boxes.
32
+ - The content of the markdown result file.
33
+ - The plain text OCR result.
34
+ """
35
+ if image is None:
36
+ return None, "Please upload an image first.", "Please upload an image first."
37
+
38
+ model_gpu = model.cuda().to(torch.bfloat16)
39
+
40
+ # Create temporary directory for output
41
+ with tempfile.TemporaryDirectory() as output_path:
42
+ # Set prompt based on task type
43
+ if task_type == "Free OCR":
44
+ prompt = "<image>\nFree OCR. "
45
+ elif task_type == "Convert to Markdown":
46
+ prompt = "<image>\n<|grounding|>Convert the document to markdown. "
47
+ else:
48
+ prompt = "<image>\nFree OCR. "
49
+
50
+ # Save uploaded image temporarily
51
+ temp_image_path = os.path.join(output_path, "temp_image.jpg")
52
+ image.save(temp_image_path)
53
+
54
+ # Configure model size parameters
55
+ size_configs = {
56
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
57
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
58
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
59
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
60
+ "Gundam (Recommended)": {
61
+ "base_size": 1024,
62
+ "image_size": 640,
63
+ "crop_mode": True,
64
+ },
65
+ }
66
+
67
+ config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
68
+
69
+ # Run inference
70
+ plain_text_result = model_gpu.infer(
71
+ tokenizer,
72
+ prompt=prompt,
73
+ image_file=temp_image_path,
74
+ output_path=output_path,
75
+ base_size=config["base_size"],
76
+ image_size=config["image_size"],
77
+ crop_mode=config["crop_mode"],
78
+ save_results=True, # Ensure results are saved to disk
79
+ test_compress=True,
80
+ eval_mode=is_eval_mode,
81
+ )
82
+
83
+ # Define paths for the generated files
84
+ image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
85
+ markdown_result_path = os.path.join(output_path, "result.mmd")
86
+
87
+ # Read the markdown file content if it exists
88
+ markdown_content = ""
89
+ if os.path.exists(markdown_result_path):
90
+ with open(markdown_result_path, "r", encoding="utf-8") as f:
91
+ markdown_content = f.read()
92
+ else:
93
+ markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
94
+
95
+
96
+ result_image = None
97
+ # Check if the annotated image exists
98
+ if os.path.exists(image_result_path):
99
+ result_image = Image.open(image_result_path)
100
+ result_image.load()
101
+
102
+ # Return all three results. Gradio will handle the temporary file path for the image.
103
+ text_result = plain_text_result if plain_text_result else markdown_content
104
+ return result_image, markdown_content, text_result
105
+
106
+
107
+ # Create Gradio interface
108
+ with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
109
+ gr.Markdown(
110
+ """
111
+ # DeepSeek-OCR Demo
112
+
113
+ Upload an image to extract text using DeepSeek-OCR model.
114
+ Supports various document types and handwriting recognition.
115
+
116
+ **Model Sizes:**
117
+ - **Tiny**: Fastest, lower accuracy (512x512)
118
+ - **Small**: Fast, good accuracy (640x640)
119
+ - **Base**: Balanced performance (1024x1024)
120
+ - **Large**: Best accuracy, slower (1280x1280)
121
+ - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
122
+ """
123
+ )
124
+
125
+ with gr.Row():
126
+ with gr.Column(scale=1):
127
+ image_input = gr.Image(
128
+ type="pil", label="Upload Image", sources=["upload", "clipboard"]
129
+ )
130
+
131
+ model_size = gr.Dropdown(
132
+ choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
133
+ value="Gundam (Recommended)",
134
+ label="Model Size",
135
+ )
136
+
137
+ task_type = gr.Dropdown(
138
+ choices=["Free OCR", "Convert to Markdown"],
139
+ value="Convert to Markdown",
140
+ label="Task Type",
141
+ )
142
+
143
+ eval_mode_checkbox = gr.Checkbox(
144
+ value=False,
145
+ label="Enable Evaluation Mode",
146
+ info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
147
+ )
148
+
149
+ submit_btn = gr.Button("Process Image", variant="primary")
150
+
151
+ with gr.Column(scale=2):
152
+ with gr.Tabs():
153
+ with gr.TabItem("Annotated Image"):
154
+ output_image = gr.Image(
155
+ interactive=False
156
+ )
157
+ with gr.TabItem("Markdown Preview"):
158
+ output_markdown = gr.Markdown()
159
+ with gr.TabItem("Markdown Source(or Eval Output)"):
160
+ output_text = gr.Textbox(
161
+ lines=20,
162
+ show_copy_button=True,
163
+ interactive=False,
164
+ )
165
+
166
+ # Examples
167
+ gr.Examples(
168
+ examples=[
169
+ ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
170
+ ["examples/receipt.jpg", "Base", "Convert to Markdown"],
171
+ ["examples/receipt-2.png", "Base", "Convert to Markdown"],
172
+ ],
173
+ inputs=[image_input, model_size, task_type, eval_mode_checkbox],
174
+ outputs=[output_image, output_markdown, output_text],
175
+ fn=process_image,
176
+ cache_examples=True,
177
+ )
178
+
179
+ submit_btn.click(
180
+ fn=process_image,
181
+ inputs=[image_input, model_size, task_type, eval_mode_checkbox],
182
+ outputs=[output_image, output_markdown, output_text],
183
+ )
184
+
185
+ # Launch the app
186
  if __name__ == "__main__":
187
+ demo.queue(max_size=20)
188
  demo.launch()