khang119966 commited on
Commit
2fd9f05
Β·
verified Β·
1 Parent(s): 3b28ff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -70
app.py CHANGED
@@ -32,11 +32,12 @@ def find_result_image(path):
32
  print(f"Error opening result image {filename}: {e}")
33
  return None
34
 
35
- # --- 2. Main Processing Function (UPDATED) ---
36
  @spaces.GPU
37
  def process_ocr_task(image, model_size, task_type, ref_text):
38
  """
39
  Processes an image with DeepSeek-OCR for all supported tasks.
 
40
  """
41
  if image is None:
42
  return "Please upload an image first.", None
@@ -89,48 +90,45 @@ def process_ocr_task(image, model_size, task_type, ref_text):
89
 
90
  print(f"====\nπŸ“„ Text Result: {text_result}\n====")
91
 
92
- # --- NEW: Handle the output with custom bounding box drawing ---
93
  result_image_pil = None
94
 
95
- if task_type == "πŸ” Locate Object by Reference":
96
- # Define the pattern to find coordinates like [[280, 15, 696, 997]]
97
- pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
98
- match = pattern.search(text_result)
99
 
100
- if match:
101
- print("βœ… Found bounding box coordinates. Drawing on the original image.")
 
 
 
 
 
 
 
102
  # Extract coordinates as integers
103
  coords_norm = [int(c) for c in match.groups()]
104
  x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
105
 
106
- # Get the original image's dimensions
107
- w, h = image.size
108
-
109
  # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
110
  x1 = int(x1_norm / 1000 * w)
111
  y1 = int(y1_norm / 1000 * h)
112
  x2 = int(x2_norm / 1000 * w)
113
  y2 = int(y2_norm / 1000 * h)
114
 
115
- # Create a copy of the original image to draw on
116
- image_with_bbox = image.copy()
117
- draw = ImageDraw.Draw(image_with_bbox)
118
-
119
  # Draw the rectangle with a red outline, 3 pixels wide
120
  draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
121
-
122
- result_image_pil = image_with_bbox
123
- else:
124
- print("⚠️ Could not parse bbox from text. Falling back to searching for a result image.")
125
- result_image_pil = find_result_image(output_path)
126
  else:
127
- # For other tasks, use the old method of finding the generated image
 
128
  result_image_pil = find_result_image(output_path)
129
 
130
  return text_result, result_image_pil
131
 
132
 
133
- # --- 3. Build the Gradio Interface ---
134
  with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
135
  gr.Markdown(
136
  """
@@ -139,37 +137,23 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
139
 
140
  **πŸ’‘ How to use:**
141
  1. **Upload an image** using the upload box.
142
- 2. Select a **Model Size**. `Gundam` is recommended for most documents for a good balance of speed and accuracy.
143
  3. Choose a **Task Type**:
144
- - **πŸ“ Free OCR**: Extracts raw text from the image. Best for simple text extraction.
145
- - **πŸ“„ Convert to Markdown**: Converts the entire document into Markdown format, preserving structure like headers, lists, and tables.
146
- - **πŸ“ˆ Parse Figure**: Analyzes and extracts structured data from charts, graphs, and geometric figures.
147
- - **πŸ” Locate Object by Reference**: Finds a specific object or piece of text in the image. You **must** type what you're looking for into the **"Reference Text"** box that appears.
 
 
148
  """
149
  )
150
 
151
  with gr.Row():
152
  with gr.Column(scale=1):
153
  image_input = gr.Image(type="pil", label="πŸ–ΌοΈ Upload Image", sources=["upload", "clipboard"])
154
-
155
- model_size = gr.Dropdown(
156
- choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
157
- value="Gundam (Recommended)",
158
- label="βš™οΈ Model Size",
159
- )
160
-
161
- task_type = gr.Dropdown(
162
- choices=["πŸ“ Free OCR", "πŸ“„ Convert to Markdown", "πŸ“ˆ Parse Figure", "πŸ” Locate Object by Reference"],
163
- value="πŸ“„ Convert to Markdown",
164
- label="πŸš€ Task Type",
165
- )
166
-
167
- ref_text_input = gr.Textbox(
168
- label="πŸ“ Reference Text (for Locate task)",
169
- placeholder="e.g., the teacher, 11-2=, a red car...",
170
- visible=False, # Initially hidden
171
- )
172
-
173
  submit_btn = gr.Button("Process Image", variant="primary")
174
 
175
  with gr.Column(scale=2):
@@ -178,27 +162,12 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
178
 
179
  # --- UI Interaction Logic ---
180
  def toggle_ref_text_visibility(task):
181
- # If the user selects the 'Locate' task, make the reference textbox visible
182
- if task == "πŸ” Locate Object by Reference":
183
- return gr.Textbox(visible=True)
184
- else:
185
- return gr.Textbox(visible=False)
186
 
187
- # When the 'task_type' dropdown changes, call the function to update the visibility
188
- task_type.change(
189
- fn=toggle_ref_text_visibility,
190
- inputs=task_type,
191
- outputs=ref_text_input,
192
- )
193
-
194
- # Define what happens when the submit button is clicked
195
- submit_btn.click(
196
- fn=process_ocr_task,
197
- inputs=[image_input, model_size, task_type, ref_text_input],
198
- outputs=[output_text, output_image],
199
- )
200
 
201
- # --- Example Images and Tasks ---
202
  gr.Examples(
203
  examples=[
204
  ["doc_markdown.png", "Gundam (Recommended)", "πŸ“„ Convert to Markdown", ""],
@@ -215,11 +184,9 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
215
 
216
  # --- 4. Launch the App ---
217
  if __name__ == "__main__":
218
- # Create an 'examples' directory if it doesn't exist
219
  if not os.path.exists("examples"):
220
  os.makedirs("examples")
221
- # Please manually download the example images into the "examples" folder.
222
- # e.g., doc_markdown.png, chart.png, teacher.png, math_locate.png, receipt.jpg
223
 
224
- demo.queue(max_size=20)
225
- demo.launch(share=True) # Set share=True to create a public link
 
32
  print(f"Error opening result image {filename}: {e}")
33
  return None
34
 
35
+ # --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
36
  @spaces.GPU
37
  def process_ocr_task(image, model_size, task_type, ref_text):
38
  """
39
  Processes an image with DeepSeek-OCR for all supported tasks.
40
+ Now draws ALL detected bounding boxes for ANY task.
41
  """
42
  if image is None:
43
  return "Please upload an image first.", None
 
90
 
91
  print(f"====\nπŸ“„ Text Result: {text_result}\n====")
92
 
93
+ # --- NEW LOGIC: Always try to find and draw all bounding boxes ---
94
  result_image_pil = None
95
 
96
+ # Define the pattern to find all coordinates like [[280, 15, 696, 997]]
97
+ pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
98
+ matches = list(pattern.finditer(text_result)) # Use finditer to get all matches
 
99
 
100
+ if matches:
101
+ print(f"βœ… Found {len(matches)} bounding box(es). Drawing on the original image.")
102
+
103
+ # Create a copy of the original image to draw on
104
+ image_with_bboxes = image.copy()
105
+ draw = ImageDraw.Draw(image_with_bboxes)
106
+ w, h = image.size # Get original image dimensions
107
+
108
+ for match in matches:
109
  # Extract coordinates as integers
110
  coords_norm = [int(c) for c in match.groups()]
111
  x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
112
 
 
 
 
113
  # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
114
  x1 = int(x1_norm / 1000 * w)
115
  y1 = int(y1_norm / 1000 * h)
116
  x2 = int(x2_norm / 1000 * w)
117
  y2 = int(y2_norm / 1000 * h)
118
 
 
 
 
 
119
  # Draw the rectangle with a red outline, 3 pixels wide
120
  draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
121
+
122
+ result_image_pil = image_with_bboxes
 
 
 
123
  else:
124
+ # If no coordinates are found in the text, fall back to finding a pre-generated image
125
+ print("⚠️ No bounding box coordinates found in text result. Falling back to search for a result image file.")
126
  result_image_pil = find_result_image(output_path)
127
 
128
  return text_result, result_image_pil
129
 
130
 
131
+ # --- 3. Build the Gradio Interface (UPDATED) ---
132
  with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
133
  gr.Markdown(
134
  """
 
137
 
138
  **πŸ’‘ How to use:**
139
  1. **Upload an image** using the upload box.
140
+ 2. Select a **Model Size**. `Gundam` is recommended for most documents.
141
  3. Choose a **Task Type**:
142
+ - **πŸ“ Free OCR**: Extracts raw text from the image.
143
+ - **πŸ“„ Convert to Markdown**: Converts the document into Markdown, preserving structure.
144
+ - **πŸ“ˆ Parse Figure**: Extracts structured data from charts and figures.
145
+ - **πŸ” Locate Object by Reference**: Finds a specific object/text.
146
+
147
+ **⭐️ New Feature**: For **ALL** tasks, if the model detects page elements (text blocks, tables, titles, etc.), it will now draw **red bounding boxes** for them on the result image!
148
  """
149
  )
150
 
151
  with gr.Row():
152
  with gr.Column(scale=1):
153
  image_input = gr.Image(type="pil", label="πŸ–ΌοΈ Upload Image", sources=["upload", "clipboard"])
154
+ model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βš™οΈ Model Size")
155
+ task_type = gr.Dropdown(choices=["πŸ“ Free OCR", "πŸ“„ Convert to Markdown", "πŸ“ˆ Parse Figure", "πŸ” Locate Object by Reference"], value="πŸ“„ Convert to Markdown", label="πŸš€ Task Type")
156
+ ref_text_input = gr.Textbox(label="πŸ“ Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  submit_btn = gr.Button("Process Image", variant="primary")
158
 
159
  with gr.Column(scale=2):
 
162
 
163
  # --- UI Interaction Logic ---
164
  def toggle_ref_text_visibility(task):
165
+ return gr.Textbox(visible=True) if task == "πŸ” Locate Object by Reference" else gr.Textbox(visible=False)
 
 
 
 
166
 
167
+ task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
168
+ submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ # --- UPDATED Example Images and Tasks ---
171
  gr.Examples(
172
  examples=[
173
  ["doc_markdown.png", "Gundam (Recommended)", "πŸ“„ Convert to Markdown", ""],
 
184
 
185
  # --- 4. Launch the App ---
186
  if __name__ == "__main__":
 
187
  if not os.path.exists("examples"):
188
  os.makedirs("examples")
189
+ # Make sure to have the correct image files in your "examples" folder
190
+ # e.g., doc_markdown.png, chart.png, teacher.jpg, math_locate.jpg, receipt.jpg
191
 
192
+ demo.queue(max_size=20).launch(share=True)