DeepSeek-OCR-DEMO

Running on Zero

App Files Files Community

khang119966 commited on 18 days ago

Commit

2fd9f05

verified ·

1 Parent(s): 3b28ff1

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -70

app.py CHANGED Viewed

@@ -32,11 +32,12 @@ def find_result_image(path):
                 print(f"Error opening result image {filename}: {e}")
     return None
-# --- 2. Main Processing Function (UPDATED) ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
     Processes an image with DeepSeek-OCR for all supported tasks.
     """
     if image is None:
         return "Please upload an image first.", None
@@ -89,48 +90,45 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         print(f"====\n📄 Text Result: {text_result}\n====")
-        # --- NEW: Handle the output with custom bounding box drawing ---
         result_image_pil = None
-        if task_type == "🔍 Locate Object by Reference":
-            # Define the pattern to find coordinates like [[280, 15, 696, 997]]
-            pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
-            match = pattern.search(text_result)
-            if match:
-                print("✅ Found bounding box coordinates. Drawing on the original image.")
                 # Extract coordinates as integers
                 coords_norm = [int(c) for c in match.groups()]
                 x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
-                # Get the original image's dimensions
-                w, h = image.size
                 # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
                 x1 = int(x1_norm / 1000 * w)
                 y1 = int(y1_norm / 1000 * h)
                 x2 = int(x2_norm / 1000 * w)
                 y2 = int(y2_norm / 1000 * h)
-                # Create a copy of the original image to draw on
-                image_with_bbox = image.copy()
-                draw = ImageDraw.Draw(image_with_bbox)
                 # Draw the rectangle with a red outline, 3 pixels wide
                 draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
-                result_image_pil = image_with_bbox
-            else:
-                print("⚠️ Could not parse bbox from text. Falling back to searching for a result image.")
-                result_image_pil = find_result_image(output_path)
         else:
-            # For other tasks, use the old method of finding the generated image
             result_image_pil = find_result_image(output_path)
         return text_result, result_image_pil
-# --- 3. Build the Gradio Interface ---
 with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
@@ -139,37 +137,23 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
         **💡 How to use:**
         1.  **Upload an image** using the upload box.
-        2.  Select a **Model Size**. `Gundam` is recommended for most documents for a good balance of speed and accuracy.
         3.  Choose a **Task Type**:
-            - **📝 Free OCR**: Extracts raw text from the image. Best for simple text extraction.
-            - **📄 Convert to Markdown**: Converts the entire document into Markdown format, preserving structure like headers, lists, and tables.
-            - **📈 Parse Figure**: Analyzes and extracts structured data from charts, graphs, and geometric figures.
-            - **🔍 Locate Object by Reference**: Finds a specific object or piece of text in the image. You **must** type what you're looking for into the **"Reference Text"** box that appears.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
-            model_size = gr.Dropdown(
-                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
-                value="Gundam (Recommended)",
-                label="⚙️ Model Size",
-            )
-            task_type = gr.Dropdown(
-                choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
-                value="📄 Convert to Markdown",
-                label="🚀 Task Type",
-            )
-            ref_text_input = gr.Textbox(
-                label="📝 Reference Text (for Locate task)",
-                placeholder="e.g., the teacher, 11-2=, a red car...",
-                visible=False, # Initially hidden
-            )
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
@@ -178,27 +162,12 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
     # --- UI Interaction Logic ---
     def toggle_ref_text_visibility(task):
-        # If the user selects the 'Locate' task, make the reference textbox visible
-        if task == "🔍 Locate Object by Reference":
-            return gr.Textbox(visible=True)
-        else:
-            return gr.Textbox(visible=False)
-    # When the 'task_type' dropdown changes, call the function to update the visibility
-    task_type.change(
-        fn=toggle_ref_text_visibility,
-        inputs=task_type,
-        outputs=ref_text_input,
-    )
-    # Define what happens when the submit button is clicked
-    submit_btn.click(
-        fn=process_ocr_task,
-        inputs=[image_input, model_size, task_type, ref_text_input],
-        outputs=[output_text, output_image],
-    )
-    # --- Example Images and Tasks ---
     gr.Examples(
         examples=[
             ["doc_markdown.png", "Gundam (Recommended)", "📄 Convert to Markdown", ""],
@@ -215,11 +184,9 @@ with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
 # --- 4. Launch the App ---
 if __name__ == "__main__":
-    # Create an 'examples' directory if it doesn't exist
     if not os.path.exists("examples"):
         os.makedirs("examples")
-    # Please manually download the example images into the "examples" folder.
-    # e.g., doc_markdown.png, chart.png, teacher.png, math_locate.png, receipt.jpg
-    demo.queue(max_size=20)
-    demo.launch(share=True) # Set share=True to create a public link

                 print(f"Error opening result image {filename}: {e}")
     return None
+# --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
     Processes an image with DeepSeek-OCR for all supported tasks.
+    Now draws ALL detected bounding boxes for ANY task.
     """
     if image is None:
         return "Please upload an image first.", None
         print(f"====\n📄 Text Result: {text_result}\n====")
+        # --- NEW LOGIC: Always try to find and draw all bounding boxes ---
         result_image_pil = None
+        # Define the pattern to find all coordinates like [[280, 15, 696, 997]]
+        pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
+        matches = list(pattern.finditer(text_result)) # Use finditer to get all matches
+        if matches:
+            print(f"✅ Found {len(matches)} bounding box(es). Drawing on the original image.")
+            # Create a copy of the original image to draw on
+            image_with_bboxes = image.copy()
+            draw = ImageDraw.Draw(image_with_bboxes)
+            w, h = image.size # Get original image dimensions
+            for match in matches:
                 # Extract coordinates as integers
                 coords_norm = [int(c) for c in match.groups()]
                 x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
                 # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
                 x1 = int(x1_norm / 1000 * w)
                 y1 = int(y1_norm / 1000 * h)
                 x2 = int(x2_norm / 1000 * w)
                 y2 = int(y2_norm / 1000 * h)
                 # Draw the rectangle with a red outline, 3 pixels wide
                 draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
+            result_image_pil = image_with_bboxes
         else:
+            # If no coordinates are found in the text, fall back to finding a pre-generated image
+            print("⚠️ No bounding box coordinates found in text result. Falling back to search for a result image file.")
             result_image_pil = find_result_image(output_path)
         return text_result, result_image_pil
+# --- 3. Build the Gradio Interface (UPDATED) ---
 with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         **💡 How to use:**
         1.  **Upload an image** using the upload box.
+        2.  Select a **Model Size**. `Gundam` is recommended for most documents.
         3.  Choose a **Task Type**:
+            - **📝 Free OCR**: Extracts raw text from the image.
+            - **📄 Convert to Markdown**: Converts the document into Markdown, preserving structure.
+            - **📈 Parse Figure**: Extracts structured data from charts and figures.
+            - **🔍 Locate Object by Reference**: Finds a specific object/text.
+        **⭐️ New Feature**: For **ALL** tasks, if the model detects page elements (text blocks, tables, titles, etc.), it will now draw **red bounding boxes** for them on the result image!
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
+            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Model Size")
+            task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"], value="📄 Convert to Markdown", label="🚀 Task Type")
+            ref_text_input = gr.Textbox(label="📝 Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
     # --- UI Interaction Logic ---
     def toggle_ref_text_visibility(task):
+        return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
+    task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
+    submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
+    # --- UPDATED Example Images and Tasks ---
     gr.Examples(
         examples=[
             ["doc_markdown.png", "Gundam (Recommended)", "📄 Convert to Markdown", ""],
 # --- 4. Launch the App ---
 if __name__ == "__main__":
     if not os.path.exists("examples"):
         os.makedirs("examples")
+    # Make sure to have the correct image files in your "examples" folder
+    # e.g., doc_markdown.png, chart.png, teacher.jpg, math_locate.jpg, receipt.jpg
+    demo.queue(max_size=20).launch(share=True)