Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 8 days ago

Commit

ebec941

verified ·

1 Parent(s): 1a07c5d

Update Gradio app with multiple files

Browse files

Files changed (1) hide show

app.py +54 -33

app.py CHANGED Viewed

@@ -50,12 +50,20 @@ def ocr_process(
             # Save image with proper format
             temp_image_path = os.path.join(temp_dir, "input_image.jpg")
             # Convert RGBA to RGB if necessary
-            if image_input.mode == 'RGBA':
                 rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
-                rgb_image.paste(image_input, mask=image_input.split()[3])
-                rgb_image.save(temp_image_path, 'JPEG')
             else:
-                image_input.save(temp_image_path, 'JPEG')
             # Set parameters based on preset
             presets = {
@@ -74,12 +82,12 @@ def ocr_process(
             else:
                 prompt = "<image>\nFree OCR. "
-            # Run inference
             result = model.infer(
                 tokenizer,
                 prompt=prompt,
                 image_file=temp_image_path,
-                output_path=temp_dir,  # Use temp directory for output
                 base_size=config["base_size"],
                 image_size=config["image_size"],
                 crop_mode=config["crop_mode"],
@@ -91,17 +99,31 @@ def ocr_process(
         model.to("cpu")
         torch.cuda.empty_cache()
-        # Return the result
-        if result:
-            return result
         else:
-            return "No text detected in the image. Please try a different preset or ensure the image contains readable text."
     except Exception as e:
         # Ensure model is moved back to CPU on error
         model.to("cpu")
         torch.cuda.empty_cache()
-        return f"Error processing image: {str(e)}"
 # Create Gradio interface
@@ -131,32 +153,32 @@ with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
                 choices=["ocr", "markdown"],
                 value="ocr",
                 label="Task Type",
-                info="OCR: Extract text | Markdown: Convert document to markdown format",
             )
             preset = gr.Radio(
-                choices=["gundam", "tiny", "small", "base", "large"],
                 value="gundam",
                 label="Model Preset",
-                info="Gundam: Optimized for mixed content | Tiny/Small: Fast | Base/Large: High quality",
             )
-            with gr.Accordion("Preset Details", open=False):
                 gr.Markdown("""
-                - **Gundam**: base_size=1024, image_size=640, crop_mode=True (Recommended)
-                - **Tiny**: base_size=512, image_size=512, crop_mode=False (Fastest)
-                - **Small**: base_size=640, image_size=640, crop_mode=False
-                - **Base**: base_size=1024, image_size=1024, crop_mode=False
-                - **Large**: base_size=1280, image_size=1280, crop_mode=False (Best quality)
                 """)
             submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
             clear_btn = gr.ClearButton([image_input], value="🗑️ Clear")
         with gr.Column(scale=1):
-            gr.Markdown("### 📝 Output")
             output_text = gr.Textbox(
-                label="Extracted Text",
                 lines=15,
                 max_lines=30,
                 interactive=False,
@@ -171,24 +193,23 @@ with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
         outputs=output_text,
     )
-    # Examples section
-    gr.Markdown("### 📚 Examples")
     gr.Examples(
         examples=[
-            ["example1.jpg", "ocr", "gundam"],
-            ["example2.jpg", "markdown", "gundam"],
         ],
         inputs=[image_input, task_type, preset],
-        label="Try these examples (upload your own images for testing)",
     )
     gr.Markdown("""
-    ### 💡 Tips
-    - For general OCR, use the "gundam" preset (optimized balance)
-    - For high-quality scanned documents, try "base" or "large" presets
-    - For handwritten text, "large" preset may work better
-    - Use "markdown" mode for structured documents with formatting
-    - If processing fails, try a different preset
     """)

             # Save image with proper format
             temp_image_path = os.path.join(temp_dir, "input_image.jpg")
             # Convert RGBA to RGB if necessary
+            if image_input.mode in ('RGBA', 'LA', 'P'):
                 rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
+                # Handle different image modes
+                if image_input.mode == 'RGBA':
+                    rgb_image.paste(image_input, mask=image_input.split()[3])
+                else:
+                    rgb_image.paste(image_input)
+                rgb_image.save(temp_image_path, 'JPEG', quality=95)
             else:
+                image_input.save(temp_image_path, 'JPEG', quality=95)
+            # Verify image was saved
+            if not os.path.exists(temp_image_path):
+                return "Error: Failed to save image for processing."
             # Set parameters based on preset
             presets = {
             else:
                 prompt = "<image>\nFree OCR. "
+            # Run inference - the model returns the text directly
             result = model.infer(
                 tokenizer,
                 prompt=prompt,
                 image_file=temp_image_path,
+                output_path=temp_dir,
                 base_size=config["base_size"],
                 image_size=config["image_size"],
                 crop_mode=config["crop_mode"],
         model.to("cpu")
         torch.cuda.empty_cache()
+        # Process the result
+        if result is None:
+            return "No text could be extracted. The image might be too blurry or contain no readable text."
+        # Handle different result types
+        if isinstance(result, str):
+            output_text = result.strip()
+        elif isinstance(result, (list, tuple)) and len(result) > 0:
+            output_text = str(result[0]).strip()
+        elif isinstance(result, dict):
+            # Try to get text from common keys
+            output_text = result.get('text', result.get('output', result.get('result', str(result))))
         else:
+            output_text = str(result).strip()
+        if not output_text or output_text == "None":
+            return "No text detected. Try adjusting the preset or uploading a clearer image."
+        return output_text
     except Exception as e:
         # Ensure model is moved back to CPU on error
         model.to("cpu")
         torch.cuda.empty_cache()
+        return f"Error processing image: {str(e)}\n\nPlease try a different preset or check if the image is valid."
 # Create Gradio interface
                 choices=["ocr", "markdown"],
                 value="ocr",
                 label="Task Type",
+                info="OCR: Extract plain text | Markdown: Convert to formatted markdown",
             )
             preset = gr.Radio(
+                choices=["gundam", "base", "large", "small", "tiny"],
                 value="gundam",
                 label="Model Preset",
+                info="Start with 'gundam' - it's optimized for most documents",
             )
+            with gr.Accordion("ℹ️ Preset Details", open=False):
                 gr.Markdown("""
+                - **Gundam** (Recommended): Balanced performance with crop mode
+                - **Base**: Standard quality without cropping
+                - **Large**: Highest quality for complex documents
+                - **Small**: Faster processing, good for simple text
+                - **Tiny**: Fastest, suitable for clear printed text
                 """)
             submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
             clear_btn = gr.ClearButton([image_input], value="🗑️ Clear")
         with gr.Column(scale=1):
+            gr.Markdown("### 📝 Extracted Text")
             output_text = gr.Textbox(
+                label="Output",
                 lines=15,
                 max_lines=30,
                 interactive=False,
         outputs=output_text,
     )
+    # Example section with receipt image
+    gr.Markdown("### 📚 Example")
     gr.Examples(
         examples=[
+            ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"],
         ],
         inputs=[image_input, task_type, preset],
+        label="Try this receipt example",
     )
     gr.Markdown("""
+    ### 💡 Tips for Best Results
+    - **For receipts**: Use "ocr" mode with "gundam" or "base" preset
+    - **For documents with tables**: Use "markdown" mode with "large" preset
+    - **If text is not detected**: Try different presets in this order: gundam → base → large
+    - **For handwritten text**: Use "large" preset for better accuracy
+    - Ensure images are clear and well-lit for optimal results
     """)