Spaces:

lucacadalora
/

jatevo

Runtime error

App Files Files Community

lucacadalora commited on Oct 21

Commit

6152a26

verified ·

1 Parent(s): 0bb6b0a

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -24

app.py CHANGED Viewed

@@ -1,33 +1,188 @@
-import os
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # optional speed-up
 import gradio as gr
 import torch
-from transformers import pipeline
-# important: trust_remote_code=True avoids the interactive prompt
-dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-ocr = pipeline(
-    "image-to-text",
-    model="deepseek-ai/DeepSeek-OCR",
     trust_remote_code=True,
-    device_map="auto",          # uses GPU if available, else CPU
-    torch_dtype=dtype,
 )
-def ocr_image(image):
-    out = ocr(image)
-    # pipeline returns a list of dicts like [{'generated_text': '...'}]
-    return out[0]["generated_text"] if out else ""
-demo = gr.Interface(
-    fn=ocr_image,
-    inputs=gr.Image(type="pil"),
-    outputs="text",
-    title="DeepSeek OCR",
-    description="Upload an image and get extracted text using DeepSeek-OCR."
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoModel, AutoTokenizer
+import spaces
+import os
+import tempfile
+from PIL import Image
+# Load model and tokenizer
+model_name = "deepseek-ai/DeepSeek-OCR"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_name,
+    _attn_implementation="flash_attention_2",
     trust_remote_code=True,
+    use_safetensors=True,
 )
+model = model.eval()
+@spaces.GPU
+def process_image(image, model_size, task_type, is_eval_mode):
+    """
+    Process image with DeepSeek-OCR and return multiple output formats.
+    Args:
+        image: PIL Image or file path
+        model_size: Model size configuration
+        task_type: OCR task type
+    Returns:
+        A tuple containing:
+        - Path to the image with bounding boxes.
+        - The content of the markdown result file.
+        - The plain text OCR result.
+    """
+    if image is None:
+        return None, "Please upload an image first.", "Please upload an image first."
+    model_gpu = model.cuda().to(torch.bfloat16)
+    # Create temporary directory for output
+    with tempfile.TemporaryDirectory() as output_path:
+        # Set prompt based on task type
+        if task_type == "Free OCR":
+            prompt = "<image>\nFree OCR. "
+        elif task_type == "Convert to Markdown":
+            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+        else:
+            prompt = "<image>\nFree OCR. "
+        # Save uploaded image temporarily
+        temp_image_path = os.path.join(output_path, "temp_image.jpg")
+        image.save(temp_image_path)
+        # Configure model size parameters
+        size_configs = {
+            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            "Gundam (Recommended)": {
+                "base_size": 1024,
+                "image_size": 640,
+                "crop_mode": True,
+            },
+        }
+        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        # Run inference
+        plain_text_result = model_gpu.infer(
+            tokenizer,
+            prompt=prompt,
+            image_file=temp_image_path,
+            output_path=output_path,
+            base_size=config["base_size"],
+            image_size=config["image_size"],
+            crop_mode=config["crop_mode"],
+            save_results=True,  # Ensure results are saved to disk
+            test_compress=True,
+            eval_mode=is_eval_mode,
+        )
+        # Define paths for the generated files
+        image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
+        markdown_result_path = os.path.join(output_path, "result.mmd")
+        # Read the markdown file content if it exists
+        markdown_content = ""
+        if os.path.exists(markdown_result_path):
+            with open(markdown_result_path, "r", encoding="utf-8") as f:
+                markdown_content = f.read()
+        else:
+            markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
+        result_image = None
+        # Check if the annotated image exists
+        if os.path.exists(image_result_path):
+            result_image = Image.open(image_result_path)
+            result_image.load()
+        # Return all three results. Gradio will handle the temporary file path for the image.
+        text_result = plain_text_result if plain_text_result else markdown_content
+        return result_image, markdown_content, text_result
+# Create Gradio interface
+with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # DeepSeek-OCR Demo
+        Upload an image to extract text using DeepSeek-OCR model.
+        Supports various document types and handwriting recognition.
+        **Model Sizes:**
+        - **Tiny**: Fastest, lower accuracy (512x512)
+        - **Small**: Fast, good accuracy (640x640)
+        - **Base**: Balanced performance (1024x1024)
+        - **Large**: Best accuracy, slower (1280x1280)
+        - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                type="pil", label="Upload Image", sources=["upload", "clipboard"]
+            )
+            model_size = gr.Dropdown(
+                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                value="Gundam (Recommended)",
+                label="Model Size",
+            )
+            task_type = gr.Dropdown(
+                choices=["Free OCR", "Convert to Markdown"],
+                value="Convert to Markdown",
+                label="Task Type",
+            )
+            eval_mode_checkbox = gr.Checkbox(
+                value=False,
+                label="Enable Evaluation Mode",
+                info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
+            )
+            submit_btn = gr.Button("Process Image", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Annotated Image"):
+                    output_image = gr.Image(
+                        interactive=False
+                    )
+                with gr.TabItem("Markdown Preview"):
+                    output_markdown = gr.Markdown()
+                with gr.TabItem("Markdown Source(or Eval Output)"):
+                    output_text = gr.Textbox(
+                        lines=20,
+                        show_copy_button=True,
+                        interactive=False,
+                    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
+            ["examples/receipt.jpg", "Base", "Convert to Markdown"],
+            ["examples/receipt-2.png", "Base", "Convert to Markdown"],
+        ],
+        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
+        outputs=[output_image, output_markdown, output_text],
+        fn=process_image,
+        cache_examples=True,
+    )
+    submit_btn.click(
+        fn=process_image,
+        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
+        outputs=[output_image, output_markdown, output_text],
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.queue(max_size=20)
     demo.launch()