Spaces:

prithivMLmods
/

florence2-vision-models

Running on Zero

App Files Files Community

prithivMLmods commited on 27 days ago

Commit

7b33c74

verified ·

1 Parent(s): 0389cd8

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -168

app.py CHANGED Viewed

@@ -1,184 +1,175 @@
 import gradio as gr
 import torch
-from transformers import AutoModel, AutoTokenizer
-import spaces
-import os
-import tempfile
-from PIL import Image, ImageDraw
-import re # Import thư viện regular expression
-# --- 1. Load Model and Tokenizer (Done only once at startup) ---
-print("Loading model and tokenizer...")
-model_name = "strangervisionhf/deepseek-ocr-transformers-v5"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-# --- FIX 1: Resolve Tokenizer Warnings ---
-# Explicitly set the pad_token_id to the eos_token_id. This is a common setup for
-# models that are used for open-ended text generation. It resolves the warning.
-tokenizer.pad_token_id = tokenizer.eos_token_id
-# Load the model to CPU first; it will be moved to GPU during processing
-model = AutoModel.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16, # Use bfloat16 for performance and compatibility
-    trust_remote_code=True,
-    use_safetensors=True,
 )
-# --- FIX 2: Prevent AttributeError ---
-# The model's code is incompatible with the newer 'DynamicCache' in transformers.
-# Disabling the cache prevents the error-causing code path from being executed.
-# This may slightly slow down inference but ensures stability.
-model.config.use_cache = False
-model = model.eval()
-print("✅ Model loaded successfully.")
-# --- Helper function to find pre-generated result images ---
-def find_result_image(path):
-    for filename in os.listdir(path):
-        if "grounding" in filename or "result" in filename:
-            try:
-                image_path = os.path.join(path, filename)
-                return Image.open(image_path)
-            except Exception as e:
-                print(f"Error opening result image {filename}: {e}")
-    return None
-# --- 2. Main Processing Function (UPDATED for multi-bbox drawing) ---
-@spaces.GPU
-def process_ocr_task(image, model_size, task_type, ref_text):
     """
-    Processes an image with DeepSeek-OCR for all supported tasks.
-    Now draws ALL detected bounding boxes for ANY task.
     """
     if image is None:
-        return "Please upload an image first.", None
-    print("🚀 Moving model to GPU...")
-    model_gpu = model.cuda()
-    print("✅ Model is on GPU.")
-    with tempfile.TemporaryDirectory() as output_path:
-        # Build the prompt... (same as before)
-        if task_type == "📝 Free OCR":
-            prompt = "<image>\nFree OCR."
-        elif task_type == "📄 Convert to Markdown":
-            prompt = "<image>\n<|grounding|>Convert the document to markdown."
-        elif task_type == "📈 Parse Figure":
-            prompt = "<image>\nParse the figure."
-        elif task_type == "🔍 Locate Object by Reference":
-            if not ref_text or ref_text.strip() == "":
-                raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
-            prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
-        else:
-            prompt = "<image>\nFree OCR."
-        temp_image_path = os.path.join(output_path, "temp_image.png")
-        image.save(temp_image_path)
-        # Configure model size... (same as before)
-        size_configs = {
-            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
-            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
-            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
-            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
-            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
-        }
-        config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        print(f"🏃 Running inference with prompt: {prompt}")
-        text_result = model_gpu.infer(
-            tokenizer,
-            prompt=prompt,
-            image_file=temp_image_path,
-            output_path=output_path,
-            base_size=config["base_size"],
-            image_size=config["image_size"],
-            crop_mode=config["crop_mode"],
-            save_results=True,
-            test_compress=True,
-            eval_mode=True,
-        )
-        print(f"====\n📄 Text Result: {text_result}\n====")
-        # --- NEW LOGIC: Always try to find and draw all bounding boxes ---
-        result_image_pil = None
-        # Define the pattern to find all coordinates like [[280, 15, 696, 997]]
-        pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
-        matches = list(pattern.finditer(text_result)) # Use finditer to get all matches
-        if matches:
-            print(f"✅ Found {len(matches)} bounding box(es). Drawing on the original image.")
-            # Create a copy of the original image to draw on
-            image_with_bboxes = image.copy()
-            draw = ImageDraw.Draw(image_with_bboxes)
-            w, h = image.size # Get original image dimensions
-            for match in matches:
-                # Extract coordinates as integers
-                coords_norm = [int(c) for c in match.groups()]
-                x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
-                # Scale the normalized coordinates (from 1000x1000 space) to the image's actual size
-                x1 = int(x1_norm / 1000 * w)
-                y1 = int(y1_norm / 1000 * h)
-                x2 = int(x2_norm / 1000 * w)
-                y2 = int(y2_norm / 1000 * h)
-                # Draw the rectangle with a red outline, 3 pixels wide
-                draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
-            result_image_pil = image_with_bboxes
-        else:
-            # If no coordinates are found in the text, fall back to finding a pre-generated image
-            print("⚠️ No bounding box coordinates found in text result. Falling back to search for a result image file.")
-            result_image_pil = find_result_image(output_path)
-        return text_result, result_image_pil
-# --- 3. Build the Gradio Interface (UPDATED) ---
-with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🐳 Full Demo of DeepSeek-OCR 🐳
-        **💡 How to use:**
-        1.  **Upload an image** using the upload box.
-        2.  Select a **Resolution**. `Gundam` is recommended for most documents.
-        3.  Choose a **Task Type**:
-            - **📝 Free OCR**: Extracts raw text from the image.
-            - **📄 Convert to Markdown**: Converts the document into Markdown, preserving structure.
-            - **📈 Parse Figure**: Extracts structured data from charts and figures.
-            - **🔍 Locate Object by Reference**: Finds a specific object/text.
-        4. If this helpful, please give it a like! 🙏 ❤️
-        """
     )
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
-            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Resolution Size")
-            task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"], value="📄 Convert to Markdown", label="🚀 Task Type")
-            ref_text_input = gr.Textbox(label="📝 Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
-            submit_btn = gr.Button("Process Image", variant="primary")
-        with gr.Column(scale=2):
-            output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)
-            output_image = gr.Image(label="🖼️ Image Result (if any)", type="pil")
-    # --- UI Interaction Logic ---
-    def toggle_ref_text_visibility(task):
-        return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
-    task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
-    submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
-# --- 4. Launch the App ---
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

+import os
+import sys
+import spaces
+from typing import Iterable
 import gradio as gr
 import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+colors.steel_blue = colors.Color(
+    name="steel_blue",
+    c50="#EBF3F8", c100="#D3E5F0", c200="#A8CCE1", c300="#7DB3D2",
+    c400="#529AC3", c500="#4682B4", c600="#3E72A0", c700="#36638C",
+    c800="#2E5378", c900="#264364", c950="#1E3450",
 )
+class SteelBlueTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.steel_blue,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue,
+            text_size=text_size, font=font, font_mono=font_mono,
+        )
+        super().set(
+            background_fill_primary="*primary_50",
+            background_fill_primary_dark="*primary_900",
+            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
+            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="white",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            slider_color="*secondary_500",
+            slider_color_dark="*secondary_600",
+            block_title_text_weight="600",
+            block_border_width="3px",
+            block_shadow="*shadow_drop_lg",
+            button_primary_shadow="*shadow_drop_lg",
+            button_large_padding="11px",
+            color_accent_soft="*primary_100",
+            block_label_background_fill="*primary_200",
+        )
+steel_blue_theme = SteelBlueTheme()
+css = """
+#main-title h1 {
+    font-size: 2.3em !important;
+}
+#output-title h2 {
+    font-size: 2.1em !important;
+}
+"""
+MODEL_IDS = {
+    "Florence-2-base": "florence-community/Florence-2-base",
+    "Florence-2-base-ft": "florence-community/Florence-2-base-ft",
+    "Florence-2-large": "florence-community/Florence-2-large",
+    "Florence-2-large-ft": "florence-community/Florence-2-large-ft",
+}
+models = {}
+processors = {}
+print("Loading Florence-2 models... This may take a while.")
+for name, repo_id in MODEL_IDS.items():
+    print(f"Loading {name}...")
+    model = Florence2ForConditionalGeneration.from_pretrained(
+        repo_id,
+        dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+    models[name] = model
+    processors[name] = processor
+    print(f"✅ Finished loading {name}.")
+print("\n🎉 All models loaded successfully!")
+@spaces.GPU(duration=30)
+def run_florence2_inference(model_name: str, image: Image.Image, task_prompt: str,
+                            max_new_tokens: int = 1024, num_beams: int = 3):
     """
+    Runs inference using the selected Florence-2 model.
     """
     if image is None:
+        return "Please upload an image to get started."
+    model = models[model_name]
+    processor = processors[model_name]
+    inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=max_new_tokens,
+        num_beams=num_beams,
+        do_sample=False
     )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    image_size = image.size
+    parsed_answer = processor.post_process_generation(
+        generated_text, task=task_prompt, image_size=image_size
+    )
+    return parsed_answer
+florence_tasks = [
+    "<OD>", "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
+    "<DENSE_REGION_CAPTION>", "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
+]
+url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/venice.jpg?download=true"
+example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Florence-2 Vision Models**", elem_id="main-title")
+    gr.Markdown("Select a model, upload an image, choose a task, and click Submit to see the results.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            image_upload = gr.Image(type="pil", label="Upload Image", value=example_image, height=290)
+            task_prompt = gr.Dropdown(
+                label="Select Task",
+                choices=florence_tasks,
+                value="<MORE_DETAILED_CAPTION>"
+            )
+            model_choice = gr.Radio(
+                choices=list(MODEL_IDS.keys()),
+                label="Select Model",
+                value="Florence-2-base"
+            )
+            image_submit = gr.Button("Submit", variant="primary")
+            with gr.Accordion("Advanced options", open=False):
+                max_new_tokens = gr.Slider(
+                    label="Max New Tokens", minimum=128, maximum=2048, step=128, value=1024
+                )
+                num_beams = gr.Slider(
+                    label="Number of Beams", minimum=1, maximum=10, step=1, value=3
+                )
+        with gr.Column(scale=3):
+            gr.Markdown("## Output", elem_id="output-title")
+            parsed_output = gr.JSON(label="Parsed Answer")
+    image_submit.click(
+        fn=run_florence2_inference,
+        inputs=[model_choice, image_upload, task_prompt, max_new_tokens, num_beams],
+        outputs=[parsed_output]
+    )
 if __name__ == "__main__":
+    demo.queue().launch(debug=True, mcp_server=True, ssr_mode=False, show_error=True)