Spaces:

ariG23498
/

gemma3-license-plate-detection

Runtime error

App Files Files Community

Update GUI with 2 models

by sergiopaniego HF Staff - opened Jun 23

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+95

-106

Files changed (3) hide show

README.md +1 -1
app.py +89 -101
requirements.txt +5 -4

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 📈
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 5.34.1
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,129 +1,117 @@
-import os
-import re
 import random
-from dataclasses import dataclass
-from functools import partial
-import torch
 import gradio as gr
-import spaces
 from datasets import load_dataset
-from torch.utils.data import DataLoader
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-from PIL import Image, ImageDraw
-# --- Configuration ---
-@dataclass
-class Configuration:
-    dataset_id: str = "ariG23498/license-detection-paligemma"
-    model_id: str = "google/gemma-3-4b-pt"
-    checkpoint_id: str = "ariG23498/gemma-3-4b-pt-object-detection"
-    device: str = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype: torch.dtype = torch.bfloat16
-    batch_size: int = 4
-    learning_rate: float = 2e-05
-    epochs: int = 1
-# --- Utils ---
 def parse_paligemma_label(label, width, height):
-    # Extract location codes
     loc_pattern = r"<loc(\d{4})>"
     locations = [int(loc) for loc in re.findall(loc_pattern, label)]
-    # Extract category (everything after the last location code)
     category = label.split(">")[-1].strip()
-    # Order in PaliGemma format is: y1, x1, y2, x2
-    y1_norm, x1_norm, y2_norm, x2_norm = locations
-    # Convert normalized coordinates to image coordinates
-    x1 = (x1_norm / 1024) * width
-    y1 = (y1_norm / 1024) * height
-    x2 = (x2_norm / 1024) * width
-    y2 = (y2_norm / 1024) * height
     return category, [x1, y1, x2, y2]
-def visualize_bounding_boxes(image, label, width, height):
-    # Copy image for drawing
     draw_image = image.copy()
     draw = ImageDraw.Draw(draw_image)
-    category, bbox = parse_paligemma_label(label, width, height)
-    draw.rectangle(bbox, outline="red", width=2)
-    draw.text((bbox[0], max(0, bbox[1] - 10)), category, fill="red")
     return draw_image
-def test_collate_function(batch_of_samples, processor, dtype):
-    images = []
-    prompts = []
-    for sample in batch_of_samples:
-        images.append([sample["image"]])
-        prompts.append(f"{processor.tokenizer.boi_token} detect \n\n")
-    batch = processor(images=images, text=prompts, return_tensors="pt", padding=True)
-    batch["pixel_values"] = batch["pixel_values"].to(dtype)
-    return batch, images
-# --- Initialize ---
-cfg = Configuration()
-processor = AutoProcessor.from_pretrained(cfg.checkpoint_id)
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    cfg.checkpoint_id,
-    torch_dtype=cfg.dtype,
-    device_map="cpu",
-)
-model.eval()
-test_dataset = load_dataset(cfg.dataset_id, split="test")
-def get_sample():
-    sample = random.choice(test_dataset)
-    images = [[sample["image"]]]
-    prompts = [f"{processor.tokenizer.boi_token} detect \n\n"]
-    batch = processor(images=images, text=prompts, return_tensors="pt", padding=True)
-    batch["pixel_values"] = batch["pixel_values"].to(cfg.dtype)
-    return batch, sample["image"]
-# --- Prediction Logic ---
-@spaces.GPU
-def run_prediction():
-    model.to(cfg.device)
-    batch, raw_image = get_sample()
-    batch = {k: v.to(cfg.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
     with torch.no_grad():
-        generation = model.generate(**batch, max_new_tokens=100)
-    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0]
-    image = raw_image  # ✅ FIXED: raw_image is already a PIL.Image
-    width, height = image.size
-    result_image = visualize_bounding_boxes(image, decoded, width, height)
-    return result_image
-# --- Gradio Interface ---
-demo = gr.Interface(
-    fn=run_prediction,
-    inputs=[],
-    outputs=gr.Image(type="pil", label="Detected Bounding Box"),
-    title="Gemma3 Object Detector",
-    description="Click 'Generate' to visualize a prediction from a randomly sampled test image.",
-)
 if __name__ == "__main__":
-    demo.launch()

 import random
+import re
+import albumentations as A
 import gradio as gr
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
 from datasets import load_dataset
+from gradio.themes.soft import Soft
+from spaces import GPU
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+# --- Config ---
+dataset_id = "ariG23498/license-detection-paligemma"
+model_id = "google/gemma-3-4b-pt"
+MODEL_OPTIONS = {
+    "🔵 Fine-tuned": "sergiopaniego/gemma-3-4b-pt-object-detection",
+    "🟣 Fine-tuned (updated tokenizer with `<loc>` tokens)": "sergiopaniego/gemma-3-4b-pt-object-detection-loc-tokens",
+}
+resize_size = 512 if "SmolVLM" in model_id else 896
+transform = A.Compose([A.Resize(height=resize_size, width=resize_size)])
+dataset = load_dataset(dataset_id, split="test")
+loaded_models = {}
+def load_model(checkpoint_id):
+    if checkpoint_id not in loaded_models:
+        processor = AutoProcessor.from_pretrained(checkpoint_id)
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            checkpoint_id,
+            torch_dtype="auto",
+            device_map="auto",
+        )
+        model.eval()
+        loaded_models[checkpoint_id] = (processor, model)
+    return loaded_models[checkpoint_id]
 def parse_paligemma_label(label, width, height):
     loc_pattern = r"<loc(\d{4})>"
     locations = [int(loc) for loc in re.findall(loc_pattern, label)]
+    if len(locations) != 4:
+        return None, None
+    y1, x1, y2, x2 = locations
+    x1, x2 = (x1 / 1024) * width, (x2 / 1024) * width
+    y1, y2 = (y1 / 1024) * height, (y2 / 1024) * height
     category = label.split(">")[-1].strip()
     return category, [x1, y1, x2, y2]
+def visualize_bounding_boxes(image, label):
+    width, height = image.size
+    category, bbox = parse_paligemma_label(label, width, height)
     draw_image = image.copy()
     draw = ImageDraw.Draw(draw_image)
+    if bbox:
+        draw.rectangle(bbox, outline="red", width=2)
+        draw.text((bbox[0], max(0, bbox[1] - 10)), category, fill="red")
     return draw_image
+@GPU
+def detect_random_image(model_choice):
+    checkpoint_id = MODEL_OPTIONS[model_choice]
+    processor, model = load_model(checkpoint_id)
+    sample = random.choice(dataset)
+    image = sample["image"]
+    transformed = transform(image=np.array(image))
+    image_resized = Image.fromarray(transformed["image"])
+    prompt = f"{processor.tokenizer.boi_token} detect \n\n"
+    inputs = processor(images=[[image_resized]], text=[prompt], return_tensors="pt", padding=True)
+    if "pixel_values" not in inputs:
+        return image_resized
+    inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     with torch.no_grad():
+        generation = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
+    decoded = processor.batch_decode(generation, skip_special_tokens=True)[0]
+    return visualize_bounding_boxes(image_resized, decoded)
+css_hide_share = """
+button#gradio-share-link-button-0 {
+    display: none !important;
+}
+"""
+# --- Gradio Blocks Interface ---
+with gr.Blocks(theme=Soft(), css=css_hide_share) as demo:
+    gr.Markdown("# Gemma3 Object Detector")
+    gr.Markdown("""
+### 🔍 About the Models
+This demo compares two fine-tuned versions of **Gemma 3 (4B)** for object detection:
+- **🔵 Fine-tuned for object detection**: trained to predict bounding boxes and class labels using the original tokenizer.
+- **🟣 Fine-tuned (updated tokenizer with `<loc>` tokens)**: same task, but uses a tokenizer updated to better encode spatial information through `<locYYYY>` tokens.
+Select a model and click **Generate** to visualize its prediction on a random test image.
+""")
+    with gr.Column():
+        model_selector = gr.Radio(
+            choices=list(MODEL_OPTIONS.keys()),
+            value="With loc tokens",
+            label="Model"
+        )
+        generate_btn = gr.Button(value="Generate")
+        output_image = gr.Image(type="pil", label="Detected Bounding Box", height=500)
+    generate_btn.click(fn=detect_random_image, inputs=model_selector, outputs=output_image)
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-gradio
-spaces
 transformers
-accelerate
 datasets
-torch

+torch
 transformers
 datasets
+Pillow
+albumentations
+gradio
+accelerate