Holo1

Sleeping

App Files Files Community

Rausda6 commited on Aug 14

Commit

34b1e95

verified ·

1 Parent(s): 0f5112d

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -37

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-import json, os, re, traceback, contextlib
-from typing import Any, List, Dict
 import spaces
 import torch
@@ -132,6 +132,10 @@ def run_inference_localization(
     pil_image_for_processing: Image.Image,
     device: str,
     dtype: torch.dtype,
 ) -> str:
     text_prompt = apply_chat_template_compat(processor, messages_for_template)
@@ -151,12 +155,15 @@ def run_inference_localization(
     else:
         amp_ctx = contextlib.nullcontext()
     with amp_ctx:
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=128,
-            do_sample=False,
-        )
     generated_ids_trimmed = trim_generated(generated_ids, inputs)
     decoded_output = batch_decode_compat(
@@ -167,10 +174,159 @@ def run_inference_localization(
     )
     return decoded_output[0] if decoded_output else ""
 # --- Gradio processing function (ZeroGPU-visible) ---
 # Decorate the function Gradio calls so Spaces detects a GPU entry point.
 @spaces.GPU(duration=120)  # keep GPU attached briefly between calls (seconds)
-def predict_click_location(input_pil_image: Image.Image, instruction: str):
     if not model_loaded or not processor or not model:
         return f"Model not loaded. Error: {load_error_message}", None, "device: n/a | dtype: n/a"
     if not input_pil_image:
@@ -220,33 +376,70 @@ def predict_click_location(input_pil_image: Image.Image, instruction: str):
     # 2) Build messages with image + instruction
     messages = get_localization_prompt(resized_image, instruction)
-    # 3) Run inference
     try:
-        coordinates_str = run_inference_localization(messages, resized_image, device, dtype)
     except Exception as e:
         traceback.print_exc()
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB"), f"device: {device} | dtype: {dtype}"
-    # 4) Parse coordinates and draw marker
-    output_image_with_click = resized_image.copy().convert("RGB")
-    match = re.search(r"Click\((\d+),\s*(\d+)\)", coordinates_str)
-    if match:
-        try:
-            x = int(match.group(1))
-            y = int(match.group(2))
-            draw = ImageDraw.Draw(output_image_with_click)
-            radius = max(5, min(resized_width // 100, resized_height // 100, 15))
-            bbox = (x - radius, y - radius, x + radius, y + radius)
-            draw.ellipse(bbox, outline="red", width=max(2, radius // 4))
-            print(f"Predicted and drawn click at: ({x}, {y}) on resized image ({resized_width}x{resized_height})")
-        except Exception as e:
-            print(f"Error drawing on image: {e}")
-            traceback.print_exc()
-    else:
-        print(f"Could not parse 'Click(x, y)' from model output: {coordinates_str}")
-    return coordinates_str, output_image_with_click, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
 # --- Load Example Data ---
 example_image = None
 example_instruction = "Enter the server address readyforquantum.com to check its security"
@@ -292,16 +485,21 @@ else:
                     placeholder="e.g., Click the 'Login' button",
                     info="Type the action you want the model to localize on the image."
                 )
                 submit_button = gr.Button("Localize Click", variant="primary")
             with gr.Column(scale=1):
                 output_coords_component = gr.Textbox(
-                    label="Predicted Coordinates (Format: Click(x, y))",
                     interactive=False
                 )
                 output_image_component = gr.Image(
                     type="pil",
-                    label="Image with Predicted Click Point",
                     height=400,
                     interactive=False
                 )
@@ -313,8 +511,16 @@ else:
         if example_image:
             gr.Examples(
-                examples=[[example_image, example_instruction]],
-                inputs=[input_image_component, instruction_component],
                 outputs=[output_coords_component, output_image_component, runtime_info],
                 fn=predict_click_location,
                 cache_examples="lazy",
@@ -322,11 +528,17 @@ else:
         submit_button.click(
             fn=predict_click_location,
-            inputs=[input_image_component, instruction_component],
             outputs=[output_coords_component, output_image_component, runtime_info]
         )
 if __name__ == "__main__":
-    # Do NOT pass 'concurrency_count' or ZeroGPU-specific launch args.
     demo.launch(debug=True)

 import gradio as gr
+import json, os, re, traceback, contextlib, math, random
+from typing import Any, List, Dict, Optional, Tuple
 import spaces
 import torch
     pil_image_for_processing: Image.Image,
     device: str,
     dtype: torch.dtype,
+    do_sample: bool = False,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    max_new_tokens: int = 128,
 ) -> str:
     text_prompt = apply_chat_template_compat(processor, messages_for_template)
     else:
         amp_ctx = contextlib.nullcontext()
+    gen_kwargs = dict(
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+    )
     with amp_ctx:
+        generated_ids = model.generate(**inputs, **gen_kwargs)
     generated_ids_trimmed = trim_generated(generated_ids, inputs)
     decoded_output = batch_decode_compat(
     )
     return decoded_output[0] if decoded_output else ""
+# ---------- Confidence helpers ----------
+CLICK_RE = re.compile(r"Click\((\d+),\s*(\d+)\)")
+def parse_click(s: str) -> Optional[Tuple[int, int]]:
+    m = CLICK_RE.search(s)
+    if not m:
+        return None
+    try:
+        return int(m.group(1)), int(m.group(2))
+    except Exception:
+        return None
+@torch.inference_mode()
+def sample_clicks(
+    messages: List[dict],
+    img: Image.Image,
+    device: str,
+    dtype: torch.dtype,
+    n_samples: int = 7,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    seed: Optional[int] = None,
+) -> List[Optional[Tuple[int, int]]]:
+    """
+    Run multiple stochastic decodes to estimate self-consistency.
+    Returns a list of (x,y) or None (if parsing failed) for each sample.
+    """
+    clicks: List[Optional[Tuple[int, int]]] = []
+    # If model respects torch random, set seed for reproducibility (optional)
+    if seed is not None:
+        torch.manual_seed(seed)
+        random.seed(seed)
+    for i in range(n_samples):
+        # Vary seed slightly each iteration to avoid identical sampling patterns
+        if seed is not None:
+            torch.manual_seed(seed + i + 1)
+            random.seed((seed + i + 1) & 0xFFFFFFFF)
+        out = run_inference_localization(
+            messages, img, device, dtype,
+            do_sample=True, temperature=temperature, top_p=top_p
+        )
+        clicks.append(parse_click(out))
+    return clicks
+def cluster_and_confidence(
+    clicks: List[Optional[Tuple[int,int]]],
+    img_w: int,
+    img_h: int,
+) -> Dict[str, Any]:
+    """
+    Simple robust consensus:
+      - Keep only valid points
+      - Compute median point (x_med, y_med)
+      - Compute distances to median
+      - Inlier threshold = max(8 px, 2% of min(img_w, img_h))
+      - Confidence = (#inliers / #total_samples) * clamp(1 - (rms_inlier_dist / thr), 0, 1)
+    Returns dict with consensus point, confidence, dispersion, and counts.
+    """
+    valid = [xy for xy in clicks if xy is not None]
+    total = len(clicks)
+    if total == 0:
+        return dict(ok=False, reason="no_samples")
+    if not valid:
+        return dict(ok=False, reason="no_valid_points", total=total)
+    xs = sorted([x for x, _ in valid])
+    ys = sorted([y for _, y in valid])
+    mid = len(valid) // 2
+    if len(valid) % 2 == 1:
+        x_med = xs[mid]
+        y_med = ys[mid]
+    else:
+        x_med = (xs[mid - 1] + xs[mid]) // 2
+        y_med = (ys[mid - 1] + ys[mid]) // 2
+    thr = max(8.0, 0.02 * min(img_w, img_h))  # ~2% of smaller side, at least 8 px
+    dists = [math.hypot(x - x_med, y - y_med) for (x, y) in valid]
+    inliers = [(xy, d) for xy, d in zip(valid, dists) if d <= thr]
+    outliers = [(xy, d) for xy, d in zip(valid, dists) if d > thr]
+    inlier_count = len(inliers)
+    # RMS of inlier distances (0 if perfect agreement)
+    if inliers:
+        rms = math.sqrt(sum(d*d for _, d in inliers) / len(inliers))
+    else:
+        rms = float("inf")
+    # Confidence: agreement ratio * sharpness factor
+    if inliers:
+        sharp = max(0.0, min(1.0, 1.0 - (rms / thr)))
+    else:
+        sharp = 0.0
+    confidence = (inlier_count / total) * sharp
+    return dict(
+        ok=True,
+        x=x_med, y=y_med,
+        confidence=confidence,
+        total_samples=total,
+        valid_samples=len(valid),
+        inliers=inlier_count,
+        outliers=len(outliers),
+        sigma_px=rms if math.isfinite(rms) else None,
+        inlier_threshold_px=thr,
+        all_points=valid,
+        inlier_points=[xy for xy,_ in inliers],
+        outlier_points=[xy for xy,_ in outliers],
+    )
+def draw_samples(
+    base_img: Image.Image,
+    consensus_xy: Optional[Tuple[int,int]],
+    inliers: List[Tuple[int,int]],
+    outliers: List[Tuple[int,int]],
+    ring_color: str = "red",
+) -> Image.Image:
+    """
+    Overlay all sampled points: green=inliers, red=outliers, plus a ring for consensus.
+    """
+    img = base_img.copy().convert("RGB")
+    draw = ImageDraw.Draw(img)
+    w, h = img.size
+    # Dot radius scales with image size
+    r = max(3, min(w, h) // 200)
+    # Draw inliers
+    for (x, y) in inliers:
+        draw.ellipse((x - r, y - r, x + r, y + r), fill="green", outline=None)
+    # Draw outliers
+    for (x, y) in outliers:
+        draw.ellipse((x - r, y - r, x + r, y + r), fill="red", outline=None)
+    # Consensus ring
+    if consensus_xy is not None:
+        cx, cy = consensus_xy
+        ring_r = max(5, min(w, h) // 100, r * 3)
+        draw.ellipse((cx - ring_r, cy - ring_r, cx + ring_r, cy + ring_r), outline=ring_color, width=max(2, ring_r // 4))
+    return img
 # --- Gradio processing function (ZeroGPU-visible) ---
 # Decorate the function Gradio calls so Spaces detects a GPU entry point.
 @spaces.GPU(duration=120)  # keep GPU attached briefly between calls (seconds)
+def predict_click_location(
+    input_pil_image: Image.Image,
+    instruction: str,
+    estimate_confidence: bool = True,
+    num_samples: int = 7,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    seed: Optional[int] = None,
+):
     if not model_loaded or not processor or not model:
         return f"Model not loaded. Error: {load_error_message}", None, "device: n/a | dtype: n/a"
     if not input_pil_image:
     # 2) Build messages with image + instruction
     messages = get_localization_prompt(resized_image, instruction)
+    # 3) Inference and (optionally) confidence estimation
     try:
+        if estimate_confidence and num_samples >= 3:
+            # Monte-Carlo sampling
+            clicks = sample_clicks(
+                messages, resized_image, device, dtype,
+                n_samples=int(num_samples),
+                temperature=float(temperature),
+                top_p=float(top_p),
+                seed=seed
+            )
+            summary = cluster_and_confidence(clicks, resized_image.width, resized_image.height)
+            if not summary.get("ok", False):
+                # Fallback: deterministic decode
+                coord_str = run_inference_localization(messages, resized_image, device, dtype, do_sample=False)
+                out_img = resized_image.copy().convert("RGB")
+                match = CLICK_RE.search(coord_str or "")
+                if match:
+                    x, y = int(match.group(1)), int(match.group(2))
+                    out_img = draw_samples(out_img, (x, y), [], [])
+                coords_text = f"{coord_str} | confidence=0.00 (fallback)"
+                return coords_text, out_img, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
+            # Build final string + visualization
+            x, y = int(summary["x"]), int(summary["y"])
+            conf = summary["confidence"]
+            inliers = summary["inlier_points"]
+            outliers = summary["outlier_points"]
+            sigma = summary["sigma_px"]
+            thr = summary["inlier_threshold_px"]
+            total = summary["total_samples"]
+            valid = summary["valid_samples"]
+            # Compose output string in the same canonical format plus diagnostics
+            coord_str = f"Click({x}, {y})"
+            diag = (
+                f"confidence={conf:.2f} | samples(valid/total)={valid}/{total} | "
+                f"inliers={len(inliers)} | σ={sigma:.1f}px | thr={thr:.1f}px | "
+                f"T={temperature:.2f}, p={top_p:.2f}"
+            )
+            # Draw: all samples + consensus ring
+            out_img = draw_samples(resized_image, (x, y), inliers, outliers)
+            return f"{coord_str} | {diag}", out_img, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
+        else:
+            # Fast deterministic single pass (no confidence)
+            coord_str = run_inference_localization(messages, resized_image, device, dtype, do_sample=False)
+            out_img = resized_image.copy().convert("RGB")
+            match = CLICK_RE.search(coord_str or "")
+            if match:
+                x = int(match.group(1))
+                y = int(match.group(2))
+                # draw a simple ring around the predicted point
+                out_img = draw_samples(out_img, (x, y), [], [])
+            else:
+                print(f"Could not parse 'Click(x, y)' from model output: {coord_str}")
+            return coord_str, out_img, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
     except Exception as e:
         traceback.print_exc()
         return f"Error during model inference: {e}", resized_image.copy().convert("RGB"), f"device: {device} | dtype: {dtype}"
 # --- Load Example Data ---
 example_image = None
 example_instruction = "Enter the server address readyforquantum.com to check its security"
                     placeholder="e.g., Click the 'Login' button",
                     info="Type the action you want the model to localize on the image."
                 )
+                estimate_conf = gr.Checkbox(value=True, label="Estimate confidence (slower)")
+                num_samples_slider = gr.Slider(3, 15, value=7, step=1, label="Samples (for confidence)")
+                temperature_slider = gr.Slider(0.2, 1.2, value=0.6, step=0.05, label="Temperature")
+                top_p_slider = gr.Slider(0.5, 0.99, value=0.9, step=0.01, label="Top-p")
+                seed_box = gr.Number(value=None, precision=0, label="Seed (optional, for reproducibility)")
                 submit_button = gr.Button("Localize Click", variant="primary")
             with gr.Column(scale=1):
                 output_coords_component = gr.Textbox(
+                    label="Predicted Coordinates + Confidence",
                     interactive=False
                 )
                 output_image_component = gr.Image(
                     type="pil",
+                    label="Image with Samples (green=inliers, red=outliers) and Final Ring",
                     height=400,
                     interactive=False
                 )
         if example_image:
             gr.Examples(
+                examples=[[example_image, example_instruction, True, 7, 0.6, 0.9, None]],
+                inputs=[
+                    input_image_component,
+                    instruction_component,
+                    estimate_conf,
+                    num_samples_slider,
+                    temperature_slider,
+                    top_p_slider,
+                    seed_box,
+                ],
                 outputs=[output_coords_component, output_image_component, runtime_info],
                 fn=predict_click_location,
                 cache_examples="lazy",
         submit_button.click(
             fn=predict_click_location,
+            inputs=[
+                input_image_component,
+                instruction_component,
+                estimate_conf,
+                num_samples_slider,
+                temperature_slider,
+                top_p_slider,
+                seed_box,
+            ],
             outputs=[output_coords_component, output_image_component, runtime_info]
         )
 if __name__ == "__main__":
     demo.launch(debug=True)