Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Running on Zero

App Files Files Community

yonigozlan HF Staff commited on Aug 12

Commit

aecac1f

1 Parent(s): 87e5c0e

Use session state

Browse files

Files changed (1) hide show

app.py +133 -80

app.py CHANGED Viewed

@@ -129,7 +129,7 @@ class AppState:
         self.inference_session = None
         self.model: Optional[Sam2VideoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
-        self.device: str = "cpu"
         self.dtype: torch.dtype = torch.bfloat16
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
@@ -158,9 +158,6 @@ class AppState:
         return len(self.video_frames)
-GLOBAL_STATE = AppState()
 def _model_repo_from_key(key: str) -> str:
     mapping = {
         "tiny": "yonigozlan/sam2.1_hiera_tiny_hf",
@@ -171,7 +168,7 @@ def _model_repo_from_key(key: str) -> str:
     return mapping.get(key, mapping["base_plus"])
-def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, torch.dtype]:
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
@@ -189,11 +186,13 @@ def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, tor
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
     device, dtype = get_device_and_dtype()
-    model = Sam2VideoModel.from_pretrained(desired_repo, torch_dtype=dtype)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
-    model.to(device)
     GLOBAL_STATE.model = model
     GLOBAL_STATE.processor = processor
@@ -204,11 +203,11 @@ def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, tor
     return model, processor, device, dtype
-def ensure_session_for_current_model() -> None:
     """Ensure the model/processor match the selected repo and inference_session exists.
     If a video is already loaded, re-initialize the inference session when needed.
     """
-    model, processor, device, dtype = load_model_if_needed()
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
@@ -239,7 +238,7 @@ def ensure_session_for_current_model() -> None:
             GLOBAL_STATE.session_repo_id = desired_repo
-def init_video_session(video: str | dict) -> tuple[AppState, int, int, Image.Image, str]:
     """Gradio handler: load video, init session, return state, slider bounds, and first frame."""
     # Reset ONLY video-related fields, keep model loaded
     GLOBAL_STATE.video_frames = []
@@ -247,7 +246,7 @@ def init_video_session(video: str | dict) -> tuple[AppState, int, int, Image.Ima
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
-    model, processor, device, dtype = load_model_if_needed()
     # Gradio Video may provide a dict with 'name' or a direct file path
     video_path: Optional[str] = None
@@ -349,9 +348,9 @@ def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
     return compose_frame(state, frame_idx)
-def _ensure_color_for_obj(obj_id: int):
-    if obj_id not in GLOBAL_STATE.color_by_obj:
-        GLOBAL_STATE.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
 def on_image_click(
@@ -384,20 +383,19 @@ def on_image_click(
     if x is None or y is None:
         raise gr.Error("Could not read click coordinates.")
-    _ensure_color_for_obj(int(obj_id))
-    processor = GLOBAL_STATE.processor
-    model = GLOBAL_STATE.model
-    inference_session = GLOBAL_STATE.inference_session
     if state.current_prompt_type == "Boxes":
         # Two-click box input
         if state.pending_box_start is None:
-            # If clear_old is enabled, clear prior points for this object on this frame
-            if bool(clear_old):
-                frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
-                frame_clicks[int(obj_id)] = []
-                state.composited_frames.pop(int(frame_idx), None)
             state.pending_box_start = (int(x), int(y))
             state.pending_box_start_frame_idx = int(frame_idx)
             state.pending_box_start_obj_id = int(obj_id)
@@ -420,13 +418,13 @@ def on_image_click(
                 frame_idx=int(frame_idx),
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
-                clear_old_inputs=bool(clear_old),
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             obj_boxes = frame_boxes.setdefault(int(obj_id), [])
-            if bool(clear_old):
-                obj_boxes.clear()
             obj_boxes.append((x_min, y_min, x_max, y_max))
             state.composited_frames.pop(int(frame_idx), None)
     else:
@@ -454,8 +452,8 @@ def on_image_click(
         state.composited_frames.pop(int(frame_idx), None)
     # Forward on that frame
-    device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
-    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
         outputs = model(
             inference_session=inference_session,
             frame_idx=int(frame_idx),
@@ -477,17 +475,17 @@ def on_image_click(
         mask_2d = mask_i.cpu().numpy().squeeze()
         masks_for_frame[int(oid)] = mask_2d
-    GLOBAL_STATE.masks_by_frame[int(frame_idx)] = masks_for_frame
     # Invalidate cache for this frame to force recomposition
-    GLOBAL_STATE.composited_frames.pop(int(frame_idx), None)
     # Return updated preview
-    return update_frame_display(GLOBAL_STATE, int(frame_idx))
-def propagate_masks(state: AppState):
-    if state is None or state.inference_session is None:
-        yield "Load a video first."
         return
     processor = GLOBAL_STATE.processor
@@ -497,9 +495,11 @@ def propagate_masks(state: AppState):
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
-    yield f"Propagating masks: {processed}/{total}"
     device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
     with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
         for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
@@ -508,6 +508,7 @@ def propagate_masks(state: AppState):
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             frame_idx = int(sam2_video_output.frame_idx)
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
             for i, oid in enumerate(obj_ids_order):
@@ -518,12 +519,20 @@ def propagate_masks(state: AppState):
             GLOBAL_STATE.composited_frames.pop(frame_idx, None)
             processed += 1
-            yield f"Propagating masks: {processed}/{total}"
-    yield f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
-def reset_session() -> tuple[AppState, Image.Image, int, int, str]:
     # Reset only session-related state, keep uploaded video and model
     if not GLOBAL_STATE.video_frames:
         # Nothing loaded; keep behavior
@@ -551,7 +560,7 @@ def reset_session() -> tuple[AppState, Image.Image, int, int, str]:
             torch.cuda.empty_cache()
     except Exception:
         pass
-    ensure_session_for_current_model()
     # Keep current slider index if possible
     current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
@@ -561,20 +570,41 @@ def reset_session() -> tuple[AppState, Image.Image, int, int, str]:
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
     # clear and reload model and processor
-    GLOBAL_STATE.model = None
-    GLOBAL_STATE.processor = None
-    ensure_session_for_current_model()
     return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status
 theme = Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")
 with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", theme=theme) as demo:
-    state = gr.State(GLOBAL_STATE)
-    gr.Markdown("""
-    **SAM2 Video (Transformers)** — Upload a video, click to add positive/negative points per object, preview masks on the clicked frame, then propagate across the video. Use the slider to scrub frames.
-    """)
     with gr.Row():
         with gr.Column(scale=1):
@@ -594,17 +624,17 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     with gr.Row():
         obj_id_inp = gr.Number(value=1, precision=0, label="Object ID")
         label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
-        clear_old_chk = gr.Checkbox(value=True, label="Clear old inputs for this object")
         prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
         with gr.Column():
             propagate_btn = gr.Button("Propagate across video", variant="primary")
             propagate_status = gr.Markdown(visible=True)
     # Wire events
-    def _on_video_change(video):
-        s, min_idx, max_idx, first_frame, status = init_video_session(video)
         return (
-            s,
             gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
             first_frame,
             status,
@@ -612,22 +642,29 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     video_in.change(
         _on_video_change,
-        inputs=[video_in],
-        outputs=[state, frame_slider, preview, load_status],
         show_progress=True,
     )
     # (moved) Examples are defined above the render button
-    examples_list = [["./tennis.mp4"], ["./football.mp4"], ["./basket.mp4"], ["./hurdles.mp4"]]
-    gr.Examples(
-        examples=examples_list,
-        inputs=[video_in],
-        fn=_on_video_change,
-        outputs=[state, frame_slider, preview, load_status],
-        label="Examples",
-        cache_examples=False,
-        examples_per_page=5,
-    )
     # Examples (place before the render MP4 button) — defined after handler below
     with gr.Row():
@@ -646,23 +683,23 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 s.processor = None
         # Stream progress text while loading (first yield shows text)
         yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
-        ensure_session_for_current_model()
         if s is not None:
             s.is_switching_model = False
         # Final yield hides the text
         yield gr.update(visible=False, value="")
-    ckpt_radio.change(_on_ckpt_change, inputs=[state, ckpt_radio], outputs=[ckpt_progress])
     # Also retrigger session re-init if a video already loaded
     def _rebind_session_after_ckpt(s: AppState):
-        ensure_session_for_current_model()
         # Reset pending box corner to avoid mismatched state
         if s is not None:
             s.pending_box_start = None
         return gr.update()
-    ckpt_radio.change(_rebind_session_after_ckpt, inputs=[state], outputs=[])
     def _sync_frame_idx(state_in: AppState, idx: int):
         if state_in is not None:
@@ -671,7 +708,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     frame_slider.change(
         _sync_frame_idx,
-        inputs=[state, frame_slider],
         outputs=preview,
     )
@@ -680,26 +717,37 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             s.current_obj_id = int(oid)
         return gr.update()
-    obj_id_inp.change(_sync_obj_id, inputs=[state, obj_id_inp], outputs=[])
     def _sync_label(s: AppState, lab: str):
         if s is not None and lab is not None:
             s.current_label = str(lab)
         return gr.update()
-    label_radio.change(_sync_label, inputs=[state, label_radio], outputs=[])
     def _sync_prompt_type(s: AppState, val: str):
         if s is not None and val is not None:
             s.current_prompt_type = str(val)
             s.pending_box_start = None
-        show_labels = str(val).lower() == "points"
-        return gr.update(visible=show_labels)
-    prompt_type.change(_sync_prompt_type, inputs=[state, prompt_type], outputs=[label_radio])
     # Image click to add a point and run forward on that frame
-    preview.select(on_image_click, [preview, state, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview)
     # Playback via MP4 rendering only
@@ -747,14 +795,19 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 except Exception as e:
                     raise gr.Error(f"Failed to render video: {e}")
-    render_btn.click(_render_video, inputs=[state], outputs=[playback_video])
-    propagate_btn.click(propagate_masks, inputs=[state], outputs=[propagate_status])
     reset_btn.click(
         reset_session,
-        inputs=None,
-        outputs=[state, preview, frame_slider, frame_slider, load_status],
     )

         self.inference_session = None
         self.model: Optional[Sam2VideoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
+        self.device: str = "cuda"
         self.dtype: torch.dtype = torch.bfloat16
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
         return len(self.video_frames)
 def _model_repo_from_key(key: str) -> str:
     mapping = {
         "tiny": "yonigozlan/sam2.1_hiera_tiny_hf",
     return mapping.get(key, mapping["base_plus"])
+def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, torch.dtype]:
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
     device, dtype = get_device_and_dtype()
+    # free up the gpu memory
+    torch.cuda.empty_cache()
+    gc.collect()
+    print("device", device)
+    model = Sam2VideoModel.from_pretrained(desired_repo)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
+    model.to(device, dtype=dtype)
     GLOBAL_STATE.model = model
     GLOBAL_STATE.processor = processor
     return model, processor, device, dtype
+def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
     """Ensure the model/processor match the selected repo and inference_session exists.
     If a video is already loaded, re-initialize the inference session when needed.
     """
+    model, processor, device, dtype = load_model_if_needed(GLOBAL_STATE)
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
             GLOBAL_STATE.session_repo_id = desired_repo
+def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppState, int, int, Image.Image, str]:
     """Gradio handler: load video, init session, return state, slider bounds, and first frame."""
     # Reset ONLY video-related fields, keep model loaded
     GLOBAL_STATE.video_frames = []
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
+    model, processor, device, dtype = load_model_if_needed(GLOBAL_STATE)
     # Gradio Video may provide a dict with 'name' or a direct file path
     video_path: Optional[str] = None
     return compose_frame(state, frame_idx)
+def _ensure_color_for_obj(state: AppState, obj_id: int):
+    if obj_id not in state.color_by_obj:
+        state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
 def on_image_click(
     if x is None or y is None:
         raise gr.Error("Could not read click coordinates.")
+    _ensure_color_for_obj(state, int(obj_id))
+    processor = state.processor
+    model = state.model
+    inference_session = state.inference_session
     if state.current_prompt_type == "Boxes":
         # Two-click box input
         if state.pending_box_start is None:
+            # For boxes, always clear old inputs (points) for this object on this frame
+            frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
+            frame_clicks[int(obj_id)] = []
+            state.composited_frames.pop(int(frame_idx), None)
             state.pending_box_start = (int(x), int(y))
             state.pending_box_start_frame_idx = int(frame_idx)
             state.pending_box_start_obj_id = int(obj_id)
                 frame_idx=int(frame_idx),
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
+                clear_old_inputs=True,  # For boxes, always clear old inputs
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             obj_boxes = frame_boxes.setdefault(int(obj_id), [])
+            # For boxes, always clear old inputs
+            obj_boxes.clear()
             obj_boxes.append((x_min, y_min, x_max, y_max))
             state.composited_frames.pop(int(frame_idx), None)
     else:
         state.composited_frames.pop(int(frame_idx), None)
     # Forward on that frame
+    device_type = "cuda" if state.device == "cuda" else "cpu"
+    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=state.dtype):
         outputs = model(
             inference_session=inference_session,
             frame_idx=int(frame_idx),
         mask_2d = mask_i.cpu().numpy().squeeze()
         masks_for_frame[int(oid)] = mask_2d
+    state.masks_by_frame[int(frame_idx)] = masks_for_frame
     # Invalidate cache for this frame to force recomposition
+    state.composited_frames.pop(int(frame_idx), None)
     # Return updated preview
+    return update_frame_display(state, int(frame_idx))
+def propagate_masks(GLOBAL_STATE: gr.State):
+    if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
+        yield "Load a video first.", gr.update()
         return
     processor = GLOBAL_STATE.processor
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
+    # Initial status; no slider change yet
+    yield f"Propagating masks: {processed}/{total}", gr.update()
     device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
+    last_frame_idx = 0
     with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
         for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             frame_idx = int(sam2_video_output.frame_idx)
+            last_frame_idx = frame_idx
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
             for i, oid in enumerate(obj_ids_order):
             GLOBAL_STATE.composited_frames.pop(frame_idx, None)
             processed += 1
+            # Every 15th frame (or last), move slider to current frame to update preview via slider binding
+            if processed % 15 == 0 or processed == total:
+                yield f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+            else:
+                yield f"Propagating masks: {processed}/{total}", gr.update()
+    # Final status; ensure slider points to last processed frame
+    yield (
+        f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects.",
+        gr.update(value=last_frame_idx),
+    )
+def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
     # Reset only session-related state, keep uploaded video and model
     if not GLOBAL_STATE.video_frames:
         # Nothing loaded; keep behavior
             torch.cuda.empty_cache()
     except Exception:
         pass
+    ensure_session_for_current_model(GLOBAL_STATE)
     # Keep current slider index if possible
     current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
     # clear and reload model and processor
     return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status
 theme = Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")
 with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", theme=theme) as demo:
+    GLOBAL_STATE = gr.State(AppState())
+    gr.Markdown(
+        """
+        ### SAM2 Video Tracking · powered by Hugging Face 🤗 Transformers
+        Segment and track objects across a video with SAM2 (Segment Anything 2). This demo runs the official implementation from the Hugging Face Transformers library for interactive, promptable video segmentation.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """
+                **Quick start**
+                - **Load a video**: Upload your own or pick an example below.
+                - **Checkpoint**: Tiny / Small / Base+ / Large (trade speed vs. accuracy).
+                - **Points mode**: Select an Object ID and point label (positive/negative), then click the frame to add guidance. You can add **multiple points per object** and define **multiple objects** across frames.
+                - **Boxes mode**: Click two opposite corners to draw a box. Old inputs for that object are cleared automatically.
+                """
+            )
+        with gr.Column():
+            gr.Markdown(
+                """
+                **Working with results**
+                - **Preview**: Use the slider to navigate frames and see the current masks.
+                - **Propagate**: Click “Propagate across video” to track all defined objects through the entire video. The preview follows progress periodically to keep things responsive.
+                - **Export**: Render an MP4 for smooth playback using the original video FPS.
+                - **Note**: More info on the Hugging Face 🤗 Transformers implementation of SAM2 can be found [here](https://huggingface.co/docs/transformers/en/main/en/model_doc/sam2_video).
+                """
+            )
     with gr.Row():
         with gr.Column(scale=1):
     with gr.Row():
         obj_id_inp = gr.Number(value=1, precision=0, label="Object ID")
         label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
+        clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
         prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
         with gr.Column():
             propagate_btn = gr.Button("Propagate across video", variant="primary")
             propagate_status = gr.Markdown(visible=True)
     # Wire events
+    def _on_video_change(GLOBAL_STATE: gr.State, video):
+        GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video)
         return (
+            GLOBAL_STATE,
             gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
             first_frame,
             status,
     video_in.change(
         _on_video_change,
+        inputs=[GLOBAL_STATE, video_in],
+        outputs=[GLOBAL_STATE, frame_slider, preview, load_status],
         show_progress=True,
     )
     # (moved) Examples are defined above the render button
+    # Each example row must match the number of inputs (GLOBAL_STATE, video_in)
+    examples_list = [
+        [None, "./tennis.mp4"],
+        [None, "./football.mp4"],
+        [None, "./basket.mp4"],
+        [None, "./hurdles.mp4"],
+    ]
+    with gr.Row():
+        gr.Examples(
+            examples=examples_list,
+            inputs=[GLOBAL_STATE, video_in],
+            fn=_on_video_change,
+            outputs=[GLOBAL_STATE, frame_slider, preview, load_status],
+            label="Examples",
+            cache_examples=False,
+            examples_per_page=5,
+        )
     # Examples (place before the render MP4 button) — defined after handler below
     with gr.Row():
                 s.processor = None
         # Stream progress text while loading (first yield shows text)
         yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
+        ensure_session_for_current_model(s)
         if s is not None:
             s.is_switching_model = False
         # Final yield hides the text
         yield gr.update(visible=False, value="")
+    ckpt_radio.change(_on_ckpt_change, inputs=[GLOBAL_STATE, ckpt_radio], outputs=[ckpt_progress])
     # Also retrigger session re-init if a video already loaded
     def _rebind_session_after_ckpt(s: AppState):
+        ensure_session_for_current_model(s)
         # Reset pending box corner to avoid mismatched state
         if s is not None:
             s.pending_box_start = None
         return gr.update()
+    ckpt_radio.change(_rebind_session_after_ckpt, inputs=[GLOBAL_STATE], outputs=[])
     def _sync_frame_idx(state_in: AppState, idx: int):
         if state_in is not None:
     frame_slider.change(
         _sync_frame_idx,
+        inputs=[GLOBAL_STATE, frame_slider],
         outputs=preview,
     )
             s.current_obj_id = int(oid)
         return gr.update()
+    obj_id_inp.change(_sync_obj_id, inputs=[GLOBAL_STATE, obj_id_inp], outputs=[])
     def _sync_label(s: AppState, lab: str):
         if s is not None and lab is not None:
             s.current_label = str(lab)
         return gr.update()
+    label_radio.change(_sync_label, inputs=[GLOBAL_STATE, label_radio], outputs=[])
     def _sync_prompt_type(s: AppState, val: str):
         if s is not None and val is not None:
             s.current_prompt_type = str(val)
             s.pending_box_start = None
+        is_points = str(val).lower() == "points"
+        # Show labels only for points; hide and disable clear_old when boxes
+        updates = [
+            gr.update(visible=is_points),
+            gr.update(interactive=is_points) if is_points else gr.update(value=True, interactive=False),
+        ]
+        return updates
+    prompt_type.change(
+        _sync_prompt_type,
+        inputs=[GLOBAL_STATE, prompt_type],
+        outputs=[label_radio, clear_old_chk],
+    )
     # Image click to add a point and run forward on that frame
+    preview.select(
+        on_image_click, [preview, GLOBAL_STATE, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview
+    )
     # Playback via MP4 rendering only
                 except Exception as e:
                     raise gr.Error(f"Failed to render video: {e}")
+    render_btn.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video])
+    # While propagating, we stream two outputs: status text and slider value updates
+    propagate_btn.click(
+        propagate_masks,
+        inputs=[GLOBAL_STATE],
+        outputs=[propagate_status, frame_slider],
+    )
     reset_btn.click(
         reset_session,
+        inputs=GLOBAL_STATE,
+        outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status],
     )