Spaces:

facebook
/

map-anything

Running on Zero

App Files Files Community

aknapitsch user commited on Sep 9

Commit

9f367f7

1 Parent(s): 9a006f8

finished mesh view

Browse files

Files changed (2) hide show

app.py +165 -110
mapanything/utils/hf_utils/visual_util.py +50 -137

app.py CHANGED Viewed

@@ -48,26 +48,6 @@ def get_logo_base64():
         return None
-# MapAnything Configuration
-# high_level_config = {
-#     "path": "configs/train.yaml",
-#     "hf_model_name": "facebook/MapAnything",
-#     "model_str": "mapanything",
-#     "config_overrides": [
-#         "machine=aws",
-#         "model=mapanything",
-#         "model/task=images_only",
-#         "model.encoder.uses_torch_hub=false",
-#     ],
-#     "checkpoint_name": "mapa_curri_24v_13d_48ipg_64g.pth",
-#     "config_name": "config.json",
-#     "trained_with_amp": True,
-#     "trained_with_amp_dtype": "fp16",
-#     "data_norm_type": "dinov2",
-#     "patch_size": 14,
-#     "resolution": 518,
-# }
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
@@ -96,7 +76,13 @@ model = None
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
-def run_model(target_dir, apply_mask=True, mask_edges=True):
     """
     Run the MapAnything model on images in the 'target_dir/images' folder and return predictions.
     """
@@ -204,7 +190,7 @@ def run_model(target_dir, apply_mask=True, mask_edges=True):
     # Process data for visualization tabs (depth, normal, measure)
     processed_data = process_predictions_for_visualization(
-        predictions, views, high_level_config
     )
     # Clean up
@@ -246,9 +232,7 @@ def update_depth_view(processed_data, view_index):
     if view_data is None or view_data["depth"] is None:
         return None
-    # Use confidence filtering if available
-    confidence = view_data.get("confidence")
-    return colorize_depth(view_data["depth"], confidence=confidence)
 def update_normal_view(processed_data, view_index):
@@ -257,9 +241,7 @@ def update_normal_view(processed_data, view_index):
     if view_data is None or view_data["normal"] is None:
         return None
-    # Use confidence filtering if available
-    confidence = view_data.get("confidence")
-    return colorize_normal(view_data["normal"], confidence=confidence)
 def update_measure_view(processed_data, view_index):
@@ -475,11 +457,11 @@ def gradio_demo(
     target_dir,
     frame_filter="All",
     show_cam=True,
-    filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
     apply_mask=True,
     mask_edges=True,
 ):
     """
     Perform reconstruction using the already-created target_dir/images.
@@ -491,9 +473,6 @@ def gradio_demo(
     gc.collect()
     torch.cuda.empty_cache()
-    # Always use Pointmap Branch for MapAnything
-    prediction_mode = "Pointmap Branch"
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
     all_files = (
@@ -519,7 +498,7 @@ def gradio_demo(
     # Build a GLB file name
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     # Convert predictions to GLB
@@ -527,12 +506,9 @@ def gradio_demo(
         predictions,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
-        target_dir=target_dir,
-        prediction_mode=prediction_mode,
-        mask_sky=filter_sky,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
-        as_mesh=True,  # Default to True for reconstruction
     )
     glbscene.export(file_obj=glbfile)
@@ -575,42 +551,19 @@ def gradio_demo(
 # -------------------------------------------------------------------------
 # 5) Helper functions for UI resets + re-visualization
 # -------------------------------------------------------------------------
-def apply_confidence_filtering(data, confidence, conf_thres):
-    """Apply confidence filtering to data arrays"""
-    if confidence is None or data is None:
-        return data
-    # Convert confidence threshold from percentage to confidence value
-    conf_threshold = np.percentile(confidence, conf_thres)
-    conf_mask = (confidence >= conf_threshold) & (confidence > 1e-5)
-    # conf_mask = confidence >= (conf_thres)
-    # Apply mask to data
-    if len(data.shape) == 3:  # 3D data (H, W, C)
-        filtered_data = data.copy()
-        for c in range(data.shape[2]):
-            filtered_data[:, :, c] = np.where(conf_mask, data[:, :, c], 0)
-    elif len(data.shape) == 2:  # 2D data (H, W)
-        filtered_data = np.where(conf_mask, data, 0)
-    else:
-        filtered_data = data
-    return filtered_data
-def colorize_depth(depth_map, confidence=None, conf_thres=None):
-    """Convert depth map to colorized visualization with optional confidence filtering"""
     if depth_map is None:
         return None
-    # Apply confidence filtering if available
-    if confidence is not None and conf_thres is not None:
-        depth_map = apply_confidence_filtering(depth_map, confidence, conf_thres)
     # Normalize depth to 0-1 range
     depth_normalized = depth_map.copy()
     valid_mask = depth_normalized > 0
     if valid_mask.sum() > 0:
         valid_depths = depth_normalized[valid_mask]
         p5 = np.percentile(valid_depths, 5)
@@ -622,8 +575,6 @@ def colorize_depth(depth_map, confidence=None, conf_thres=None):
     import matplotlib.pyplot as plt
     colormap = plt.cm.turbo_r
-    # colormap = plt.cm.plasma
-    # colormap = plt.cm.viridis
     colored = colormap(depth_normalized)
     colored = (colored[:, :, :3] * 255).astype(np.uint8)
@@ -633,34 +584,36 @@ def colorize_depth(depth_map, confidence=None, conf_thres=None):
     return colored
-def colorize_normal(normal_map, confidence=None, conf_thres=None):
-    """Convert normal map to colorized visualization with optional confidence filtering"""
     if normal_map is None:
         return None
-    # Apply confidence filtering if available
-    if confidence is not None and conf_thres is not None:
-        normal_map = apply_confidence_filtering(normal_map, confidence, conf_thres)
     # Normalize normals to [0, 1] range for visualization
-    normal_vis = (normal_map + 1.0) / 2.0
     normal_vis = (normal_vis * 255).astype(np.uint8)
     return normal_vis
-def process_predictions_for_visualization(predictions, views, high_level_config):
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
-    # Check if confidence data is available in any view
-    has_confidence_data = False
     # Process each view
     for view_idx, view in enumerate(views):
         # Get image
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
-        # image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
         # Get predicted points
         pred_pts3d = predictions["world_points"][view_idx]
@@ -672,12 +625,32 @@ def process_predictions_for_visualization(predictions, views, high_level_config)
             "depth": None,
             "normal": None,
             "mask": None,
-            "confidence": None,
-            "has_confidence": has_confidence_data,
         }
-        view_data["mask"] = predictions["final_mask"][view_idx]
         view_data["depth"] = predictions["depth"][view_idx].squeeze()
         normals, _ = points_to_normals(pred_pts3d, mask=view_data["mask"])
@@ -872,7 +845,6 @@ def update_visualization(
     frame_filter,
     show_cam,
     is_example,
-    filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
     show_mesh=True,
@@ -905,12 +877,9 @@ def update_visualization(
     loaded = np.load(predictions_path, allow_pickle=True)
     predictions = {key: loaded[key] for key in loaded.keys()}
-    # Always use Pointmap Branch for MapAnything
-    prediction_mode = "Pointmap Branch"
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     if not os.path.exists(glbfile):
@@ -918,9 +887,6 @@ def update_visualization(
             predictions,
             filter_by_frames=frame_filter,
             show_cam=show_cam,
-            target_dir=target_dir,
-            prediction_mode=prediction_mode,
-            mask_sky=filter_sky,
             mask_black_bg=filter_black_bg,
             mask_white_bg=filter_white_bg,
             as_mesh=show_mesh,
@@ -933,6 +899,77 @@ def update_visualization(
     )
 # -------------------------------------------------------------------------
 # Example scene functions
 # -------------------------------------------------------------------------
@@ -1147,9 +1184,6 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
                     gr.Markdown("### Pointcloud options (live updates)")
                     show_cam = gr.Checkbox(label="Show Camera", value=True)
                     show_mesh = gr.Checkbox(label="Show mesh", value=True)
-                    filter_sky = gr.Checkbox(
-                        label="Filter Sky (using skyseg.onnx)", value=False
-                    )
                     filter_black_bg = gr.Checkbox(
                         label="Filter Black Background", value=False
                     )
@@ -1160,7 +1194,6 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
                     apply_mask_checkbox = gr.Checkbox(
                         label="Apply non-ambiguous mask", value=True
                     )
-                    mask_edges_checkbox = apply_mask_checkbox
     # ---------------------- Example Scenes Section ----------------------
     gr.Markdown("## Example Scenes (lists all scenes in the examples folder)")
     gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
@@ -1223,11 +1256,10 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
             target_dir_output,
             frame_filter,
             show_cam,
-            filter_sky,
             filter_black_bg,
             filter_white_bg,
             apply_mask_checkbox,
-            mask_edges_checkbox,
         ],
         outputs=[
             reconstruction_output,
@@ -1258,6 +1290,9 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
             frame_filter,
             show_cam,
             is_example,
         ],
         [reconstruction_output, log_output],
     )
@@ -1271,31 +1306,35 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         ],
         [reconstruction_output, log_output],
     )
-    filter_sky.change(
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
-            filter_sky,
             filter_black_bg,
             filter_white_bg,
         ],
         [reconstruction_output, log_output],
-    )
-    filter_black_bg.change(
-        update_visualization,
-        [
             target_dir_output,
-            frame_filter,
-            show_cam,
-            is_example,
-            filter_sky,
             filter_black_bg,
             filter_white_bg,
         ],
-        [reconstruction_output, log_output],
     )
     filter_white_bg.change(
         update_visualization,
@@ -1304,12 +1343,29 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
             frame_filter,
             show_cam,
             is_example,
-            filter_sky,
             filter_black_bg,
             filter_white_bg,
             show_mesh,
         ],
         [reconstruction_output, log_output],
     )
     show_mesh.change(
@@ -1319,7 +1375,6 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
             frame_filter,
             show_cam,
             is_example,
-            filter_sky,
             filter_black_bg,
             filter_white_bg,
             show_mesh,

         return None
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
+def run_model(
+    target_dir,
+    apply_mask=True,
+    mask_edges=True,
+    filter_black_bg=False,
+    filter_white_bg=False,
+):
     """
     Run the MapAnything model on images in the 'target_dir/images' folder and return predictions.
     """
     # Process data for visualization tabs (depth, normal, measure)
     processed_data = process_predictions_for_visualization(
+        predictions, views, high_level_config, filter_black_bg, filter_white_bg
     )
     # Clean up
     if view_data is None or view_data["depth"] is None:
         return None
+    return colorize_depth(view_data["depth"], mask=view_data.get("mask"))
 def update_normal_view(processed_data, view_index):
     if view_data is None or view_data["normal"] is None:
         return None
+    return colorize_normal(view_data["normal"], mask=view_data.get("mask"))
 def update_measure_view(processed_data, view_index):
     target_dir,
     frame_filter="All",
     show_cam=True,
     filter_black_bg=False,
     filter_white_bg=False,
     apply_mask=True,
     mask_edges=True,
+    show_mesh=True,
 ):
     """
     Perform reconstruction using the already-created target_dir/images.
     gc.collect()
     torch.cuda.empty_cache()
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
     all_files = (
     # Build a GLB file name
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
     )
     # Convert predictions to GLB
         predictions,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
+        as_mesh=show_mesh,  # Use the show_mesh parameter
     )
     glbscene.export(file_obj=glbfile)
 # -------------------------------------------------------------------------
 # 5) Helper functions for UI resets + re-visualization
 # -------------------------------------------------------------------------
+def colorize_depth(depth_map, mask=None):
+    """Convert depth map to colorized visualization with optional mask"""
     if depth_map is None:
         return None
     # Normalize depth to 0-1 range
     depth_normalized = depth_map.copy()
     valid_mask = depth_normalized > 0
+    # Apply additional mask if provided (for background filtering)
+    if mask is not None:
+        valid_mask = valid_mask & mask
     if valid_mask.sum() > 0:
         valid_depths = depth_normalized[valid_mask]
         p5 = np.percentile(valid_depths, 5)
     import matplotlib.pyplot as plt
     colormap = plt.cm.turbo_r
     colored = colormap(depth_normalized)
     colored = (colored[:, :, :3] * 255).astype(np.uint8)
     return colored
+def colorize_normal(normal_map, mask=None):
+    """Convert normal map to colorized visualization with optional mask"""
     if normal_map is None:
         return None
+    # Create a copy for modification
+    normal_vis = normal_map.copy()
+    # Apply mask if provided (set masked areas to [0, 0, 0] which becomes grey after normalization)
+    if mask is not None:
+        invalid_mask = ~mask
+        normal_vis[invalid_mask] = [0, 0, 0]  # Set invalid areas to zero
     # Normalize normals to [0, 1] range for visualization
+    normal_vis = (normal_vis + 1.0) / 2.0
     normal_vis = (normal_vis * 255).astype(np.uint8)
     return normal_vis
+def process_predictions_for_visualization(
+    predictions, views, high_level_config, filter_black_bg=False, filter_white_bg=False
+):
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
     # Process each view
     for view_idx, view in enumerate(views):
         # Get image
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
         # Get predicted points
         pred_pts3d = predictions["world_points"][view_idx]
             "depth": None,
             "normal": None,
             "mask": None,
         }
+        # Start with the final mask from predictions
+        mask = predictions["final_mask"][view_idx].copy()
+        # Apply black background filtering if enabled
+        if filter_black_bg:
+            # Get the image colors (ensure they're in 0-255 range)
+            view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
+            # Filter out black background pixels (sum of RGB < 16)
+            black_bg_mask = view_colors.sum(axis=2) >= 16
+            mask = mask & black_bg_mask
+        # Apply white background filtering if enabled
+        if filter_white_bg:
+            # Get the image colors (ensure they're in 0-255 range)
+            view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
+            # Filter out white background pixels (all RGB > 240)
+            white_bg_mask = ~(
+                (view_colors[:, :, 0] > 240)
+                & (view_colors[:, :, 1] > 240)
+                & (view_colors[:, :, 2] > 240)
+            )
+            mask = mask & white_bg_mask
+        view_data["mask"] = mask
         view_data["depth"] = predictions["depth"][view_idx].squeeze()
         normals, _ = points_to_normals(pred_pts3d, mask=view_data["mask"])
     frame_filter,
     show_cam,
     is_example,
     filter_black_bg=False,
     filter_white_bg=False,
     show_mesh=True,
     loaded = np.load(predictions_path, allow_pickle=True)
     predictions = {key: loaded[key] for key in loaded.keys()}
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
     )
     if not os.path.exists(glbfile):
             predictions,
             filter_by_frames=frame_filter,
             show_cam=show_cam,
             mask_black_bg=filter_black_bg,
             mask_white_bg=filter_white_bg,
             as_mesh=show_mesh,
     )
+def update_all_views_on_filter_change(
+    target_dir,
+    filter_black_bg,
+    filter_white_bg,
+    processed_data,
+    depth_view_selector,
+    normal_view_selector,
+    measure_view_selector,
+):
+    """
+    Update all individual view tabs when background filtering checkboxes change.
+    This regenerates the processed data with new filtering and updates all views.
+    """
+    # Check if we have a valid target directory and predictions
+    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return processed_data, None, None, None, []
+    predictions_path = os.path.join(target_dir, "predictions.npz")
+    if not os.path.exists(predictions_path):
+        return processed_data, None, None, None, []
+    try:
+        # Load the original predictions and views
+        loaded = np.load(predictions_path, allow_pickle=True)
+        predictions = {key: loaded[key] for key in loaded.keys()}
+        # Load images using MapAnything's load_images function
+        image_folder_path = os.path.join(target_dir, "images")
+        views = load_images(image_folder_path)
+        # Regenerate processed data with new filtering settings
+        new_processed_data = process_predictions_for_visualization(
+            predictions, views, high_level_config, filter_black_bg, filter_white_bg
+        )
+        # Get current view indices
+        try:
+            depth_view_idx = (
+                int(depth_view_selector.split()[1]) - 1 if depth_view_selector else 0
+            )
+        except:
+            depth_view_idx = 0
+        try:
+            normal_view_idx = (
+                int(normal_view_selector.split()[1]) - 1 if normal_view_selector else 0
+            )
+        except:
+            normal_view_idx = 0
+        try:
+            measure_view_idx = (
+                int(measure_view_selector.split()[1]) - 1
+                if measure_view_selector
+                else 0
+            )
+        except:
+            measure_view_idx = 0
+        # Update all views with new filtered data
+        depth_vis = update_depth_view(new_processed_data, depth_view_idx)
+        normal_vis = update_normal_view(new_processed_data, normal_view_idx)
+        measure_img, _ = update_measure_view(new_processed_data, measure_view_idx)
+        return new_processed_data, depth_vis, normal_vis, measure_img, []
+    except Exception as e:
+        print(f"Error updating views on filter change: {e}")
+        return processed_data, None, None, None, []
 # -------------------------------------------------------------------------
 # Example scene functions
 # -------------------------------------------------------------------------
                     gr.Markdown("### Pointcloud options (live updates)")
                     show_cam = gr.Checkbox(label="Show Camera", value=True)
                     show_mesh = gr.Checkbox(label="Show mesh", value=True)
                     filter_black_bg = gr.Checkbox(
                         label="Filter Black Background", value=False
                     )
                     apply_mask_checkbox = gr.Checkbox(
                         label="Apply non-ambiguous mask", value=True
                     )
     # ---------------------- Example Scenes Section ----------------------
     gr.Markdown("## Example Scenes (lists all scenes in the examples folder)")
     gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
             target_dir_output,
             frame_filter,
             show_cam,
             filter_black_bg,
             filter_white_bg,
             apply_mask_checkbox,
+            show_mesh,
         ],
         outputs=[
             reconstruction_output,
             frame_filter,
             show_cam,
             is_example,
+            filter_black_bg,
+            filter_white_bg,
+            show_mesh,
         ],
         [reconstruction_output, log_output],
     )
         ],
         [reconstruction_output, log_output],
     )
+    filter_black_bg.change(
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
             filter_black_bg,
             filter_white_bg,
         ],
         [reconstruction_output, log_output],
+    ).then(
+        fn=update_all_views_on_filter_change,
+        inputs=[
             target_dir_output,
             filter_black_bg,
             filter_white_bg,
+            processed_data_state,
+            depth_view_selector,
+            normal_view_selector,
+            measure_view_selector,
+        ],
+        outputs=[
+            processed_data_state,
+            depth_map,
+            normal_map,
+            measure_image,
+            measure_points_state,
         ],
     )
     filter_white_bg.change(
         update_visualization,
             frame_filter,
             show_cam,
             is_example,
             filter_black_bg,
             filter_white_bg,
             show_mesh,
         ],
         [reconstruction_output, log_output],
+    ).then(
+        fn=update_all_views_on_filter_change,
+        inputs=[
+            target_dir_output,
+            filter_black_bg,
+            filter_white_bg,
+            processed_data_state,
+            depth_view_selector,
+            normal_view_selector,
+            measure_view_selector,
+        ],
+        outputs=[
+            processed_data_state,
+            depth_map,
+            normal_map,
+            measure_image,
+            measure_points_state,
+        ],
     )
     show_mesh.change(
             frame_filter,
             show_cam,
             is_example,
             filter_black_bg,
             filter_white_bg,
             show_mesh,

mapanything/utils/hf_utils/visual_util.py CHANGED Viewed

@@ -107,13 +107,13 @@ def image_mesh(
         *vertex_attrs (np.ndarray): vertex attributes in corresponding order with input image_attrs
         indices (np.ndarray, optional): indices of vertices in the original mesh
     """
-    assert (len(image_attrs) > 0) or (mask is not None), (
-        "At least one of image_attrs or mask should be provided"
-    )
     height, width = next(image_attrs).shape[:2] if mask is None else mask.shape
-    assert all(img.shape[:2] == (height, width) for img in image_attrs), (
-        "All image_attrs should have the same shape"
-    )
     row_faces = np.stack(
         [
@@ -151,14 +151,10 @@ def image_mesh(
 def predictions_to_glb(
     predictions,
-    conf_thres=50.0,
     filter_by_frames="all",
     mask_black_bg=False,
     mask_white_bg=False,
     show_cam=True,
-    mask_sky=False,
-    target_dir=None,
-    prediction_mode="Predicted Pointmap",
     mask_ambiguous=False,
     as_mesh=True,
 ) -> trimesh.Scene:
@@ -168,17 +164,12 @@ def predictions_to_glb(
     Args:
         predictions (dict): Dictionary containing model predictions with keys:
             - world_points: 3D point coordinates (S, H, W, 3)
-            - world_points_conf: Confidence scores (S, H, W)
             - images: Input images (S, H, W, 3)
             - extrinsic: Camera extrinsic matrices (S, 3, 4)
-        conf_thres (float): Percentage of low-confidence points to filter out (default: 50.0)
         filter_by_frames (str): Frame filter specification (default: "all")
         mask_black_bg (bool): Mask out black background pixels (default: False)
         mask_white_bg (bool): Mask out white background pixels (default: False)
         show_cam (bool): Include camera visualization (default: True)
-        mask_sky (bool): Apply sky segmentation mask (default: False)
-        target_dir (str): Output directory for intermediate files (default: None)
-        prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
         mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False)
         as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False)
@@ -191,9 +182,6 @@ def predictions_to_glb(
     if not isinstance(predictions, dict):
         raise ValueError("predictions must be a dictionary")
-    if conf_thres is None:
-        conf_thres = 10.0
     print("Building GLB scene")
     selected_frame_idx = None
     if filter_by_frames != "all" and filter_by_frames != "All":
@@ -203,95 +191,23 @@ def predictions_to_glb(
         except (ValueError, IndexError):
             pass
-    if "Pointmap" in prediction_mode:
-        print("Using Pointmap Branch")
-        if "world_points" in predictions:
-            # import ipdb
-            # ipdb.set_trace()
-            pred_world_points = predictions[
-                "world_points"
-            ]  # No batch dimension to remove
-            pred_world_points_conf = predictions.get(
-                "confidence", np.ones_like(pred_world_points[..., 0])
-            )
-        else:
-            print(
-                "Warning: world_points not found in predictions, falling back to depth-based points"
-            )
-            pred_world_points = predictions["world_points_from_depth"]
-            pred_world_points_conf = predictions.get(
-                "depth_conf", np.ones_like(pred_world_points[..., 0])
-            )
-    else:
-        print("Using Depthmap and Camera Branch")
-        pred_world_points = predictions["world_points_from_depth"]
-        pred_world_points_conf = predictions.get(
-            "depth_conf", np.ones_like(pred_world_points[..., 0])
         )
     # Get images from predictions
     images = predictions["images"]
     # Use extrinsic matrices instead of pred_extrinsic_list
     camera_matrices = predictions["extrinsic"]
-    if mask_sky:
-        if target_dir is not None:
-            import onnxruntime
-            skyseg_session = None
-            target_dir_images = target_dir + "/images"
-            image_list = sorted(os.listdir(target_dir_images))
-            sky_mask_list = []
-            # Get the shape of pred_world_points_conf to match
-            S, H, W = (
-                pred_world_points_conf.shape
-                if hasattr(pred_world_points_conf, "shape")
-                else (len(images), images.shape[1], images.shape[2])
-            )
-            # Download skyseg.onnx if it doesn't exist
-            if not os.path.exists("skyseg.onnx"):
-                print("Downloading skyseg.onnx...")
-                download_file_from_url(
-                    "https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx",
-                    "skyseg.onnx",
-                )
-            for i, image_name in enumerate(image_list):
-                image_filepath = os.path.join(target_dir_images, image_name)
-                mask_filepath = os.path.join(target_dir, "sky_masks", image_name)
-                # Check if mask already exists
-                if os.path.exists(mask_filepath):
-                    # Load existing mask
-                    sky_mask = cv2.imread(mask_filepath, cv2.IMREAD_GRAYSCALE)
-                else:
-                    # Generate new mask
-                    if skyseg_session is None:
-                        skyseg_session = onnxruntime.InferenceSession("skyseg.onnx")
-                    sky_mask = segment_sky(
-                        image_filepath, skyseg_session, mask_filepath
-                    )
-                # Resize mask to match H×W if needed
-                if sky_mask.shape[0] != H or sky_mask.shape[1] != W:
-                    sky_mask = cv2.resize(sky_mask, (W, H))
-                sky_mask_list.append(sky_mask)
-            # Convert list to numpy array with shape S×H×W
-            sky_mask_array = np.array(sky_mask_list)
-            # Apply sky mask to confidence scores
-            sky_mask_binary = (sky_mask_array > 0.1).astype(np.float32)
-            pred_world_points_conf = pred_world_points_conf * sky_mask_binary
     if selected_frame_idx is not None:
         pred_world_points = pred_world_points[selected_frame_idx][None]
-        pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
         images = images[selected_frame_idx][None]
         camera_matrices = camera_matrices[selected_frame_idx][None]
@@ -303,36 +219,30 @@ def predictions_to_glb(
         colors_rgb = images
     colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
-    conf = pred_world_points_conf.reshape(-1)
-    # Convert percentage threshold to actual confidence value
-    if conf_thres == 0.0:
-        conf_threshold = 0.0
-    else:
-        conf_threshold = np.percentile(conf, conf_thres)
-    conf_mask = (conf >= conf_threshold) & (conf > 1e-5)
     final_mask = predictions["final_mask"].reshape(-1)
     if mask_black_bg:
-        black_bg_mask = colors_rgb.sum(axis=1) >= 16 / 255.0
-        conf_mask = conf_mask & black_bg_mask
     if mask_white_bg:
         # Filter out white background pixels (RGB values close to white)
         # Consider pixels white if all RGB values are above 240
         white_bg_mask = (
-            (colors_rgb[:, 0] > 240 / 255.0)
-            & (colors_rgb[:, 1] > 240 / 255.0)
-            & (colors_rgb[:, 2] > 240 / 255.0)
         )
-        conf_mask = conf_mask & white_bg_mask
-    # Use final_mask with conf_mask when mask_ambiguous is checked
     if mask_ambiguous:
-        conf_mask = conf_mask & final_mask
-    vertices_3d = vertices_3d[conf_mask].copy()
-    colors_rgb = colors_rgb[conf_mask].copy()
     if vertices_3d is None or np.asarray(vertices_3d).size == 0:
         vertices_3d = np.array([[1, 0, 0]])
@@ -368,16 +278,13 @@ def predictions_to_glb(
             else:  # Assume already in HWC format
                 original_image_colors = images[0]
             original_image_colors *= 255
-            # Create mask from confidence and other filters
-            original_conf = pred_world_points_conf.reshape(H, W)
             original_final_mask = predictions["final_mask"][selected_frame_idx].reshape(
                 H, W
             )
-            # Apply thresholds to create mask
-            mask = (original_conf >= conf_threshold) & (original_conf > 1e-5)
-            if mask_ambiguous:
-                mask = mask & original_final_mask
             # Additional background masks if needed
             if mask_black_bg:
@@ -407,29 +314,28 @@ def predictions_to_glb(
                     original_points * np.array([1, -1, 1], dtype=np.float32),
                     original_image_colors / 255.0,
                     frame_normals * np.array([1, -1, 1], dtype=np.float32),
-                    mask=original_final_mask,
                     tri=True,
                     return_indices=False,
                 )
                 # Apply coordinate transformations to normals
                 vertex_normals = vertex_normals * np.array([1, -1, 1], dtype=np.float32)
-                # frame_normals = frame_normals * np.array([1, -1, 1], dtype=np.float32)
             else:
                 # Create faces and vertices using image_mesh without normals
                 faces, vertices, vertex_colors = image_mesh(
                     original_points * np.array([1, -1, 1], dtype=np.float32),
                     original_image_colors / 255.0,
-                    mask=original_final_mask,
                     tri=True,
                     return_indices=False,
                 )
-            vertices = vertices * np.array([1, -1, 1], dtype=np.float32)
             # Create trimesh object with optional normals
             mesh_data = trimesh.Trimesh(
-                vertices=vertices,
                 faces=faces,
                 vertex_colors=(vertex_colors * 255).astype(np.uint8),
                 vertex_normals=(vertex_normals if vertex_normals is not None else None),
@@ -446,7 +352,6 @@ def predictions_to_glb(
                 # Get data for this frame
                 frame_points = pred_world_points[frame_idx]
-                frame_conf = pred_world_points_conf[frame_idx]
                 frame_final_mask = predictions["final_mask"][frame_idx]
                 # Get frame image
@@ -455,16 +360,27 @@ def predictions_to_glb(
                 else:  # Assume already in HWC format
                     frame_image = images[frame_idx]
                 frame_image *= 255
-                # Create mask for this frame
-                mask = (frame_conf >= conf_threshold) & (frame_conf > 1e-5)
-                if mask_ambiguous:
-                    mask = mask | frame_final_mask
                 # Create mesh for this frame
                 faces, vertices, vertex_colors = image_mesh(
                     frame_points * np.array([1, -1, 1], dtype=np.float32),
                     frame_image / 255.0,
-                    mask=frame_final_mask,
                     tri=True,
                     return_indices=False,
                 )
@@ -484,9 +400,6 @@ def predictions_to_glb(
     # Prepare 4x4 matrices for camera extrinsics
     num_cameras = len(camera_matrices)
-    # extrinsics_matrices = np.zeros((num_cameras, 4, 4))
-    # extrinsics_matrices[:, :3, :4] = camera_matrices
-    # extrinsics_matrices[:, 3, 3] = 1
     if show_cam:
         # Add camera models to the scene
@@ -497,7 +410,7 @@ def predictions_to_glb(
             current_color = tuple(int(255 * x) for x in rgba_color[:3])
             integrate_camera_into_scene(
-                scene_3d, camera_to_world, current_color, scene_scale
             )
     # Align scene to the observation of the first camera

         *vertex_attrs (np.ndarray): vertex attributes in corresponding order with input image_attrs
         indices (np.ndarray, optional): indices of vertices in the original mesh
     """
+    assert (len(image_attrs) > 0) or (
+        mask is not None
+    ), "At least one of image_attrs or mask should be provided"
     height, width = next(image_attrs).shape[:2] if mask is None else mask.shape
+    assert all(
+        img.shape[:2] == (height, width) for img in image_attrs
+    ), "All image_attrs should have the same shape"
     row_faces = np.stack(
         [
 def predictions_to_glb(
     predictions,
     filter_by_frames="all",
     mask_black_bg=False,
     mask_white_bg=False,
     show_cam=True,
     mask_ambiguous=False,
     as_mesh=True,
 ) -> trimesh.Scene:
     Args:
         predictions (dict): Dictionary containing model predictions with keys:
             - world_points: 3D point coordinates (S, H, W, 3)
             - images: Input images (S, H, W, 3)
             - extrinsic: Camera extrinsic matrices (S, 3, 4)
         filter_by_frames (str): Frame filter specification (default: "all")
         mask_black_bg (bool): Mask out black background pixels (default: False)
         mask_white_bg (bool): Mask out white background pixels (default: False)
         show_cam (bool): Include camera visualization (default: True)
         mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False)
         as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False)
     if not isinstance(predictions, dict):
         raise ValueError("predictions must be a dictionary")
     print("Building GLB scene")
     selected_frame_idx = None
     if filter_by_frames != "all" and filter_by_frames != "All":
         except (ValueError, IndexError):
             pass
+    # Always use Pointmap Branch
+    print("Using Pointmap Branch")
+    if "world_points" not in predictions:
+        raise ValueError(
+            "world_points not found in predictions. Pointmap Branch requires 'world_points' key. "
+            "Depthmap and Camera branches have been removed."
         )
+    pred_world_points = predictions["world_points"]
     # Get images from predictions
     images = predictions["images"]
     # Use extrinsic matrices instead of pred_extrinsic_list
     camera_matrices = predictions["extrinsic"]
     if selected_frame_idx is not None:
         pred_world_points = pred_world_points[selected_frame_idx][None]
         images = images[selected_frame_idx][None]
         camera_matrices = camera_matrices[selected_frame_idx][None]
         colors_rgb = images
     colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
+    # Create mask for filtering
+    mask = np.ones(len(vertices_3d), dtype=bool)
     final_mask = predictions["final_mask"].reshape(-1)
     if mask_black_bg:
+        black_bg_mask = colors_rgb.sum(axis=1) >= 16
+        mask = mask & black_bg_mask
     if mask_white_bg:
         # Filter out white background pixels (RGB values close to white)
         # Consider pixels white if all RGB values are above 240
         white_bg_mask = (
+            (colors_rgb[:, 0] > 240)
+            & (colors_rgb[:, 1] > 240)
+            & (colors_rgb[:, 2] > 240)
         )
+        mask = mask & ~white_bg_mask
+    # Use final_mask when mask_ambiguous is checked
     if mask_ambiguous:
+        mask = mask & final_mask
+    vertices_3d = vertices_3d[mask].copy()
+    colors_rgb = colors_rgb[mask].copy()
     if vertices_3d is None or np.asarray(vertices_3d).size == 0:
         vertices_3d = np.array([[1, 0, 0]])
             else:  # Assume already in HWC format
                 original_image_colors = images[0]
             original_image_colors *= 255
+            # Get original final mask
             original_final_mask = predictions["final_mask"][selected_frame_idx].reshape(
                 H, W
             )
+            # Create mask based on final mask
+            mask = original_final_mask
             # Additional background masks if needed
             if mask_black_bg:
                     original_points * np.array([1, -1, 1], dtype=np.float32),
                     original_image_colors / 255.0,
                     frame_normals * np.array([1, -1, 1], dtype=np.float32),
+                    mask=mask,
                     tri=True,
                     return_indices=False,
                 )
                 # Apply coordinate transformations to normals
                 vertex_normals = vertex_normals * np.array([1, -1, 1], dtype=np.float32)
             else:
                 # Create faces and vertices using image_mesh without normals
                 faces, vertices, vertex_colors = image_mesh(
                     original_points * np.array([1, -1, 1], dtype=np.float32),
                     original_image_colors / 255.0,
+                    mask=mask,
                     tri=True,
                     return_indices=False,
                 )
+            # vertices = vertices * np.array([1, -1, 1], dtype=np.float32)
             # Create trimesh object with optional normals
             mesh_data = trimesh.Trimesh(
+                vertices=vertices * np.array([1, -1, 1], dtype=np.float32),
                 faces=faces,
                 vertex_colors=(vertex_colors * 255).astype(np.uint8),
                 vertex_normals=(vertex_normals if vertex_normals is not None else None),
                 # Get data for this frame
                 frame_points = pred_world_points[frame_idx]
                 frame_final_mask = predictions["final_mask"][frame_idx]
                 # Get frame image
                 else:  # Assume already in HWC format
                     frame_image = images[frame_idx]
                 frame_image *= 255
+                # Create mask for this frame using final_mask
+                mask = frame_final_mask
+                # Additional background masks if needed
+                if mask_black_bg:
+                    black_bg_mask = frame_image.sum(axis=2) >= 16
+                    mask = mask & black_bg_mask
+                if mask_white_bg:
+                    white_bg_mask = ~(
+                        (frame_image[:, :, 0] > 240)
+                        & (frame_image[:, :, 1] > 240)
+                        & (frame_image[:, :, 2] > 240)
+                    )
+                    mask = mask & white_bg_mask
                 # Create mesh for this frame
                 faces, vertices, vertex_colors = image_mesh(
                     frame_points * np.array([1, -1, 1], dtype=np.float32),
                     frame_image / 255.0,
+                    mask=mask,
                     tri=True,
                     return_indices=False,
                 )
     # Prepare 4x4 matrices for camera extrinsics
     num_cameras = len(camera_matrices)
     if show_cam:
         # Add camera models to the scene
             current_color = tuple(int(255 * x) for x in rgba_color[:3])
             integrate_camera_into_scene(
+                scene_3d, world_to_camera, current_color, scene_scale
             )
     # Align scene to the observation of the first camera