Spaces:

facebook
/

map-anything

Running on Zero

App Files Files Community

aknapitsch user commited on Sep 8

Commit

37de32d

1 Parent(s): 8c1e404

simpler inference and refactoring

Browse files

Files changed (47) hide show

app.py +165 -445
hf_utils/vgg_geometry.py +0 -166
hf_utils/visual_util.py +5 -5
mapanything/__init__.py +0 -0
mapanything/datasets/wai/ase.py +1 -1
mapanything/datasets/wai/bedlam.py +1 -1
mapanything/datasets/wai/blendedmvs.py +2 -9
mapanything/datasets/wai/dl3dv.py +7 -28
mapanything/datasets/wai/dtu.py +1 -1
mapanything/datasets/wai/dynamicreplica.py +1 -1
mapanything/datasets/wai/eth3d.py +1 -1
mapanything/datasets/wai/gta_sfm.py +1 -1
mapanything/datasets/wai/matrixcity.py +1 -1
mapanything/datasets/wai/megadepth.py +2 -9
mapanything/datasets/wai/mpsd.py +2 -9
mapanything/datasets/wai/mvs_synth.py +1 -1
mapanything/datasets/wai/paralleldomain4d.py +1 -1
mapanything/datasets/wai/sailvos3d.py +1 -1
mapanything/datasets/wai/scannetpp.py +1 -1
mapanything/datasets/wai/spring.py +2 -9
mapanything/datasets/wai/structured3d.py +1 -1
mapanything/datasets/wai/tav2_wb.py +2 -9
mapanything/datasets/wai/unrealstereo4k.py +1 -1
mapanything/datasets/wai/xrooms.py +1 -1
mapanything/models/external/README.md +5 -0
mapanything/models/external/moge/models/v1.py +1 -1
mapanything/models/external/moge/models/v2.py +1 -1
mapanything/models/mapanything/ablations.py +4 -2
mapanything/models/mapanything/model.py +220 -4
mapanything/models/mapanything/modular_dust3r.py +4 -2
mapanything/train/losses.py +283 -9
mapanything/utils/geometry.py +91 -0
mapanything/utils/image.py +11 -10
mapanything/utils/inference.py +389 -0
mapanything/utils/viz.py +2 -2
mapanything/utils/wai/__init__.py +3 -0
mapanything/utils/wai/basic_dataset.py +131 -0
mapanything/utils/wai/camera.py +263 -0
mapanything/utils/wai/colormaps/colors_fps_5k.npz +3 -0
mapanything/utils/wai/core.py +492 -0
mapanything/utils/wai/intersection_check.py +462 -0
mapanything/utils/wai/io.py +1373 -0
mapanything/utils/wai/m_ops.py +346 -0
mapanything/utils/wai/ops.py +368 -0
mapanything/utils/wai/scene_frame.py +431 -0
mapanything/utils/wai/semantics.py +40 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -18,24 +18,21 @@ import gradio as gr
 import numpy as np
 import spaces
 import torch
-from huggingface_hub import hf_hub_download
 sys.path.append("mapanything/")
 from hf_utils.css_and_html import (
     get_acknowledgements_html,
     get_description_html,
     get_gradio_theme,
     get_header_html,
-    GRADIO_CSS,
-    MEASURE_INSTRUCTIONS_HTML,
 )
-from hf_utils.vgg_geometry import unproject_depth_map_to_point_map
 from hf_utils.visual_util import predictions_to_glb
-from mapanything.models import init_model
-from mapanything.utils.geometry import depth_edge, normals_edge, points_to_normals
 from mapanything.utils.image import load_images, rgb
-from mapanything.utils.inference import loss_of_one_batch_multi_view
 def get_logo_base64():
@@ -103,69 +100,16 @@ def init_hydra_config(config_path, overrides=None):
     return cfg
-def init_inference_model(config, ckpt_path, device):
-    "Initialize the model for inference"
-    if isinstance(config, dict):
-        config_path = config["path"]
-        overrrides = config["config_overrides"]
-        model_args = init_hydra_config(config_path, overrides=overrrides)
-        model = init_model(model_args.model.model_str, model_args.model.model_config)
-    else:
-        config_path = config
-        model_args = init_hydra_config(config_path)
-        model = init_model(model_args.model_str, model_args.model_config)
-    model.to(device)
-    if ckpt_path is not None:
-        print("Loading model from: ", ckpt_path)
-        # Load HuggingFace token for private repositories
-        hf_token = load_hf_token()
-        # Try to download from HuggingFace Hub first if it's a HF URL
-        if "huggingface.co" in ckpt_path:
-            try:
-                # Extract repo_id and filename from URL
-                # URL format: https://huggingface.co/facebook/MapAnything/resolve/main/mapa_curri_24v_13d_48ipg_64g.pth
-                parts = ckpt_path.replace("https://huggingface.co/", "").split("/")
-                repo_id = f"{parts[0]}/{parts[1]}"  # e.g., "facebook/MapAnything"
-                filename = "/".join(
-                    parts[4:]
-                )  # e.g., "mapa_curri_24v_13d_48ipg_64g.pth"
-                print(f"Downloading from HuggingFace Hub: {repo_id}/{filename}")
-                local_file = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=filename,
-                    token=hf_token,
-                    cache_dir=None,  # Use default cache
-                )
-                ckpt = torch.load(local_file, map_location=device, weights_only=False)
-            except Exception as e:
-                print(f"HuggingFace Hub download failed: {e}")
-                print("Falling back to torch.hub.load_state_dict_from_url...")
-                # Fallback to original method
-                ckpt = torch.hub.load_state_dict_from_url(
-                    ckpt_path, map_location=device
-                )
-        else:
-            # Use original method for non-HF URLs
-            ckpt = torch.hub.load_state_dict_from_url(ckpt_path, map_location=device)
-        print(model.load_state_dict(ckpt["model"], strict=False))
-    model.eval()
-    return model
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
     "config_overrides": [
         "machine=aws",
         "model=mapanything",
         "model/task=images_only",
         "model.encoder.uses_torch_hub=false",
     ],
-    "checkpoint_path": "https://huggingface.co/facebook/MapAnything/resolve/main/mapa_curri_24v_13d_48ipg_64g.pth",
     "trained_with_amp": True,
     "trained_with_amp_dtype": "fp16",
     "data_norm_type": "dinov2",
@@ -181,7 +125,7 @@ model = None
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
-def run_model(target_dir, model_placeholder):
     """
     Run the MapAnything model on images in the 'target_dir/images' folder and return predictions.
     """
@@ -191,15 +135,16 @@ def run_model(target_dir, model_placeholder):
     # Device check
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = torch.device(device)
-    # if not torch.cuda.is_available():
-    #     raise ValueError("CUDA is not available. Check your environment.")
     # Initialize model if not already done
     if model is None:
         print("Initializing MapAnything model...")
-        model = init_inference_model(
-            high_level_config, high_level_config["checkpoint_path"], device
         )
     else:
         model = model.to(device)
@@ -208,30 +153,18 @@ def run_model(target_dir, model_placeholder):
     # Load images using MapAnything's load_images function
     print("Loading images...")
     image_folder_path = os.path.join(target_dir, "images")
-    views = load_images(
-        image_folder_path,
-        resolution_set=high_level_config["resolution"],
-        verbose=False,
-        norm_type=high_level_config["data_norm_type"],
-        patch_size=high_level_config["patch_size"],
-        stride=1,
-    )
     print(f"Loaded {len(views)} images")
     if len(views) == 0:
         raise ValueError("No images found. Check your upload.")
-    # Run inference using MapAnything's inference function
-    print("Running MapAnything inference...")
-    with torch.no_grad():
-        pred_result = loss_of_one_batch_multi_view(
-            views,
-            model,
-            None,
-            device,
-            use_amp=high_level_config["trained_with_amp"],
-            amp_dtype=high_level_config["trained_with_amp_dtype"],
-        )
     # Convert predictions to format expected by visualization
     predictions = {}
@@ -242,167 +175,40 @@ def run_model(target_dir, model_placeholder):
     world_points_list = []
     depth_maps_list = []
     images_list = []
-    confidence_list = []
     final_mask_list = []
-    # Check if confidence data is available
-    has_confidence = False
-    for view_idx, view in enumerate(views):
-        view_key = f"pred{view_idx + 1}"
-        if view_key in pred_result and "conf" in pred_result[view_key]:
-            has_confidence = True
-            break
-    # Extract predictions for each view
-    for view_idx, view in enumerate(views):
-        # Get image for colors
-        image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
-        view_key = f"pred{view_idx + 1}"
-        if view_key in pred_result:
-            pred_pts3d = pred_result[view_key]["pts3d"][0].cpu().numpy()
-            # Get confidence data if available
-            confidence_map = None
-            if "conf" in pred_result[view_key]:
-                confidence_map = pred_result[view_key]["conf"][0].cpu().numpy()
-            # Compute final_mask just like in visualize_raw_inference_output function
-            # Create the prediction mask based on parameters
-            pred_mask = None
-            use_gt_mask_on_pred = False  # Set based on your requirements
-            use_pred_mask = True  # Set based on your requirements
-            use_non_ambi_mask = True  # Set based on your requirements
-            use_conf_mask = False  # Set based on your requirements
-            conf_percentile = 10  # Set based on your requirements
-            use_edge_mask = True  # Set based on your requirements
-            pts_edge_tol = 5  # Set based on your requirements
-            depth_edge_rtol = 0.03  # Set based on your requirements
-            if use_pred_mask:
-                # Get non ambiguous mask if available and requested
-                has_non_ambiguous_mask = (
-                    "non_ambiguous_mask" in pred_result[view_key] and use_non_ambi_mask
-                )
-                if has_non_ambiguous_mask:
-                    non_ambiguous_mask = (
-                        pred_result[view_key]["non_ambiguous_mask"][0].cpu().numpy()
-                    )
-                    pred_mask = non_ambiguous_mask
-                # Get confidence mask if available and requested
-                has_conf = "conf" in pred_result[view_key] and use_conf_mask
-                if has_conf:
-                    confidences = pred_result[view_key]["conf"][0].cpu()
-                    percentile_threshold = torch.quantile(
-                        confidences, conf_percentile / 100.0
-                    )
-                    conf_mask = confidences > percentile_threshold
-                    conf_mask = conf_mask.numpy()
-                    if pred_mask is not None:
-                        pred_mask = pred_mask & conf_mask
-                    else:
-                        pred_mask = conf_mask
-                # Apply edge mask if requested
-                if use_edge_mask and pred_mask is not None:
-                    if "cam_quats" not in pred_result[view_key]:
-                        # For direct point prediction
-                        # Compute normals and edge mask
-                        normals, normals_mask = points_to_normals(
-                            pred_pts3d, mask=pred_mask
-                        )
-                        edge_mask = ~(
-                            normals_edge(normals, tol=pts_edge_tol, mask=normals_mask)
-                        )
-                    else:
-                        # For ray-based prediction
-                        ray_depth = pred_result[view_key]["depth_along_ray"][0].cpu()
-                        local_pts3d = (
-                            pred_result[view_key]["ray_directions"][0].cpu() * ray_depth
-                        )
-                        depth_z = local_pts3d[..., 2].numpy()
-                        # Compute normals and edge mask
-                        normals, normals_mask = points_to_normals(
-                            pred_pts3d, mask=pred_mask
-                        )
-                        edge_mask = ~(
-                            depth_edge(depth_z, rtol=depth_edge_rtol, mask=pred_mask)
-                            & normals_edge(normals, tol=pts_edge_tol, mask=normals_mask)
-                        )
-                    if pred_mask is not None:
-                        pred_mask = pred_mask & edge_mask
-            # Determine final mask to use (like in visualize_raw_inference_output)
-            final_mask = None
-            valid_mask = np.ones_like(
-                pred_pts3d[..., 0], dtype=bool
-            )  # Create dummy valid_mask for app.py context
-            if use_gt_mask_on_pred:
-                final_mask = valid_mask
-                if use_pred_mask and pred_mask is not None:
-                    final_mask = final_mask & pred_mask
-            elif use_pred_mask and pred_mask is not None:
-                final_mask = pred_mask
-            else:
-                final_mask = np.ones_like(valid_mask, dtype=bool)
-            # Check if we have camera pose and intrinsics data
-            if "cam_quats" in pred_result[view_key]:
-                # Get decoupled quantities (like in visualize_raw_custom_data_inference_output)
-                cam_quats = pred_result[view_key]["cam_quats"][0].cpu()
-                cam_trans = pred_result[view_key]["cam_trans"][0].cpu()
-                ray_directions = pred_result[view_key]["ray_directions"][0].cpu()
-                ray_depth = pred_result[view_key]["depth_along_ray"][0].cpu()
-                # Convert the quantities
-                from mapanything.utils.geometry import (
-                    quaternion_to_rotation_matrix,
-                    recover_pinhole_intrinsics_from_ray_directions,
-                )
-                cam_rot = quaternion_to_rotation_matrix(cam_quats)
-                cam_pose = torch.eye(4)
-                cam_pose[:3, :3] = cam_rot
-                cam_pose[:3, 3] = cam_trans
-                cam_pose = np.linalg.inv(cam_pose)
-                cam_intrinsics = recover_pinhole_intrinsics_from_ray_directions(
-                    ray_directions, use_geometric_calculation=True
-                )
-                # Compute depth as in app_map.py
-                local_pts3d = ray_directions * ray_depth
-                depth_z = local_pts3d[..., 2]
-                # Convert to numpy and extract 3x4 extrinsic (remove bottom row)
-                extrinsic = cam_pose[:3, :4].numpy()  # Shape: (3, 4)
-                intrinsic = cam_intrinsics.numpy()  # Shape: (3, 3)
-                depth_z = depth_z.numpy()  # Shape: (H, W)
-            else:
-                # Use dummy values if camera info not available
-                # extrinsic: (3, 4) - [R|t] matrix
-                extrinsic = np.eye(3, 4)  # Identity rotation, zero translation
-                # intrinsic: (3, 3) - camera intrinsic matrix
-                intrinsic = np.eye(3)
-                # depth_z: (H, W) - dummy depth values
-                depth_z = np.zeros_like(pred_pts3d[..., 0])
-            # Append to lists
-            extrinsic_list.append(extrinsic)
-            intrinsic_list.append(intrinsic)
-            world_points_list.append(pred_pts3d)
-            depth_maps_list.append(depth_z)
-            images_list.append(image[0])  # Add image to list
-            final_mask_list.append(final_mask)  # Add final_mask to list
-            # Add confidence data (or None if not available)
-            if confidence_map is not None:
-                confidence_list.append(confidence_map)
-            elif has_confidence:
-                # If some views have confidence but this one doesn't, add dummy confidence
-                confidence_list.append(np.ones_like(depth_z))
     # Convert lists to numpy arrays with required shapes
     # extrinsic: (S, 3, 4) - batch of camera extrinsic matrices
@@ -419,26 +225,18 @@ def run_model(target_dir, model_placeholder):
     # Add channel dimension if needed to match (S, H, W, 1) format
     if len(depth_maps.shape) == 3:
         depth_maps = depth_maps[..., np.newaxis]
     predictions["depth"] = depth_maps
     # images: (S, H, W, 3) - batch of input images
     predictions["images"] = np.stack(images_list, axis=0)
-    # confidence: (S, H, W) - batch of confidence maps (only if available)
-    if confidence_list:
-        predictions["confidence"] = np.stack(confidence_list, axis=0)
     # final_mask: (S, H, W) - batch of final masks for filtering
     predictions["final_mask"] = np.stack(final_mask_list, axis=0)
-    world_points = unproject_depth_map_to_point_map(
-        depth_maps, predictions["extrinsic"], predictions["intrinsic"]
-    )
-    predictions["world_points_from_depth"] = world_points
     # Process data for visualization tabs (depth, normal, measure)
     processed_data = process_predictions_for_visualization(
-        pred_result, views, high_level_config
     )
     # Clean up
@@ -474,43 +272,69 @@ def get_view_data_by_index(processed_data, view_index):
     return processed_data[view_keys[view_index]]
-def update_depth_view(processed_data, view_index, conf_thres=None):
-    """Update depth view for a specific view index with optional confidence filtering"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None or view_data["depth"] is None:
         return None
     # Use confidence filtering if available
     confidence = view_data.get("confidence")
-    return colorize_depth(
-        view_data["depth"], confidence=confidence, conf_thres=conf_thres
-    )
-def update_normal_view(processed_data, view_index, conf_thres=None):
-    """Update normal view for a specific view index with optional confidence filtering"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None or view_data["normal"] is None:
         return None
     # Use confidence filtering if available
     confidence = view_data.get("confidence")
-    return colorize_normal(
-        view_data["normal"], confidence=confidence, conf_thres=conf_thres
-    )
 def update_measure_view(processed_data, view_index):
-    """Update measure view for a specific view index"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None:
         return None, []  # image, measure_points
-    return view_data["image"], []
-def navigate_depth_view(
-    processed_data, current_selector_value, direction, conf_thres=None
-):
     """Navigate depth view (direction: -1 for previous, +1 for next)"""
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
@@ -525,14 +349,12 @@ def navigate_depth_view(
     new_view = (current_view + direction) % num_views
     new_selector_value = f"View {new_view + 1}"
-    depth_vis = update_depth_view(processed_data, new_view, conf_thres=conf_thres)
     return new_selector_value, depth_vis
-def navigate_normal_view(
-    processed_data, current_selector_value, direction, conf_thres=None
-):
     """Navigate normal view (direction: -1 for previous, +1 for next)"""
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
@@ -547,7 +369,7 @@ def navigate_normal_view(
     new_view = (current_view + direction) % num_views
     new_selector_value = f"View {new_view + 1}"
-    normal_vis = update_normal_view(processed_data, new_view, conf_thres=conf_thres)
     return new_selector_value, normal_vis
@@ -572,14 +394,14 @@ def navigate_measure_view(processed_data, current_selector_value, direction):
     return new_selector_value, measure_image, measure_points
-def populate_visualization_tabs(processed_data, conf_thres=None):
     """Populate the depth, normal, and measure tabs with processed data"""
     if processed_data is None or len(processed_data) == 0:
         return None, None, None, []
     # Use update functions to ensure confidence filtering is applied from the start
-    depth_vis = update_depth_view(processed_data, 0, conf_thres=conf_thres)
-    normal_vis = update_normal_view(processed_data, 0, conf_thres=conf_thres)
     measure_img, _ = update_measure_view(processed_data, 0)
     return depth_vis, normal_vis, measure_img, []
@@ -683,13 +505,13 @@ def update_gallery_on_upload(input_video, input_images, s_time_interval=1.0):
 @spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
-    conf_thres=3.0,
     frame_filter="All",
     show_cam=True,
     filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
-    mask_ambiguous=False,
 ):
     """
     Perform reconstruction using the already-created target_dir/images.
@@ -716,7 +538,9 @@ def gradio_demo(
     print("Running MapAnything model...")
     with torch.no_grad():
-        predictions, processed_data = run_model(target_dir, None)
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
@@ -729,13 +553,12 @@ def gradio_demo(
     # Build a GLB file name
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_mask{mask_ambiguous}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     # Convert predictions to GLB
     glbscene = predictions_to_glb(
         predictions,
-        conf_thres=conf_thres,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
         target_dir=target_dir,
@@ -743,7 +566,6 @@ def gradio_demo(
         mask_sky=filter_sky,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
-        mask_ambiguous=mask_ambiguous,
     )
     glbscene.export(file_obj=glbfile)
@@ -760,7 +582,7 @@ def gradio_demo(
     # Populate visualization tabs with processed data
     depth_vis, normal_vis, measure_img, measure_pts = populate_visualization_tabs(
-        processed_data, conf_thres=conf_thres
     )
     # Update view selectors based on available views
@@ -860,29 +682,30 @@ def colorize_normal(normal_map, confidence=None, conf_thres=None):
     return normal_vis
-def process_predictions_for_visualization(pred_result, views, high_level_config):
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
     # Check if confidence data is available in any view
     has_confidence_data = False
-    for view_idx, view in enumerate(views):
-        view_key = f"pred{view_idx + 1}"
-        if view_key in pred_result and "conf" in pred_result[view_key]:
-            has_confidence_data = True
-            break
     # Process each view
     for view_idx, view in enumerate(views):
-        view_key = f"pred{view_idx + 1}"
-        if view_key not in pred_result:
-            continue
         # Get image
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
         # Get predicted points
-        pred_pts3d = pred_result[view_key]["pts3d"][0].cpu().numpy()
         # Initialize data for this view
         view_data = {
@@ -895,36 +718,12 @@ def process_predictions_for_visualization(pred_result, views, high_level_config)
             "has_confidence": has_confidence_data,
         }
-        # Get confidence data if available
-        if "conf" in pred_result[view_key]:
-            confidence = pred_result[view_key]["conf"][0].cpu().numpy()
-            view_data["confidence"] = confidence
-        # Get masks if available
-        has_non_ambiguous_mask = "non_ambiguous_mask" in pred_result[view_key]
-        if has_non_ambiguous_mask:
-            view_data["mask"] = (
-                pred_result[view_key]["non_ambiguous_mask"][0].cpu().numpy()
-            )
-        # Extract depth and camera info if available
-        if "cam_quats" in pred_result[view_key]:
-            ray_directions = pred_result[view_key]["ray_directions"][0].cpu()
-            ray_depth = pred_result[view_key]["depth_along_ray"][0].cpu()
-            # Compute depth
-            local_pts3d = ray_directions * ray_depth
-            depth_z = local_pts3d[..., 2].numpy()
-            view_data["depth"] = depth_z
-            # Compute normals if we have valid points
-            if has_non_ambiguous_mask:
-                try:
-                    normals, _ = points_to_normals(pred_pts3d, mask=view_data["mask"])
-                    view_data["normal"] = normals
-                except:
-                    # If normal computation fails, skip it
-                    pass
         processed_data[view_idx] = view_data
@@ -972,10 +771,29 @@ def measure(
         point2d = event.index[0], event.index[1]
         print(f"Clicked point: {point2d}")
         measure_points.append(point2d)
-        # Get image and ensure it's valid
-        image = current_view["image"]
         if image is None:
             return None, [], "No image available"
@@ -1093,14 +911,12 @@ def update_log():
 def update_visualization(
     target_dir,
-    conf_thres,
     frame_filter,
     show_cam,
     is_example,
     filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
-    mask_ambiguous=False,
 ):
     """
     Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
@@ -1135,13 +951,12 @@ def update_visualization(
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     if not os.path.exists(glbfile):
         glbscene = predictions_to_glb(
             predictions,
-            conf_thres=conf_thres,
             filter_by_frames=frame_filter,
             show_cam=show_cam,
             target_dir=target_dir,
@@ -1149,7 +964,6 @@ def update_visualization(
             mask_sky=filter_sky,
             mask_black_bg=filter_black_bg,
             mask_white_bg=filter_white_bg,
-            mask_ambiguous=mask_ambiguous,
         )
         glbscene.export(file_obj=glbfile)
@@ -1346,6 +1160,9 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
                             interactive=False,
                             sources=[],
                         )
                         measure_text = gr.Markdown("")
             with gr.Row():
@@ -1363,17 +1180,11 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
                 )
             with gr.Row():
-                conf_thres = gr.Slider(
-                    minimum=0,
-                    maximum=100,
-                    value=0,
-                    step=0.1,
-                    label="Confidence Threshold (%), only shown in depth and normals",
-                )
                 frame_filter = gr.Dropdown(
                     choices=["All"], value="All", label="Show Points from Frame"
                 )
                 with gr.Column():
                     show_cam = gr.Checkbox(label="Show Camera", value=True)
                     filter_sky = gr.Checkbox(
                         label="Filter Sky (using skyseg.onnx)", value=False
@@ -1384,8 +1195,11 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
                     filter_white_bg = gr.Checkbox(
                         label="Filter White Background", value=False
                     )
-                    mask_ambiguous = gr.Checkbox(label="Mask Ambiguous", value=True)
     # ---------------------- Example Scenes Section ----------------------
     gr.Markdown("## Example Scenes")
     gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
@@ -1446,13 +1260,13 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         fn=gradio_demo,
         inputs=[
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
-            mask_ambiguous,
         ],
         outputs=[
             reconstruction_output,
@@ -1476,76 +1290,10 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
     # -------------------------------------------------------------------------
     # Real-time Visualization Updates
     # -------------------------------------------------------------------------
-    def update_all_visualizations_on_conf_change(
-        processed_data,
-        depth_selector,
-        normal_selector,
-        conf_thres_val,
-        target_dir,
-        frame_filter,
-        show_cam,
-        is_example,
-    ):
-        """Update 3D view and all tabs when confidence threshold changes"""
-        # Update 3D pointcloud visualization
-        glb_file, log_msg = update_visualization(
-            target_dir,
-            conf_thres_val,
-            frame_filter,
-            show_cam,
-            is_example,
-        )
-        # Update depth and normal tabs with new confidence threshold
-        depth_vis = None
-        normal_vis = None
-        if processed_data is not None:
-            # Get current view indices from selectors
-            try:
-                depth_view_idx = (
-                    int(depth_selector.split()[1]) - 1 if depth_selector else 0
-                )
-            except:
-                depth_view_idx = 0
-            try:
-                normal_view_idx = (
-                    int(normal_selector.split()[1]) - 1 if normal_selector else 0
-                )
-            except:
-                normal_view_idx = 0
-            # Update visualizations with new confidence threshold
-            depth_vis = update_depth_view(
-                processed_data, depth_view_idx, conf_thres=conf_thres_val
-            )
-            normal_vis = update_normal_view(
-                processed_data, normal_view_idx, conf_thres=conf_thres_val
-            )
-        return glb_file, log_msg, depth_vis, normal_vis
-    conf_thres.change(
-        fn=update_all_visualizations_on_conf_change,
-        inputs=[
-            processed_data_state,
-            depth_view_selector,
-            normal_view_selector,
-            conf_thres,
-            target_dir_output,
-            frame_filter,
-            show_cam,
-            is_example,
-        ],
-        outputs=[reconstruction_output, log_output, depth_map, normal_map],
-    )
     frame_filter.change(
         update_visualization,
         [
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             is_example,
@@ -1556,7 +1304,6 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         update_visualization,
         [
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             is_example,
@@ -1567,14 +1314,12 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         update_visualization,
         [
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
-            mask_ambiguous,
         ],
         [reconstruction_output, log_output],
     )
@@ -1582,14 +1327,12 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         update_visualization,
         [
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
-            mask_ambiguous,
         ],
         [reconstruction_output, log_output],
     )
@@ -1597,29 +1340,12 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
         update_visualization,
         [
             target_dir_output,
-            conf_thres,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
-            mask_ambiguous,
-        ],
-        [reconstruction_output, log_output],
-    )
-    mask_ambiguous.change(
-        update_visualization,
-        [
-            target_dir_output,
-            conf_thres,
-            frame_filter,
-            show_cam,
-            is_example,
-            filter_sky,
-            filter_black_bg,
-            filter_white_bg,
-            mask_ambiguous,
         ],
         [reconstruction_output, log_output],
     )
@@ -1653,67 +1379,61 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
     # Depth tab navigation
     prev_depth_btn.click(
-        fn=lambda processed_data, current_selector, conf_thres_val: navigate_depth_view(
-            processed_data, current_selector, -1, conf_thres=conf_thres_val
         ),
-        inputs=[processed_data_state, depth_view_selector, conf_thres],
         outputs=[depth_view_selector, depth_map],
     )
     next_depth_btn.click(
-        fn=lambda processed_data, current_selector, conf_thres_val: navigate_depth_view(
-            processed_data, current_selector, 1, conf_thres=conf_thres_val
         ),
-        inputs=[processed_data_state, depth_view_selector, conf_thres],
         outputs=[depth_view_selector, depth_map],
     )
     depth_view_selector.change(
-        fn=lambda processed_data, selector_value, conf_thres_val: (
             update_depth_view(
                 processed_data,
                 int(selector_value.split()[1]) - 1,
-                conf_thres=conf_thres_val,
             )
             if selector_value
             else None
         ),
-        inputs=[processed_data_state, depth_view_selector, conf_thres],
         outputs=[depth_map],
     )
     # Normal tab navigation
     prev_normal_btn.click(
-        fn=lambda processed_data,
-        current_selector,
-        conf_thres_val: navigate_normal_view(
-            processed_data, current_selector, -1, conf_thres=conf_thres_val
         ),
-        inputs=[processed_data_state, normal_view_selector, conf_thres],
         outputs=[normal_view_selector, normal_map],
     )
     next_normal_btn.click(
-        fn=lambda processed_data,
-        current_selector,
-        conf_thres_val: navigate_normal_view(
-            processed_data, current_selector, 1, conf_thres=conf_thres_val
         ),
-        inputs=[processed_data_state, normal_view_selector, conf_thres],
         outputs=[normal_view_selector, normal_map],
     )
     normal_view_selector.change(
-        fn=lambda processed_data, selector_value, conf_thres_val: (
             update_normal_view(
                 processed_data,
                 int(selector_value.split()[1]) - 1,
-                conf_thres=conf_thres_val,
             )
             if selector_value
             else None
         ),
-        inputs=[processed_data_state, normal_view_selector, conf_thres],
         outputs=[normal_map],
     )

 import numpy as np
 import spaces
 import torch
 sys.path.append("mapanything/")
 from hf_utils.css_and_html import (
+    GRADIO_CSS,
+    MEASURE_INSTRUCTIONS_HTML,
     get_acknowledgements_html,
     get_description_html,
     get_gradio_theme,
     get_header_html,
 )
 from hf_utils.visual_util import predictions_to_glb
+from mapanything.models import MapAnything
+from mapanything.utils.geometry import depthmap_to_world_frame, points_to_normals
 from mapanything.utils.image import load_images, rgb
 def get_logo_base64():
     return cfg
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
+    "hf_model_name": "facebook/map-anything",
     "config_overrides": [
         "machine=aws",
         "model=mapanything",
         "model/task=images_only",
         "model.encoder.uses_torch_hub=false",
     ],
     "trained_with_amp": True,
     "trained_with_amp_dtype": "fp16",
     "data_norm_type": "dinov2",
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
+def run_model(target_dir, model_placeholder, apply_mask=True, mask_edges=True):
     """
     Run the MapAnything model on images in the 'target_dir/images' folder and return predictions.
     """
     # Device check
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = torch.device(device)
     # Initialize model if not already done
     if model is None:
         print("Initializing MapAnything model...")
+        print("Loading CC-BY-NC 4.0 licensed MapAnything model...")
+        model = MapAnything.from_pretrained(high_level_config["hf_model_name"]).to(
+            device
         )
     else:
         model = model.to(device)
     # Load images using MapAnything's load_images function
     print("Loading images...")
     image_folder_path = os.path.join(target_dir, "images")
+    views = load_images(image_folder_path)
     print(f"Loaded {len(views)} images")
     if len(views) == 0:
         raise ValueError("No images found. Check your upload.")
+    # Run model inference
+    print("Running inference...")
+    # apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
+    # mask_edges: Whether to compute an edge mask based on normals and depth and apply it to the output. Defaults to True.
+    # Use checkbox values
+    outputs = model.infer(views, apply_mask=apply_mask, mask_edges=mask_edges)
     # Convert predictions to format expected by visualization
     predictions = {}
     world_points_list = []
     depth_maps_list = []
     images_list = []
     final_mask_list = []
+    # Loop through the outputs
+    for pred in outputs:
+        # Extract data from predictions
+        depthmap_torch = pred["depth_z"][0].squeeze(-1)  # (H, W)
+        intrinsics_torch = pred["intrinsics"][0]  # (3, 3)
+        camera_pose_torch = pred["camera_poses"][0]  # (4, 4)
+        # Compute new pts3d using depth, intrinsics, and camera pose
+        pts3d_computed, valid_mask = depthmap_to_world_frame(
+            depthmap_torch, intrinsics_torch, camera_pose_torch
+        )
+        # Convert to numpy arrays for visualization
+        # Check if mask key exists in pred, if not, fill with boolean trues in the size of depthmap_torch
+        if "mask" in pred:
+            mask = pred["mask"][0].squeeze(-1).cpu().numpy().astype(bool)
+        else:
+            # Fill with boolean trues in the size of depthmap_torch
+            mask = np.ones_like(depthmap_torch.cpu().numpy(), dtype=bool)
+        # Combine with valid depth mask
+        mask = mask & valid_mask.cpu().numpy()
+        image = pred["img_no_norm"][0].cpu().numpy()
+        # Append to lists
+        extrinsic_list.append(camera_pose_torch.cpu().numpy())
+        intrinsic_list.append(intrinsics_torch.cpu().numpy())
+        world_points_list.append(pts3d_computed.cpu().numpy())
+        depth_maps_list.append(depthmap_torch.cpu().numpy())
+        images_list.append(image)  # Add image to list
+        final_mask_list.append(mask)  # Add final_mask to list
     # Convert lists to numpy arrays with required shapes
     # extrinsic: (S, 3, 4) - batch of camera extrinsic matrices
     # Add channel dimension if needed to match (S, H, W, 1) format
     if len(depth_maps.shape) == 3:
         depth_maps = depth_maps[..., np.newaxis]
     predictions["depth"] = depth_maps
     # images: (S, H, W, 3) - batch of input images
     predictions["images"] = np.stack(images_list, axis=0)
     # final_mask: (S, H, W) - batch of final masks for filtering
     predictions["final_mask"] = np.stack(final_mask_list, axis=0)
     # Process data for visualization tabs (depth, normal, measure)
     processed_data = process_predictions_for_visualization(
+        predictions, views, high_level_config
     )
     # Clean up
     return processed_data[view_keys[view_index]]
+def update_depth_view(processed_data, view_index):
+    """Update depth view for a specific view index"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None or view_data["depth"] is None:
         return None
     # Use confidence filtering if available
     confidence = view_data.get("confidence")
+    return colorize_depth(view_data["depth"], confidence=confidence)
+def update_normal_view(processed_data, view_index):
+    """Update normal view for a specific view index"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None or view_data["normal"] is None:
         return None
     # Use confidence filtering if available
     confidence = view_data.get("confidence")
+    return colorize_normal(view_data["normal"], confidence=confidence)
 def update_measure_view(processed_data, view_index):
+    """Update measure view for a specific view index with mask overlay"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None:
         return None, []  # image, measure_points
+    # Get the base image
+    image = view_data["image"].copy()
+    # Ensure image is in uint8 format
+    if image.dtype != np.uint8:
+        if image.max() <= 1.0:
+            image = (image * 255).astype(np.uint8)
+        else:
+            image = image.astype(np.uint8)
+    # Apply mask overlay if mask is available
+    if view_data["mask"] is not None:
+        mask = view_data["mask"]
+        # Create light grey overlay for masked areas
+        # Masked areas (False values) will be overlaid with light grey
+        invalid_mask = ~mask  # Areas where mask is False
+        if invalid_mask.any():
+            # Create a light grey overlay (RGB: 192, 192, 192)
+            overlay_color = np.array([192, 192, 192], dtype=np.uint8)
+            # Apply overlay with some transparency
+            alpha = 0.5  # Transparency level
+            for c in range(3):  # RGB channels
+                image[:, :, c] = np.where(
+                    invalid_mask,
+                    (1 - alpha) * image[:, :, c] + alpha * overlay_color[c],
+                    image[:, :, c],
+                ).astype(np.uint8)
+    return image, []
+def navigate_depth_view(processed_data, current_selector_value, direction):
     """Navigate depth view (direction: -1 for previous, +1 for next)"""
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
     new_view = (current_view + direction) % num_views
     new_selector_value = f"View {new_view + 1}"
+    depth_vis = update_depth_view(processed_data, new_view)
     return new_selector_value, depth_vis
+def navigate_normal_view(processed_data, current_selector_value, direction):
     """Navigate normal view (direction: -1 for previous, +1 for next)"""
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
     new_view = (current_view + direction) % num_views
     new_selector_value = f"View {new_view + 1}"
+    normal_vis = update_normal_view(processed_data, new_view)
     return new_selector_value, normal_vis
     return new_selector_value, measure_image, measure_points
+def populate_visualization_tabs(processed_data):
     """Populate the depth, normal, and measure tabs with processed data"""
     if processed_data is None or len(processed_data) == 0:
         return None, None, None, []
     # Use update functions to ensure confidence filtering is applied from the start
+    depth_vis = update_depth_view(processed_data, 0)
+    normal_vis = update_normal_view(processed_data, 0)
     measure_img, _ = update_measure_view(processed_data, 0)
     return depth_vis, normal_vis, measure_img, []
 @spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
     frame_filter="All",
     show_cam=True,
     filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
+    apply_mask=True,
+    mask_edges=True,
 ):
     """
     Perform reconstruction using the already-created target_dir/images.
     print("Running MapAnything model...")
     with torch.no_grad():
+        predictions, processed_data = run_model(
+            target_dir, None, apply_mask, mask_edges
+        )
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     # Build a GLB file name
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     # Convert predictions to GLB
     glbscene = predictions_to_glb(
         predictions,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
         target_dir=target_dir,
         mask_sky=filter_sky,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
     )
     glbscene.export(file_obj=glbfile)
     # Populate visualization tabs with processed data
     depth_vis, normal_vis, measure_img, measure_pts = populate_visualization_tabs(
+        processed_data
     )
     # Update view selectors based on available views
     return normal_vis
+def process_predictions_for_visualization(predictions, views, high_level_config):
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
     # Check if confidence data is available in any view
     has_confidence_data = False
+    # for view_idx, view in enumerate(views):
+    #     view_key = f"pred{view_idx + 1}"
+    #     if view_key in pred_result and "conf" in pred_result[view_key]:
+    #         has_confidence_data = True
+    #         break
     # Process each view
     for view_idx, view in enumerate(views):
+        # view_key = f"pred{view_idx + 1}"
+        # if view_key not in pred_result:
+        #     continue
         # Get image
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
+        # image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
         # Get predicted points
+        pred_pts3d = predictions["world_points"][view_idx]
         # Initialize data for this view
         view_data = {
             "has_confidence": has_confidence_data,
         }
+        view_data["mask"] = predictions["final_mask"][view_idx]
+        view_data["depth"] = predictions["depth"][view_idx].squeeze()
+        normals, _ = points_to_normals(pred_pts3d, mask=view_data["mask"])
+        view_data["normal"] = normals
         processed_data[view_idx] = view_data
         point2d = event.index[0], event.index[1]
         print(f"Clicked point: {point2d}")
+        # Check if the clicked point is in a masked area (prevent interaction)
+        if (
+            current_view["mask"] is not None
+            and 0 <= point2d[1] < current_view["mask"].shape[0]
+            and 0 <= point2d[0] < current_view["mask"].shape[1]
+        ):
+            # Check if the point is in a masked (invalid) area
+            if not current_view["mask"][point2d[1], point2d[0]]:
+                print(f"Clicked point {point2d} is in masked area, ignoring click")
+                # Always return image with mask overlay
+                masked_image, _ = update_measure_view(
+                    processed_data, current_view_index
+                )
+                return (
+                    masked_image,
+                    measure_points,
+                    '<span style="color: red; font-weight: bold;">Cannot measure on masked areas (shown in grey)</span>',
+                )
         measure_points.append(point2d)
+        # Get image with mask overlay and ensure it's valid
+        image, _ = update_measure_view(processed_data, current_view_index)
         if image is None:
             return None, [], "No image available"
 def update_visualization(
     target_dir,
     frame_filter,
     show_cam,
     is_example,
     filter_sky=False,
     filter_black_bg=False,
     filter_white_bg=False,
 ):
     """
     Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_sky{filter_sky}_black{filter_black_bg}_white{filter_white_bg}_pred{prediction_mode.replace(' ', '_')}.glb",
     )
     if not os.path.exists(glbfile):
         glbscene = predictions_to_glb(
             predictions,
             filter_by_frames=frame_filter,
             show_cam=show_cam,
             target_dir=target_dir,
             mask_sky=filter_sky,
             mask_black_bg=filter_black_bg,
             mask_white_bg=filter_white_bg,
         )
         glbscene.export(file_obj=glbfile)
                             interactive=False,
                             sources=[],
                         )
+                        gr.Markdown(
+                            "**Note:** Gray areas indicate regions with no depth information where measurements cannot be taken."
+                        )
                         measure_text = gr.Markdown("")
             with gr.Row():
                 )
             with gr.Row():
                 frame_filter = gr.Dropdown(
                     choices=["All"], value="All", label="Show Points from Frame"
                 )
                 with gr.Column():
+                    gr.Markdown("### Pointcloud options (live updates)")
                     show_cam = gr.Checkbox(label="Show Camera", value=True)
                     filter_sky = gr.Checkbox(
                         label="Filter Sky (using skyseg.onnx)", value=False
                     filter_white_bg = gr.Checkbox(
                         label="Filter White Background", value=False
                     )
+                    gr.Markdown("### Reconstruction options: (updated on next run)")
+                    apply_mask_checkbox = gr.Checkbox(
+                        label="Apply non-ambiguous mask", value=True
+                    )
+                    mask_edges_checkbox = apply_mask_checkbox
     # ---------------------- Example Scenes Section ----------------------
     gr.Markdown("## Example Scenes")
     gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
         fn=gradio_demo,
         inputs=[
             target_dir_output,
             frame_filter,
             show_cam,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
+            apply_mask_checkbox,
+            mask_edges_checkbox,
         ],
         outputs=[
             reconstruction_output,
     # -------------------------------------------------------------------------
     # Real-time Visualization Updates
     # -------------------------------------------------------------------------
     frame_filter.change(
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
         ],
         [reconstruction_output, log_output],
     )
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
         ],
         [reconstruction_output, log_output],
     )
         update_visualization,
         [
             target_dir_output,
             frame_filter,
             show_cam,
             is_example,
             filter_sky,
             filter_black_bg,
             filter_white_bg,
         ],
         [reconstruction_output, log_output],
     )
     # Depth tab navigation
     prev_depth_btn.click(
+        fn=lambda processed_data, current_selector: navigate_depth_view(
+            processed_data, current_selector, -1
         ),
+        inputs=[processed_data_state, depth_view_selector],
         outputs=[depth_view_selector, depth_map],
     )
     next_depth_btn.click(
+        fn=lambda processed_data, current_selector: navigate_depth_view(
+            processed_data, current_selector, 1
         ),
+        inputs=[processed_data_state, depth_view_selector],
         outputs=[depth_view_selector, depth_map],
     )
     depth_view_selector.change(
+        fn=lambda processed_data, selector_value: (
             update_depth_view(
                 processed_data,
                 int(selector_value.split()[1]) - 1,
             )
             if selector_value
             else None
         ),
+        inputs=[processed_data_state, depth_view_selector],
         outputs=[depth_map],
     )
     # Normal tab navigation
     prev_normal_btn.click(
+        fn=lambda processed_data, current_selector: navigate_normal_view(
+            processed_data, current_selector, -1
         ),
+        inputs=[processed_data_state, normal_view_selector],
         outputs=[normal_view_selector, normal_map],
     )
     next_normal_btn.click(
+        fn=lambda processed_data, current_selector: navigate_normal_view(
+            processed_data, current_selector, 1
         ),
+        inputs=[processed_data_state, normal_view_selector],
         outputs=[normal_view_selector, normal_map],
     )
     normal_view_selector.change(
+        fn=lambda processed_data, selector_value: (
             update_normal_view(
                 processed_data,
                 int(selector_value.split()[1]) - 1,
             )
             if selector_value
             else None
         ),
+        inputs=[processed_data_state, normal_view_selector],
         outputs=[normal_map],
     )

hf_utils/vgg_geometry.py DELETED Viewed

@@ -1,166 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import os
-import torch
-import numpy as np
-def unproject_depth_map_to_point_map(
-    depth_map: np.ndarray, extrinsics_cam: np.ndarray, intrinsics_cam: np.ndarray
-) -> np.ndarray:
-    """
-    Unproject a batch of depth maps to 3D world coordinates.
-    Args:
-        depth_map (np.ndarray): Batch of depth maps of shape (S, H, W, 1) or (S, H, W)
-        extrinsics_cam (np.ndarray): Batch of camera extrinsic matrices of shape (S, 3, 4)
-        intrinsics_cam (np.ndarray): Batch of camera intrinsic matrices of shape (S, 3, 3)
-    Returns:
-        np.ndarray: Batch of 3D world coordinates of shape (S, H, W, 3)
-    """
-    if isinstance(depth_map, torch.Tensor):
-        depth_map = depth_map.cpu().numpy()
-    if isinstance(extrinsics_cam, torch.Tensor):
-        extrinsics_cam = extrinsics_cam.cpu().numpy()
-    if isinstance(intrinsics_cam, torch.Tensor):
-        intrinsics_cam = intrinsics_cam.cpu().numpy()
-    world_points_list = []
-    for frame_idx in range(depth_map.shape[0]):
-        cur_world_points, _, _ = depth_to_world_coords_points(
-            depth_map[frame_idx].squeeze(-1), extrinsics_cam[frame_idx], intrinsics_cam[frame_idx]
-        )
-        world_points_list.append(cur_world_points)
-    world_points_array = np.stack(world_points_list, axis=0)
-    return world_points_array
-def depth_to_world_coords_points(
-    depth_map: np.ndarray,
-    extrinsic: np.ndarray,
-    intrinsic: np.ndarray,
-    eps=1e-8,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Convert a depth map to world coordinates.
-    Args:
-        depth_map (np.ndarray): Depth map of shape (H, W).
-        intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
-        extrinsic (np.ndarray): Camera extrinsic matrix of shape (3, 4). OpenCV camera coordinate convention, cam from world.
-    Returns:
-        tuple[np.ndarray, np.ndarray]: World coordinates (H, W, 3) and valid depth mask (H, W).
-    """
-    if depth_map is None:
-        return None, None, None
-    # Valid depth mask
-    point_mask = depth_map > eps
-    # Convert depth map to camera coordinates
-    cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic)
-    # Multiply with the inverse of extrinsic matrix to transform to world coordinates
-    # extrinsic_inv is 4x4 (note closed_form_inverse_OpenCV is batched, the output is (N, 4, 4))
-    cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0]
-    R_cam_to_world = cam_to_world_extrinsic[:3, :3]
-    t_cam_to_world = cam_to_world_extrinsic[:3, 3]
-    # Apply the rotation and translation to the camera coordinates
-    world_coords_points = np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world  # HxWx3, 3x3 -> HxWx3
-    # world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world
-    return world_coords_points, cam_coords_points, point_mask
-def depth_to_cam_coords_points(depth_map: np.ndarray, intrinsic: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-    """
-    Convert a depth map to camera coordinates.
-    Args:
-        depth_map (np.ndarray): Depth map of shape (H, W).
-        intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
-    Returns:
-        tuple[np.ndarray, np.ndarray]: Camera coordinates (H, W, 3)
-    """
-    H, W = depth_map.shape
-    assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3"
-    assert intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0, "Intrinsic matrix must have zero skew"
-    # Intrinsic parameters
-    fu, fv = intrinsic[0, 0], intrinsic[1, 1]
-    cu, cv = intrinsic[0, 2], intrinsic[1, 2]
-    # Generate grid of pixel coordinates
-    u, v = np.meshgrid(np.arange(W), np.arange(H))
-    # Unproject to camera coordinates
-    x_cam = (u - cu) * depth_map / fu
-    y_cam = (v - cv) * depth_map / fv
-    z_cam = depth_map
-    # Stack to form camera coordinates
-    cam_coords = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
-    return cam_coords
-def closed_form_inverse_se3(se3, R=None, T=None):
-    """
-    Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
-    If `R` and `T` are provided, they must correspond to the rotation and translation
-    components of `se3`. Otherwise, they will be extracted from `se3`.
-    Args:
-        se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices.
-        R (optional): Nx3x3 array or tensor of rotation matrices.
-        T (optional): Nx3x1 array or tensor of translation vectors.
-    Returns:
-        Inverted SE3 matrices with the same type and device as `se3`.
-    Shapes:
-        se3: (N, 4, 4)
-        R: (N, 3, 3)
-        T: (N, 3, 1)
-    """
-    # Check if se3 is a numpy array or a torch tensor
-    is_numpy = isinstance(se3, np.ndarray)
-    # Validate shapes
-    if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
-        raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
-    # Extract R and T if not provided
-    if R is None:
-        R = se3[:, :3, :3]  # (N,3,3)
-    if T is None:
-        T = se3[:, :3, 3:]  # (N,3,1)
-    # Transpose R
-    if is_numpy:
-        # Compute the transpose of the rotation for NumPy
-        R_transposed = np.transpose(R, (0, 2, 1))
-        # -R^T t for NumPy
-        top_right = -np.matmul(R_transposed, T)
-        inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
-    else:
-        R_transposed = R.transpose(1, 2)  # (N,3,3)
-        top_right = -torch.bmm(R_transposed, T)  # (N,3,1)
-        inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
-        inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
-    inverted_matrix[:, :3, :3] = R_transposed
-    inverted_matrix[:, :3, 3:] = top_right
-    return inverted_matrix

hf_utils/visual_util.py CHANGED Viewed

@@ -221,14 +221,14 @@ def predictions_to_glb(
     # Prepare 4x4 matrices for camera extrinsics
     num_cameras = len(camera_matrices)
-    extrinsics_matrices = np.zeros((num_cameras, 4, 4))
-    extrinsics_matrices[:, :3, :4] = camera_matrices
-    extrinsics_matrices[:, 3, 3] = 1
     if show_cam:
         # Add camera models to the scene
         for i in range(num_cameras):
-            world_to_camera = extrinsics_matrices[i]
             camera_to_world = np.linalg.inv(world_to_camera)
             rgba_color = colormap(i / num_cameras)
             current_color = tuple(int(255 * x) for x in rgba_color[:3])
@@ -238,7 +238,7 @@ def predictions_to_glb(
             )
     # Align scene to the observation of the first camera
-    scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
     print("GLB Scene built")
     return scene_3d

     # Prepare 4x4 matrices for camera extrinsics
     num_cameras = len(camera_matrices)
+    # extrinsics_matrices = np.zeros((num_cameras, 4, 4))
+    # extrinsics_matrices[:, :3, :4] = camera_matrices
+    # extrinsics_matrices[:, 3, 3] = 1
     if show_cam:
         # Add camera models to the scene
         for i in range(num_cameras):
+            world_to_camera = camera_matrices[i]
             camera_to_world = np.linalg.inv(world_to_camera)
             rgba_color = colormap(i / num_cameras)
             current_color = tuple(int(255 * x) for x in rgba_color[:3])
             )
     # Align scene to the observation of the first camera
+    scene_3d = apply_scene_alignment(scene_3d, camera_matrices)
     print("GLB Scene built")
     return scene_3d

mapanything/__init__.py ADDED Viewed

File without changes

mapanything/datasets/wai/ase.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class ASEWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class ASEWAI(BaseDataset):

mapanything/datasets/wai/bedlam.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class BedlamWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class BedlamWAI(BaseDataset):

mapanything/datasets/wai/blendedmvs.py CHANGED Viewed

@@ -8,7 +8,7 @@ import cv2
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class BlendedMVSWAI(BaseDataset):
@@ -108,16 +108,9 @@ class BlendedMVSWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image", "depth"],
-                # modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional masks manually
-            ### Remove once stability issue with scene_meta is fixed
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class BlendedMVSWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/dl3dv.py CHANGED Viewed

@@ -12,7 +12,7 @@ from mapanything.utils.cropping import (
     rescale_image_and_other_optional_info,
     resize_with_nearest_interpolation_to_match_aspect_ratio,
 )
-from wai import load_data, load_frame
 class DL3DVWAI(BaseDataset):
@@ -115,35 +115,14 @@ class DL3DVWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image"],
-                # modalities=[
-                #     "image",
-                #     "pred_depth/mvsanywhere",
-                #     "pred_mask/moge2",
-                #     "depth_confidence/mvsanywhere",
-                # ],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional modalities manually
-            ### Remove once stability issue with scene_meta is fixed
-            mvs_depth_path = os.path.join(
-                scene_root, "mvsanywhere", "v0", "depth", f"{view_file_name}.exr"
-            )
-            mvs_conf_path = os.path.join(
-                scene_root,
-                "mvsanywhere",
-                "v0",
-                "depth_confidence",
-                f"{view_file_name}.exr",
-            )
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_depth/mvsanywhere"] = load_data(mvs_depth_path, "depth")
-            view_data["depth_confidence/mvsanywhere"] = load_data(
-                mvs_conf_path, "scalar"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

     rescale_image_and_other_optional_info,
     resize_with_nearest_interpolation_to_match_aspect_ratio,
 )
+from mapanything.utils.wai.core import load_data, load_frame
 class DL3DVWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=[
+                    "image",
+                    "pred_depth/mvsanywhere",
+                    "pred_mask/moge2",
+                    "depth_confidence/mvsanywhere",
+                ],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/dtu.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class DTUWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class DTUWAI(BaseDataset):

mapanything/datasets/wai/dynamicreplica.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class DynamicReplicaWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class DynamicReplicaWAI(BaseDataset):

mapanything/datasets/wai/eth3d.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class ETH3DWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class ETH3DWAI(BaseDataset):

mapanything/datasets/wai/gta_sfm.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class GTASfMWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class GTASfMWAI(BaseDataset):

mapanything/datasets/wai/matrixcity.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class MatrixCityWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class MatrixCityWAI(BaseDataset):

mapanything/datasets/wai/megadepth.py CHANGED Viewed

@@ -8,7 +8,7 @@ import cv2
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class MegaDepthWAI(BaseDataset):
@@ -109,16 +109,9 @@ class MegaDepthWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image", "depth"],
-                # modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional masks manually
-            ### Remove once stability issue with scene_meta is fixed
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class MegaDepthWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/mpsd.py CHANGED Viewed

@@ -8,7 +8,7 @@ import cv2
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class MPSDWAI(BaseDataset):
@@ -108,16 +108,9 @@ class MPSDWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image", "depth"],
-                # modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional masks manually
-            ### Remove once stability issue with scene_meta is fixed
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class MPSDWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/mvs_synth.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class MVSSynthWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class MVSSynthWAI(BaseDataset):

mapanything/datasets/wai/paralleldomain4d.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class ParallelDomain4DWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class ParallelDomain4DWAI(BaseDataset):

mapanything/datasets/wai/sailvos3d.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class SAILVOS3DWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class SAILVOS3DWAI(BaseDataset):

mapanything/datasets/wai/scannetpp.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class ScanNetPPWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class ScanNetPPWAI(BaseDataset):

mapanything/datasets/wai/spring.py CHANGED Viewed

@@ -8,7 +8,7 @@ import cv2
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class SpringWAI(BaseDataset):
@@ -107,16 +107,9 @@ class SpringWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image", "depth", "skymask"],
-                # modalities=["image", "depth", "skymask", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional masks manually
-            ### Remove once stability issue with scene_meta is fixed
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class SpringWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=["image", "depth", "skymask", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/structured3d.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class Structured3DWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class Structured3DWAI(BaseDataset):

mapanything/datasets/wai/tav2_wb.py CHANGED Viewed

@@ -8,7 +8,7 @@ import cv2
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class TartanAirV2WBWAI(BaseDataset):
@@ -108,16 +108,9 @@ class TartanAirV2WBWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
-                modalities=["image", "depth"],
-                # modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
-            ### HOTFIX: Load required additional masks manually
-            ### Remove once stability issue with scene_meta is fixed
-            mask_path = os.path.join(
-                scene_root, "moge", "v0", "mask", "moge2", f"{view_file_name}.png"
-            )
-            view_data["pred_mask/moge2"] = load_data(mask_path, "binary")
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class TartanAirV2WBWAI(BaseDataset):
             view_data = load_frame(
                 scene_root,
                 view_file_name,
+                modalities=["image", "depth", "pred_mask/moge2"],
                 scene_meta=scene_meta,
             )
             # Convert necessary data to numpy
             image = view_data["image"].permute(1, 2, 0).numpy()

mapanything/datasets/wai/unrealstereo4k.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class UnrealStereo4KWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class UnrealStereo4KWAI(BaseDataset):

mapanything/datasets/wai/xrooms.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
-from wai import load_data, load_frame
 class XRoomsWAI(BaseDataset):

 import numpy as np
 from mapanything.datasets.base.base_dataset import BaseDataset
+from mapanything.utils.wai.core import load_data, load_frame
 class XRoomsWAI(BaseDataset):

mapanything/models/external/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# External Model Code for Benchmarking & Re-Training
+This directory contains external model code that we use to train and benchmark external models fairly. These libraries are not part of the core MapAnything codebase and are included for only benchmarking purposes. The code in this directory is licensed under the same license as the source code from which it was derived, unless otherwise specified.
+The open-source Apache 2.0 License of MapAnything does not apply to these libraries.

mapanything/models/external/moge/models/v1.py CHANGED Viewed

@@ -475,7 +475,7 @@ class MoGeModel(nn.Module):
         return_dict = {"points": points, "mask": mask}
         return return_dict
-    @torch.inference_mode()
     def infer(
         self,
         image: torch.Tensor,

         return_dict = {"points": points, "mask": mask}
         return return_dict
+    # @torch.inference_mode()
     def infer(
         self,
         image: torch.Tensor,

mapanything/models/external/moge/models/v2.py CHANGED Viewed

@@ -227,7 +227,7 @@ class MoGeModel(nn.Module):
         return return_dict
-    @torch.inference_mode()
     def infer(
         self,
         image: torch.Tensor,

         return return_dict
+    # @torch.inference_mode()
     def infer(
         self,
         image: torch.Tensor,

mapanything/models/mapanything/ablations.py CHANGED Viewed

@@ -134,8 +134,10 @@ class MapAnythingAblations(nn.Module):
         # Initialize image encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
-        del self.encoder_config["uses_torch_hub"]
-        self.encoder = encoder_factory(**self.encoder_config)
         # Initialize the encoder for ray directions
         ray_dirs_encoder_config = self.geometric_input_config["ray_dirs_encoder_config"]

         # Initialize image encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
+        # Create a copy of the config before deleting the key to preserve it for serialization
+        encoder_config_copy = self.encoder_config.copy()
+        del encoder_config_copy["uses_torch_hub"]
+        self.encoder = encoder_factory(**encoder_config_copy)
         # Initialize the encoder for ray directions
         ray_dirs_encoder_config = self.geometric_input_config["ray_dirs_encoder_config"]

mapanything/models/mapanything/model.py CHANGED Viewed

@@ -2,11 +2,13 @@
 MapAnything model class defined using UniCeption modules.
 """
 from functools import partial
-from typing import Callable, Dict, Type, Union
 import torch
 import torch.nn as nn
 from mapanything.utils.geometry import (
     apply_log_to_norm,
@@ -15,6 +17,11 @@ from mapanything.utils.geometry import (
     normalize_pose_translations,
     transform_pose_using_quats_and_trans_2_to_1,
 )
 from uniception.models.encoders import (
     encoder_factory,
     EncoderGlobalRepInput,
@@ -72,7 +79,7 @@ if hasattr(torch.backends.cuda, "matmul") and hasattr(
     torch.backends.cuda.matmul.allow_tf32 = True
-class MapAnything(nn.Module):
     "Modular MapAnything model class that supports input of images & optional geometric modalities (multiple reconstruction tasks)."
     def __init__(
@@ -139,8 +146,10 @@ class MapAnything(nn.Module):
         # Initialize image encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
-        del self.encoder_config["uses_torch_hub"]
-        self.encoder = encoder_factory(**self.encoder_config)
         # Initialize the encoder for ray directions
         ray_dirs_encoder_config = self.geometric_input_config["ray_dirs_encoder_config"]
@@ -199,6 +208,14 @@ class MapAnything(nn.Module):
         # Load pretrained weights
         self._load_pretrained_weights()
     def _initialize_info_sharing(self, info_sharing_config):
         """
         Initialize the information sharing module based on the configuration.
@@ -1717,3 +1734,202 @@ class MapAnything(nn.Module):
                     res[i]["non_ambiguous_mask_logits"] = output_mask_logits_per_view[i]
         return res

 MapAnything model class defined using UniCeption modules.
 """
+import warnings
 from functools import partial
+from typing import Any, Callable, Dict, List, Type, Union
 import torch
 import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
 from mapanything.utils.geometry import (
     apply_log_to_norm,
     normalize_pose_translations,
     transform_pose_using_quats_and_trans_2_to_1,
 )
+from mapanything.utils.inference import (
+    postprocess_model_outputs_for_inference,
+    preprocess_input_views_for_inference,
+    validate_input_views_for_inference,
+)
 from uniception.models.encoders import (
     encoder_factory,
     EncoderGlobalRepInput,
     torch.backends.cuda.matmul.allow_tf32 = True
+class MapAnything(nn.Module, PyTorchModelHubMixin):
     "Modular MapAnything model class that supports input of images & optional geometric modalities (multiple reconstruction tasks)."
     def __init__(
         # Initialize image encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
+        # Create a copy of the config before deleting the key to preserve it for serialization
+        encoder_config_copy = self.encoder_config.copy()
+        del encoder_config_copy["uses_torch_hub"]
+        self.encoder = encoder_factory(**encoder_config_copy)
         # Initialize the encoder for ray directions
         ray_dirs_encoder_config = self.geometric_input_config["ray_dirs_encoder_config"]
         # Load pretrained weights
         self._load_pretrained_weights()
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
     def _initialize_info_sharing(self, info_sharing_config):
         """
         Initialize the information sharing module based on the configuration.
                     res[i]["non_ambiguous_mask_logits"] = output_mask_logits_per_view[i]
         return res
+    def _configure_geometric_input_config(
+        self,
+        use_calibration: bool,
+        use_depth: bool,
+        use_pose: bool,
+        use_depth_scale: bool,
+        use_pose_scale: bool,
+    ):
+        """
+        Configure the geometric input configuration
+        """
+        # Store original config for restoration
+        if not hasattr(self, "_original_geometric_config"):
+            self._original_geometric_config = dict(self.geometric_input_config)
+        # Set the geometric input configuration
+        if not (use_calibration or use_depth or use_pose):
+            # No geometric inputs (images-only mode)
+            self.geometric_input_config.update(
+                {
+                    "overall_prob": 0.0,
+                    "dropout_prob": 1.0,
+                    "ray_dirs_prob": 0.0,
+                    "depth_prob": 0.0,
+                    "cam_prob": 0.0,
+                    "sparse_depth_prob": 0.0,
+                    "depth_scale_norm_all_prob": 0.0,
+                    "pose_scale_norm_all_prob": 0.0,
+                }
+            )
+        else:
+            # Enable geometric inputs with deterministic behavior
+            self.geometric_input_config.update(
+                {
+                    "overall_prob": 1.0,
+                    "dropout_prob": 0.0,
+                    "ray_dirs_prob": 1.0 if use_calibration else 0.0,
+                    "depth_prob": 1.0 if use_depth else 0.0,
+                    "cam_prob": 1.0 if use_pose else 0.0,
+                    "sparse_depth_prob": 0.0,  # No sparsification during inference
+                    "depth_scale_norm_all_prob": 0.0 if use_depth_scale else 1.0,
+                    "pose_scale_norm_all_prob": 0.0 if use_pose_scale else 1.0,
+                }
+            )
+    def _restore_original_geometric_input_config(self):
+        """
+        Restore original geometric input configuration
+        """
+        if hasattr(self, "_original_geometric_config"):
+            self.geometric_input_config.update(self._original_geometric_config)
+    @torch.inference_mode()
+    def infer(
+        self,
+        views: List[Dict[str, Any]],
+        use_amp: bool = True,
+        amp_dtype: str = "bf16",
+        apply_mask: bool = True,
+        mask_edges: bool = True,
+        edge_normal_threshold: float = 5.0,
+        edge_depth_threshold: float = 0.03,
+        apply_confidence_mask: bool = False,
+        confidence_percentile: float = 10,
+        ignore_calibration_inputs: bool = False,
+        ignore_depth_inputs: bool = False,
+        ignore_pose_inputs: bool = False,
+        ignore_depth_scale_inputs: bool = False,
+        ignore_pose_scale_inputs: bool = False,
+    ) -> List[Dict[str, torch.Tensor]]:
+        """
+        User-friendly inference with strict input validation and automatic conversion.
+        Args:
+            views: List of view dictionaries. Each dict can contain:
+                Required:
+                - 'img': torch.Tensor of shape (B, 3, H, W) - normalized RGB images
+                - 'data_norm_type': str - normalization type used to normalize the images (must be equal to self.model.encoder.data_norm_type)
+                Optional Geometric Inputs (only one of intrinsics OR ray_directions):
+                - 'intrinsics': torch.Tensor of shape (B, 3, 3) - will be converted to ray directions
+                - 'ray_directions': torch.Tensor of shape (B, H, W, 3) - ray directions in camera frame
+                - 'depth_z': torch.Tensor of shape (B, H, W, 1) - Z depth in camera frame (intrinsics or ray_directions must be provided)
+                - 'camera_poses': torch.Tensor of shape (B, 4, 4) or tuple of (quats - (B, 4), trans - (B, 3)) - can be any world frame
+                - 'is_metric_scale': bool or torch.Tensor of shape (B,) - if not provided, defaults to True
+                Optional Additional Info:
+                - 'instance': List[str] where length of list is B - instance info for each view
+                - 'idx': List[int] where length of list is B - index info for each view
+                - 'true_shape': List[tuple] where length of list is B - true shape info (H, W) for each view
+            use_amp: Whether to use automatic mixed precision for faster inference. Defaults to True.
+            amp_dtype: The dtype to use for mixed precision. Defaults to "bf16" (bfloat16). Options: "fp16", "bf16", "fp32".
+            apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
+            mask_edges: Whether to compute an edge mask based on normals and depth and apply it to the output. Defaults to True.
+            edge_normal_threshold: Tolerance threshold for normals-based edge detection. Defaults to 5.0.
+            edge_depth_threshold: Relative tolerance threshold for depth-based edge detection. Defaults to 0.03.
+            apply_confidence_mask: Whether to apply the confidence mask to the output. Defaults to False.
+            confidence_percentile: The percentile to use for the confidence threshold. Defaults to 10.
+            ignore_calibration_inputs: Whether to ignore the calibration inputs (intrinsics and ray_directions). Defaults to False.
+            ignore_depth_inputs: Whether to ignore the depth inputs. Defaults to False.
+            ignore_pose_inputs: Whether to ignore the pose inputs. Defaults to False.
+            ignore_depth_scale_inputs: Whether to ignore the depth scale inputs. Defaults to False.
+            ignore_pose_scale_inputs: Whether to ignore the pose scale inputs. Defaults to False.
+        IMPORTANT CONSTRAINTS:
+        - Cannot provide both 'intrinsics' and 'ray_directions' (they represent the same information)
+        - If 'depth' is provided, then 'intrinsics' or 'ray_directions' must also be provided
+        - If ANY view has 'camera_poses', then view 0 (first view) MUST also have 'camera_poses'
+        Returns:
+            List of prediction dictionaries, one per view. Each dict contains:
+                - 'img_no_norm': torch.Tensor of shape (B, H, W, 3) - denormalized rgb images
+                - 'pts3d': torch.Tensor of shape (B, H, W, 3) - predicted points in world frame
+                - 'pts3d_cam': torch.Tensor of shape (B, H, W, 3) - predicted points in camera frame
+                - 'ray_directions': torch.Tensor of shape (B, H, W, 3) - ray directions in camera frame
+                - 'intrinsics': torch.Tensor of shape (B, 3, 3) - pinhole camera intrinsics recovered from ray directions
+                - 'depth_along_ray': torch.Tensor of shape (B, H, W, 1) - depth along ray in camera frame
+                - 'depth_z': torch.Tensor of shape (B, H, W, 1) - Z depth in camera frame
+                - 'cam_trans': torch.Tensor of shape (B, 3) - camera translation in world frame
+                - 'cam_quats': torch.Tensor of shape (B, 4) - camera quaternion in world frame
+                - 'camera_poses': torch.Tensor of shape (B, 4, 4) - camera pose in world frame
+                - 'metric_scaling_factor': torch.Tensor of shape (B,) - applied metric scaling factor
+                - 'mask': torch.Tensor of shape (B, H, W, 1) - combo of non-ambiguous mask, edge mask and confidence-based mask if used
+                - 'non_ambiguous_mask': torch.Tensor of shape (B, H, W) - non-ambiguous mask
+                - 'non_ambiguous_mask_logits': torch.Tensor of shape (B, H, W) - non-ambiguous mask logits
+                - 'conf': torch.Tensor of shape (B, H, W) - confidence
+        Raises:
+            ValueError: For invalid inputs, missing required keys, conflicting modalities, or constraint violations
+        """
+        # Determine the mixed precision floating point type
+        if use_amp:
+            if amp_dtype == "fp16":
+                amp_dtype = torch.float16
+            elif amp_dtype == "bf16":
+                if torch.cuda.is_bf16_supported():
+                    amp_dtype = torch.bfloat16
+                else:
+                    warnings.warn(
+                        "bf16 is not supported on this device. Using fp16 instead."
+                    )
+                    amp_dtype = torch.float16
+            elif amp_dtype == "fp32":
+                amp_dtype = torch.float32
+        else:
+            amp_dtype = torch.float32
+        # Validate the input views
+        validated_views = validate_input_views_for_inference(views)
+        # Transfer the views to the same device as the model
+        ignore_keys = set(
+            [
+                "instance",
+                "idx",
+                "true_shape",
+                "data_norm_type",
+            ]
+        )
+        for view in validated_views:
+            for name in view.keys():
+                if name in ignore_keys:
+                    continue
+                view[name] = view[name].to(self.device, non_blocking=True)
+        # Pre-process the input views
+        processed_views = preprocess_input_views_for_inference(validated_views)
+        # Set the model input probabilities based on input args for ignoring inputs
+        self._configure_geometric_input_config(
+            use_calibration=not ignore_calibration_inputs,
+            use_depth=not ignore_depth_inputs,
+            use_pose=not ignore_pose_inputs,
+            use_depth_scale=not ignore_depth_scale_inputs,
+            use_pose_scale=not ignore_pose_scale_inputs,
+        )
+        # Run the model
+        with torch.autocast("cuda", enabled=bool(use_amp), dtype=amp_dtype):
+            preds = self.forward(processed_views)
+        # Post-process the model outputs
+        preds = postprocess_model_outputs_for_inference(
+            raw_outputs=preds,
+            input_views=processed_views,
+            apply_mask=apply_mask,
+            mask_edges=mask_edges,
+            edge_normal_threshold=edge_normal_threshold,
+            edge_depth_threshold=edge_depth_threshold,
+            apply_confidence_mask=apply_confidence_mask,
+            confidence_percentile=confidence_percentile,
+        )
+        # Restore the original configuration
+        self._restore_original_geometric_input_config()
+        return preds

mapanything/models/mapanything/modular_dust3r.py CHANGED Viewed

@@ -99,8 +99,10 @@ class ModularDUSt3R(nn.Module):
         # Initialize Encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
-        del self.encoder_config["uses_torch_hub"]
-        self.encoder = encoder_factory(**self.encoder_config)
         # Initialize Custom Positional Encoding if required
         if custom_positional_encoding is not None:

         # Initialize Encoder
         if self.encoder_config["uses_torch_hub"]:
             self.encoder_config["torch_hub_force_reload"] = torch_hub_force_reload
+        # Create a copy of the config before deleting the key to preserve it for serialization
+        encoder_config_copy = self.encoder_config.copy()
+        del encoder_config_copy["uses_torch_hub"]
+        self.encoder = encoder_factory(**encoder_config_copy)
         # Initialize Custom Positional Encoding if required
         if custom_positional_encoding is not None:

mapanything/train/losses.py CHANGED Viewed

@@ -1766,6 +1766,202 @@ class PointsPlusScaleRegr3D(Criterion, MultiLoss):
         return losses, (details | {})
 class FactoredGeometryRegr3D(Criterion, MultiLoss):
     """
     Regression Loss for Factored Geometry.
@@ -1787,6 +1983,7 @@ class FactoredGeometryRegr3D(Criterion, MultiLoss):
         pose_quats_loss_weight=1,
         pose_trans_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
     ):
@@ -1821,6 +2018,8 @@ class FactoredGeometryRegr3D(Criterion, MultiLoss):
             pose_trans_loss_weight (float): Weight to use for the pose trans loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
         """
@@ -1847,6 +2046,7 @@ class FactoredGeometryRegr3D(Criterion, MultiLoss):
         self.pose_quats_loss_weight = pose_quats_loss_weight
         self.pose_trans_loss_weight = pose_trans_loss_weight
         self.compute_pairwise_relative_pose_loss = compute_pairwise_relative_pose_loss
         self.compute_world_frame_points_loss = compute_world_frame_points_loss
         self.world_frame_points_loss_weight = world_frame_points_loss_weight
@@ -1869,6 +2069,19 @@ class FactoredGeometryRegr3D(Criterion, MultiLoss):
         gt_ray_directions = []
         gt_pose_quats = []
         # Predicted quantities
         no_norm_pr_pts = []
         no_norm_pr_pts_cam = []
         no_norm_pr_depth = []
@@ -1922,16 +2135,34 @@ class FactoredGeometryRegr3D(Criterion, MultiLoss):
                 gt_pose_quats.append(gt_pose_quats_in_view0)
                 no_norm_gt_pose_trans.append(no_norm_gt_pose_trans_in_view0)
-            # Get predictions
-            no_norm_pr_pts.append(preds[i]["pts3d"])
             no_norm_pr_pts_cam.append(preds[i]["pts3d_cam"])
             pr_ray_directions.append(preds[i]["ray_directions"])
             if self.depth_type_for_loss == "depth_along_ray":
                 no_norm_pr_depth.append(preds[i]["depth_along_ray"])
             elif self.depth_type_for_loss == "depth_z":
                 no_norm_pr_depth.append(preds[i]["pts3d_cam"][..., 2:])
-            no_norm_pr_pose_trans.append(preds[i]["cam_trans"])
-            pr_pose_quats.append(preds[i]["cam_quats"])
         if dist_clip is not None:
             # Points that are too far-away == invalid
@@ -2443,6 +2674,7 @@ class FactoredGeometryRegr3DPlusNormalGMLoss(FactoredGeometryRegr3D):
         pose_quats_loss_weight=1,
         pose_trans_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
         apply_normal_and_gm_loss_to_synthetic_data_only=True,
@@ -2478,6 +2710,8 @@ class FactoredGeometryRegr3DPlusNormalGMLoss(FactoredGeometryRegr3D):
             pose_trans_loss_weight (float): Weight to use for the pose trans loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
             apply_normal_and_gm_loss_to_synthetic_data_only (bool): If True, apply the normal and gm loss only to synthetic data.
@@ -2500,6 +2734,7 @@ class FactoredGeometryRegr3DPlusNormalGMLoss(FactoredGeometryRegr3D):
             pose_quats_loss_weight=pose_quats_loss_weight,
             pose_trans_loss_weight=pose_trans_loss_weight,
             compute_pairwise_relative_pose_loss=compute_pairwise_relative_pose_loss,
             compute_world_frame_points_loss=compute_world_frame_points_loss,
             world_frame_points_loss_weight=world_frame_points_loss_weight,
         )
@@ -2895,6 +3130,7 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
         pose_trans_loss_weight=1,
         scale_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
     ):
@@ -2928,6 +3164,8 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
             scale_loss_weight (float): Weight to use for the scale loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
         """
@@ -2948,6 +3186,7 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
         self.pose_trans_loss_weight = pose_trans_loss_weight
         self.scale_loss_weight = scale_loss_weight
         self.compute_pairwise_relative_pose_loss = compute_pairwise_relative_pose_loss
         self.compute_world_frame_points_loss = compute_world_frame_points_loss
         self.world_frame_points_loss_weight = world_frame_points_loss_weight
@@ -2970,6 +3209,19 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
         gt_ray_directions = []
         gt_pose_quats = []
         # Predicted quantities
         no_norm_pr_pts = []
         no_norm_pr_pts_cam = []
         no_norm_pr_depth = []
@@ -3024,6 +3276,24 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
                 gt_pose_quats.append(gt_pose_quats_in_view0)
                 no_norm_gt_pose_trans.append(no_norm_gt_pose_trans_in_view0)
             # Get predictions for normalized loss
             if self.depth_type_for_loss == "depth_along_ray":
                 curr_view_no_norm_depth = preds[i]["depth_along_ray"]
@@ -3032,7 +3302,7 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
             if "metric_scaling_factor" in preds[i].keys():
                 # Divide by the predicted metric scaling factor to get the raw predicted points, depth_along_ray, and pose_trans
                 # This detaches the predicted metric scaling factor from the geometry based loss
-                curr_view_no_norm_pr_pts = preds[i]["pts3d"] / preds[i][
                     "metric_scaling_factor"
                 ].unsqueeze(-1).unsqueeze(-1)
                 curr_view_no_norm_pr_pts_cam = preds[i]["pts3d_cam"] / preds[i][
@@ -3042,19 +3312,19 @@ class FactoredGeometryScaleRegr3D(Criterion, MultiLoss):
                     "metric_scaling_factor"
                 ].unsqueeze(-1).unsqueeze(-1)
                 curr_view_no_norm_pr_pose_trans = (
-                    preds[i]["cam_trans"] / preds[i]["metric_scaling_factor"]
                 )
             else:
-                curr_view_no_norm_pr_pts = preds[i]["pts3d"]
                 curr_view_no_norm_pr_pts_cam = preds[i]["pts3d_cam"]
                 curr_view_no_norm_depth = curr_view_no_norm_depth
-                curr_view_no_norm_pr_pose_trans = preds[i]["cam_trans"]
             no_norm_pr_pts.append(curr_view_no_norm_pr_pts)
             no_norm_pr_pts_cam.append(curr_view_no_norm_pr_pts_cam)
             no_norm_pr_depth.append(curr_view_no_norm_depth)
             no_norm_pr_pose_trans.append(curr_view_no_norm_pr_pose_trans)
             pr_ray_directions.append(preds[i]["ray_directions"])
-            pr_pose_quats.append(preds[i]["cam_quats"])
             # Get the predicted metric scale points
             if "metric_scaling_factor" in preds[i].keys():
@@ -3553,6 +3823,7 @@ class FactoredGeometryScaleRegr3DPlusNormalGMLoss(FactoredGeometryScaleRegr3D):
         pose_trans_loss_weight=1,
         scale_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
         apply_normal_and_gm_loss_to_synthetic_data_only=True,
@@ -3585,6 +3856,8 @@ class FactoredGeometryScaleRegr3DPlusNormalGMLoss(FactoredGeometryScaleRegr3D):
             scale_loss_weight (float): Weight to use for the scale loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
             apply_normal_and_gm_loss_to_synthetic_data_only (bool): If True, apply the normal and gm loss only to synthetic data.
@@ -3607,6 +3880,7 @@ class FactoredGeometryScaleRegr3DPlusNormalGMLoss(FactoredGeometryScaleRegr3D):
             pose_trans_loss_weight=pose_trans_loss_weight,
             scale_loss_weight=scale_loss_weight,
             compute_pairwise_relative_pose_loss=compute_pairwise_relative_pose_loss,
             compute_world_frame_points_loss=compute_world_frame_points_loss,
             world_frame_points_loss_weight=world_frame_points_loss_weight,
         )

         return losses, (details | {})
+class NormalGMLoss(MultiLoss):
+    """
+    Normal & Gradient Matching Loss for Monocular Depth Training.
+    """
+    def __init__(
+        self,
+        norm_predictions=True,
+        norm_mode="avg_dis",
+        apply_normal_and_gm_loss_to_synthetic_data_only=True,
+    ):
+        """
+        Initialize the loss criterion for Normal & Gradient Matching Loss (currently only valid for 1 view).
+        Computes:
+        (1) Normal Loss over the PointMap (naturally will be in local frame) in euclidean coordinates,
+        (2) Gradient Matching (GM) Loss over the Depth Z in log space. (MiDAS applied GM loss in disparity space)
+        Args:
+            norm_predictions (bool): If True, normalize the predictions before computing the loss.
+            norm_mode (str): Normalization mode for the gt and predicted (optional) scene representation. Default: "avg_dis".
+            apply_normal_and_gm_loss_to_synthetic_data_only (bool): If True, apply the normal and gm loss only to synthetic data.
+                If False, apply the normal and gm loss to all data. Default: True.
+        """
+        super().__init__()
+        self.norm_predictions = norm_predictions
+        self.norm_mode = norm_mode
+        self.apply_normal_and_gm_loss_to_synthetic_data_only = (
+            apply_normal_and_gm_loss_to_synthetic_data_only
+        )
+    def get_all_info(self, batch, preds, dist_clip=None):
+        """
+        Function to get all the information needed to compute the loss.
+        Returns all quantities normalized.
+        """
+        n_views = len(batch)
+        assert n_views == 1, (
+            "Normal & Gradient Matching Loss Class only supports 1 view"
+        )
+        # Everything is normalized w.r.t. camera of view1
+        in_camera1 = closed_form_pose_inverse(batch[0]["camera_pose"])
+        # Initialize lists to store data for all views
+        no_norm_gt_pts = []
+        valid_masks = []
+        no_norm_pr_pts = []
+        # Get ground truth & prediction info for all views
+        for i in range(n_views):
+            # Get ground truth
+            no_norm_gt_pts.append(geotrf(in_camera1, batch[i]["pts3d"]))
+            valid_masks.append(batch[i]["valid_mask"].clone())
+            # Get predictions for normalized loss
+            if "metric_scaling_factor" in preds[i].keys():
+                # Divide by the predicted metric scaling factor to get the raw predicted points
+                # This detaches the predicted metric scaling factor from the geometry based loss
+                curr_view_no_norm_pr_pts = preds[i]["pts3d"] / preds[i][
+                    "metric_scaling_factor"
+                ].unsqueeze(-1).unsqueeze(-1)
+            else:
+                curr_view_no_norm_pr_pts = preds[i]["pts3d"]
+            no_norm_pr_pts.append(curr_view_no_norm_pr_pts)
+        if dist_clip is not None:
+            # Points that are too far-away == invalid
+            for i in range(n_views):
+                dis = no_norm_gt_pts[i].norm(dim=-1)
+                valid_masks[i] = valid_masks[i] & (dis <= dist_clip)
+        # Initialize normalized tensors
+        gt_pts = [torch.zeros_like(pts) for pts in no_norm_gt_pts]
+        pr_pts = [torch.zeros_like(pts) for pts in no_norm_pr_pts]
+        # Normalize the predicted points if specified
+        if self.norm_predictions:
+            pr_normalization_output = normalize_multiple_pointclouds(
+                no_norm_pr_pts,
+                valid_masks,
+                self.norm_mode,
+                ret_factor=True,
+            )
+            pr_pts_norm = pr_normalization_output[:-1]
+        # Normalize the ground truth points
+        gt_normalization_output = normalize_multiple_pointclouds(
+            no_norm_gt_pts, valid_masks, self.norm_mode, ret_factor=True
+        )
+        gt_pts_norm = gt_normalization_output[:-1]
+        for i in range(n_views):
+            if self.norm_predictions:
+                # Assign the normalized predictions
+                pr_pts[i] = pr_pts_norm[i]
+            else:
+                # Assign the raw predicted points
+                pr_pts[i] = no_norm_pr_pts[i]
+            # Assign the normalized ground truth
+            gt_pts[i] = gt_pts_norm[i]
+        return gt_pts, pr_pts, valid_masks
+    def compute_loss(self, batch, preds, **kw):
+        gt_pts, pred_pts, valid_masks = self.get_all_info(batch, preds, **kw)
+        n_views = len(batch)
+        assert n_views == 1, (
+            "Normal & Gradient Matching Loss Class only supports 1 view"
+        )
+        normal_losses = []
+        gradient_matching_losses = []
+        details = {}
+        running_avg_dict = {}
+        self_name = type(self).__name__
+        for i in range(n_views):
+            # Get the local frame points, log space depth_z & valid masks
+            pred_local_pts3d = pred_pts[i]
+            pred_depth_z = pred_local_pts3d[..., 2:]
+            pred_depth_z = apply_log_to_norm(pred_depth_z)
+            gt_local_pts3d = gt_pts[i]
+            gt_depth_z = gt_local_pts3d[..., 2:]
+            gt_depth_z = apply_log_to_norm(gt_depth_z)
+            valid_mask_for_normal_gm_loss = valid_masks[i].clone()
+            # Update the validity mask for normal & gm loss based on the synthetic data mask if required
+            if self.apply_normal_and_gm_loss_to_synthetic_data_only:
+                synthetic_mask = batch[i]["is_synthetic"]  # (B, )
+                synthetic_mask = synthetic_mask.unsqueeze(-1).unsqueeze(-1)  # (B, 1, 1)
+                synthetic_mask = synthetic_mask.expand(
+                    -1, pred_depth_z.shape[1], pred_depth_z.shape[2]
+                )  # (B, H, W)
+                valid_mask_for_normal_gm_loss = (
+                    valid_mask_for_normal_gm_loss & synthetic_mask
+                )
+            # Compute the normal loss
+            normal_loss = compute_normal_loss(
+                pred_local_pts3d, gt_local_pts3d, valid_mask_for_normal_gm_loss.clone()
+            )
+            normal_losses.append(normal_loss)
+            # Compute the gradient matching loss
+            gradient_matching_loss = compute_gradient_matching_loss(
+                pred_depth_z, gt_depth_z, valid_mask_for_normal_gm_loss.clone()
+            )
+            gradient_matching_losses.append(gradient_matching_loss)
+            # Add loss details if only valid values are present
+            # Initialize or update running average directly
+            # Normal loss details
+            if float(normal_loss) > 0:
+                details[f"{self_name}_normal_view{i + 1}"] = float(normal_loss)
+                normal_avg_key = f"{self_name}_normal_avg"
+                if normal_avg_key not in details:
+                    details[normal_avg_key] = float(normal_losses[i])
+                    running_avg_dict[f"{self_name}_normal_valid_views"] = 1
+                else:
+                    normal_valid_views = (
+                        running_avg_dict[f"{self_name}_normal_valid_views"] + 1
+                    )
+                    running_avg_dict[f"{self_name}_normal_valid_views"] = (
+                        normal_valid_views
+                    )
+                    details[normal_avg_key] += (
+                        float(normal_losses[i]) - details[normal_avg_key]
+                    ) / normal_valid_views
+            # Gradient Matching loss details
+            if float(gradient_matching_loss) > 0:
+                details[f"{self_name}_gradient_matching_view{i + 1}"] = float(
+                    gradient_matching_loss
+                )
+                # For gradient matching loss
+                gm_avg_key = f"{self_name}_gradient_matching_avg"
+                if gm_avg_key not in details:
+                    details[gm_avg_key] = float(gradient_matching_losses[i])
+                    running_avg_dict[f"{self_name}_gm_valid_views"] = 1
+                else:
+                    gm_valid_views = running_avg_dict[f"{self_name}_gm_valid_views"] + 1
+                    running_avg_dict[f"{self_name}_gm_valid_views"] = gm_valid_views
+                    details[gm_avg_key] += (
+                        float(gradient_matching_losses[i]) - details[gm_avg_key]
+                    ) / gm_valid_views
+        # Put the losses together
+        loss_terms = []
+        for i in range(n_views):
+            loss_terms.append((normal_losses[i], None, "normal"))
+            loss_terms.append((gradient_matching_losses[i], None, "gradient_matching"))
+        losses = Sum(*loss_terms)
+        return losses, details
 class FactoredGeometryRegr3D(Criterion, MultiLoss):
     """
     Regression Loss for Factored Geometry.
         pose_quats_loss_weight=1,
         pose_trans_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
+        convert_predictions_to_view0_frame=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
     ):
             pose_trans_loss_weight (float): Weight to use for the pose trans loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
+            convert_predictions_to_view0_frame (bool): If True, convert predictions to view0 frame.
+                Use this if the predictions are not already in the view0 frame. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
         """
         self.pose_quats_loss_weight = pose_quats_loss_weight
         self.pose_trans_loss_weight = pose_trans_loss_weight
         self.compute_pairwise_relative_pose_loss = compute_pairwise_relative_pose_loss
+        self.convert_predictions_to_view0_frame = convert_predictions_to_view0_frame
         self.compute_world_frame_points_loss = compute_world_frame_points_loss
         self.world_frame_points_loss_weight = world_frame_points_loss_weight
         gt_ray_directions = []
         gt_pose_quats = []
         # Predicted quantities
+        if self.convert_predictions_to_view0_frame:
+            # Get the camera transform to convert quantities to view0 frame
+            pred_camera0 = torch.eye(4, device=preds[0]["cam_quats"].device).unsqueeze(
+                0
+            )
+            batch_size = preds[0]["cam_quats"].shape[0]
+            pred_camera0 = pred_camera0.repeat(batch_size, 1, 1)
+            pred_camera0_rot = quaternion_to_rotation_matrix(
+                preds[0]["cam_quats"].clone()
+            )
+            pred_camera0[..., :3, :3] = pred_camera0_rot
+            pred_camera0[..., :3, 3] = preds[0]["cam_trans"].clone()
+            pred_in_camera0 = closed_form_pose_inverse(pred_camera0)
         no_norm_pr_pts = []
         no_norm_pr_pts_cam = []
         no_norm_pr_depth = []
                 gt_pose_quats.append(gt_pose_quats_in_view0)
                 no_norm_gt_pose_trans.append(no_norm_gt_pose_trans_in_view0)
+            # Get the local predictions
             no_norm_pr_pts_cam.append(preds[i]["pts3d_cam"])
             pr_ray_directions.append(preds[i]["ray_directions"])
             if self.depth_type_for_loss == "depth_along_ray":
                 no_norm_pr_depth.append(preds[i]["depth_along_ray"])
             elif self.depth_type_for_loss == "depth_z":
                 no_norm_pr_depth.append(preds[i]["pts3d_cam"][..., 2:])
+            # Get the predicted global predictions in view0's frame
+            if self.convert_predictions_to_view0_frame:
+                # Convert predictions to view0 frame
+                pr_pts3d_in_view0 = geotrf(pred_in_camera0, preds[i]["pts3d"])
+                pr_pose_quats_in_view0, pr_pose_trans_in_view0 = (
+                    transform_pose_using_quats_and_trans_2_to_1(
+                        preds[0]["cam_quats"],
+                        preds[0]["cam_trans"],
+                        preds[i]["cam_quats"],
+                        preds[i]["cam_trans"],
+                    )
+                )
+                no_norm_pr_pts.append(pr_pts3d_in_view0)
+                no_norm_pr_pose_trans.append(pr_pose_trans_in_view0)
+                pr_pose_quats.append(pr_pose_quats_in_view0)
+            else:
+                # Predictions are already in view0 frame
+                no_norm_pr_pts.append(preds[i]["pts3d"])
+                no_norm_pr_pose_trans.append(preds[i]["cam_trans"])
+                pr_pose_quats.append(preds[i]["cam_quats"])
         if dist_clip is not None:
             # Points that are too far-away == invalid
         pose_quats_loss_weight=1,
         pose_trans_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
+        convert_predictions_to_view0_frame=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
         apply_normal_and_gm_loss_to_synthetic_data_only=True,
             pose_trans_loss_weight (float): Weight to use for the pose trans loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
+            convert_predictions_to_view0_frame (bool): If True, convert predictions to view0 frame.
+                Use this if the predictions are not already in the view0 frame. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
             apply_normal_and_gm_loss_to_synthetic_data_only (bool): If True, apply the normal and gm loss only to synthetic data.
             pose_quats_loss_weight=pose_quats_loss_weight,
             pose_trans_loss_weight=pose_trans_loss_weight,
             compute_pairwise_relative_pose_loss=compute_pairwise_relative_pose_loss,
+            convert_predictions_to_view0_frame=convert_predictions_to_view0_frame,
             compute_world_frame_points_loss=compute_world_frame_points_loss,
             world_frame_points_loss_weight=world_frame_points_loss_weight,
         )
         pose_trans_loss_weight=1,
         scale_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
+        convert_predictions_to_view0_frame=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
     ):
             scale_loss_weight (float): Weight to use for the scale loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
+            convert_predictions_to_view0_frame (bool): If True, convert predictions to view0 frame.
+                Use this if the predictions are not already in the view0 frame. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
         """
         self.pose_trans_loss_weight = pose_trans_loss_weight
         self.scale_loss_weight = scale_loss_weight
         self.compute_pairwise_relative_pose_loss = compute_pairwise_relative_pose_loss
+        self.convert_predictions_to_view0_frame = convert_predictions_to_view0_frame
         self.compute_world_frame_points_loss = compute_world_frame_points_loss
         self.world_frame_points_loss_weight = world_frame_points_loss_weight
         gt_ray_directions = []
         gt_pose_quats = []
         # Predicted quantities
+        if self.convert_predictions_to_view0_frame:
+            # Get the camera transform to convert quantities to view0 frame
+            pred_camera0 = torch.eye(4, device=preds[0]["cam_quats"].device).unsqueeze(
+                0
+            )
+            batch_size = preds[0]["cam_quats"].shape[0]
+            pred_camera0 = pred_camera0.repeat(batch_size, 1, 1)
+            pred_camera0_rot = quaternion_to_rotation_matrix(
+                preds[0]["cam_quats"].clone()
+            )
+            pred_camera0[..., :3, :3] = pred_camera0_rot
+            pred_camera0[..., :3, 3] = preds[0]["cam_trans"].clone()
+            pred_in_camera0 = closed_form_pose_inverse(pred_camera0)
         no_norm_pr_pts = []
         no_norm_pr_pts_cam = []
         no_norm_pr_depth = []
                 gt_pose_quats.append(gt_pose_quats_in_view0)
                 no_norm_gt_pose_trans.append(no_norm_gt_pose_trans_in_view0)
+            # Get the global predictions in view0's frame
+            if self.convert_predictions_to_view0_frame:
+                # Convert predictions to view0 frame
+                pr_pts3d_in_view0 = geotrf(pred_in_camera0, preds[i]["pts3d"])
+                pr_pose_quats_in_view0, pr_pose_trans_in_view0 = (
+                    transform_pose_using_quats_and_trans_2_to_1(
+                        preds[0]["cam_quats"],
+                        preds[0]["cam_trans"],
+                        preds[i]["cam_quats"],
+                        preds[i]["cam_trans"],
+                    )
+                )
+            else:
+                # Predictions are already in view0 frame
+                pr_pts3d_in_view0 = preds[i]["pts3d"]
+                pr_pose_trans_in_view0 = preds[i]["cam_trans"]
+                pr_pose_quats_in_view0 = preds[i]["cam_quats"]
             # Get predictions for normalized loss
             if self.depth_type_for_loss == "depth_along_ray":
                 curr_view_no_norm_depth = preds[i]["depth_along_ray"]
             if "metric_scaling_factor" in preds[i].keys():
                 # Divide by the predicted metric scaling factor to get the raw predicted points, depth_along_ray, and pose_trans
                 # This detaches the predicted metric scaling factor from the geometry based loss
+                curr_view_no_norm_pr_pts = pr_pts3d_in_view0 / preds[i][
                     "metric_scaling_factor"
                 ].unsqueeze(-1).unsqueeze(-1)
                 curr_view_no_norm_pr_pts_cam = preds[i]["pts3d_cam"] / preds[i][
                     "metric_scaling_factor"
                 ].unsqueeze(-1).unsqueeze(-1)
                 curr_view_no_norm_pr_pose_trans = (
+                    pr_pose_trans_in_view0 / preds[i]["metric_scaling_factor"]
                 )
             else:
+                curr_view_no_norm_pr_pts = pr_pts3d_in_view0
                 curr_view_no_norm_pr_pts_cam = preds[i]["pts3d_cam"]
                 curr_view_no_norm_depth = curr_view_no_norm_depth
+                curr_view_no_norm_pr_pose_trans = pr_pose_trans_in_view0
             no_norm_pr_pts.append(curr_view_no_norm_pr_pts)
             no_norm_pr_pts_cam.append(curr_view_no_norm_pr_pts_cam)
             no_norm_pr_depth.append(curr_view_no_norm_depth)
             no_norm_pr_pose_trans.append(curr_view_no_norm_pr_pose_trans)
             pr_ray_directions.append(preds[i]["ray_directions"])
+            pr_pose_quats.append(pr_pose_quats_in_view0)
             # Get the predicted metric scale points
             if "metric_scaling_factor" in preds[i].keys():
         pose_trans_loss_weight=1,
         scale_loss_weight=1,
         compute_pairwise_relative_pose_loss=False,
+        convert_predictions_to_view0_frame=False,
         compute_world_frame_points_loss=True,
         world_frame_points_loss_weight=1,
         apply_normal_and_gm_loss_to_synthetic_data_only=True,
             scale_loss_weight (float): Weight to use for the scale loss. Default: 1.
             compute_pairwise_relative_pose_loss (bool): If True, the pose loss is computed on the
                 exhaustive pairwise relative poses. Default: False.
+            convert_predictions_to_view0_frame (bool): If True, convert predictions to view0 frame.
+                Use this if the predictions are not already in the view0 frame. Default: False.
             compute_world_frame_points_loss (bool): If True, compute the world frame pointmap loss. Default: True.
             world_frame_points_loss_weight (float): Weight to use for the world frame pointmap loss. Default: 1.
             apply_normal_and_gm_loss_to_synthetic_data_only (bool): If True, apply the normal and gm loss only to synthetic data.
             pose_trans_loss_weight=pose_trans_loss_weight,
             scale_loss_weight=scale_loss_weight,
             compute_pairwise_relative_pose_loss=compute_pairwise_relative_pose_loss,
+            convert_predictions_to_view0_frame=convert_predictions_to_view0_frame,
             compute_world_frame_points_loss=compute_world_frame_points_loss,
             world_frame_points_loss_weight=world_frame_points_loss_weight,
         )

mapanything/utils/geometry.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Tuple, Union
 import einops as ein
 import numpy as np
 import torch
 from mapanything.utils.misc import invalid_to_zeros
 from mapanything.utils.warnings import no_warnings
@@ -646,6 +647,96 @@ def quaternion_to_rotation_matrix(quat):
     return rot_matrix
 def quaternion_inverse(quat):
     """
     Compute the inverse of a quaternion.

 import einops as ein
 import numpy as np
 import torch
+import torch.nn.functional as F
 from mapanything.utils.misc import invalid_to_zeros
 from mapanything.utils.warnings import no_warnings
     return rot_matrix
+def rotation_matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+    # Convert from rijk to ijkr
+    out = out[..., [1, 2, 3, 0]]
+    out = standardize_quaternion(out)
+    return out
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
 def quaternion_inverse(quat):
     """
     Compute the inverse of a quaternion.

mapanything/utils/image.py CHANGED Viewed

@@ -287,6 +287,17 @@ def load_images(
             f"Using target resolution {target_size[0]}x{target_size[1]} (W x H) for all images"
         )
     # Second pass: Resize all images to the same target size
     imgs = []
     for path, img, W1, H1 in loaded_images:
@@ -298,16 +309,6 @@ def load_images(
         if verbose:
             print(f" - Adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
-        if norm_type in IMAGE_NORMALIZATION_DICT.keys():
-            img_norm = IMAGE_NORMALIZATION_DICT[norm_type]
-            ImgNorm = tvf.Compose(
-                [tvf.ToTensor(), tvf.Normalize(mean=img_norm.mean, std=img_norm.std)]
-            )
-        else:
-            raise ValueError(
-                f"Unknown image normalization type: {norm_type}. Available options: {list(IMAGE_NORMALIZATION_DICT.keys())}"
-            )
         imgs.append(
             dict(
                 img=ImgNorm(img)[None],

             f"Using target resolution {target_size[0]}x{target_size[1]} (W x H) for all images"
         )
+    # Get the image normalization function based on the norm_type
+    if norm_type in IMAGE_NORMALIZATION_DICT.keys():
+        img_norm = IMAGE_NORMALIZATION_DICT[norm_type]
+        ImgNorm = tvf.Compose(
+            [tvf.ToTensor(), tvf.Normalize(mean=img_norm.mean, std=img_norm.std)]
+        )
+    else:
+        raise ValueError(
+            f"Unknown image normalization type: {norm_type}. Available options: {list(IMAGE_NORMALIZATION_DICT.keys())}"
+        )
     # Second pass: Resize all images to the same target size
     imgs = []
     for path, img, W1, H1 in loaded_images:
         if verbose:
             print(f" - Adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
         imgs.append(
             dict(
                 img=ImgNorm(img)[None],

mapanything/utils/inference.py CHANGED Viewed

@@ -3,9 +3,43 @@ Inference utilities.
 """
 import warnings
 import torch
 def loss_of_one_batch_multi_view(
     batch,
@@ -84,3 +118,358 @@ def loss_of_one_batch_multi_view(
     result["loss"] = loss
     return result[ret] if ret else result

 """
 import warnings
+from typing import Any, Dict, List
+import numpy as np
 import torch
+from mapanything.utils.geometry import (
+    depth_edge,
+    get_rays_in_camera_frame,
+    normals_edge,
+    points_to_normals,
+    quaternion_to_rotation_matrix,
+    recover_pinhole_intrinsics_from_ray_directions,
+    rotation_matrix_to_quaternion,
+)
+from mapanything.utils.image import rgb
+# Hard constraints - exactly what users can provide
+ALLOWED_VIEW_KEYS = {
+    "img",  # Required - input images
+    "data_norm_type",  # Required - normalization type of the input images
+    "depth_z",  # Optional - Z depth maps
+    "ray_directions",  # Optional - ray directions in camera frame
+    "intrinsics",  # Optional - pinhole camera intrinsics (conflicts with ray_directions)
+    "camera_poses",  # Optional - camera poses
+    "is_metric_scale",  # Optional - whether inputs are metric scale
+    "true_shape",  # Optional - original image shape
+    "idx",  # Optional - index of the view
+    "instance",  # Optional - instance info of the view
+}
+REQUIRED_KEYS = {"img", "data_norm_type"}
+# Define conflicting keys that cannot be used together
+CONFLICTING_KEYS = [
+    ("intrinsics", "ray_directions")  # Both represent camera projection
+]
 def loss_of_one_batch_multi_view(
     batch,
     result["loss"] = loss
     return result[ret] if ret else result
+def validate_input_views_for_inference(
+    views: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Strict validation and preprocessing of input views.
+    Args:
+        views: List of view dictionaries
+    Returns:
+        Validated and preprocessed views
+    Raises:
+        ValueError: For invalid keys, missing required keys, conflicting inputs, or invalid camera pose constraints
+    """
+    # Ensure input is not empty
+    if not views:
+        raise ValueError("At least one view must be provided")
+    # Track which views have camera poses
+    views_with_poses = []
+    # Validate each view
+    for view_idx, view in enumerate(views):
+        # Check for invalid keys
+        provided_keys = set(view.keys())
+        invalid_keys = provided_keys - ALLOWED_VIEW_KEYS
+        if invalid_keys:
+            raise ValueError(
+                f"View {view_idx} contains invalid keys: {invalid_keys}. "
+                f"Allowed keys are: {sorted(ALLOWED_VIEW_KEYS)}"
+            )
+        # Check for missing required keys
+        missing_keys = REQUIRED_KEYS - provided_keys
+        if missing_keys:
+            raise ValueError(f"View {view_idx} missing required keys: {missing_keys}")
+        # Check for conflicting keys
+        for conflict_set in CONFLICTING_KEYS:
+            present_conflicts = [key for key in conflict_set if key in provided_keys]
+            if len(present_conflicts) > 1:
+                raise ValueError(
+                    f"View {view_idx} contains conflicting keys: {present_conflicts}. "
+                    f"Only one of {conflict_set} can be provided at a time."
+                )
+        # Check depth constraint: If depth is provided, intrinsics or ray_directions must also be provided
+        if "depth_z" in provided_keys:
+            if (
+                "intrinsics" not in provided_keys
+                and "ray_directions" not in provided_keys
+            ):
+                raise ValueError(
+                    f"View {view_idx} depth constraint violation: If 'depth_z' is provided, "
+                    f"then 'intrinsics' or 'ray_directions' must also be provided. "
+                    f"Z Depth values require camera calibration information to be meaningful for an image."
+                )
+        # Track views with camera poses
+        if "camera_poses" in provided_keys:
+            views_with_poses.append(view_idx)
+    # Cross-view constraint: If any view has camera_poses, view 0 must have them too
+    if views_with_poses and 0 not in views_with_poses:
+        raise ValueError(
+            f"Camera pose constraint violation: Views {views_with_poses} have camera_poses, "
+            f"but view 0 (reference view) does not. When using camera_poses, the first view "
+            f"must also provide camera_poses to serve as the reference frame."
+        )
+    return views
+def preprocess_input_views_for_inference(
+    views: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Pre-process input views to match the expected internal input format.
+    The following steps are performed:
+    1. Convert intrinsics to ray directions when required. If ray directions are already provided, unit normalize them.
+    2. Convert depth_z to depth_along_ray
+    3. Convert camera_poses to the expected input keys (camera_pose_quats and camera_pose_trans)
+    4. Default is_metric_scale to True when not provided
+    Args:
+        views: List of view dictionaries
+    Returns:
+        Preprocessed views with consistent internal format
+    """
+    processed_views = []
+    for view_idx, view in enumerate(views):
+        # Copy the view dictionary to avoid modifying the original input
+        processed_view = dict(view)
+        # Step 1: Convert intrinsics to ray_directions when required. If ray_directions are provided, unit normalize them.
+        if "intrinsics" in view:
+            images = view["img"]
+            height, width = images.shape[-2:]
+            intrinsics = view["intrinsics"]
+            _, ray_directions = get_rays_in_camera_frame(
+                intrinsics=intrinsics,
+                height=height,
+                width=width,
+                normalize_to_unit_sphere=True,
+            )
+            processed_view["ray_directions"] = ray_directions
+            del processed_view["intrinsics"]
+        elif "ray_directions" in view:
+            ray_directions = view["ray_directions"]
+            ray_norm = torch.norm(ray_directions, dim=-1, keepdim=True)
+            processed_view["ray_directions"] = ray_directions / (ray_norm + 1e-8)
+        # Step 2: Convert depth_z to depth_along_ray
+        if "depth_z" in view:
+            depth_z = view["depth_z"]
+            ray_directions = processed_view["ray_directions"]
+            ray_directions_unit_plane = ray_directions / ray_directions[..., 2:3]
+            pts3d_cam = depth_z * ray_directions_unit_plane
+            depth_along_ray = torch.norm(pts3d_cam, dim=-1, keepdim=True)
+            processed_view["depth_along_ray"] = depth_along_ray
+            del processed_view["depth_z"]
+        # Step 3: Convert camera_poses to expected input keys
+        if "camera_poses" in view:
+            camera_poses = view["camera_poses"]
+            if isinstance(camera_poses, tuple) and len(camera_poses) == 2:
+                quats, trans = camera_poses
+                processed_view["camera_pose_quats"] = quats
+                processed_view["camera_pose_trans"] = trans
+            elif torch.is_tensor(camera_poses) and camera_poses.shape[-2:] == (4, 4):
+                rotation_matrices = camera_poses[:, :3, :3]
+                translation_vectors = camera_poses[:, :3, 3]
+                quats = rotation_matrix_to_quaternion(rotation_matrices)
+                processed_view["camera_pose_quats"] = quats
+                processed_view["camera_pose_trans"] = translation_vectors
+            else:
+                raise ValueError(
+                    f"View {view_idx}: camera_poses must be either a tuple of (quats, trans) "
+                    f"or a tensor of (B, 4, 4) transformation matrices."
+                )
+            del processed_view["camera_poses"]
+        # Step 4: Default is_metric_scale to True when not provided
+        if "is_metric_scale" not in processed_view:
+            # Get batch size from the image tensor
+            batch_size = view["img"].shape[0]
+            # Default to True for all samples in the batch
+            processed_view["is_metric_scale"] = torch.ones(
+                batch_size, dtype=torch.bool, device=view["img"].device
+            )
+        # Rename keys to match expected model input format
+        if "ray_directions" in processed_view:
+            processed_view["ray_directions_cam"] = processed_view["ray_directions"]
+            del processed_view["ray_directions"]
+        # Append the processed view to the list
+        processed_views.append(processed_view)
+    return processed_views
+def postprocess_model_outputs_for_inference(
+    raw_outputs: List[Dict[str, torch.Tensor]],
+    input_views: List[Dict[str, Any]],
+    apply_mask: bool = True,
+    mask_edges: bool = True,
+    edge_normal_threshold: float = 5.0,
+    edge_depth_threshold: float = 0.03,
+    apply_confidence_mask: bool = False,
+    confidence_percentile: float = 10,
+) -> List[Dict[str, torch.Tensor]]:
+    """
+    Post-process raw model outputs by copying raw outputs and adding essential derived fields.
+    This function simplifies the raw model outputs by:
+    1. Copying all raw outputs as-is
+    2. Adding denormalized images (img_no_norm)
+    3. Adding Z depth (depth_z) from camera frame points
+    4. Recovering pinhole camera intrinsics from ray directions
+    5. Adding camera pose matrices (camera_poses) if pose data is available
+    6. Applying mask to dense geometry outputs if requested (supports edge masking and confidence masking)
+    Args:
+        raw_outputs: List of raw model output dictionaries, one per view
+        input_views: List of original input view dictionaries, one per view
+        apply_mask: Whether to apply non-ambiguous mask to dense outputs. Defaults to True.
+        mask_edges: Whether to compute an edge mask based on normals and depth and apply it to the output. Defaults to True.
+        apply_confidence_mask: Whether to apply the confidence mask to the output. Defaults to False.
+        confidence_percentile: The percentile to use for the confidence threshold. Defaults to 10.
+    Returns:
+        List of processed output dictionaries containing:
+            - All original raw outputs (after masking dense geometry outputs if requested)
+            - 'img_no_norm': Denormalized RGB images (B, H, W, 3)
+            - 'depth_z': Z depth from camera frame (B, H, W, 1) if points in camera frame available
+            - 'intrinsics': Recovered pinhole camera intrinsics (B, 3, 3) if ray directions available
+            - 'camera_poses': 4x4 pose matrices (B, 4, 4) if pose data available
+            - 'mask': comprehensive mask for dense geometry outputs (B, H, W, 1) if requested
+    """
+    processed_outputs = []
+    for view_idx, (raw_output, original_view) in enumerate(
+        zip(raw_outputs, input_views)
+    ):
+        # Start by copying all raw outputs
+        processed_output = dict(raw_output)
+        # 1. Add denormalized images
+        img = original_view["img"]  # Shape: (B, 3, H, W)
+        data_norm_type = original_view["data_norm_type"][0]
+        img_hwc = rgb(img, data_norm_type)
+        # Convert numpy back to torch if needed (rgb returns numpy)
+        if isinstance(img_hwc, np.ndarray):
+            img_hwc = torch.from_numpy(img_hwc).to(img.device)
+        processed_output["img_no_norm"] = img_hwc
+        # 2. Add Z depth if we have camera frame points
+        if "pts3d_cam" in processed_output:
+            processed_output["depth_z"] = processed_output["pts3d_cam"][..., 2:3]
+        # 3. Recover pinhole camera intrinsics from ray directions if available
+        if "ray_directions" in processed_output:
+            intrinsics = recover_pinhole_intrinsics_from_ray_directions(
+                processed_output["ray_directions"]
+            )
+            processed_output["intrinsics"] = intrinsics
+        # 4. Add camera pose matrices if both translation and quaternions are available
+        if "cam_trans" in processed_output and "cam_quats" in processed_output:
+            cam_trans = processed_output["cam_trans"]  # (B, 3)
+            cam_quats = processed_output["cam_quats"]  # (B, 4)
+            batch_size = cam_trans.shape[0]
+            # Convert quaternions to rotation matrices
+            rotation_matrices = quaternion_to_rotation_matrix(cam_quats)  # (B, 3, 3)
+            # Create 4x4 pose matrices
+            pose_matrices = (
+                torch.eye(4, device=img.device).unsqueeze(0).repeat(batch_size, 1, 1)
+            )
+            pose_matrices[:, :3, :3] = rotation_matrices
+            pose_matrices[:, :3, 3] = cam_trans
+            processed_output["camera_poses"] = pose_matrices  # (B, 4, 4)
+        # 5. Apply comprehensive mask to dense geometry outputs if requested
+        if apply_mask:
+            final_mask = None
+            # Start with non-ambiguous mask if available
+            if "non_ambiguous_mask" in processed_output:
+                non_ambiguous_mask = (
+                    processed_output["non_ambiguous_mask"].cpu().numpy()
+                )  # (B, H, W)
+                final_mask = non_ambiguous_mask
+            # Apply confidence mask if requested and available
+            if apply_confidence_mask and "conf" in processed_output:
+                confidences = processed_output["conf"].cpu()  # (B, H, W)
+                # Compute percentile threshold for each batch element
+                batch_size = confidences.shape[0]
+                conf_mask = torch.zeros_like(confidences, dtype=torch.bool)
+                percentile_threshold = (
+                    torch.quantile(
+                        confidences.reshape(batch_size, -1),
+                        confidence_percentile / 100.0,
+                        dim=1,
+                    )
+                    .unsqueeze(-1)
+                    .unsqueeze(-1)
+                )  # Shape: (B, 1, 1)
+                # Compute mask for each batch element
+                conf_mask = confidences > percentile_threshold
+                conf_mask = conf_mask.numpy()
+                if final_mask is not None:
+                    final_mask = final_mask & conf_mask
+                else:
+                    final_mask = conf_mask
+            # Apply edge mask if requested and we have the required data
+            if mask_edges and final_mask is not None and "pts3d" in processed_output:
+                # Get 3D points for edge computation
+                pred_pts3d = processed_output["pts3d"].cpu().numpy()  # (B, H, W, 3)
+                batch_size, height, width = final_mask.shape
+                edge_masks = []
+                for b in range(batch_size):
+                    batch_final_mask = final_mask[b]  # (H, W)
+                    batch_pts3d = pred_pts3d[b]  # (H, W, 3)
+                    if batch_final_mask.any():  # Only compute if we have valid points
+                        # Compute normals and normal-based edge mask
+                        normals, normals_mask = points_to_normals(
+                            batch_pts3d, mask=batch_final_mask
+                        )
+                        normal_edges = normals_edge(
+                            normals, tol=edge_normal_threshold, mask=normals_mask
+                        )
+                        # Compute depth-based edge mask
+                        depth_z = (
+                            processed_output["depth_z"][b].squeeze(-1).cpu().numpy()
+                        )
+                        depth_edges = depth_edge(
+                            depth_z, rtol=edge_depth_threshold, mask=batch_final_mask
+                        )
+                        # Combine both edge types
+                        edge_mask = ~(depth_edges & normal_edges)
+                        edge_masks.append(edge_mask)
+                    else:
+                        # No valid points, keep all as invalid
+                        edge_masks.append(np.zeros_like(batch_final_mask, dtype=bool))
+                # Stack batch edge masks and combine with final mask
+                edge_mask = np.stack(edge_masks, axis=0)  # (B, H, W)
+                final_mask = final_mask & edge_mask
+            # Apply final mask to dense geometry outputs if we have a mask
+            if final_mask is not None:
+                # Convert mask to torch tensor
+                final_mask_torch = torch.from_numpy(final_mask).to(
+                    processed_output["pts3d"].device
+                )
+                final_mask_torch = final_mask_torch.unsqueeze(-1)  # (B, H, W, 1)
+                # Apply mask to dense geometry outputs (zero out invalid regions)
+                dense_geometry_keys = [
+                    "pts3d",
+                    "pts3d_cam",
+                    "depth_along_ray",
+                    "depth_z",
+                ]
+                for key in dense_geometry_keys:
+                    if key in processed_output:
+                        processed_output[key] = processed_output[key] * final_mask_torch
+                # Add mask to processed output
+                processed_output["mask"] = final_mask_torch
+        processed_outputs.append(processed_output)
+    return processed_outputs

mapanything/utils/viz.py CHANGED Viewed

@@ -110,7 +110,7 @@ def script_add_rerun_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--url",
         type=str,
-        default="rerun+http://127.0.0.1:9081/proxy",
         help="Connect to this HTTP(S) URL",
     )
     parser.add_argument(
@@ -129,7 +129,7 @@ def init_rerun_args(
     headless=True,
     connect=True,
     serve=False,
-    url="rerun+http://127.0.0.1:9081/proxy",
     save=None,
     stdout=False,
 ) -> Namespace:

     parser.add_argument(
         "--url",
         type=str,
+        default="rerun+http://127.0.0.1:2004/proxy",
         help="Connect to this HTTP(S) URL",
     )
     parser.add_argument(
     headless=True,
     connect=True,
     serve=False,
+    url="rerun+http://127.0.0.1:2004/proxy",
     save=None,
     stdout=False,
 ) -> Namespace:

mapanything/utils/wai/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+This utils module contains PORTAGE of wai-core scripts/methods for MapAnything.
+"""

mapanything/utils/wai/basic_dataset.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from pathlib import Path
+from typing import Any
+import torch
+from box import Box
+from mapanything.utils.wai.core import get_frame_index, load_data, load_frame
+from mapanything.utils.wai.ops import stack
+from mapanything.utils.wai.scene_frame import get_scene_frame_names
+class BasicSceneframeDataset(torch.utils.data.Dataset):
+    """Basic wai dataset to iterative over frames of scenes"""
+    @staticmethod
+    def collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
+        return stack(batch)
+    def __init__(
+        self,
+        cfg: Box,
+    ):
+        """
+        Initialize the BasicSceneframeDataset.
+        Args:
+            cfg (Box): Configuration object containing dataset parameters including:
+                - root: Root directory containing scene data
+                - frame_modalities: List of modalities to load for each frame
+                - key_remap: Optional dictionary mapping original keys to new keys
+        """
+        super().__init__()
+        self.cfg = cfg
+        self.root = cfg.root
+        keyframes = cfg.get("use_keyframes", True)
+        self.scene_frame_names = get_scene_frame_names(cfg, keyframes=keyframes)
+        self.scene_frame_list = [
+            (scene_name, frame_name)
+            for scene_name, frame_names in self.scene_frame_names.items()
+            for frame_name in frame_names
+        ]
+        self._scene_cache = {}
+    def __len__(self):
+        """
+        Get the total number of scene-frame pairs in the dataset.
+        Returns:
+            int: The number of scene-frame pairs.
+        """
+        return len(self.scene_frame_list)
+    def _load_scene(self, scene_name: str) -> dict[str, Any]:
+        """
+        Load scene data for a given scene name.
+        Args:
+            scene_name (str): The name of the scene to load.
+        Returns:
+            dict: A dictionary containing scene data, including scene metadata.
+        """
+        # load scene data
+        scene_data = {}
+        scene_data["meta"] = load_data(
+            Path(
+                self.root,
+                scene_name,
+                self.cfg.get("scene_meta_path", "scene_meta.json"),
+            ),
+            "scene_meta",
+        )
+        return scene_data
+    def _load_scene_frame(
+        self, scene_name: str, frame_name: str | float
+    ) -> dict[str, Any]:
+        """
+        Load data for a specific frame from a specific scene.
+        This method loads scene data if not already cached, then loads the specified frame
+        from that scene with the modalities specified in the configuration.
+        Args:
+            scene_name (str): The name of the scene containing the frame.
+            frame_name (str or float): The name/timestamp of the frame to load.
+        Returns:
+            dict: A dictionary containing the loaded frame data with requested modalities.
+        """
+        scene_frame_data = {}
+        if not (scene_data := self._scene_cache.get(scene_name)):
+            scene_data = self._load_scene(scene_name)
+            # for now only cache the last scene
+            self._scene_cache = {}
+            self._scene_cache[scene_name] = scene_data
+        frame_idx = get_frame_index(scene_data["meta"], frame_name)
+        scene_frame_data["scene_name"] = scene_name
+        scene_frame_data["frame_name"] = frame_name
+        scene_frame_data["scene_path"] = str(Path(self.root, scene_name))
+        scene_frame_data["frame_idx"] = frame_idx
+        scene_frame_data.update(
+            load_frame(
+                Path(self.root, scene_name),
+                frame_name,
+                modalities=self.cfg.frame_modalities,
+                scene_meta=scene_data["meta"],
+            )
+        )
+        # Remap key names
+        for key, new_key in self.cfg.get("key_remap", {}).items():
+            if key in scene_frame_data:
+                scene_frame_data[new_key] = scene_frame_data.pop(key)
+        return scene_frame_data
+    def __getitem__(self, index: int) -> dict[str, Any]:
+        """
+        Get a specific scene-frame pair by index.
+        Args:
+            index (int): The index of the scene-frame pair to retrieve.
+        Returns:
+            dict: A dictionary containing the loaded frame data with requested modalities.
+        """
+        scene_frame = self._load_scene_frame(*self.scene_frame_list[index])
+        return scene_frame

mapanything/utils/wai/camera.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+This utils script contains PORTAGE of wai-core camera methods for MapAnything.
+"""
+from typing import Any
+import numpy as np
+import torch
+from scipy.spatial.transform import Rotation, Slerp
+from mapanything.utils.wai.ops import get_dtype_device
+# constants regarding camera models
+PINHOLE_CAM_KEYS = ["fl_x", "fl_y", "cx", "cy", "h", "w"]
+DISTORTION_PARAM_KEYS = [
+    "k1",
+    "k2",
+    "k3",
+    "k4",
+    "p1",
+    "p2",
+]  # order corresponds to the OpenCV convention
+CAMERA_KEYS = PINHOLE_CAM_KEYS + DISTORTION_PARAM_KEYS
+def interpolate_intrinsics(
+    frame1: dict[str, Any],
+    frame2: dict[str, Any],
+    alpha: float,
+) -> dict[str, Any]:
+    """
+    Interpolate camera intrinsics linearly.
+    Args:
+        frame1: The first frame dictionary.
+        frame2: The second frame dictionary.
+        alpha: Interpolation parameter. alpha = 0 for frame1, alpha = 1 for frame2.
+    Returns:
+        frame_inter: dictionary with new intrinsics.
+    """
+    frame_inter = {}
+    for key in CAMERA_KEYS:
+        if key in frame1 and key in frame2:
+            p1 = frame1[key]
+            p2 = frame2[key]
+            frame_inter[key] = (1 - alpha) * p1 + alpha * p2
+    return frame_inter
+def interpolate_extrinsics(
+    matrix1: list | np.ndarray | torch.Tensor,
+    matrix2: list | np.ndarray | torch.Tensor,
+    alpha: float,
+) -> list | np.ndarray | torch.Tensor:
+    """
+    Interpolate camera extrinsics 4x4 matrices using SLERP.
+    Args:
+        matrix1: The first matrix.
+        matrix2: The second matrix.
+        alpha: Interpolation parameter. alpha = 0 for matrix1, alpha = 1 for matrix2.
+    Returns:
+        matrix: 4x4 interpolated matrix, same type.
+    Raises:
+        ValueError: If different type.
+    """
+    if not isinstance(matrix1, type(matrix2)):
+        raise ValueError("Both matrices should have the same type.")
+    dtype, device = get_dtype_device(matrix1)
+    if isinstance(matrix1, list):
+        mtype = "list"
+        matrix1 = np.array(matrix1)
+        matrix2 = np.array(matrix2)
+    elif isinstance(matrix1, np.ndarray):
+        mtype = "numpy"
+    elif isinstance(matrix1, torch.Tensor):
+        mtype = "torch"
+        matrix1 = matrix1.numpy()
+        matrix2 = matrix2.numpy()
+    else:
+        raise ValueError(
+            "Only list, numpy array and torch tensors are supported as inputs."
+        )
+    R1 = matrix1[:3, :3]
+    t1 = matrix1[:3, 3]
+    R2 = matrix2[:3, :3]
+    t2 = matrix2[:3, 3]
+    # interpolate translation
+    t = (1 - alpha) * t1 + alpha * t2
+    # interpolate rotations with SLERP
+    R1_quat = Rotation.from_matrix(R1).as_quat()
+    R2_quat = Rotation.from_matrix(R2).as_quat()
+    rotation_slerp = Slerp([0, 1], Rotation(np.stack([R1_quat, R2_quat])))
+    R = rotation_slerp(alpha).as_matrix()
+    matrix_inter = np.eye(4)
+    # combine together
+    matrix_inter[:3, :3] = R
+    matrix_inter[:3, 3] = t
+    if mtype == "list":
+        matrix_inter = matrix_inter.tolist()
+    elif mtype == "torch":
+        matrix_inter = torch.from_numpy(matrix_inter).to(dtype).to(device)
+    elif mtype == "numpy":
+        matrix_inter = matrix_inter.astype(dtype)
+    return matrix_inter
+def convert_camera_coeffs_to_pinhole_matrix(
+    scene_meta, frame, fmt="torch"
+) -> torch.Tensor | np.ndarray | list:
+    """
+    Convert camera intrinsics from NeRFStudio format to a 3x3 intrinsics matrix.
+    Args:
+        scene_meta: Scene metadata containing camera parameters
+        frame: Frame-specific camera parameters that override scene_meta
+    Returns:
+        torch.Tensor: 3x3 camera intrinsics matrix
+    Raises:
+        ValueError: If camera model is not PINHOLE or if distortion coefficients are present
+    """
+    # Check if camera model is supported
+    camera_model = frame.get("camera_model", scene_meta.get("camera_model"))
+    if camera_model != "PINHOLE":
+        raise ValueError("Only PINHOLE camera model supported")
+    # Check for unsupported distortion coefficients
+    if any(
+        (frame.get(coeff, 0) != 0) or (scene_meta.get(coeff, 0) != 0)
+        for coeff in DISTORTION_PARAM_KEYS
+    ):
+        raise ValueError(
+            "Pinhole camera does not support radial/tangential distortion -> Undistort first"
+        )
+    # Extract camera intrinsic parameters
+    camera_coeffs = {}
+    for coeff in ["fl_x", "fl_y", "cx", "cy"]:
+        camera_coeffs[coeff] = frame.get(coeff, scene_meta.get(coeff))
+        if camera_coeffs[coeff] is None:
+            raise ValueError(f"Missing required camera parameter: {coeff}")
+    # Create intrinsics matrix
+    intrinsics = [
+        [camera_coeffs["fl_x"], 0.0, camera_coeffs["cx"]],
+        [0.0, camera_coeffs["fl_y"], camera_coeffs["cy"]],
+        [0.0, 0.0, 1.0],
+    ]
+    if fmt == "torch":
+        intrinsics = torch.tensor(intrinsics)
+    elif fmt == "np":
+        intrinsics = np.array(intrinsics)
+    return intrinsics
+def rotate_pinhole_90degcw(
+    W: int, H: int, fx: float, fy: float, cx: float, cy: float
+) -> tuple[int, int, float, float, float, float]:
+    """Rotates the intrinsics of a pinhole camera model by 90 degrees clockwise."""
+    W_new = H
+    H_new = W
+    fx_new = fy
+    fy_new = fx
+    cy_new = cx
+    cx_new = H - 1 - cy
+    return W_new, H_new, fx_new, fy_new, cx_new, cy_new
+def _gl_cv_cmat() -> np.ndarray:
+    cmat = np.array([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]])
+    return cmat
+def _apply_transformation(
+    c2ws: torch.Tensor | np.ndarray, cmat: np.ndarray
+) -> torch.Tensor | np.ndarray:
+    """
+    Convert camera poses using a provided conversion matrix.
+    Args:
+        c2ws (torch.Tensor or np.ndarray): Camera poses (batch_size, 4, 4) or (4, 4)
+        cmat (torch.Tensor or np.ndarray): Conversion matrix (4, 4)
+    Returns:
+        torch.Tensor or np.ndarray: Transformed camera poses (batch_size, 4, 4) or (4, 4)
+    """
+    if isinstance(c2ws, torch.Tensor):
+        # Clone the input tensor to avoid modifying it in-place
+        c2ws_transformed = c2ws.clone()
+        # Apply the conversion matrix to the rotation part of the camera poses
+        if len(c2ws.shape) == 3:
+            c2ws_transformed[:, :3, :3] = c2ws_transformed[
+                :, :3, :3
+            ] @ torch.from_numpy(cmat[:3, :3]).to(c2ws).unsqueeze(0)
+        else:
+            c2ws_transformed[:3, :3] = c2ws_transformed[:3, :3] @ torch.from_numpy(
+                cmat[:3, :3]
+            ).to(c2ws)
+    elif isinstance(c2ws, np.ndarray):
+        # Clone the input array to avoid modifying it in-place
+        c2ws_transformed = c2ws.copy()
+        if len(c2ws.shape) == 3:  # batched
+            # Apply the conversion matrix to the rotation part of the camera poses
+            c2ws_transformed[:, :3, :3] = np.einsum(
+                "ijk,lk->ijl", c2ws_transformed[:, :3, :3], cmat[:3, :3]
+            )
+        else:  # single 4x4 matrix
+            # Apply the conversion matrix to the rotation part of the camera pose
+            c2ws_transformed[:3, :3] = np.dot(c2ws_transformed[:3, :3], cmat[:3, :3])
+    else:
+        raise ValueError("Input data type not supported.")
+    return c2ws_transformed
+def gl2cv(
+    c2ws: torch.Tensor | np.ndarray,
+    return_cmat: bool = False,
+) -> torch.Tensor | np.ndarray | tuple[torch.Tensor | np.ndarray, np.ndarray]:
+    """
+    Convert camera poses from OpenGL to OpenCV coordinate system.
+    Args:
+        c2ws (torch.Tensor or np.ndarray): Camera poses (batch_size, 4, 4) or (4, 4)
+        return_cmat (bool): If True, return the conversion matrix along with the transformed poses
+    Returns:
+        torch.Tensor or np.ndarray: Transformed camera poses (batch_size, 4, 4) or (4, 4)
+        np.ndarray (optional): Conversion matrix if return_cmat is True
+    """
+    cmat = _gl_cv_cmat()
+    if return_cmat:
+        return _apply_transformation(c2ws, cmat), cmat
+    return _apply_transformation(c2ws, cmat)
+def intrinsics_to_fov(
+    fx: torch.Tensor, fy: torch.Tensor, h: torch.Tensor, w: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute the horizontal and vertical fields of view in radians from camera intrinsics.
+    Args:
+        fx (torch.Tensor): focal x
+        fy (torch.Tensor): focal y
+        h (torch.Tensor): Image height(s) with shape (B,).
+        w (torch.Tensor): Image width(s) with shape (B,).
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: A tuple containing the horizontal and vertical fields
+        of view in radians, both with shape (N,).
+    """
+    return 2 * torch.atan((w / 2) / fx), 2 * torch.atan((h / 2) / fy)

mapanything/utils/wai/colormaps/colors_fps_5k.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fae94fe5fb565ff40d1c556ae2640d00fc068e732cb4af5bb64eef034790e07c
+size 9478

mapanything/utils/wai/core.py ADDED Viewed

	@@ -0,0 +1,492 @@

+"""
+This utils script contains PORTAGE of wai-core core methods for MapAnything.
+"""
+import logging
+import re
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+from mapanything.utils.wai.camera import (
+    CAMERA_KEYS,
+    convert_camera_coeffs_to_pinhole_matrix,
+    interpolate_extrinsics,
+    interpolate_intrinsics,
+)
+from mapanything.utils.wai.io import _get_method, _load_scene_meta
+from mapanything.utils.wai.ops import crop
+logger = logging.getLogger(__name__)
+WAI_COLORMAP_PATH = Path(__file__).parent / "colormaps"
+def load_data(fname: str | Path, format_type: str | None = None, **kwargs) -> Any:
+    """
+    Loads data from a file using the appropriate method based on the file format.
+    Args:
+        fname (str or Path): The filename or path to load data from.
+        format_type (str, optional): The format type of the data. If None, it will be inferred from the file extension if possible.
+            Supported formats include: 'readable', 'scalar', 'image', 'binary', 'depth', 'normals',
+            'numpy', 'ptz', 'mmap', 'scene_meta', 'labeled_image', 'mesh', 'labeled_mesh', 'caption', "latents".
+        **kwargs: Additional keyword arguments to pass to the loading method.
+    Returns:
+        The loaded data in the format returned by the specific loading method.
+    Raises:
+        ValueError: If the format cannot be inferred from the file extension.
+        NotImplementedError: If the specified format is not supported.
+        FileExistsError: If the file does not exist.
+    """
+    load_method = _get_method(fname, format_type, load=True)
+    return load_method(fname, **kwargs)
+def store_data(
+    fname: str | Path,
+    data: Any,
+    format_type: str | None = None,
+    **kwargs,
+) -> Any:
+    """
+    Stores data to a file using the appropriate method based on the file format.
+    Args:
+        fname (str or Path): The filename or path to store data to.
+        data: The data to be stored.
+        format_type (str, optional): The format type of the data. If None, it will be inferred from the file extension.
+        **kwargs: Additional keyword arguments to pass to the storing method.
+    Returns:
+        The result of the storing method, which may vary depending on the method used.
+    """
+    store_method = _get_method(fname, format_type, load=False)
+    Path(fname).parent.mkdir(parents=True, exist_ok=True)
+    return store_method(fname, data, **kwargs)
+def get_frame(
+    scene_meta: dict[str, Any],
+    frame_key: int | str | float,
+) -> dict[str, Any]:
+    """
+    Get a frame from scene_meta based on name or index.
+    Args:
+        scene_meta: Dictionary containing scene metadata
+        frame_key: Either a string (frame name) or integer (frame index) or float (video timestamp)
+    Returns:
+        The frame data (dict)
+    """
+    frame_idx = get_frame_index(scene_meta, frame_key)
+    if isinstance(frame_idx, int):
+        frame = scene_meta["frames"][frame_idx]
+        frame["_is_interpolated"] = False
+    else:
+        frame = {}
+        frame["frame_name"] = frame_key
+        left = int(frame_idx)  # it's floor operation
+        assert left >= 0 and left < (len(scene_meta["frames"]) - 1), "Wrong index"
+        frame_left = scene_meta["frames"][left]
+        frame_right = scene_meta["frames"][left + 1]
+        # Interpolate intrinsics and extrinsics
+        frame["transform_matrix"] = interpolate_extrinsics(
+            frame_left["transform_matrix"],
+            frame_right["transform_matrix"],
+            frame_idx - left,
+        )
+        frame.update(
+            interpolate_intrinsics(
+                frame_left,
+                frame_right,
+                frame_idx - left,
+            )
+        )
+        frame["_is_interpolated"] = True
+    return frame
+def get_intrinsics(
+    scene_meta,
+    frame_key,
+    fmt: str = "torch",
+) -> torch.Tensor | np.ndarray | list:
+    frame = get_frame(scene_meta, frame_key)
+    return convert_camera_coeffs_to_pinhole_matrix(scene_meta, frame, fmt=fmt)
+def get_extrinsics(
+    scene_meta,
+    frame_key,
+    fmt: str = "torch",
+) -> torch.Tensor | np.ndarray | list | None:
+    frame = get_frame(scene_meta, frame_key)
+    if "transform_matrix" in frame:
+        if fmt == "torch":
+            return torch.tensor(frame["transform_matrix"]).reshape(4, 4).float()
+        elif fmt == "np":
+            return np.array(frame["transform_matrix"]).reshape(4, 4)
+        return frame["transform_matrix"]
+    else:
+        # TODO: should not happen if we enable interpolation
+        return None
+def get_frame_index(
+    scene_meta: dict[str, Any],
+    frame_key: int | str | float,
+    frame_index_threshold_sec: float = 1e-4,
+    distance_threshold_sec: float = 2.0,
+) -> int | float:
+    """
+    Returns the frame index from scene_meta based on name (str) or index (int) or sub-frame index (float).
+    Args:
+        scene_meta: Dictionary containing scene metadata
+        frame_key: Either a string (frame name) or integer (frame index) or float (sub-frame index)
+        frame_index_threshold_sec: A threshold for nearest neighbor clipping for indexes (in seconds).
+                                   Default is 1e-4, which is 10000 fps.
+        distance_th: A threshold for maximum distance between interpolated frames (in seconds).
+    Returns:
+        Frame index (int)
+    Raises:
+        ValueError: If frame_key is not a string or integer or float
+    """
+    if isinstance(frame_key, str):
+        try:
+            return scene_meta["frame_names"][frame_key]
+        except KeyError as err:
+            error_message = (
+                f"Frame name not found: {frame_key} - "
+                f"Please verify scene_meta.json of scene: {scene_meta['dataset_name']}/{scene_meta['scene_name']}"
+            )
+            logger.error(error_message)
+            raise KeyError(error_message) from err
+    if isinstance(frame_key, int):
+        return frame_key
+    if isinstance(frame_key, float):
+        # If exact hit
+        if frame_key in scene_meta["frame_names"]:
+            return scene_meta["frame_names"][frame_key]
+        frame_names = sorted(list(scene_meta["frame_names"].keys()))
+        distances = np.array([frm - frame_key for frm in frame_names])
+        left = int(np.nonzero(distances <= 0)[0][-1])
+        right = left + 1
+        # The last frame or rounding errors
+        if (
+            left == distances.shape[0] - 1
+            or abs(distances[left]) < frame_index_threshold_sec
+        ):
+            return scene_meta["frame_names"][frame_names[int(left)]]
+        if abs(distances[right]) < frame_index_threshold_sec:
+            return scene_meta["frame_names"][frame_names[int(right)]]
+        interpolation_distance = distances[right] - distances[left]
+        if interpolation_distance > distance_threshold_sec:
+            raise ValueError(
+                f"Frame interpolation is forbidden for distances larger than {distance_threshold_sec}."
+            )
+        alpha = -distances[left] / interpolation_distance
+        return scene_meta["frame_names"][frame_names[int(left)]] + alpha
+    raise ValueError(f"Frame key type not supported: {frame_key} ({type(frame_key)}).")
+def load_modality_data(
+    scene_root: Path | str,
+    results: dict[str, Any],
+    modality_dict: dict[str, Any],
+    modality: str,
+    frame: dict[str, Any] | None = None,
+    fmt: str = "torch",
+) -> dict[str, Any]:
+    """
+    Processes a modality by loading data from a specified path and updating the results dictionary.
+    This function extracts the format and path from the given modality dictionary, loads the data
+    from the specified path, and updates the results dictionary with the loaded data.
+    Args:
+        scene_root (str or Path): The root directory of the scene where the data is located.
+        results (dict): A dictionary to store the loaded modality data and optional frame path.
+        modality_dict (dict): A dictionary containing the modality information, including 'format'
+            and the path to the data.
+        modality (str): The key under which the loaded modality data will be stored in the results.
+        frame (dict, optional): A dictionary containing frame information. If provided, that means we are loading
+        frame modalities, otherwise it is scene modalities.
+    Returns:
+        dict: The updated results dictionary containing the loaded modality data.
+    """
+    modality_format = modality_dict["format"]
+    # The modality is stored as a video
+    if "video" in modality_format:
+        assert isinstance(frame["frame_name"], float), "frame_name should be float"
+        video_file = None
+        if "chunks" in modality_dict:
+            video_list = modality_dict["chunks"]
+            # Get the correct chunk of the video
+            for video_chunk in video_list:
+                if video_chunk["start"] <= frame["frame_name"] <= video_chunk["end"]:
+                    video_file = video_chunk
+                    break
+        else:
+            # There is only one video (no chunks)
+            video_file = modality_dict
+            if "start" not in video_file:
+                video_file["start"] = 0
+            if "end" not in video_file:
+                video_file["end"] = float("inf")
+            if not (video_file["start"] <= frame["frame_name"] <= video_file["end"]):
+                video_file = None
+        # This timestamp is not available in any of the chunks
+        if video_file is None:
+            frame_name = frame["frame_name"]
+            logger.warning(
+                f"Modality {modality} ({modality_format}) is not available at time {frame_name}"
+            )
+            return results
+        # Load the modality from the video
+        loaded_modality = load_data(
+            Path(scene_root, video_file["file"]),
+            modality_format,
+            frame_key=frame["frame_name"] - video_file["start"],
+        )
+        if "bbox" in video_file:
+            loaded_modality = crop(loaded_modality, video_file["bbox"])
+        if loaded_modality is not None:
+            results[modality] = loaded_modality
+        if frame:
+            results[f"{modality}_fname"] = video_file["file"]
+    else:
+        modality_path = [v for k, v in modality_dict.items() if k != "format"][0]
+        if frame:
+            if modality_path in frame:
+                fname = frame[modality_path]
+            else:
+                fname = None
+        else:
+            fname = modality_path
+        if fname is not None:
+            loaded_modality = load_data(
+                Path(scene_root, fname),
+                modality_format,
+                frame_key=frame["frame_name"] if frame else None,
+                fmt=fmt,
+            )
+            results[modality] = loaded_modality
+            if frame:
+                results[f"{modality}_fname"] = frame[modality_path]
+    return results
+def load_modality(
+    scene_root: Path | str,
+    modality_meta: dict[str, Any],
+    modality: str,
+    frame: dict[str, Any] | None = None,
+    fmt: str = "torch",
+) -> dict[str, Any]:
+    """
+    Loads modality data based on the provided metadata and updates the results dictionary.
+    This function navigates through the modality metadata to find the specified modality,
+    then loads the data for each modality found.
+    Args:
+        scene_root (str or Path): The root directory of the scene where the data is located.
+        modality_meta (dict): A nested dictionary containing metadata for various modalities.
+        modality (str): A string representing the path to the desired modality within the metadata,
+            using '/' as a separator for nested keys.
+        frame (dict, optional): A dictionary containing frame information. If provided, we are operating
+        on frame modalities, otherwise it is scene modalities.
+    Returns:
+        dict: A dictionary containing the loaded modality data.
+    """
+    results = {}
+    # support for nested modalities like "pred_depth/metric3dv2"
+    modality_keys = modality.split("/")
+    current_modality = modality_meta
+    for key in modality_keys:
+        try:
+            current_modality = current_modality[key]
+        except KeyError as err:
+            error_message = (
+                f"Modality '{err.args[0]}' not found in modalities metadata. "
+                f"Please verify the scene_meta.json and the provided modalities in {scene_root}."
+            )
+            logger.error(error_message)
+            raise KeyError(error_message) from err
+    if "format" in current_modality:
+        results = load_modality_data(
+            scene_root, results, current_modality, modality, frame, fmt=fmt
+        )
+    else:
+        # nested modality, return last by default
+        logger.warning("Nested modality, returning last by default")
+        key = next(reversed(current_modality.keys()))
+        results = load_modality_data(
+            scene_root, results, current_modality[key], modality, frame, fmt=fmt
+        )
+    return results
+def load_frame(
+    scene_root: Path | str,
+    frame_key: int | str | float,
+    modalities: str | list[str] | None = None,
+    scene_meta: dict[str, Any] | None = None,
+    load_intrinsics: bool = True,
+    load_extrinsics: bool = True,
+    fmt: str = "torch",
+    interpolate: bool = False,
+) -> dict[str, Any]:
+    """
+    Load a single frame from a scene with specified modalities.
+    Args:
+        scene_root (str or Path): The root directory of the scene where the data is located.
+        frame_key (int or str or float): Either a string (frame name) or integer (frame index) or float (video timestamp).
+        modalities (str or list[str], optional): The modality or list of modalities to load.
+            If None, only basic frame information is loaded.
+        scene_meta (dict, optional): Dictionary containing scene metadata. If None, it will be loaded
+            from scene_meta.json in the scene_root.
+        interpolate (bool, optional): Allow interpolating frames?
+    Returns:
+        dict: A dictionary containing the loaded frame data with the requested modalities.
+    """
+    scene_root = Path(scene_root)
+    if scene_meta is None:
+        scene_meta = _load_scene_meta(scene_root / "scene_meta.json")
+    frame = get_frame(scene_meta, frame_key)
+    # compact, standarized frame representation
+    wai_frame = {}
+    if load_extrinsics:
+        extrinsics = get_extrinsics(
+            scene_meta,
+            frame_key,
+            fmt=fmt,
+        )
+        if extrinsics is not None:
+            wai_frame["extrinsics"] = extrinsics
+    if load_intrinsics:
+        camera_model = frame.get("camera_model", scene_meta.get("camera_model"))
+        wai_frame["camera_model"] = camera_model
+        if camera_model == "PINHOLE":
+            wai_frame["intrinsics"] = get_intrinsics(scene_meta, frame_key, fmt=fmt)
+        elif camera_model in ["OPENCV", "OPENCV_FISHEYE"]:
+            # optional per-frame intrinsics
+            for camera_key in CAMERA_KEYS:
+                if camera_key in frame:
+                    wai_frame[camera_key] = float(frame[camera_key])
+                elif camera_key in scene_meta:
+                    wai_frame[camera_key] = float(scene_meta[camera_key])
+        else:
+            error_message = (
+                f"Camera model not supported: {camera_model} - "
+                f"Please verify scene_meta.json of scene: {scene_meta['dataset_name']}/{scene_meta['scene_name']}"
+            )
+            logger.error(error_message)
+            raise NotImplementedError(error_message)
+    wai_frame["w"] = frame.get("w", scene_meta["w"] if "w" in scene_meta else None)
+    wai_frame["h"] = frame.get("h", scene_meta["h"] if "h" in scene_meta else None)
+    wai_frame["frame_name"] = frame["frame_name"]
+    wai_frame["frame_idx"] = get_frame_index(scene_meta, frame_key)
+    wai_frame["_is_interpolated"] = frame["_is_interpolated"]
+    if modalities is not None:
+        if isinstance(modalities, str):
+            modalities = [modalities]
+        for modality in modalities:
+            # Handle regex patterns in modality
+            if any(char in modality for char in ".|*+?()[]{}^$\\"):
+                # This is a regex pattern
+                pattern = re.compile(modality)
+                matching_modalities = [
+                    m for m in scene_meta["frame_modalities"] if pattern.match(m)
+                ]
+                if not matching_modalities:
+                    raise ValueError(
+                        f"No modalities match the pattern: {modality} in scene: {scene_root}"
+                    )
+                # Use the first matching modality
+                modality = matching_modalities[0]
+            current_modalities = load_modality(
+                scene_root, scene_meta["frame_modalities"], modality, frame, fmt=fmt
+            )
+            wai_frame.update(current_modalities)
+    return wai_frame
+def set_frame(
+    scene_meta: dict[str, Any],
+    frame_key: int | str,
+    new_frame: dict[str, Any],
+    sort: bool = False,
+) -> dict[str, Any]:
+    """
+    Replace a frame in scene_meta with a new frame.
+    Args:
+        scene_meta: Dictionary containing scene metadata.
+        frame_key: Either a string (frame name) or integer (frame index).
+        new_frame: New frame data to replace the existing frame.
+        sort: If True, sort the keys in the new_frame dictionary.
+    Returns:
+        Updated scene_meta dictionary.
+    """
+    frame_idx = get_frame_index(scene_meta, frame_key)
+    if isinstance(frame_idx, float):
+        raise ValueError(
+            f"Setting frame for sub-frame frame_key is not supported: {frame_key} ({type(frame_key)})."
+        )
+    if sort:
+        new_frame = {k: new_frame[k] for k in sorted(new_frame)}
+    scene_meta["frames"][frame_idx] = new_frame
+    return scene_meta
+def nest_modality(
+    frame_modalities: dict[str, Any],
+    modality_name: str,
+) -> dict[str, Any]:
+    """
+    Converts a flat modality structure into a nested one based on the modality name.
+    Args:
+        frame_modalities (dict): Dictionary containing frame modalities.
+        modality_name (str): The name of the modality to nest.
+    Returns:
+        dict: A dictionary with the nested modality structure.
+    """
+    frame_modality = {}
+    if modality_name in frame_modalities:
+        frame_modality = frame_modalities[modality_name]
+        if "frame_key" in frame_modality:
+            # required for backwards compatibility
+            # converting non-nested format into nested one based on name
+            modality_name = frame_modality["frame_key"].split("_")[0]
+            frame_modality = {modality_name: frame_modality}
+    return frame_modality

mapanything/utils/wai/intersection_check.py ADDED Viewed

	@@ -0,0 +1,462 @@

+import torch
+from einops import rearrange, repeat
+from tqdm import tqdm
+def create_frustum_from_intrinsics(
+    intrinsics: torch.Tensor,
+    near: torch.Tensor | float,
+    far: torch.Tensor | float,
+) -> torch.Tensor:
+    r"""
+    Create a frustum from camera intrinsics.
+    Args:
+        intrinsics (torch.Tensor): Bx3x3 Intrinsics of cameras.
+        near (torch.Tensor or float): [B] Near plane distance.
+        far (torch.Tensor or float): [B] Far plane distance.
+    Returns:
+        frustum (torch.Tensor): Bx8x3 batch of frustum points following the order:
+            5 ---------- 4
+            |\          /|
+            6 \        / 7
+             \ 1 ---- 0 /
+              \|      |/
+               2 ---- 3
+    """
+    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 1, 1]
+    cx, cy = intrinsics[:, 0, 2], intrinsics[:, 1, 2]
+    # Calculate the offsets at the near plane
+    near_x = near * (cx / fx)
+    near_y = near * (cy / fy)
+    far_x = far * (cx / fx)
+    far_y = far * (cy / fy)
+    # Define frustum vertices in camera space
+    near_plane = torch.stack(
+        [
+            torch.stack([near_x, near_y, near * torch.ones_like(near_x)], dim=-1),
+            torch.stack([-near_x, near_y, near * torch.ones_like(near_x)], dim=-1),
+            torch.stack([-near_x, -near_y, near * torch.ones_like(near_x)], dim=-1),
+            torch.stack([near_x, -near_y, near * torch.ones_like(near_x)], dim=-1),
+        ],
+        dim=1,
+    )
+    far_plane = torch.stack(
+        [
+            torch.stack([far_x, far_y, far * torch.ones_like(far_x)], dim=-1),
+            torch.stack([-far_x, far_y, far * torch.ones_like(far_x)], dim=-1),
+            torch.stack([-far_x, -far_y, far * torch.ones_like(far_x)], dim=-1),
+            torch.stack([far_x, -far_y, far * torch.ones_like(far_x)], dim=-1),
+        ],
+        dim=1,
+    )
+    return torch.cat([near_plane, far_plane], dim=1)
+def _frustum_to_triangles(frustum: torch.Tensor) -> torch.Tensor:
+    """
+    Convert frustum to triangles.
+    Args:
+        frustums (torch.Tensor): Bx8 batch of frustum points.
+    Returns:
+        frustum_triangles (torch.Tensor): Bx3x3 batch of frustum triangles.
+    """
+    triangle_inds = torch.tensor(
+        [
+            [0, 1, 2],
+            [0, 2, 3],
+            [0, 3, 7],
+            [0, 7, 4],
+            [1, 2, 6],
+            [1, 6, 5],
+            [1, 4, 5],
+            [1, 0, 4],
+            [2, 6, 7],
+            [2, 3, 7],
+            [6, 7, 4],
+            [6, 5, 4],
+        ]
+    )
+    frustum_triangles = frustum[:, triangle_inds]
+    return frustum_triangles
+def segment_triangle_intersection_check(
+    start_points: torch.Tensor,
+    end_points: torch.Tensor,
+    triangles: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Check if segments (lines with starting and end point) intersect triangles in 3D using the
+    Moller-Trumbore algorithm.
+    Args:
+        start_points (torch.Tensor): Bx3 Starting points of the segment.
+        end_points (torch.Tensor): Bx3 End points of the segment.
+        triangles (torch.Tensor): Bx3x3 Vertices of the triangles.
+    Returns:
+        intersects (torch.Tensor): B Boolean tensor indicating if each ray intersects its
+        corresponding triangle.
+    """
+    vertex0 = triangles[:, 0, :]
+    vertex1 = triangles[:, 1, :]
+    vertex2 = triangles[:, 2, :]
+    edge1 = vertex1 - vertex0
+    edge2 = vertex2 - vertex0
+    ray_vectors = end_points - start_points
+    max_lengths = torch.norm(ray_vectors, dim=1)
+    ray_vectors = ray_vectors / max_lengths[:, None]
+    h = torch.cross(ray_vectors, edge2, dim=1)
+    a = (edge1 * h).sum(dim=1)
+    epsilon = 1e-6
+    mask = torch.abs(a) > epsilon
+    f = torch.zeros_like(a)
+    f[mask] = 1.0 / a[mask]
+    s = start_points - vertex0
+    u = f * (s * h).sum(dim=1)
+    q = torch.cross(s, edge1, dim=1)
+    v = f * (ray_vectors * q).sum(dim=1)
+    t = f * (edge2 * q).sum(dim=1)
+    # Check conditions
+    intersects = (
+        (u >= 0)
+        & (u <= 1)
+        & (v >= 0)
+        & (u + v <= 1)
+        & (t >= epsilon)
+        & (t <= max_lengths)
+    )
+    return intersects
+def triangle_intersection_check(
+    triangles1: torch.Tensor,
+    triangles2: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Check if two triangles intersect.
+    Args:
+        triangles1 (torch.Tensor): Bx3x3 Vertices of the first batch of triangles.
+        triangles2 (torch.Tensor): Bx3x3 Vertices of the first batch of triangles.
+    Returns:
+        triangle_intersection (torch.Tensor): B Boolean tensor indicating if triangles intersect.
+    """
+    n = triangles1.shape[1]
+    start_points1 = rearrange(triangles1, "B N C -> (B N) C")
+    end_points1 = rearrange(
+        triangles1[:, torch.arange(1, n + 1) % n], "B N C -> (B N) C"
+    )
+    start_points2 = rearrange(triangles2, "B N C -> (B N) C")
+    end_points2 = rearrange(
+        triangles2[:, torch.arange(1, n + 1) % n], "B N C -> (B N) C"
+    )
+    intersection_1_2 = segment_triangle_intersection_check(
+        start_points1, end_points1, repeat(triangles2, "B N C -> (B N2) N C", N2=3)
+    )
+    intersection_2_1 = segment_triangle_intersection_check(
+        start_points2, end_points2, repeat(triangles1, "B N C -> (B N2) N C", N2=3)
+    )
+    triangle_intersection = torch.any(
+        rearrange(intersection_1_2, "(B N N2) -> B (N N2)", B=triangles1.shape[0], N=n),
+        dim=1,
+    ) | torch.any(
+        rearrange(intersection_2_1, "(B N N2) -> B (N N2)", B=triangles1.shape[0], N=n),
+        dim=1,
+    )
+    return triangle_intersection
+def frustum_intersection_check(
+    frustums: torch.Tensor,
+    check_inside: bool = True,
+    chunk_size: int = 500,
+    device: str | None = None,
+) -> torch.Tensor:
+    """
+    Check if any pair of the frustums intersect with each other.
+    Args:
+        frustums (torch.Tensor): Bx8 batch of frustum points.
+        check_inside (bool): If True, also checks if one frustum is inside another.
+            Defaults to True.
+        chunk_size (Optional[int]): Number of chunks to split the computation into.
+            Defaults to 500.
+        device (Optional[str]): Device to store exhuastive frustum intersection matrix on.
+            Defaults to None.
+    Returns:
+        frustum_intersection (torch.Tensor): BxB tensor of Booleans indicating if any pair
+        of frustums intersect with each other.
+    """
+    B = frustums.shape[0]
+    if device is None:
+        device = frustums.device
+    frustum_triangles = _frustum_to_triangles(frustums)
+    T = frustum_triangles.shape[1]
+    # Perform frustum in frustum check if required
+    if check_inside:
+        frustum_intersection = frustums_in_frustum_check(
+            frustums=frustums, chunk_size=chunk_size, device=device
+        )
+    else:
+        frustum_intersection = torch.zeros((B, B), dtype=torch.bool, device=device)
+    # Check triangle intersections in chunks
+    for i in tqdm(range(0, B, chunk_size), desc="Checking triangle intersections"):
+        i_end = min(i + chunk_size, B)
+        chunk_i_size = i_end - i
+        for j in range(0, B, chunk_size):
+            j_end = min(j + chunk_size, B)
+            chunk_j_size = j_end - j
+            # Process all triangle pairs between the two chunks in a vectorized way
+            triangles_i = frustum_triangles[i:i_end]  # [chunk_i, T, 3, 3]
+            triangles_j = frustum_triangles[j:j_end]  # [chunk_j, T, 3, 3]
+            # Reshape to process all triangle pairs at once
+            tri_i = triangles_i.reshape(chunk_i_size * T, 3, 3)
+            tri_j = triangles_j.reshape(chunk_j_size * T, 3, 3)
+            # Expand for all pairs - explicitly specify dimensions instead of using ...
+            tri_i_exp = repeat(tri_i, "bt i j -> (bt bj_t) i j", bj_t=chunk_j_size * T)
+            tri_j_exp = repeat(tri_j, "bt i j -> (bi_t bt) i j", bi_t=chunk_i_size * T)
+            # Check intersection
+            batch_intersect = triangle_intersection_check(tri_i_exp, tri_j_exp)
+            # Reshape and check if any triangle pair intersects
+            batch_intersect = batch_intersect.reshape(chunk_i_size, T, chunk_j_size, T)
+            batch_intersect = batch_intersect.any(dim=(1, 3))
+            # Update result
+            frustum_intersection[i:i_end, j:j_end] |= batch_intersect.to(device)
+    return frustum_intersection
+def ray_triangle_intersection_check(
+    ray_origins: torch.Tensor,
+    ray_vectors: torch.Tensor,
+    triangles: torch.Tensor,
+    max_lengths: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """
+    Check if rays intersect triangles in 3D using the Moller-Trumbore algorithm, considering the
+    finite length of rays.
+    Args:
+        ray_origins (torch.Tensor): Bx3 Origins of the rays.
+        ray_vectors (torch.Tensor): Bx3 Direction vectors of the rays.
+        triangles (torch.Tensor): Bx3x3 Vertices of the triangles.
+        max_lengths Optional[torch.Tensor]: B Maximum lengths of the rays.
+    Returns:
+        intersects (torch.Tensor): B Boolean tensor indicating if each ray intersects its
+        corresponding triangle.
+    """
+    vertex0 = triangles[:, 0, :]
+    vertex1 = triangles[:, 1, :]
+    vertex2 = triangles[:, 2, :]
+    edge1 = vertex1 - vertex0
+    edge2 = vertex2 - vertex0
+    h = torch.cross(ray_vectors, edge2, dim=1)
+    a = (edge1 * h).sum(dim=1)
+    epsilon = 1e-6
+    mask = torch.abs(a) > epsilon
+    f = torch.zeros_like(a)
+    f[mask] = 1.0 / a[mask]
+    s = ray_origins - vertex0
+    u = f * (s * h).sum(dim=1)
+    q = torch.cross(s, edge1, dim=1)
+    v = f * (ray_vectors * q).sum(dim=1)
+    t = f * (edge2 * q).sum(dim=1)
+    # Check conditions
+    intersects = (u >= 0) & (u <= 1) & (v >= 0) & (u + v <= 1) & (t >= epsilon)
+    if max_lengths is not None:
+        intersects &= t <= max_lengths
+    return intersects
+#### Checks for frustums
+def _frustum_to_planes(frustums: torch.Tensor) -> torch.Tensor:
+    r"""
+    Converts frustum parameters to plane representation.
+    Args:
+        frustums (torch.Tensor): Bx8 batch of frustum points following the order:
+            5 ---------- 4
+            |\          /|
+            6 \        / 7
+             \ 1 ---- 0 /
+              \|      |/
+               2 ---- 3
+    Returns:
+        planes (torch.Tensor): Bx6x4 where 6 represents the six frustum planes and
+                                    4 represents plane parameters [a, b, c, d].
+    """
+    planes = []
+    for inds in [[0, 1, 3], [1, 6, 2], [0, 3, 7], [2, 6, 3], [0, 5, 1], [6, 5, 4]]:
+        normal = torch.cross(
+            frustums[:, inds[1]] - frustums[:, inds[0]],
+            frustums[:, inds[2]] - frustums[:, inds[0]],
+            dim=1,
+        )
+        normal = normal / torch.norm(normal, dim=1, keepdim=True)
+        d = -torch.sum(normal * frustums[:, inds[0]], dim=1, keepdim=True)
+        planes.append(torch.cat([normal, d], -1))
+    return torch.stack(planes, 1)
+def points_in_frustum_check(
+    frustums: torch.Tensor,
+    points: torch.Tensor,
+    chunk_size: int | None = None,
+    device: str | None = None,
+):
+    """
+    Check if points are inside frustums.
+    Args:
+        frustums (torch.Tensor): Bx8 batch of frustum points.
+        points (torch.Tensor): BxNx3 batch of points.
+        chunk_size (Optional[int]): Number of chunks to split the computation into. Defaults to None.
+        device (Optional[str]): Device to perfrom computation on. Defaults to None.
+    Returns:
+        inside (torch.Tensor): BxN batch of Booleans indicating if points are inside frustums.
+    """
+    if device is None:
+        device = frustums.device
+    if chunk_size is not None:
+        # Split computation into chunks to avoid OOM errors for large batch sizes
+        point_plane_direction = []
+        for chunk_idx in range(0, frustums.shape[0], chunk_size):
+            chunk_frustum_planes = _frustum_to_planes(
+                frustums[chunk_idx : chunk_idx + chunk_size]
+            )
+            # Bx8x4 tensor of plane parameters [a, b, c, d]
+            chunk_points = points[chunk_idx : chunk_idx + chunk_size]
+            chunk_point_plane_direction = torch.einsum(
+                "bij,bnj->bni", (chunk_frustum_planes[:, :, :-1], chunk_points)
+            ) + repeat(
+                chunk_frustum_planes[:, :, -1], "B P -> B N P", N=chunk_points.shape[1]
+            )  # BxMxN tensor
+            point_plane_direction.append(chunk_point_plane_direction.to(device))
+        point_plane_direction = torch.cat(point_plane_direction)
+    else:
+        # Convert frustums to planes
+        frustum_planes = _frustum_to_planes(
+            frustums
+        )  # Bx8x4 tensor of plane parameters [a, b, c, d]
+        # Compute dot product between each point and each plane
+        point_plane_direction = torch.einsum(
+            "bij,bnj->bni", (frustum_planes[:, :, :-1], points)
+        ) + repeat(frustum_planes[:, :, -1], "B P -> B N P", N=points.shape[1]).to(
+            device
+        )  # BxMxN tensor
+    inside = (point_plane_direction >= 0).all(-1)
+    return inside
+def frustums_in_frustum_check(
+    frustums: torch.Tensor,
+    chunk_size: int,
+    device: str | None = None,
+    use_double_chunking: bool = True,
+):
+    """
+    Check if frustums are contained within other frustums.
+    Args:
+        frustums (torch.Tensor): Bx8 batch of frustum points.
+        chunk_size (Optional[int]): Number of chunks to split the computation into.
+            Defaults to None.
+        device (Optional[str]): Device to store exhuastive frustum containment matrix on.
+            Defaults to None.
+        use_double_chunking (bool): If True, use double chunking to avoid OOM errors.
+            Defaults to True.
+    Returns:
+        frustum_contained (torch.Tensor): BxB batch of Booleans indiciating if frustums are inside
+        other frustums.
+    """
+    B = frustums.shape[0]
+    if device is None:
+        device = frustums.device
+    if use_double_chunking:
+        frustum_contained = torch.zeros((B, B), dtype=torch.bool, device=device)
+        # Check if frustums are containing each other by processing in chunks
+        for i in tqdm(range(0, B, chunk_size), desc="Checking frustum containment"):
+            i_end = min(i + chunk_size, B)
+            chunk_i_size = i_end - i
+            for j in range(0, B, chunk_size):
+                j_end = min(j + chunk_size, B)
+                chunk_j_size = j_end - j
+                # Process a chunk of frustums against another chunk
+                frustums_i = frustums[i:i_end]
+                frustums_j_vertices = frustums[
+                    j:j_end, :1
+                ]  # Just need one vertex to check containment
+                # Perform points in frustum check
+                contained = rearrange(
+                    points_in_frustum_check(
+                        repeat(frustums_i, "B ... -> (B B2) ...", B2=chunk_j_size),
+                        repeat(
+                            frustums_j_vertices, "B ... -> (B2 B) ...", B2=chunk_i_size
+                        ),
+                    )[:, 0],
+                    "(B B2) -> B B2",
+                    B=chunk_i_size,
+                ).to(device)
+                # Map results back to the full matrix
+                frustum_contained[i:i_end, j:j_end] |= contained
+                frustum_contained[j:j_end, i:i_end] |= contained.transpose(
+                    0, 1
+                )  # Symmetric relation
+    else:
+        # Perform points in frustum check with a single chunked loop
+        frustum_contained = rearrange(
+            points_in_frustum_check(
+                repeat(frustums, "B ... -> (B B2) ...", B2=B),
+                repeat(frustums[:, :1], "B ... -> (B2 B) ...", B2=B),
+                chunk_size=chunk_size,
+            )[:, 0],
+            "(B B2) -> B B2",
+            B=B,
+        ).to(device)
+        frustum_contained = frustum_contained | frustum_contained.T
+    return frustum_contained

mapanything/utils/wai/io.py ADDED Viewed

	@@ -0,0 +1,1373 @@

+"""
+This utils script contains PORTAGE of wai-core io methods for MapAnything.
+"""
+import gzip
+import io
+import json
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, cast, IO, Literal, overload
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+import numpy as np
+import torch
+import trimesh
+import yaml
+from PIL import Image, PngImagePlugin
+from plyfile import PlyData, PlyElement
+from safetensors.torch import load_file as load_sft, save_file as save_sft
+from torchvision.io import decode_image
+from yaml import CLoader
+from mapanything.utils.wai.ops import (
+    to_numpy,
+)
+from mapanything.utils.wai.semantics import (
+    apply_id_to_color_mapping,
+    INVALID_ID,
+    load_semantic_color_mapping,
+)
+# Try to use orjson for faster JSON processing
+try:
+    import orjson
+except ImportError:
+    orjson = None
+logger = logging.getLogger(__name__)
+@overload
+def _load_readable(
+    fname: Path | str, load_as_string: Literal[True], **kwargs
+) -> str: ...
+@overload
+def _load_readable(
+    fname: Path | str, load_as_string: Literal[False] = False, **kwargs
+) -> dict: ...
+def _load_readable(
+    fname: Path | str,
+    load_as_string: bool = False,
+    **kwargs,
+) -> Any | str:
+    """
+    Loads data from a human-readable file and will try to parse JSON or YAML files as a dict, list,
+    int, float, str, bool, or None object. Can optionally return the file contents as a string.
+    Args:
+        fname (str or Path): The filename to load data from.
+        load_as_string (bool, optional): Whether to return the loaded data as a string.
+            Defaults to False.
+    Returns:
+        The loaded data, which can be any type of object that can be represented in JSON or YAML.
+    Raises:
+        NotImplementedError: If the file suffix is not supported (i.e., not .json, .yaml, or .yml).
+    """
+    if load_as_string:
+        return _load_readable_string(fname, **kwargs)
+    else:
+        return _load_readable_structured(fname, **kwargs)
+def _load_readable_structured(
+    fname: Path | str,
+    **kwargs,
+) -> Any:
+    """
+    Loads data from a human-readable file and will try to parse JSON or YAML files as a dict, list,
+    int, float, str, bool, or None object.
+    Args:
+        fname (str or Path): The filename to load data from.
+    Returns:
+        The loaded data, which can be any type of object that can be represented in JSON or YAML.
+    Raises:
+        NotImplementedError: If the file suffix is not supported (i.e., not .json, .yaml, or .yml).
+    """
+    fname = Path(fname)
+    if not fname.exists():
+        raise FileNotFoundError(f"File does not exist: {fname}")
+    if fname.suffix == ".json":
+        # Use binary mode for JSON files
+        with open(fname, mode="rb") as f:
+            # Use orjson if available, otherwise use standard JSON
+            if orjson:
+                return orjson.loads(f.read())
+            return json.load(f)
+    if fname.suffix in [".yaml", ".yml"]:
+        # Use text mode with UTF-8 encoding for YAML files
+        with open(fname, mode="r", encoding="utf-8") as f:
+            return yaml.load(f, Loader=CLoader)
+    raise NotImplementedError(f"Readable format not supported: {fname.suffix}")
+def _load_readable_string(
+    fname: Path | str,
+    **kwargs,
+) -> str:
+    """
+    Loads data from a human-readable file as a string.
+    Args:
+        fname (str or Path): The filename to load data from.
+    Returns:
+        The file's contents, as a string.
+    """
+    fname = Path(fname)
+    if not fname.exists():
+        raise FileNotFoundError(f"File does not exist: {fname}")
+    with open(fname, mode="r", encoding="utf-8") as f:
+        contents = f.read()
+    return contents
+def _store_readable(
+    fname: Path | str,
+    data: Any,
+    **kwargs,
+) -> int:
+    """
+    Stores data in a human-readable file (JSON or YAML).
+    Args:
+        fname (str or Path): The filename to store data in.
+        data: The data to store, which can be any type of object that can be represented in JSON or YAML.
+    Returns:
+        The number of bytes written to the file.
+    Raises:
+        NotImplementedError: If the file suffix is not supported (i.e., not .json, .yaml, or .yml).
+    """
+    fname = Path(fname)
+    # Create parent directory if it doesn't exist
+    os.makedirs(fname.parent, exist_ok=True)
+    if fname.suffix == ".json":
+        if orjson:
+            # Define the operation for orjson
+            with open(fname, mode="wb") as f:
+                return f.write(orjson.dumps(data, option=orjson.OPT_INDENT_2))
+        else:
+            # Define the operation for standard json
+            with open(fname, mode="w", encoding="utf-8") as f:
+                json.dump(data, f, indent=2)
+                return f.tell()
+    elif fname.suffix in [".yaml", ".yml"]:
+        # Define the operation for YAML files
+        with open(fname, mode="w", encoding="utf-8") as f:
+            yaml.dump(data, f)
+            return f.tell()
+    else:
+        raise NotImplementedError(f"Writable format not supported: {fname.suffix}")
+def get_processing_state(scene_root: Path | str) -> dict:
+    """
+    Retrieves the processing state of a scene.
+    Args:
+        scene_root (Path or str): The root directory of the scene.
+    Returns:
+        dict: A dictionary containing the processing state of the scene.
+            If no processing log exists, or reading it fails, an empty
+            dictionary is returned.
+    """
+    process_log_path = Path(scene_root) / "_process_log.json"
+    try:
+        return _load_readable_structured(process_log_path)
+    except FileNotFoundError:
+        logger.debug(f"Log file not found, returning empty dict: {process_log_path}")
+        return {}
+    except Exception:
+        logger.error(
+            f"Could not parse, returning empty dict: {process_log_path}", exc_info=True
+        )
+        return {}
+def _write_exr(
+    fname: str | Path,
+    data: np.ndarray | torch.Tensor,
+    params: list | None = None,
+    **kwargs,
+) -> bool:
+    """
+    Writes an image as an EXR file using OpenCV.
+    Args:
+        fname (str or Path): The filename to save the image to.
+        data (numpy.ndarray, torch.Tensor): The image data to save. Must be a 2D or 3D array.
+        params (list, optional): A list of parameters to pass to OpenCV's imwrite function.
+            Defaults to None, which uses 32-bit with zip compression.
+    Returns:
+        bool: True if the image was saved successfully, False otherwise.
+    Raises:
+        ValueError: If the input data has less than two or more than three dimensions.
+    Notes:
+        Only 32-bit float (CV_32F) images can be saved.
+        For comparison of different compression methods, see P1732924327.
+    """
+    if Path(fname).suffix != ".exr":
+        raise ValueError(
+            f"Only filenames with suffix .exr allowed but received: {fname}"
+        )
+    ## Note: only 32-bit float (CV_32F) images can be saved
+    data_np = to_numpy(data, dtype=np.float32)
+    if (data_np.ndim > 3) or (data_np.ndim < 2):
+        raise ValueError(
+            f"Image needs to contain two or three dims but received: {data_np.shape}"
+        )
+    return cv2.imwrite(str(fname), data_np, params if params else [])
+@overload
+def _read_exr(fname: str | Path, fmt: Literal["np"], **kwargs) -> np.ndarray: ...
+@overload
+def _read_exr(fname: str | Path, fmt: Literal["PIL"], **kwargs) -> Image.Image: ...
+@overload
+def _read_exr(
+    fname: str | Path, fmt: Literal["torch"] = "torch", **kwargs
+) -> torch.Tensor: ...
+def _read_exr(
+    fname: str | Path, fmt: Literal["np", "PIL", "torch"] = "torch", **kwargs
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Reads an EXR image file using OpenCV.
+    Args:
+        fname (str or Path): The filename of the EXR image to read.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch tensor.
+            - "np": Returns a NumPy array.
+            - "PIL": Returns a PIL Image object.
+            Defaults to "torch".
+    Returns:
+        The EXR image data in the specified output format.
+    Raises:
+        NotImplementedError: If the specified output format is not supported.
+        ValueError: If data shape is not supported, e.g. multi-channel PIL float images.
+    Notes:
+        The EXR image is read in its original format, without any conversion or rescaling.
+    """
+    data = cv2.imread(str(fname), cv2.IMREAD_UNCHANGED)
+    if data is None:
+        raise FileNotFoundError(f"Failed to read EXR file: {fname}")
+    if fmt == "torch":
+        # Convert to PyTorch tensor with float32 dtype
+        data = torch.from_numpy(data).float()
+    elif fmt == "np":
+        # Convert to NumPy array with float32 dtype
+        data = np.array(data, dtype=np.float32)
+    elif fmt == "PIL":
+        if data.ndim != 2:
+            raise ValueError("PIL does not support multi-channel EXR images")
+        # Convert to PIL Image object
+        data = Image.fromarray(data)
+    else:
+        raise NotImplementedError(f"fmt not supported: {fmt}")
+    return data
+@overload
+def _load_image(
+    fname: str | Path,
+    fmt: Literal["np"],
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> np.ndarray: ...
+@overload
+def _load_image(
+    fname: str | Path,
+    fmt: Literal["pil"],
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> Image.Image: ...
+@overload
+def _load_image(
+    fname: str | Path,
+    fmt: Literal["torch"] = "torch",
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> torch.Tensor: ...
+def _load_image(
+    fname: str | Path,
+    fmt: Literal["np", "pil", "torch"] = "torch",
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Loads an image from a file.
+    Args:
+        fname (str or Path): The filename to load the image from.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch tensor with shape (C, H, W).
+            - "np": Returns a NumPy array with shape (H, W, C).
+            - "pil": Returns a PIL Image object.
+            Defaults to "torch".
+        resize (tuple, optional): A tuple of two integers representing the desired width and height of the image.
+            If None, the image is not resized. Defaults to None.
+    Returns:
+        The loaded image in the specified output format.
+    Raises:
+        NotImplementedError: If the specified output format is not supported.
+    Notes:
+        This function loads non-binary images in RGB mode and normalizes pixel values to the range [0, 1].
+    """
+    # Fastest way to load into torch tensor
+    if resize is None and fmt == "torch":
+        return decode_image(str(fname)).float() / 255.0
+    # Load using PIL
+    with open(fname, "rb") as f:
+        pil_image = Image.open(f)
+        pil_image.load()
+        if pil_image.mode not in ["RGB", "RGBA"]:
+            raise OSError(
+                f"Expected a RGB or RGBA image in {fname}, but instead found an image with mode {pil_image.mode}"
+            )
+        if resize is not None:
+            pil_image = pil_image.resize(resize)
+        if fmt == "torch":
+            return (
+                torch.from_numpy(np.array(pil_image)).permute(2, 0, 1).float() / 255.0
+            )
+        elif fmt == "np":
+            return np.array(pil_image, dtype=np.float32) / 255.0
+        elif fmt == "pil":
+            return pil_image
+        else:
+            raise NotImplementedError(f"Image format not supported: {fmt}")
+def _store_image(
+    fname: str | Path, img_data: np.ndarray | torch.Tensor | Image.Image, **kwargs
+) -> None:
+    """
+    Stores an image in a file.
+    Args:
+        fname (str or Path): The filename to store the image in.
+        img_data (numpy.ndarray, torch.tensor or PIL.Image.Image): The image data to store.
+    Notes (for numpy.ndarray or torch.tensor inputs):
+        This function assumes that the input image data is in the range [0, 1], and has shape
+        (H, W, C), or (C, H, W) for PyTorch tensors, with C being 3 or 4.
+        It converts the image data to uint8 format and saves it as a compressed image file.
+    """
+    if isinstance(img_data, torch.Tensor):
+        if img_data.ndim != 3:
+            raise ValueError(f"Tensor needs to be 3D but received: {img_data.shape=}")
+        if img_data.shape[0] in [3, 4]:
+            # Convert to HWC format expected by pillow `Image.save` below
+            img_data = img_data.permute(1, 2, 0)
+        img_data = img_data.contiguous()
+    if isinstance(img_data, (np.ndarray, torch.Tensor)):
+        if img_data.shape[-1] not in [3, 4]:
+            raise ValueError(
+                f"Image must have 3 or 4 channels, but received: {img_data.shape=}"
+            )
+        img_data_np = to_numpy(img_data, dtype=np.float32)
+        img_data = Image.fromarray((255 * img_data_np).round().astype(np.uint8))
+    with open(fname, "wb") as f:
+        pil_kwargs = {
+            # Make PNGs faster to save using minimal compression
+            "optimize": False,
+            "compress_level": 1,
+            # Higher JPEG image quality
+            "quality": "high",
+        }
+        pil_kwargs.update(kwargs)
+        img_data.save(cast(IO[bytes], f), **pil_kwargs)
+def _load_binary_mask(
+    fname: str | Path,
+    fmt: str = "torch",
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Loads a binary image from a file.
+    Args:
+        fname (str or Path): The filename to load the binary image from.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch Boolean tensor with shape H x W.
+            - "np": Returns a NumPy Boolean array with shape H x W.
+            - "pil": Returns a PIL Image object.
+            Defaults to "torch".
+        resize (tuple, optional): A tuple of two integers representing the desired width and height of the binary image.
+            If None, the image is not resized. Defaults to None.
+    Returns:
+        The loaded binary image in the specified output format.
+    Raises:
+        NotImplementedError: If the specified output format is not supported.
+    """
+    if fmt not in ["pil", "np", "torch"]:
+        raise NotImplementedError(f"Image format not supported: {fmt}")
+    with open(fname, "rb") as f:
+        pil_image = Image.open(f)
+        pil_image.load()
+        if pil_image.mode == "L":
+            pil_image = pil_image.convert("1")
+        elif pil_image.mode != "1":
+            raise OSError(
+                f"Expected a binary or grayscale image in {fname}, but instead found an image with mode {pil_image.mode}"
+            )
+        if resize is not None:
+            pil_image = pil_image.resize(resize)
+        if fmt == "pil":
+            return pil_image
+        mask = np.array(pil_image, copy=True)
+        return mask if fmt == "np" else torch.from_numpy(mask)
+def _store_binary_mask(
+    fname: str | Path, img_data: np.ndarray | torch.Tensor | Image.Image, **kwargs
+) -> None:
+    """
+    Stores a binary image in a compressed image file.
+    Args:
+        fname (str or Path): The filename to store the binary image in.
+        img_data (numpy.ndarray, torch.tensor or PIL.Image.Image): The binary image data to store.
+    """
+    if isinstance(img_data, Image.Image):
+        if img_data.mode not in ["1", "L"]:
+            raise RuntimeError(
+                f'Expected a PIL image with mode "1" or "L", but instead got a PIL image with mode {img_data.mode}'
+            )
+    elif isinstance(img_data, np.ndarray) or isinstance(img_data, torch.Tensor):
+        if len(img_data.squeeze().shape) != 2:
+            raise RuntimeError(
+                f"Expected a PyTorch tensor or NumPy array with shape (H, W, 1), (1, H, W) or (H, W), but the shape is {img_data.shape}"
+            )
+        img_data = img_data.squeeze()
+    else:
+        raise NotImplementedError(f"Input format not supported: {type(img_data)}")
+    if not isinstance(img_data, Image.Image):
+        img_data = to_numpy(img_data, dtype=bool)
+        img_data = Image.fromarray(img_data)
+    img_data = img_data.convert("1")
+    with open(fname, "wb") as f:
+        img_data.save(f, compress_level=1, optimize=False)
+def _load_sft(
+    fname: str | Path,
+    fmt: str = "torch",
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Loads a tensor from a safetensor file.
+    Args:
+        fname (str | Path): The filename of the safetensor file to load.
+        fmt (str, optional): The format of the output data. Currently only "torch" is supported.
+        **kwargs: Additional keyword arguments (unused).
+    Returns:
+        torch.Tensor: The loaded tensor.
+    Raises:
+        AssertionError: If the file extension is not .sft or if fmt is not "torch".
+    """
+    assert Path(fname).suffix == ".sft", "Only .sft (safetensor) is supported"
+    assert fmt == "torch", "Only torch format is supported for latent"
+    out = load_sft(str(fname))
+    return out["latent"]
+def _store_sft(fname: str | Path, data: torch.Tensor, **kwargs) -> None:
+    """
+    Stores a tensor to a safetensor file.
+    Args:
+        fname (str | Path): The filename to store the latent in.
+        data (torch.Tensor): The latent tensor to store.
+        **kwargs: Additional keyword arguments (unused).
+    Raises:
+        AssertionError: If the file extension is not .sft or if data is not a torch.Tensor.
+    """
+    assert Path(fname).suffix == ".sft", "Only .sft (safetensor) is supported"
+    assert isinstance(data, torch.Tensor)
+    save_sft(tensors={"latent": data}, filename=str(fname))
+def _store_depth(fname: str | Path, data: np.ndarray | torch.Tensor, **kwargs) -> bool:
+    """
+    Stores a depth map in an EXR file.
+    Args:
+        fname (str or Path): The filename to save the depth map to.
+        data (numpy.ndarray, torch.tensor): The depth map to save.
+    Returns:
+        bool: True if the depth map was saved successfully, False otherwise.
+    Raises:
+        ValueError: If the input data does not have two dimensions after removing singleton dimensions.
+    """
+    data_np = to_numpy(data, dtype=np.float32)
+    data_np = data_np.squeeze()  # remove all 1-dim entries
+    if data_np.ndim != 2:
+        raise ValueError(f"Depth image needs to be 2d, but received: {data_np.shape}")
+    if "params" in kwargs:
+        params = kwargs["params"]
+    else:
+        # use 16-bit with zip compression for depth maps
+        params = [
+            cv2.IMWRITE_EXR_TYPE,
+            cv2.IMWRITE_EXR_TYPE_HALF,
+            cv2.IMWRITE_EXR_COMPRESSION,
+            cv2.IMWRITE_EXR_COMPRESSION_ZIP,
+        ]
+    return _write_exr(fname, data_np, params=params)
+def _load_depth(
+    fname: str | Path, fmt: str = "torch", **kwargs
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Loads a depth image from an EXR file.
+    Args:
+        fname (str or Path): The filename of the EXR file to load.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch tensor.
+            - "np": Returns a NumPy array.
+            - "PIL": Returns a PIL Image object.
+            Defaults to "torch".
+    Returns:
+        The loaded depth image in the specified output format.
+    Raises:
+        ValueError: If the loaded depth image does not have two dimensions.
+    Notes:
+        This function assumes that the EXR file contains a single-channel depth image.
+    """
+    data = _read_exr(fname, fmt)
+    if (fmt != "PIL") and (data.ndim != 2):
+        raise ValueError(f"Depth image needs to be 2D, but loaded: {data.shape}")
+    return data
+def _store_normals(
+    fname: str | Path, data: np.ndarray | torch.Tensor, **kwargs
+) -> bool:
+    """
+    Stores a normals image in an EXR file.
+    Args:
+        fname (str or Path): The filename to save the normals image to.
+        data (numpy.ndarray): The normals image data to save. Will be converted to a 32-bit float array.
+    Returns:
+        bool: True if the normals image was saved successfully, False otherwise.
+    Raises:
+        ValueError: If the input data has more than three dimensions after removing singleton dimensions.
+        ValueError: If the input data does not have exactly three channels.
+        ValueError: If the input data is not normalized (i.e., maximum absolute value exceeds 1).
+    Notes:
+        This function assumes that the input data is in HWC (height, width, channels) format.
+        If the input data is in CHW (channels, height, width) format, it will be automatically transposed to HWC.
+    """
+    data_np = to_numpy(data, dtype=np.float32)
+    data_np = data_np.squeeze()  # remove all singleton dimensions
+    if data_np.ndim != 3:
+        raise ValueError(
+            f"Normals image needs to be 3-dim but received: {data_np.shape}"
+        )
+    if (data_np.shape[0] == 3) and (data_np.shape[2] != 3):
+        # ensure HWC format
+        data_np = data_np.transpose(1, 2, 0)
+    if data_np.shape[2] != 3:
+        raise ValueError(
+            f"Normals image needs have 3 channels but received: {data_np.shape}"
+        )
+    # We want to check that the norm values are either 1 (valid) or 0 (invalid values are 0s)
+    norm = np.linalg.norm(data_np, axis=-1)
+    is_one = np.isclose(norm, 1.0, atol=1e-3)
+    is_zero = np.isclose(norm, 0.0)
+    if not np.all([is_one | is_zero]):
+        raise ValueError("Normals image must be normalized")
+    return _write_exr(fname, data_np)
+def _load_normals(
+    fname: str | Path, fmt: str = "torch", **kwargs
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Loads a normals image from an EXR file.
+    Args:
+        fname (str or Path): The filename of the EXR file to load.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch tensor.
+            - "np": Returns a NumPy array.
+            - "PIL": Returns a PIL Image object.
+            Defaults to "torch".
+    Returns:
+        The loaded normals image in the specified output format.
+    Raises:
+        Warning: If the loaded normals image has more than two dimensions.
+    Notes:
+        This function assumes that the EXR file contains a 3-channel normals image.
+    """
+    data = _read_exr(fname, fmt)
+    if data.ndim != 3:
+        raise ValueError(f"Normals image needs to be 3-dim but received: {data.shape}")
+    if data.shape[2] != 3:
+        raise ValueError(
+            f"Normals image needs have 3 channels but received: {data.shape}"
+        )
+    return data
+def _load_numpy(fname: str | Path, allow_pickle: bool = False, **kwargs) -> np.ndarray:
+    """
+    Loads a NumPy array from a file.
+    Args:
+        fname (str or Path): The filename to load the NumPy array from.
+        allow_pickle (bool, optional): Whether to allow pickled objects in the NumPy file.
+            Defaults to False.
+    Returns:
+        numpy.ndarray: The loaded NumPy array.
+    Raises:
+        NotImplementedError: If the file suffix is not supported (i.e., not .npy or .npz).
+    Notes:
+        This function supports loading NumPy arrays from .npy and .npz files.
+        For .npz files, it assumes that the array is stored under the key "arr_0".
+    """
+    fname = Path(fname)
+    with open(fname, "rb") as fid:
+        if fname.suffix == ".npy":
+            return np.load(fid, allow_pickle=allow_pickle)
+        elif fname.suffix == ".npz":
+            return np.load(fid, allow_pickle=allow_pickle).get("arr_0")
+        else:
+            raise NotImplementedError(f"Numpy format not supported: {fname.suffix}")
+def _store_numpy(fname: str | Path, data: np.ndarray, **kwargs) -> None:
+    """
+    Stores a NumPy array in a file.
+    Args:
+        fname (str or Path): The filename to store the NumPy array in.
+        data (numpy.ndarray): The NumPy array to store.
+    Raises:
+        NotImplementedError: If the file suffix is not supported (i.e., not .npy or .npz).
+    Notes:
+        This function supports storing NumPy arrays in .npy and .npz files.
+        For .npz files, it uses compression to reduce the file size.
+    """
+    fname = Path(fname)
+    with open(fname, "wb") as fid:
+        if fname.suffix == ".npy":
+            np.save(fid, data)
+        elif fname.suffix == ".npz":
+            np.savez_compressed(fid, arr_0=data)
+        else:
+            raise NotImplementedError(f"Numpy format not supported: {fname.suffix}")
+def _load_ptz(fname: str | Path, **kwargs) -> torch.Tensor:
+    """
+    Loads a PyTorch tensor from a PTZ file.
+    Args:
+        fname (str or Path): The filename to load the tensor from.
+    Returns:
+        torch.Tensor: The loaded PyTorch tensor.
+    Notes:
+        This function assumes that the PTZ file contains a PyTorch tensor saved using `torch.save`.
+        If the tensor was saved in a different format, this function may fail.
+    """
+    with open(fname, "rb") as fid:
+        data = gzip.decompress(fid.read())
+        ## Note: if the following line fails, save PyTorch tensors in PTZ instead of NumPy
+        return torch.load(io.BytesIO(data), map_location="cpu", weights_only=True)
+def _store_ptz(fname: str | Path, data: torch.Tensor, **kwargs) -> None:
+    """
+    Stores a PyTorch tensor in a PTZ file.
+    Args:
+        fname (str or Path): The filename to store the tensor in.
+        data (torch.Tensor): The PyTorch tensor to store.
+    Notes:
+        This function saves the tensor using `torch.save` and compresses it using gzip.
+    """
+    with open(fname, "wb") as fid:
+        with gzip.open(fid, "wb") as gfid:
+            torch.save(data, gfid)
+def _store_mmap(fname: str | Path, data: np.ndarray | torch.Tensor, **kwargs) -> str:
+    """
+    Stores matrix-shaped data in a memory-mapped file.
+    Args:
+        fname (str or Path): The filename to store the data in.
+        data (numpy.ndarray): The matrix-shaped data to store.
+    Returns:
+        str: The name of the stored memory-mapped file.
+    Notes:
+        This function stores the data in a .npy file with a modified filename that includes the shape of the data.
+        The data is converted to float32 format before storing.
+    """
+    fname = Path(fname)
+    # add dimensions to the file name for loading
+    data_np = to_numpy(data, dtype=np.float32)
+    shape_string = "x".join([str(dim) for dim in data_np.shape])
+    mmap_name = f"{fname.stem}--{shape_string}.npy"
+    with open(fname.parent / mmap_name, "wb") as fid:
+        np.save(fid, data_np)
+    return mmap_name
+def _load_mmap(fname: str | Path, **kwargs) -> np.memmap:
+    """
+    Loads matrix-shaped data from a memory-mapped file.
+    Args:
+        fname (str or Path): The filename of the memory-mapped file to load.
+    Returns:
+        numpy.memmap: A memory-mapped array containing the loaded data.
+    Notes:
+        This function assumes that the filename contains the shape of the data, separated by 'x' or ','.
+        It uses this information to create a memory-mapped array with the correct shape.
+    """
+    shape_string = Path(Path(fname).name.split("--")[1]).stem
+    shape = [int(dim) for dim in shape_string.replace(",", "x").split("x")]
+    with open(fname, "rb") as fid:
+        return np.memmap(fid, dtype=np.float32, mode="r", shape=shape, offset=128)
+def _store_scene_meta(fname: Path | str, scene_meta: dict[str, Any], **kwargs) -> None:
+    """
+    Stores scene metadata in a readable file.
+    Args:
+        fname (str or Path): The filename to store the scene metadata in.
+        scene_meta (dict): The scene metadata to store.
+    Notes:
+        This function updates the "last_modified" field of the scene metadata to the current date and time before storing it.
+        It also removes the "frame_names" field from the scene metadata, as it is not necessary to store this information.
+        Creates a backup of the existing file before overwriting it.
+    """
+    # update the modified date
+    scene_meta["last_modified"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if "frame_names" in scene_meta:
+        del scene_meta["frame_names"]
+    # create/overwrite backup
+    fname_path = Path(fname)
+    if fname_path.exists():
+        backup_fname = fname_path.parent / f"_{fname_path.stem}_backup.json"
+        if backup_fname.exists():
+            backup_fname.unlink()
+        fname_path.rename(backup_fname)
+    _store_readable(fname, scene_meta)
+def _load_scene_meta(fname: Path | str, **kwargs) -> dict[str, Any]:
+    """
+    Loads scene metadata from a readable file.
+    Args:
+        fname (str or Path): The filename to load the scene metadata from.
+    Returns:
+        dict: The loaded scene metadata, including an additional "frame_names" field that maps frame names to their indices.
+    Notes:
+        This function creates the "frame_names" field in the scene metadata for efficient lookup of frame indices by name.
+    """
+    scene_meta = _load_readable_structured(fname)
+    # create the frame_name -> frame_idx for efficiency
+    scene_meta["frame_names"] = {
+        frame["frame_name"]: frame_idx
+        for frame_idx, frame in enumerate(scene_meta["frames"])
+    }
+    return scene_meta
+def _load_labeled_image(
+    fname: str | Path,
+    fmt: str = "torch",
+    resize: tuple[int, int] | None = None,
+    **kwargs,
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Loads a labeled image from a PNG file.
+    Args:
+        fname (str or Path): The filename to load the image from.
+        fmt (str): The format of the output data. Can be one of:
+            - "torch": Returns a PyTorch int32 tensor with shape (H, W).
+            - "np": Returns a NumPy int32 array with shape (H, W).
+            - "pil": Returns a PIL Image object.
+            Defaults to "torch".
+        resize (tuple, optional): A tuple of two integers representing the desired width and height of the image.
+            If None, the image is not resized. Defaults to None.
+    Returns:
+        The loaded image in the specified output format.
+    Raises:
+        NotImplementedError: If the specified output format is not supported.
+        RuntimeError: If the 'id_to_color_mapping' is missing in the PNG metadata.
+    Notes:
+        The function expects the PNG file to contain metadata with a key 'id_to_color_mapping',
+        which maps from label ids to tuples of RGB values.
+    """
+    with open(fname, "rb") as f:
+        pil_image = Image.open(f)
+        pil_image.load()
+        if pil_image.mode != "RGB":
+            raise OSError(
+                f"Expected a RGB image in {fname}, but instead found an image with mode {pil_image.mode}"
+            )
+        # Load id to RGB mapping
+        color_palette_json = pil_image.info.get("id_to_color_mapping", None)
+        if color_palette_json is None:
+            raise RuntimeError("'id_to_color_mapping' is missing in the PNG metadata.")
+        color_palette = json.loads(color_palette_json)
+        color_to_id_mapping = {
+            tuple(color): int(id) for id, color in color_palette.items()
+        }
+        if resize is not None:
+            pil_image = pil_image.resize(resize, Image.NEAREST)
+    if fmt == "pil":
+        return pil_image
+    # Reverse the color mapping: map from RGB colors to ids
+    img_data = np.array(pil_image)
+    # Create a lookup table for fast mapping
+    max_color_value = 256  # Assuming 8-bit per channel
+    lookup_table = np.full(
+        (max_color_value, max_color_value, max_color_value),
+        INVALID_ID,
+        dtype=np.int32,
+    )
+    for color, index in color_to_id_mapping.items():
+        lookup_table[color] = index
+    # Map colors to ids using the lookup table
+    img_data = lookup_table[img_data[..., 0], img_data[..., 1], img_data[..., 2]]
+    if fmt == "np":
+        return img_data
+    elif fmt == "torch":
+        return torch.from_numpy(img_data)
+    else:
+        raise NotImplementedError(f"Image format not supported: {fmt}")
+def _store_labeled_image(
+    fname: str | Path,
+    img_data: np.ndarray | torch.Tensor | Image.Image,
+    semantic_color_mapping: np.ndarray | None = None,
+    **kwargs,
+) -> None:
+    """
+    Stores a labeled image as a uint8 RGB PNG file.
+    Args:
+        fname (str or Path): The filename to store the image in.
+        img_data (numpy.ndarray, torch.Tensor or PIL.Image.Image): The per-pixel label ids to store.
+        semantic_color_mapping (np.ndarray): Optional, preloaded NumPy array of semantic colors.
+    Raises:
+        ValueError: If the file suffix is not supported (i.e., not .png).
+        RuntimeError: If the type of the image data is different from uint16, int16 or int32.
+    Notes:
+        The function takes an image with per-pixel label ids and converts it into an RGB image
+        using a specified mapping from label ids to RGB colors. The resulting image is saved as
+        a PNG file, with the mapping stored as metadata.
+    """
+    if Path(fname).suffix != ".png":
+        raise ValueError(
+            f"Only filenames with suffix .png allowed but received: {fname}"
+        )
+    if isinstance(img_data, Image.Image) and img_data.mode != "I;16":
+        raise RuntimeError(
+            f"The provided image does not seem to be a labeled image. The provided PIL image has mode {img_data.mode}."
+        )
+    if isinstance(img_data, np.ndarray) and img_data.dtype not in [
+        np.uint16,
+        np.int16,
+        np.int32,
+    ]:
+        raise RuntimeError(
+            f"The provided NumPy array has type {img_data.dtype} but the expected type is np.uint16, np.int16 or np.int32."
+        )
+    if isinstance(img_data, torch.Tensor):
+        if img_data.dtype not in [torch.uint16, torch.int16, torch.int32]:
+            raise RuntimeError(
+                f"The provided PyTorch tensor has type {img_data.dtype} but the expected type is torch.uint16, torch.int16 or torch.int32."
+            )
+        img_data = img_data.numpy()
+    if semantic_color_mapping is None:
+        # Mapping from ids to colors not provided, load it now
+        semantic_color_mapping = load_semantic_color_mapping()
+    img_data, color_palette = apply_id_to_color_mapping(
+        img_data, semantic_color_mapping
+    )
+    pil_image = Image.fromarray(img_data, "RGB")
+    # Create a PngInfo object to store metadata
+    meta = PngImagePlugin.PngInfo()
+    meta.add_text("id_to_color_mapping", json.dumps(color_palette))
+    pil_image.save(fname, pnginfo=meta)
+def _load_generic_mesh(mesh_path: str | Path, **kwargs) -> trimesh.Trimesh:
+    """Load mesh with the trimesh library.
+    Args:
+        mesh_path (str): Path to the mesh file
+    Returns:
+        The trimesh object from trimesh.load().
+    Raises:
+        ValueError: If the file format is not supported.
+    """
+    # needed to load big texture files
+    Image.MAX_IMAGE_PIXELS = None
+    # load mesh with trimesh
+    mesh_data = trimesh.load(mesh_path, process=False)
+    return mesh_data
+def _store_generic_mesh(
+    file_path: str | Path, mesh_data: dict | trimesh.Trimesh, **kwargs
+) -> None:
+    """
+    Dummy function for storing generic mesh data.
+    Args:
+        file_path (str): The filename to store the mesh in.
+        mesh_data (dict): Dictionary containing mesh data.
+        **kwargs: Additional keyword arguments.
+    Raises:
+        NotImplementedError: This function is not implemented yet.
+    """
+    raise NotImplementedError("Storing generic meshes is not implemented yet.")
+def _load_labeled_mesh(
+    file_path: str | Path,
+    fmt: str = "torch",
+    palette: str = "rgb",
+    **kwargs,
+) -> dict | trimesh.Trimesh:
+    """
+    Loads a mesh from a labeled mesh file (PLY binary format).
+    Args:
+        file_path (str): The path to the labeled mesh file (.ply).
+        fmt (str): Output format of the mesh data. Can be one of:
+            - "torch": Returns a dict of PyTorch tensors containing mesh data.
+            - "np": Returns a dict of NumPy arrays containing mesh data.
+            - "trimesh": Returns a trimesh mesh object.
+            Defaults to "torch".
+        palette (str): Output color of the trimesh mesh data. Can be one of:
+            - "rgb": Colors the mesh with original rgb colors
+            - "semantic_class": Colors the mesh with semantic class colors
+            - "instance": Colors the mesh with semantic instance colors
+            Applied only when fmt is "trimesh".
+    Returns:
+        The loaded mesh in the specified output format.
+    Raises:
+        NotImplementedError: If the specified output format is not supported.
+    Notes:
+        This function reads a binary PLY file with vertex position, color, and optional
+        semantic class and instance IDs. The faces are stored as lists of vertex indices.
+    """
+    # load data (NOTE: define known_list_len to enable faster read)
+    ply_data = PlyData.read(file_path, known_list_len={"face": {"vertex_indices": 3}})
+    # get vertices
+    vertex_data = ply_data["vertex"].data
+    vertices = np.column_stack(
+        (vertex_data["x"], vertex_data["y"], vertex_data["z"])
+    ).astype(np.float32)
+    # initialize output data
+    mesh_data = {}
+    mesh_data["is_labeled_mesh"] = True
+    mesh_data["vertices"] = vertices
+    # get faces if available
+    if "face" in ply_data:
+        faces = np.asarray(ply_data["face"].data["vertex_indices"]).astype(np.int32)
+        mesh_data["faces"] = faces
+    # get rgb colors if available
+    if all(color in vertex_data.dtype.names for color in ["red", "green", "blue"]):
+        vertices_color = np.column_stack(
+            (vertex_data["red"], vertex_data["green"], vertex_data["blue"])
+        ).astype(np.uint8)
+        mesh_data["vertices_color"] = vertices_color
+    # get vertices class and instance if available
+    if "semantic_class_id" in vertex_data.dtype.names:
+        vertices_class = vertex_data["semantic_class_id"].astype(np.int32)
+        mesh_data["vertices_semantic_class_id"] = vertices_class
+    if "instance_id" in vertex_data.dtype.names:
+        vertices_instance = vertex_data["instance_id"].astype(np.int32)
+        mesh_data["vertices_instance_id"] = vertices_instance
+    # get class colors if available
+    if all(
+        color in vertex_data.dtype.names
+        for color in [
+            "semantic_class_red",
+            "semantic_class_green",
+            "semantic_class_blue",
+        ]
+    ):
+        vertices_semantic_class_color = np.column_stack(
+            (
+                vertex_data["semantic_class_red"],
+                vertex_data["semantic_class_green"],
+                vertex_data["semantic_class_blue"],
+            )
+        ).astype(np.uint8)
+        mesh_data["vertices_semantic_class_color"] = vertices_semantic_class_color
+    # get instance colors if available
+    if all(
+        color in vertex_data.dtype.names
+        for color in ["instance_red", "instance_green", "instance_blue"]
+    ):
+        vertices_instance_color = np.column_stack(
+            (
+                vertex_data["instance_red"],
+                vertex_data["instance_green"],
+                vertex_data["instance_blue"],
+            )
+        ).astype(np.uint8)
+        mesh_data["vertices_instance_color"] = vertices_instance_color
+    # convert data into output format (if needed)
+    if fmt == "np":
+        return mesh_data
+    elif fmt == "torch":
+        return {k: torch.tensor(v) for k, v in mesh_data.items()}
+    elif fmt == "trimesh":
+        trimesh_mesh = trimesh.Trimesh(
+            vertices=mesh_data["vertices"], faces=mesh_data["faces"]
+        )
+        # color the mesh according to the palette
+        if palette == "rgb":
+            # original rgb colors
+            if "vertices_color" in mesh_data:
+                trimesh_mesh.visual.vertex_colors = mesh_data["vertices_color"]
+            else:
+                raise ValueError(
+                    f"Palette {palette} could not be applied. Missing vertices_color in mesh data."
+                )
+        elif palette == "semantic_class":
+            # semantic class colors
+            if "vertices_semantic_class_color" in mesh_data:
+                trimesh_mesh.visual.vertex_colors = mesh_data[
+                    "vertices_semantic_class_color"
+                ]
+            else:
+                raise ValueError(
+                    f"Palette {palette} could not be applied. Missing vertices_semantic_class_color in mesh data."
+                )
+        elif palette == "instance":
+            # semantic instance colors
+            if "vertices_instance_color" in mesh_data:
+                trimesh_mesh.visual.vertex_colors = mesh_data["vertices_instance_color"]
+            else:
+                raise ValueError(
+                    f"Palette {palette} could not be applied. Missing vertices_instance_color in mesh data."
+                )
+        else:
+            raise ValueError(f"Invalid palette: {palette}.")
+        return trimesh_mesh
+    else:
+        raise NotImplementedError(f"Labeled mesh format not supported: {fmt}")
+def _store_labeled_mesh(file_path: str | Path, mesh_data: dict, **kwargs) -> None:
+    """
+    Stores a mesh in WAI format (PLY binary format).
+    Args:
+        file_path (str): The filename to store the mesh in.
+        mesh_data (dict): Dictionary containing mesh data with keys:
+            - 'vertices' (numpy.ndarray): Array of vertex coordinates with shape (N, 3).
+            - 'faces' (numpy.ndarray, optional): Array of face indices.
+            - 'vertices_color' (numpy.ndarray, optional): Array of vertex colors with shape (N, 3).
+            - 'vertices_semantic_class_id' (numpy.ndarray, optional): Array of semantic classes for each vertex with shape (N).
+            - 'vertices_instance_id' (numpy.ndarray, optional): Array of instance IDs for each vertex with shape (N).
+            - 'vertices_semantic_class_color' (numpy.ndarray, optional): Array of vertex semantic class colors with shape (N, 3).
+            - 'vertices_instance_color' (numpy.ndarray, optional): Array of vertex instance colors with shape (N, 3).
+    Notes:
+        This function writes a binary PLY file with vertex position, color, and optional
+        semantic class and instance IDs. The faces are stored as lists of vertex indices.
+    """
+    # Validate input data
+    if "vertices" not in mesh_data:
+        raise ValueError("Mesh data must contain 'vertices'")
+    # create vertex data with properties
+    vertex_dtype = [("x", "f4"), ("y", "f4"), ("z", "f4")]
+    if "vertices_color" in mesh_data:
+        vertex_dtype.extend([("red", "u1"), ("green", "u1"), ("blue", "u1")])
+    if "vertices_semantic_class_id" in mesh_data:
+        vertex_dtype.append(("semantic_class_id", "i4"))
+    if "vertices_instance_id" in mesh_data:
+        vertex_dtype.append(("instance_id", "i4"))
+    if "vertices_semantic_class_color" in mesh_data:
+        vertex_dtype.extend(
+            [
+                ("semantic_class_red", "u1"),
+                ("semantic_class_green", "u1"),
+                ("semantic_class_blue", "u1"),
+            ]
+        )
+    if "vertices_instance_color" in mesh_data:
+        vertex_dtype.extend(
+            [("instance_red", "u1"), ("instance_green", "u1"), ("instance_blue", "u1")]
+        )
+    vertex_count = len(mesh_data["vertices"])
+    vertex_data = np.zeros(vertex_count, dtype=vertex_dtype)
+    # vertex positions
+    vertex_data["x"] = mesh_data["vertices"][:, 0]
+    vertex_data["y"] = mesh_data["vertices"][:, 1]
+    vertex_data["z"] = mesh_data["vertices"][:, 2]
+    # vertex colors
+    if "vertices_color" in mesh_data:
+        vertex_data["red"] = mesh_data["vertices_color"][:, 0]
+        vertex_data["green"] = mesh_data["vertices_color"][:, 1]
+        vertex_data["blue"] = mesh_data["vertices_color"][:, 2]
+    # vertex class
+    if "vertices_semantic_class_id" in mesh_data:
+        vertex_data["semantic_class_id"] = mesh_data["vertices_semantic_class_id"]
+    # vertex instance
+    if "vertices_instance_id" in mesh_data:
+        vertex_data["instance_id"] = mesh_data["vertices_instance_id"]
+    # vertex class colors
+    if "vertices_semantic_class_color" in mesh_data:
+        vertex_data["semantic_class_red"] = mesh_data["vertices_semantic_class_color"][
+            :, 0
+        ]
+        vertex_data["semantic_class_green"] = mesh_data[
+            "vertices_semantic_class_color"
+        ][:, 1]
+        vertex_data["semantic_class_blue"] = mesh_data["vertices_semantic_class_color"][
+            :, 2
+        ]
+    # vertex instance colors
+    if "vertices_instance_color" in mesh_data:
+        vertex_data["instance_red"] = mesh_data["vertices_instance_color"][:, 0]
+        vertex_data["instance_green"] = mesh_data["vertices_instance_color"][:, 1]
+        vertex_data["instance_blue"] = mesh_data["vertices_instance_color"][:, 2]
+    # initialize data to save
+    vertex_element = PlyElement.describe(vertex_data, "vertex")
+    data_to_save = [vertex_element]
+    # faces data
+    if "faces" in mesh_data:
+        face_dtype = [("vertex_indices", "i4", (3,))]
+        face_data = np.zeros(len(mesh_data["faces"]), dtype=face_dtype)
+        face_data["vertex_indices"] = mesh_data["faces"]
+        face_element = PlyElement.describe(face_data, "face")
+        data_to_save.append(face_element)
+    # Create and write a binary PLY file
+    ply_data = PlyData(data_to_save, text=False)
+    ply_data.write(file_path)
+def _get_method(
+    fname: Path | str, format_type: str | None = None, load: bool = True
+) -> Callable:
+    """
+    Returns a method for loading or storing data in a specific format.
+    Args:
+        fname (str or Path): The filename to load or store data from/to.
+        format_type (str, optional): The format of the data. If None, it will be inferred from the file extension.
+            Defaults to None.
+        load (bool, optional): Whether to return a method for loading or storing data.
+            Defaults to True.
+    Returns:
+        callable: A method for loading or storing data in the specified format.
+    Raises:
+        ValueError: If the format cannot be inferred from the file extension.
+        NotImplementedError: If the specified format is not supported.
+    Notes:
+        This function supports various formats, including readable files (JSON, YAML), images, NumPy arrays,
+        PyTorch tensors, memory-mapped files, and scene metadata.
+    """
+    fname = Path(fname)
+    if format_type is None:
+        # use default formats
+        if fname.suffix in [".json", ".yaml", ".yml"]:
+            format_type = "readable"
+        elif fname.suffix in [".jpg", ".jpeg", ".png", ".webp"]:
+            format_type = "image"
+        elif fname.suffix in [".npy", ".npz"]:
+            format_type = "numpy"
+        elif fname.suffix == ".ptz":
+            format_type = "ptz"
+        elif fname.suffix == ".sft":
+            format_type = "sft"
+        elif fname.suffix == ".exr":
+            format_type = "scalar"
+        elif fname.suffix in [".glb", ".obj", ".ply"]:
+            format_type = "mesh"
+        else:
+            raise ValueError(f"Cannot infer format for {fname}")
+    methods = {
+        "readable": (_load_readable, _store_readable),
+        "scalar": (_read_exr, _write_exr),
+        "image": (_load_image, _store_image),
+        "binary": (_load_binary_mask, _store_binary_mask),
+        "latent": (_load_sft, _store_sft),
+        "depth": (_load_depth, _store_depth),
+        "normals": (_load_normals, _store_normals),
+        "numpy": (_load_numpy, _store_numpy),
+        "ptz": (_load_ptz, _store_ptz),
+        "sft": (_load_sft, _store_sft),
+        "mmap": (_load_mmap, _store_mmap),
+        "scene_meta": (_load_scene_meta, _store_scene_meta),
+        "labeled_image": (_load_labeled_image, _store_labeled_image),
+        "mesh": (_load_generic_mesh, _store_generic_mesh),
+        "labeled_mesh": (_load_labeled_mesh, _store_labeled_mesh),
+    }
+    try:
+        return methods[format_type][0 if load else 1]
+    except KeyError as e:
+        raise NotImplementedError(f"Format not supported: {format_type}") from e

mapanything/utils/wai/m_ops.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import numpy as np
+import torch
+def m_dot(
+    transform: torch.Tensor,
+    points: torch.Tensor | list,
+    maintain_shape: bool = False,
+) -> torch.Tensor | list:
+    """
+    Apply batch matrix multiplication between transform matrices and points.
+    Args:
+        transform: Batch of transformation matrices [..., 3/4, 3/4]
+        points: Batch of points [..., N, 3] or a list of points
+        maintain_shape: If True, preserves the original shape of points
+    Returns:
+        Transformed points with shape [..., N, 3] or a list of transformed points
+    """
+    if isinstance(points, list):
+        return [m_dot(t, p, maintain_shape) for t, p in zip(transform, points)]
+    # Store original shape and flatten batch dimensions
+    orig_shape = points.shape
+    batch_dims = points.shape[:-3]
+    # Reshape to standard batch format
+    transform_flat = transform.reshape(-1, transform.shape[-2], transform.shape[-1])
+    points_flat = points.reshape(transform_flat.shape[0], -1, points.shape[-1])
+    # Apply transformation
+    pts = torch.bmm(
+        transform_flat[:, :3, :3],
+        points_flat[..., :3].permute(0, 2, 1).to(transform_flat.dtype),
+    ).permute(0, 2, 1)
+    if transform.shape[-1] == 4:
+        pts = pts + transform_flat[:, :3, 3].unsqueeze(1)
+    # Restore original shape
+    if maintain_shape:
+        return pts.reshape(orig_shape)
+    else:
+        return pts.reshape(*batch_dims, -1, 3)
+def m_unproject(
+    depth: torch.Tensor,
+    intrinsic: torch.Tensor,
+    cam2world: torch.Tensor = None,
+    img_grid: torch.Tensor = None,
+    valid: torch.Tensor = None,
+    H: int | None = None,
+    W: int | None = None,
+    img_feats: torch.Tensor = None,
+    maintain_shape: bool = False,
+) -> torch.Tensor:
+    """
+    Unproject 2D image points with depth values to 3D points in camera or world space.
+    Args:
+        depth: Depth values, either a tensor of shape ...xHxW or a float value
+        intrinsic: Camera intrinsic matrix of shape ...x3x3
+        cam2world: Optional camera-to-world transformation matrix of shape ...x4x4
+        img_grid: Optional pre-computed image grid. If None, will be created
+        valid: Optional mask for valid depth values or minimum depth threshold
+        H: Image height (required if depth is a scalar)
+        W: Image width (required if depth is a scalar)
+        img_feats: Optional image features to append to 3D points
+        maintain_shape: If True, preserves the original shape of points
+    Returns:
+        3D points in camera or world space, with optional features appended
+    """
+    # Get device and shape information from intrinsic matrix
+    device = intrinsic.device
+    pre_shape = intrinsic.shape[:-2]  # Batch dimensions
+    # Validate inputs
+    if isinstance(depth, (int, float)) and H is None:
+        raise ValueError("H must be provided if depth is a scalar")
+    # Determine image dimensions from depth if not provided
+    if isinstance(depth, torch.Tensor) and H is None:
+        H, W = depth.shape[-2:]
+    # Create image grid if not provided
+    if img_grid is None:
+        # Create coordinate grid with shape HxWx3 (last dimension is homogeneous)
+        img_grid = _create_image_grid(H, W, device)
+        # Add homogeneous coordinate
+        img_grid = torch.cat([img_grid, torch.ones_like(img_grid[..., :1])], -1)
+    # Expand img_grid to match batch dimensions of intrinsic
+    if img_grid.dim() <= intrinsic.dim():
+        img_grid = img_grid.unsqueeze(0)
+        img_grid = img_grid.expand(*pre_shape, *img_grid.shape[-3:])
+    # Handle valid mask or minimum depth threshold
+    depth_mask = None
+    if valid is not None:
+        if isinstance(valid, float):
+            # Create mask for minimum depth value
+            depth_mask = depth > valid
+        elif isinstance(valid, torch.Tensor):
+            depth_mask = valid
+        # Apply mask to image grid and other inputs
+        img_grid = masking(img_grid, depth_mask, dim=intrinsic.dim())
+        if not isinstance(depth, (int, float)):
+            depth = masking(depth, depth_mask, dim=intrinsic.dim() - 1)
+        if img_feats is not None:
+            img_feats = masking(img_feats, depth_mask, dim=intrinsic.dim() - 1)
+    # Unproject 2D points to 3D camera space
+    cam_pts: torch.Tensor = m_dot(
+        m_inverse_intrinsics(intrinsic),
+        img_grid[..., [1, 0, 2]],
+        maintain_shape=True,
+    )
+    # Scale by depth values
+    cam_pts = mult(cam_pts, depth.unsqueeze(-1))
+    # Transform to world space if cam2world is provided
+    if cam2world is not None:
+        cam_pts = m_dot(cam2world, cam_pts, maintain_shape=True)
+    # Append image features if provided
+    if img_feats is not None:
+        if isinstance(cam_pts, list):
+            if isinstance(cam_pts[0], list):
+                # Handle nested list case
+                result = []
+                for batch_idx, batch in enumerate(cam_pts):
+                    batch_result = []
+                    for view_idx, view in enumerate(batch):
+                        batch_result.append(
+                            torch.cat([view, img_feats[batch_idx][view_idx]], -1)
+                        )
+                    result.append(batch_result)
+                cam_pts = result
+            else:
+                # Handle single list case
+                cam_pts = [
+                    torch.cat([pts, feats], -1)
+                    for pts, feats in zip(cam_pts, img_feats)
+                ]
+        else:
+            # Handle tensor case
+            cam_pts = torch.cat([cam_pts, img_feats], -1)
+    if maintain_shape:
+        return cam_pts
+    # Flatten last dimension
+    return cam_pts.reshape(*pre_shape, -1, 3)
+def m_project(
+    world_pts: torch.Tensor,
+    intrinsic: torch.Tensor,
+    world2cam: torch.Tensor | None = None,
+    maintain_shape: bool = False,
+) -> torch.Tensor:
+    """
+    Project 3D world points to 2D image coordinates.
+    Args:
+        world_pts: 3D points in world coordinates
+        intrinsic: Camera intrinsic matrix
+        world2cam: Optional transformation from world to camera coordinates
+        maintain_shape: If True, preserves the original shape of points
+    Returns:
+        Image points with coordinates in img_y,img_x,z order
+    """
+    # Transform points from world to camera space if world2cam is provided
+    cam_pts: torch.Tensor = world_pts
+    if world2cam is not None:
+        cam_pts = m_dot(world2cam, world_pts, maintain_shape=maintain_shape)
+    # Get shapes to properly expand intrinsics
+    shared_dims = intrinsic.shape[:-2]
+    extra_dims = cam_pts.shape[len(shared_dims) : -1]
+    # Expand intrinsics to match cam_pts shape
+    expanded_intrinsic = intrinsic.view(*shared_dims, *([1] * len(extra_dims)), 3, 3)
+    expanded_intrinsic = expanded_intrinsic.expand(*shared_dims, *extra_dims, 3, 3)
+    # Project points from camera space to image space
+    depth_abs = cam_pts[..., 2].abs().clamp(min=1e-5)
+    return torch.stack(
+        [
+            expanded_intrinsic[..., 1, 1] * cam_pts[..., 1] / depth_abs
+            + expanded_intrinsic[..., 1, 2],
+            expanded_intrinsic[..., 0, 0] * cam_pts[..., 0] / depth_abs
+            + expanded_intrinsic[..., 0, 2],
+            cam_pts[..., 2],
+        ],
+        -1,
+    )
+def in_image(
+    image_pts: torch.Tensor | list,
+    H: int,
+    W: int,
+    min_depth: float = 0.0,
+) -> torch.Tensor | list:
+    """
+    Check if image points are within the image boundaries.
+    Args:
+        image_pts: Image points in pixel coordinates
+        H: Image height
+        W: Image width
+        min_depth: Minimum valid depth
+    Returns:
+        Boolean mask indicating which points are within the image
+    """
+    is_list = isinstance(image_pts, list)
+    if is_list:
+        return [in_image(pts, H, W, min_depth=min_depth) for pts in image_pts]
+    in_image_mask = (
+        torch.all(image_pts >= 0, -1)
+        & (image_pts[..., 0] < H)
+        & (image_pts[..., 1] < W)
+    )
+    if (min_depth is not None) and image_pts.shape[-1] == 3:
+        in_image_mask &= image_pts[..., 2] > min_depth
+    return in_image_mask
+def _create_image_grid(H: int, W: int, device: torch.device) -> torch.Tensor:
+    """
+    Create a coordinate grid for image pixels.
+    Args:
+        H: Image height
+        W: Image width
+        device: Computation device
+    Returns:
+        Image grid with shape HxWx3 (last dimension is homogeneous)
+    """
+    y_coords = torch.arange(H, device=device)
+    x_coords = torch.arange(W, device=device)
+    # Use meshgrid with indexing="ij" for correct orientation
+    y_grid, x_grid = torch.meshgrid(y_coords, x_coords, indexing="ij")
+    # Stack coordinates and add homogeneous coordinate
+    img_grid = torch.stack([y_grid, x_grid, torch.ones_like(y_grid)], dim=-1)
+    return img_grid
+def masking(
+    X: torch.Tensor | list,
+    mask: torch.Tensor | list,
+    dim: int = 3,
+) -> torch.Tensor | list:
+    """
+    Apply a Boolean mask to tensor or list elements.
+    Handles nested structures by recursively applying the mask.
+    Args:
+        X: Input tensor or list to be masked
+        mask: Boolean mask to apply
+        dim: Dimension threshold for recursive processing
+    Returns:
+        Masked tensor or list with the same structure as input
+    """
+    if isinstance(X, list) or (isinstance(X, torch.Tensor) and X.dim() >= dim):
+        return [masking(x, m, dim) for x, m in zip(X, mask)]
+    return X[mask]
+def m_inverse_intrinsics(intrinsics: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the inverse of camera intrinsics matrices analytically.
+    This is much faster than using torch.inverse() for intrinsics matrices.
+    The intrinsics matrix has the form:
+    K = [fx  s  cx]
+        [0   fy cy]
+        [0   0   1]
+    And its inverse is:
+    K^-1 = [1/fx  -s/(fx*fy)  (s*cy-cx*fy)/(fx*fy)]
+           [0     1/fy        -cy/fy            ]
+           [0     0           1                 ]
+    Args:
+        intrinsics: Camera intrinsics matrices of shape [..., 3, 3]
+    Returns:
+        Inverse intrinsics matrices of shape [..., 3, 3]
+    """
+    # Extract the components of the intrinsics matrix
+    fx = intrinsics[..., 0, 0]
+    s = intrinsics[..., 0, 1]  # skew, usually 0
+    cx = intrinsics[..., 0, 2]
+    fy = intrinsics[..., 1, 1]
+    cy = intrinsics[..., 1, 2]
+    # Create output tensor with same shape and device
+    inv_intrinsics = torch.zeros_like(intrinsics)
+    # Compute the inverse analytically
+    inv_intrinsics[..., 0, 0] = 1.0 / fx
+    inv_intrinsics[..., 0, 1] = -s / (fx * fy)
+    inv_intrinsics[..., 0, 2] = (s * cy - cx * fy) / (fx * fy)
+    inv_intrinsics[..., 1, 1] = 1.0 / fy
+    inv_intrinsics[..., 1, 2] = -cy / fy
+    inv_intrinsics[..., 2, 2] = 1.0
+    return inv_intrinsics
+def mult(
+    A: torch.Tensor | np.ndarray | list | float | int,
+    B: torch.Tensor | np.ndarray | list | float | int,
+) -> torch.Tensor | np.ndarray | list | float | int:
+    """
+    Multiply two objects with support for lists, tensors, arrays, and scalars.
+    Handles nested structures by recursively applying multiplication.
+    Args:
+        A: First operand (tensor, array, list, or scalar)
+        B: Second operand (tensor, array, list, or scalar)
+    Returns:
+        Result of multiplication with the same structure as inputs
+    """
+    if isinstance(A, list) and isinstance(B, (int, float)):
+        return [mult(a, B) for a in A]
+    if isinstance(B, list) and isinstance(A, (int, float)):
+        return [mult(A, b) for b in B]
+    if isinstance(A, list) and isinstance(B, list):
+        return [mult(a, b) for a, b in zip(A, B)]
+    return A * B

mapanything/utils/wai/ops.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+This utils script contains PORTAGE of wai-core ops methods for MapAnything.
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+def to_numpy(
+    data: torch.Tensor | np.ndarray | int | float,
+    dtype: np.dtype | str | type = np.float32,
+) -> np.ndarray:
+    """
+    Convert data to a NumPy array with the specified dtype (default: float32).
+    This function handles conversion from NumPy arrays and PyTorch tensors to a NumPy array.
+    Args:
+        data: Input data (torch.Tensor, np.ndarray, or scalar)
+        dtype: Target data type (NumPy dtype, str, or type). Default: np.float32.
+    Returns:
+        Converted data as NumPy array with specified dtype.
+    """
+    # Set default dtype if not defined
+    assert dtype is not None, "dtype cannot be None"
+    dtype = np.dtype(dtype)
+    # Handle torch.Tensor
+    if isinstance(data, torch.Tensor):
+        return data.detach().cpu().numpy().astype(dtype)
+    # Handle numpy.ndarray
+    if isinstance(data, np.ndarray):
+        return data.astype(dtype)
+    # Handle scalar values
+    if isinstance(data, (int, float)):
+        return np.array(data, dtype=dtype)
+    raise NotImplementedError(f"Unsupported data type: {type(data)}")
+def get_dtype_device(
+    data: torch.Tensor | np.ndarray | dict | list,
+) -> tuple[torch.dtype | np.dtype | None, torch.device | str | type | None]:
+    """
+    Determine the data type and device of the input data.
+    This function recursively inspects the input data and determines its data type
+    and device. It handles PyTorch tensors, NumPy arrays, dictionaries, and lists.
+    Args:
+        data: Input data (torch.Tensor, np.ndarray, dict, list, or other)
+    Returns:
+        tuple: (dtype, device) where:
+            - dtype: The data type (torch.dtype or np.dtype)
+            - device: The device (torch.device, 'cpu', 'cuda:X', or np.ndarray)
+    Raises:
+        ValueError: If tensors in a dictionary are on different CUDA devices
+    """
+    if isinstance(data, torch.Tensor):
+        return data.dtype, data.device
+    if isinstance(data, np.ndarray):
+        return data.dtype, np.ndarray
+    if isinstance(data, dict):
+        dtypes = {get_dtype_device(v)[0] for v in data.values()}
+        devices = {get_dtype_device(v)[1] for v in data.values()}
+        cuda_devices = {device for device in devices if str(device).startswith("cuda")}
+        cpu_devices = {device for device in devices if str(device).startswith("cpu")}
+        if (len(cuda_devices) > 0) or (len(cpu_devices) > 0):
+            # torch.tensor
+            dtype = torch.float
+            if all(dtype == torch.half for dtype in dtypes):
+                dtype = torch.half
+            device = None
+            if len(cuda_devices) > 1:
+                raise ValueError("All tensors must be on the same device")
+            if len(cuda_devices) == 1:
+                device = list(cuda_devices)[0]
+            if (device is None) and (len(cpu_devices) == 1):
+                device = list(cpu_devices)[0]
+        else:
+            dtype = np.float32
+            # Fix typo in numpy float16 check
+            if all(dtype == np.float16 for dtype in dtypes):
+                dtype = np.float16
+            device = np.ndarray
+    elif isinstance(data, list):
+        if not data:  # Handle empty list case
+            return None, None
+        dtype, device = get_dtype_device(data[0])
+    else:
+        return np.float32, np.ndarray
+    return dtype, device
+def crop(
+    data: np.ndarray | torch.Tensor | Image.Image,
+    bbox: tuple[int, int, int, int] | tuple[int, int],
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Crop data of different formats (numpy arrays, PyTorch tensors, PIL Images) to a target size.
+    Args:
+        data: Input data to resize (numpy.ndarray, torch.Tensor, or PIL.Image.Image)
+        size: Target size as tuple (offset_height, offset_width, height, width) or tuple (height, width)
+    Returns:
+        Cropped data in the same format as the input
+    """
+    if len(bbox) == 4:
+        offset_height, offset_width, target_height, target_width = bbox
+    elif len(bbox) == 2:
+        target_height, target_width = bbox
+        offset_height, offset_width = 0, 0
+    else:
+        raise ValueError(f"Unsupported size length {len(bbox)}.")
+    end_height = offset_height + target_height
+    end_width = offset_width + target_width
+    if any([sz < 0 for sz in bbox]):
+        raise ValueError("Bounding box can't have negative values.")
+    if isinstance(data, np.ndarray):
+        if (
+            max(offset_height, end_height) > data.shape[0]
+            or max(offset_width, end_width) > data.shape[1]
+        ):
+            raise ValueError("Invalid bounding box.")
+        cropped_data = data[offset_height:end_height, offset_width:end_width, ...]
+        return cropped_data
+    # Handle PIL images
+    elif isinstance(data, Image.Image):
+        if (
+            max(offset_height, end_height) > data.size[1]
+            or max(offset_width, end_width) > data.size[0]
+        ):
+            raise ValueError("Invalid bounding box.")
+        return data.crop((offset_width, offset_height, end_width, end_height))
+    # Handle PyTorch tensors
+    elif isinstance(data, torch.Tensor):
+        if data.is_nested:
+            # special handling for nested tensors
+            return torch.stack([crop(nested_tensor, bbox) for nested_tensor in data])
+        if (
+            max(offset_height, end_height) > data.shape[-2]
+            or max(offset_width, end_width) > data.shape[-1]
+        ):
+            raise ValueError("Invalid bounding box.")
+        cropped_data = data[..., offset_height:end_height, offset_width:end_width]
+        return cropped_data
+    else:
+        raise TypeError(f"Unsupported data type '{type(data)}'.")
+def stack(
+    data: list[
+        dict[str, torch.Tensor | np.ndarray]
+        | list[torch.Tensor | np.ndarray]
+        | tuple[torch.Tensor | np.ndarray]
+    ],
+) -> dict[str, torch.Tensor | np.ndarray] | list[torch.Tensor | np.ndarray]:
+    """
+    Stack a list of dictionaries into a single dictionary with stacked values.
+    Or when given a list of sublists, stack the sublists using torch or numpy stack
+    if the items are of equal size, or nested tensors if the items are PyTorch tensors
+    of different size.
+    This utility function is similar to PyTorch's collate function, but specifically
+    designed for stacking dictionaries of numpy arrays or PyTorch tensors.
+    Args:
+        data (list): A list of dictionaries with the same keys, where values are
+                    either numpy arrays or PyTorch tensors.
+                    OR
+                    A list of sublist, where the values of sublists are PyTorch tensors
+                    or np arrrays.
+    Returns:
+        dict: A dictionary with the same keys as input dictionaries, but with values
+              stacked along a new first dimension.
+        OR
+        list: If the input was a list with sublists, it returns a list with a stacked
+            output for each original input sublist.
+    Raises:
+        ValueError: If dictionaries in the list have inconsistent keys.
+        NotImplementedError: If input is not a list or contains non-dictionary elements.
+    """
+    if not isinstance(data, list):
+        raise NotImplementedError(f"Stack: Data type not supported: {data}")
+    if len(data) == 0:
+        return data
+    if all(isinstance(entry, dict) for entry in data):
+        stacked_data = {}
+        keys = list(data[0].keys())
+        if any(set(entry.keys()) != set(keys) for entry in data):
+            raise ValueError("Data not consistent for stacking")
+        for key in keys:
+            stacked_data[key] = []
+            for entry in data:
+                stacked_data[key].append(entry[key])
+            # stack it according to data format
+            if all(isinstance(v, np.ndarray) for v in stacked_data[key]):
+                stacked_data[key] = np.stack(stacked_data[key])
+            elif all(isinstance(v, torch.Tensor) for v in stacked_data[key]):
+                # Check if all tensors have the same shape
+                first_shape = stacked_data[key][0].shape
+                if all(tensor.shape == first_shape for tensor in stacked_data[key]):
+                    stacked_data[key] = torch.stack(stacked_data[key])
+                else:
+                    # Use nested tensors if shapes are not consistent
+                    stacked_data[key] = torch.nested.nested_tensor(stacked_data[key])
+        return stacked_data
+    if all(isinstance(entry, list) for entry in data):
+        # new stacked data will be a list with all of the sublist
+        stacked_data = []
+        for sublist in data:
+            # stack it according to data format
+            if all(isinstance(v, np.ndarray) for v in sublist):
+                stacked_data.append(np.stack(sublist))
+            elif all(isinstance(v, torch.Tensor) for v in sublist):
+                # Check if all tensors have the same shape
+                first_shape = sublist[0].shape
+                if all(tensor.shape == first_shape for tensor in sublist):
+                    stacked_data.append(torch.stack(sublist))
+                else:
+                    # Use nested tensors if shapes are not consistent
+                    stacked_data.append(torch.nested.nested_tensor(sublist))
+        return stacked_data
+    raise NotImplementedError(f"Stack: Data type not supported: {data}")
+def resize(
+    data: np.ndarray | torch.Tensor | Image.Image,
+    size: tuple[int, int] | int | None = None,
+    scale: float | None = None,
+    modality_format: str | None = None,
+) -> np.ndarray | torch.Tensor | Image.Image:
+    """
+    Resize data of different formats (numpy arrays, PyTorch tensors, PIL Images) to a target size.
+    Args:
+        data: Input data to resize (numpy.ndarray, torch.Tensor, or PIL.Image.Image)
+        size: Target size as tuple (height, width) or single int for long-side scaling
+        scale: Scale factor to apply to the original dimensions
+        modality_format: Type of data being resized ('depth', 'normals', or None)
+                         Affects interpolation method used
+    Returns:
+        Resized data in the same format as the input
+    Raises:
+        ValueError: If neither size nor scale is provided, or if both are provided
+        TypeError: If data is not a supported type
+    """
+    # Validate input parameters
+    if size is not None and scale is not None:
+        raise ValueError("Only one of size or scale should be provided.")
+    # Calculate size from scale if needed
+    if size is None:
+        if scale is None:
+            raise ValueError("Either size or scale must be provided.")
+        size = (1, 1)
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            size = (int(data.shape[-2] * scale), int(data.shape[-1] * scale))
+        elif isinstance(data, Image.Image):
+            size = (int(data.size[1] * scale), int(data.size[0] * scale))
+        else:
+            raise TypeError(f"Unsupported data type '{type(data)}'.")
+    # Handle long-side scaling when size is a single integer
+    elif isinstance(size, int):
+        long_side = size
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            if isinstance(data, torch.Tensor) and data.is_nested:
+                raise ValueError(
+                    "Long-side scaling not support for nested tensors, use fixed size instead."
+                )
+            h, w = data.shape[-2], data.shape[-1]
+        elif isinstance(data, Image.Image):
+            w, h = data.size
+        else:
+            raise TypeError(f"Unsupported data type '{type(data)}'.")
+        if h > w:
+            size = (long_side, int(w * long_side / h))
+        else:
+            size = (int(h * long_side / w), long_side)
+    target_height, target_width = size
+    # Set interpolation method based on modality
+    if modality_format in ["depth", "normals"]:
+        interpolation = Image.Resampling.NEAREST
+        torch_interpolation = "nearest"
+    else:
+        interpolation = Image.Resampling.LANCZOS
+        torch_interpolation = "bilinear"
+    # Handle numpy arrays
+    if isinstance(data, np.ndarray):
+        pil_image = Image.fromarray(data)
+        resized_image = pil_image.resize((target_width, target_height), interpolation)
+        return np.array(resized_image)
+    # Handle PIL images
+    elif isinstance(data, Image.Image):
+        return data.resize((target_width, target_height), interpolation)
+    # Handle PyTorch tensors
+    elif isinstance(data, torch.Tensor):
+        if data.is_nested:
+            # special handling for nested tensors
+            return torch.stack(
+                [
+                    resize(nested_tensor, size, scale, modality_format)
+                    for nested_tensor in data
+                ]
+            )
+        original_dim = data.ndim
+        if original_dim == 2:  # (H, W)
+            data = data.unsqueeze(0).unsqueeze(0)  # Add channel and batch dimensions
+        elif original_dim == 3:  # (C/B, H W)
+            if modality_format == "depth":
+                data = data.unsqueeze(1)  # channel batch dimension
+            else:
+                data = data.unsqueeze(0)  # Add batch dimension
+        resized_tensor = F.interpolate(
+            data,
+            size=(target_height, target_width),
+            mode=torch_interpolation,
+            align_corners=False if torch_interpolation != "nearest" else None,
+        )
+        if original_dim == 2:
+            return resized_tensor.squeeze(0).squeeze(
+                0
+            )  # Remove batch and channel dimensions
+        elif original_dim == 3:
+            if modality_format == "depth":
+                return resized_tensor.squeeze(1)  # Remove channel dimension
+            return resized_tensor.squeeze(0)  # Remove batch dimension
+        else:
+            return resized_tensor
+    else:
+        raise TypeError(f"Unsupported data type '{type(data)}'.")

mapanything/utils/wai/scene_frame.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import logging
+import os
+import random
+import re
+from pathlib import Path
+from typing import Any
+import numpy as np
+from mapanything.utils.wai.io import (
+    _load_readable,
+    _load_scene_meta,
+    get_processing_state,
+)
+logger = logging.getLogger(__name__)
+def get_scene_frame_names(
+    cfg: dict | object,
+    root: Path | str | None = None,
+    scene_frames_fn: str | None = None,
+    keyframes: bool = True,
+) -> dict[str, list[str | float]] | None:
+    """
+    Retrieve scene frame names based on configuration and optional parameters.
+    This function determines the scene frame names by resolving the scene frame file
+    and applying any necessary filters based on the provided configuration.
+    Args:
+        cfg: Configuration object containing settings and parameters.
+        root: Optional root directory path. If not provided, it will be fetched from cfg.
+        scene_frames_fn: Optional scene frames file name. If not provided, it will be fetched from cfg.
+        keyframes: Optional, used only for a video. If True (default), return only keyframes (with camera poses).
+    Returns:
+        A dictionary mapping scene names to their respective frame names.
+    """
+    scene_frames_fn = (
+        cfg.get("scene_frames_fn") if scene_frames_fn is None else scene_frames_fn
+    )
+    scene_frame_names = None
+    if scene_frames_fn is not None:
+        # load scene_frames based on scene_frame file
+        scene_frame_names = _resolve_scene_frames_fn(scene_frames_fn)
+    scene_names = get_scene_names(
+        cfg,
+        root=root,
+        scene_names=(
+            list(scene_frame_names.keys()) if scene_frame_names is not None else None
+        ),
+    )
+    scene_frame_names = _resolve_scene_frame_names(
+        cfg,
+        scene_names,
+        root=root,
+        scene_frame_names=scene_frame_names,
+        keyframes=keyframes,
+    )
+    return scene_frame_names
+def get_scene_names(
+    cfg: dict | object,
+    root: Path | str | None = None,
+    scene_names: list[str] | None = None,
+    shuffle: bool = False,
+) -> list[str]:
+    """
+    Retrieve scene names based on the provided configuration and optional parameters.
+    This function determines the scene names by checking the root directory for subdirectories
+    and applying any necessary filters based on the provided configuration.
+    Args:
+        cfg: Configuration object containing settings and parameters.
+        root: Optional root directory path. If not provided, it will be fetched from cfg.
+        scene_names: Optional list of scene names. If not provided, it will be determined from the root directory.
+        shuffle: Optional bool. Default to False. If True, it will return the list of scene names in random order.
+    Returns:
+        A list of scene names after applying any filters specified in the configuration.
+    """
+    root = cfg.get("root") if root is None else root
+    if root is not None:
+        # Check if the root exists
+        if not Path(root).exists():
+            raise IOError(f"Root directory does not exist: {root}")
+        # Check if the root is a directory
+        if not Path(root).is_dir():
+            raise IOError(f"Root directory is not a directory: {root}")
+    if scene_names is None:
+        scene_filters = cfg.get("scene_filters")
+        if (
+            scene_filters
+            and len(scene_filters) == 1
+            and isinstance(scene_filters[0], list)
+            and all(isinstance(entry, str) for entry in scene_filters[0])
+        ):
+            # Shortcut the scene_names if the scene_filters is only a list of scene names
+            scene_names = scene_filters[0]
+        else:
+            # List all subdirectories in the root as scenes
+            scene_names = sorted(
+                [entry.name for entry in os.scandir(root) if entry.is_dir()]
+            )
+    # Filter scenes based on scene_filters
+    scene_names = _filter_scenes(root, scene_names, cfg.get("scene_filters"))
+    # shuffle the list if needed (in place)
+    if shuffle:
+        random.shuffle(scene_names)
+    return scene_names
+def _filter_scenes(
+    root: Path | str,
+    scene_names: list[str],
+    scene_filters: tuple | list | None,
+) -> list[str]:
+    if scene_filters is None:
+        return scene_names
+    if not isinstance(scene_filters, (tuple, list)):
+        raise ValueError("scene_filters must be a list or tuple")
+    for scene_filter in scene_filters:
+        if scene_filter in [None, "all"]:
+            pass
+        elif isinstance(scene_filter, (tuple, list)):
+            if len(scene_filter) == 0:
+                raise ValueError("scene_filter cannot be empty")
+            elif all(isinstance(x, int) for x in scene_filter):
+                if len(scene_filter) == 2:
+                    # start/end index
+                    scene_names = scene_names[scene_filter[0] : scene_filter[1]]
+                elif len(scene_filter) == 3:
+                    # start/end/step
+                    scene_names = scene_names[
+                        scene_filter[0] : scene_filter[1] : scene_filter[2]
+                    ]
+                else:
+                    # omegaconf conversion issue (converts strings to integers whenever possible)
+                    if str(scene_filter[0]) in scene_names:
+                        scene_names = [str(s) for s in scene_filter]
+                    else:
+                        raise ValueError(
+                            "scene_filter format [start_idx, end_idx] or [start_idx, end_idx, step_size] or [scene_name1, scene_name2, ...]"
+                        )
+            elif all(isinstance(x, str) for x in scene_filter):
+                # explicit scene names
+                if set(scene_filter).issubset(set(scene_names)):
+                    scene_names = list(scene_filter)
+                else:
+                    logger.warning(
+                        f"Scene(s) not available: {set(scene_filter) - set(scene_names)}"
+                    )
+                    scene_names = list(set(scene_names) & set(scene_filter))
+            else:
+                raise TypeError(
+                    f"Scene filter type not supported: {type(scene_filter)}"
+                )
+        elif isinstance(scene_filter, dict):
+            # reserved key words
+            if modality := scene_filter.get("exists"):
+                scene_names = [
+                    scene_name
+                    for scene_name in scene_names
+                    if Path(root, scene_name, modality).exists()
+                ]
+            elif modality := scene_filter.get("exists_not"):
+                scene_names = [
+                    scene_name
+                    for scene_name in scene_names
+                    if not Path(root, scene_name, modality).exists()
+                ]
+            elif process_filter := scene_filter.get("process_state"):
+                # filter for where <process_key> has <process_state>
+                (process_key, process_state) = process_filter
+                filtered_scene_names = []
+                for scene_name in scene_names:
+                    # load processing state and check for
+                    processing_state = get_processing_state(Path(root, scene_name))
+                    if "*" in process_key:  # regex matching
+                        for process_name in processing_state:
+                            if re.match(process_key, process_name):
+                                process_key = process_name
+                                break
+                    if process_key not in processing_state:
+                        continue
+                    if processing_state[process_key]["state"] == process_state:
+                        filtered_scene_names.append(scene_name)
+                scene_names = filtered_scene_names
+            elif process_filter := scene_filter.get("process_state_not"):
+                # filter for where <process_key> does not have <process_state>
+                (process_key, process_state) = process_filter
+                filtered_scene_names = []
+                for scene_name in scene_names:
+                    # load processing state and check for
+                    try:
+                        processing_state = get_processing_state(Path(root, scene_name))
+                    except Exception:
+                        filtered_scene_names.append(scene_name)
+                        continue
+                    if "*" in process_key:  # regex matching
+                        for process_name in processing_state:
+                            if re.match(process_key, process_name):
+                                process_key = process_name
+                                break
+                    if (process_key not in processing_state) or (
+                        processing_state[process_key]["state"] != process_state
+                    ):
+                        filtered_scene_names.append(scene_name)
+                scene_names = filtered_scene_names
+            else:
+                raise ValueError(f"Scene filter not supported: {scene_filter}")
+        elif isinstance(scene_filter, str):
+            # regex
+            scene_names = [
+                scene_name
+                for scene_name in scene_names
+                if re.fullmatch(scene_filter, scene_name)
+            ]
+        else:
+            raise ValueError(f"Scene filter not supported: {scene_filter}")
+    return scene_names
+def _resolve_scene_frames_fn(scene_frames_fn: str) -> dict[str, list[str] | None]:
+    # support for file list in forms of lists or dicts
+    # containing scene_names [-> frames]
+    scene_frames_list = _load_readable(scene_frames_fn)
+    scene_frame_names = {}
+    # TODO: The following code seems unreachable as scene_frames_list is always a dict
+    if isinstance(scene_frames_list, (list, tuple)):
+        for entry in scene_frames_list:
+            if isinstance(entry, (tuple, list)):
+                if (
+                    (len(entry) != 2)
+                    or (not isinstance(entry[0], str))
+                    or (not isinstance(entry[1], list))
+                ):
+                    raise NotImplementedError(
+                        "Only supports lists of [<scene_name>, [frame_names]]"
+                    )
+                scene_frame_names[entry[0]] = entry[1]
+            elif isinstance(entry, str):
+                scene_frame_names[entry] = None
+            elif isinstance(entry, dict):
+                # scene_name -> frames
+                raise NotImplementedError("Dict entry not supported yet")
+            else:
+                raise IOError(f"File list contains an entry of wrong format: {entry}")
+    elif isinstance(scene_frames_list, dict):
+        # scene_name -> frames
+        for scene_name, frame in scene_frames_list.items():
+            if isinstance(frame, (tuple, list)):
+                scene_frame_names[scene_name] = frame
+            elif isinstance(frame, dict):
+                if "frame_names" in frame:
+                    scene_frame_names[scene_name] = frame["frame_names"]
+                else:
+                    raise IOError(f"Scene frames format not supported: {frame}")
+            elif frame is None:
+                scene_frame_names[scene_name] = frame
+            else:
+                raise IOError(f"Scene frames format not supported: {frame}")
+    else:
+        raise IOError(f"Scene frames format not supported: {scene_frames_list}")
+    return scene_frame_names
+def _resolve_scene_frame_names(
+    cfg: dict | object,
+    scene_names: list[str],
+    root: Path | str | None = None,
+    scene_frame_names: dict[str, list[str | float] | None] | None = None,
+    keyframes: bool = True,
+) -> dict[str, list[str]]:
+    root = cfg.get("root") if root is None else root
+    if scene_frame_names is not None:
+        # restrict to the additional scene-level prefiltering
+        scene_frame_names = {
+            scene_name: scene_frame_names[scene_name] for scene_name in scene_names
+        }
+        # dict already loaded, apply additional filters
+        for scene_name, frame_names in scene_frame_names.items():
+            if frame_names is None:
+                scene_meta = _load_scene_meta(
+                    Path(
+                        root, scene_name, cfg.get("scene_meta_path", "scene_meta.json")
+                    )
+                )
+                frame_names = [frame["frame_name"] for frame in scene_meta["frames"]]
+                # TODO: add some logic for video keyframes
+            scene_frame_names[scene_name] = _filter_frame_names(
+                root, frame_names, scene_name, cfg.get("frame_filters")
+            )
+    else:
+        scene_frame_names = {}
+        for scene_name in scene_names:
+            scene_meta = _load_scene_meta(
+                Path(root, scene_name, cfg.get("scene_meta_path", "scene_meta.json"))
+            )
+            if not keyframes:
+                frame_names = get_video_frames(scene_meta)
+                if frame_names is None:
+                    keyframes = True
+            if keyframes:
+                frame_names = [frame["frame_name"] for frame in scene_meta["frames"]]
+            frame_names = _filter_frame_names(
+                root, frame_names, scene_name, cfg.get("frame_filters")
+            )
+            scene_frame_names[scene_name] = frame_names
+    return scene_frame_names
+def _filter_frame_names(
+    root: Path | str,
+    frame_names: list[str],
+    scene_name: str,
+    frame_filters: list | tuple | None,
+) -> list[str]:
+    if frame_filters is None:
+        return frame_names
+    if not isinstance(frame_filters, (tuple, list)):
+        raise ValueError("frame_filters must be a list or tuple")
+    for frame_filter in frame_filters:
+        if frame_filter in [None, "all"]:
+            pass
+        elif isinstance(frame_filter, (tuple, list)):
+            if len(frame_filter) == 0:
+                raise ValueError("frame_filter cannot be empty")
+            if isinstance(frame_filter[0], int):
+                if len(frame_filter) == 2:
+                    # start/end index
+                    frame_names = frame_names[frame_filter[0] : frame_filter[1]]
+                elif len(frame_filter) == 3:
+                    # start/end/step
+                    frame_names = frame_names[
+                        frame_filter[0] : frame_filter[1] : frame_filter[2]
+                    ]
+                else:
+                    raise ValueError(
+                        "frame_filter format [start_idx, end_idx] or [start_idx, end_idx,step_size]"
+                    )
+            else:
+                raise TypeError(
+                    f"frame_filter[0] type not supported: {type(frame_filter[0])}"
+                )
+        elif isinstance(frame_filter, str):
+            # reserved key words
+            if match := re.match("exists: (.+)", frame_filter):
+                modality = match.group(1)
+                frame_names = [
+                    frame_name
+                    for frame_name in frame_names
+                    if any(Path(root, scene_name, modality).glob(f"{frame_name}.*"))
+                ]
+            elif match := re.match("!exists: (.+)", frame_filter):
+                modality = match.group(1)
+                frame_names = [
+                    frame_name
+                    for frame_name in frame_names
+                    if not any(Path(root, scene_name, modality).glob(f"{frame_name}.*"))
+                ]
+            else:  # general regex
+                frame_names = [
+                    frame_name
+                    for frame_name in frame_names
+                    if re.match(frame_filter, frame_name)
+                ]
+        else:
+            raise ValueError(f"frame_filter type not supported: {type(frame_filter)}")
+    return frame_names
+def get_video_frames(scene_meta: dict[str, Any]):
+    """
+    Return names of video frames.
+    Args:
+        scene_meta: dictionary with scene_meat data.
+    Returns:
+        A list of video frame names.
+    """
+    image_modality = [mod for mod in scene_meta["frame_modalities"] if "image" in mod]
+    if len(image_modality) > 0:
+        image_modality = scene_meta["frame_modalities"][image_modality[0]]
+        if "chunks" in image_modality:
+            file_list = image_modality["chunks"]
+        else:
+            file_list = [image_modality]
+        frame_names = []
+        for chunk in file_list:
+            start, end, fps = chunk["start"], chunk["end"], chunk["fps"]
+            chunk_frame_names = np.arange(start, end, 1.0 / fps).tolist()
+            frame_names += chunk_frame_names
+        return frame_names
+    return None

mapanything/utils/wai/semantics.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+This utils script contains PORTAGE of wai-core semantics methods for MapAnything.
+"""
+import numpy as np
+from PIL import Image
+INVALID_ID = 0
+INVALID_COLOR = (0, 0, 0)
+def load_semantic_color_mapping(filename: str = "colors_fps_5k.npz") -> np.ndarray:
+    """Loads a precomputed colormap."""
+    from mapanything.utils.wai.core import WAI_COLORMAP_PATH
+    return np.load(WAI_COLORMAP_PATH / filename).get("arr_0")
+def apply_id_to_color_mapping(
+    data_id: np.ndarray | Image.Image,
+    semantic_color_mapping: np.ndarray,
+) -> tuple[np.ndarray, dict[int, tuple[int, int, int]]]:
+    """Maps semantic class/instance IDs to RGB colors."""
+    if isinstance(data_id, Image.Image):
+        data_id = np.array(data_id)
+    max_color_id = semantic_color_mapping.shape[0] - 1
+    max_data_id = data_id.max()
+    if max_data_id > max_color_id:
+        raise ValueError("The provided color palette does not have enough colors!")
+    # Create palette containing the id->color mappings of the input data IDs
+    unique_indices = np.unique(data_id).tolist()
+    color_palette = {
+        index: semantic_color_mapping[index, :].tolist() for index in unique_indices
+    }
+    data_colors = semantic_color_mapping[data_id]
+    return data_colors, color_palette

requirements.txt CHANGED Viewed

@@ -18,4 +18,5 @@ einops
 requests
 psutil
 tqdm
 uniception==0.1.4

 requests
 psutil
 tqdm
+safetensors
 uniception==0.1.4