Spaces:

facebook
/

map-anything

Running on Zero

App Files Files Community

nkeetha user commited on Sep 15

Commit

19d7794

1 Parent(s): 75d19ab

Update Model & Examples

Browse files

Files changed (7) hide show

app.py +3 -1
examples/Cat-Girl/Cat_Girl.png +3 -0
examples/Downtown/Downtown.jpg +3 -0
examples/Office/Office.jpg +3 -0
examples/Safari-Car/Safari_Car.jpg +3 -0
mapanything/models/mapanything/model.py +237 -65
mapanything/utils/hf_utils/visual_util.py +1 -1

app.py CHANGED Viewed

@@ -124,7 +124,9 @@ def run_model(
     # apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
     # mask_edges: Whether to compute an edge mask based on normals and depth and apply it to the output. Defaults to True.
     # Use checkbox values - mask_edges is set to True by default since there's no UI control for it
-    outputs = model.infer(views, apply_mask=apply_mask, mask_edges=True)
     # Convert predictions to format expected by visualization
     predictions = {}

     # apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
     # mask_edges: Whether to compute an edge mask based on normals and depth and apply it to the output. Defaults to True.
     # Use checkbox values - mask_edges is set to True by default since there's no UI control for it
+    outputs = model.infer(
+        views, apply_mask=apply_mask, mask_edges=True, memory_efficient_inference=False
+    )
     # Convert predictions to format expected by visualization
     predictions = {}

examples/Cat-Girl/Cat_Girl.png ADDED Viewed

Git LFS Details

SHA256: 57fa6d587d598e7a428e8997b86d5c3a06e0e18529bfad8bab78ae03a1f5820f
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

examples/Downtown/Downtown.jpg ADDED Viewed

Git LFS Details

SHA256: b87a72df1e3a010f4003ea8c2e7c08d1c6277009d369d684be93a44fb3593a19
Pointer size: 130 Bytes
Size of remote file: 52.5 kB

examples/Office/Office.jpg ADDED Viewed

Git LFS Details

SHA256: 28767640002f93b703b24a34a6d75ca24b1ef093a19f52ef0f9d3b074ef68c61
Pointer size: 131 Bytes
Size of remote file: 198 kB

examples/Safari-Car/Safari_Car.jpg ADDED Viewed

Git LFS Details

SHA256: cc0b2cf1882f9ad3b0f284474a72ea7c30b5ba40101f9a5d7e899da70ea43d06
Pointer size: 131 Bytes
Size of remote file: 124 kB

mapanything/models/mapanything/model.py CHANGED Viewed

@@ -4,7 +4,7 @@ MapAnything model class defined using UniCeption modules.
 import warnings
 from functools import partial
-from typing import Any, Callable, Dict, List, Type, Union
 import torch
 import torch.nn as nn
@@ -1255,7 +1255,221 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
         return fused_all_encoder_features_across_views
-    def forward(self, views):
         """
         Forward pass performing the following operations:
         1. Encodes the N input views (images).
@@ -1279,6 +1493,7 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
                                     "camera_pose_quats" (tensor): Camera pose quaternions. Tensor of shape (B, 4). Camera pose is opencv (RDF) cam2world transformation.
                                     "camera_pose_trans" (tensor): Camera pose translations. Tensor of shape (B, 3). Camera pose is opencv (RDF) cam2world transformation.
                                     "is_metric_scale" (tensor): Boolean tensor indicating whether the geometric inputs are in metric scale or not. Tensor of shape (B, 1).
         Returns:
             List[dict]: A list containing the final outputs for all N views.
@@ -1376,72 +1591,25 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
                 f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
             )
-        # Downstream task prediction
         with torch.autocast("cuda", enabled=False):
-            # Run Prediction Heads & Post-Process Outputs
             if self.pred_head_type == "linear":
-                dense_head_outputs = self.dense_head(
-                    PredictionHeadInput(last_feature=dense_head_inputs)
-                )
-                dense_final_outputs = self.dense_adaptor(
-                    AdaptorInput(
-                        adaptor_feature=dense_head_outputs.decoded_channels,
-                        output_shape_hw=img_shape,
-                    )
-                )
-            elif self.pred_head_type == "dpt":
-                dense_head_outputs = self.dense_head(
-                    PredictionHeadLayeredInput(
-                        list_features=dense_head_inputs_list,
-                        target_output_shape=img_shape,
-                    )
-                )
-                dense_final_outputs = self.dense_adaptor(
-                    AdaptorInput(
-                        adaptor_feature=dense_head_outputs.decoded_channels,
-                        output_shape_hw=img_shape,
-                    )
-                )
-            elif self.pred_head_type == "dpt+pose":
-                dense_head_outputs = self.dense_head(
-                    PredictionHeadLayeredInput(
-                        list_features=dense_head_inputs_list,
-                        target_output_shape=img_shape,
-                    )
-                )
-                dense_final_outputs = self.dense_adaptor(
-                    AdaptorInput(
-                        adaptor_feature=dense_head_outputs.decoded_channels,
-                        output_shape_hw=img_shape,
-                    )
-                )
-                pose_head_outputs = self.pose_head(
-                    PredictionHeadInput(last_feature=dense_head_inputs_list[-1])
-                )
-                pose_final_outputs = self.pose_adaptor(
-                    AdaptorInput(
-                        adaptor_feature=pose_head_outputs.decoded_channels,
-                        output_shape_hw=img_shape,
-                    )
-                )
-            else:
-                raise ValueError(
-                    f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
-                )
-            scale_head_output = self.scale_head(
-                PredictionHeadTokenInput(
-                    last_feature=final_info_sharing_multi_view_feat.additional_token_features
-                )
             )
-            scale_final_output = self.scale_adaptor(
-                AdaptorInput(
-                    adaptor_feature=scale_head_output.decoded_channels,
-                    output_shape_hw=img_shape,
                 )
             )
-            scale_final_output = scale_final_output.value.squeeze(
-                -1
-            )  # (B, 1, 1) -> (B, 1)
             # Prepare the final scene representation for all views
             if self.scene_rep_type in [
@@ -1774,7 +1942,7 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
                     "ray_dirs_prob": 1.0 if use_calibration else 0.0,
                     "depth_prob": 1.0 if use_depth else 0.0,
                     "cam_prob": 1.0 if use_pose else 0.0,
-                    "sparse_depth_prob": 0.0,  # No sparsification during inference
                     "depth_scale_norm_all_prob": 0.0 if use_depth_scale else 1.0,
                     "pose_scale_norm_all_prob": 0.0 if use_pose_scale else 1.0,
                 }
@@ -1791,6 +1959,7 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
     def infer(
         self,
         views: List[Dict[str, Any]],
         use_amp: bool = True,
         amp_dtype: str = "bf16",
         apply_mask: bool = True,
@@ -1826,6 +1995,7 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
                 - 'idx': List[int] where length of list is B - index info for each view
                 - 'true_shape': List[tuple] where length of list is B - true shape info (H, W) for each view
             use_amp: Whether to use automatic mixed precision for faster inference. Defaults to True.
             amp_dtype: The dtype to use for mixed precision. Defaults to "bf16" (bfloat16). Options: "fp16", "bf16", "fp32".
             apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
@@ -1915,7 +2085,9 @@ class MapAnything(nn.Module, PyTorchModelHubMixin):
         # Run the model
         with torch.autocast("cuda", enabled=bool(use_amp), dtype=amp_dtype):
-            preds = self.forward(processed_views)
         # Post-process the model outputs
         preds = postprocess_model_outputs_for_inference(

 import warnings
 from functools import partial
+from typing import Any, Callable, Dict, List, Tuple, Type, Union
 import torch
 import torch.nn as nn
         return fused_all_encoder_features_across_views
+    def _compute_adaptive_minibatch_size(
+        self,
+        memory_safety_factor: float = 0.95,
+    ) -> int:
+        """
+        Compute adaptive minibatch size based on available PyTorch memory.
+        Args:
+            memory_safety_factor: Safety factor to avoid OOM (0.95 = use 95% of available memory)
+        Returns:
+            Computed minibatch size
+        """
+        device = self.device
+        if device.type == "cuda":
+            # Get available GPU memory
+            torch.cuda.empty_cache()
+            available_memory = torch.cuda.mem_get_info()[0]  # Free memory in bytes
+            usable_memory = (
+                available_memory * memory_safety_factor
+            )  # Use safety factor to avoid OOM
+        else:
+            # For non-CUDA devices, use conservative default
+            print(
+                "Non-CUDA device detected. Using conservative default minibatch size of 1 for memory efficient dense prediction head inference."
+            )
+            return 1
+        # Determine minibatch size based on available memory
+        max_estimated_memory_per_sample = (
+            680 * 1024 * 1024
+        )  # 680 MB per sample (upper bound profiling using a 518 x 518 input)
+        computed_minibatch_size = int(usable_memory / max_estimated_memory_per_sample)
+        if computed_minibatch_size < 1:
+            computed_minibatch_size = 1
+        return computed_minibatch_size
+    def downstream_dense_head(
+        self,
+        dense_head_inputs: Union[torch.Tensor, List[torch.Tensor]],
+        img_shape: Tuple[int, int],
+    ):
+        """
+        Run the downstream dense prediction head
+        """
+        if self.pred_head_type == "linear":
+            dense_head_outputs = self.dense_head(
+                PredictionHeadInput(last_feature=dense_head_inputs)
+            )
+            dense_final_outputs = self.dense_adaptor(
+                AdaptorInput(
+                    adaptor_feature=dense_head_outputs.decoded_channels,
+                    output_shape_hw=img_shape,
+                )
+            )
+        elif self.pred_head_type in ["dpt", "dpt+pose"]:
+            dense_head_outputs = self.dense_head(
+                PredictionHeadLayeredInput(
+                    list_features=dense_head_inputs,
+                    target_output_shape=img_shape,
+                )
+            )
+            dense_final_outputs = self.dense_adaptor(
+                AdaptorInput(
+                    adaptor_feature=dense_head_outputs.decoded_channels,
+                    output_shape_hw=img_shape,
+                )
+            )
+        else:
+            raise ValueError(
+                f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
+            )
+        return dense_final_outputs
+    def downstream_head(
+        self,
+        dense_head_inputs: Union[torch.Tensor, List[torch.Tensor]],
+        scale_head_inputs: torch.Tensor,
+        img_shape: Tuple[int, int],
+        memory_efficient_inference: bool = False,
+    ):
+        """
+        Run Prediction Heads & Post-Process Outputs
+        """
+        # Get device
+        device = self.device
+        # Use mini-batch inference to run the dense prediction head (the memory bottleneck)
+        # This saves memory and is slower than running the dense prediction head in one go
+        if memory_efficient_inference:
+            # Obtain the batch size of the dense head inputs
+            if self.pred_head_type == "linear":
+                batch_size = dense_head_inputs.shape[0]
+            elif self.pred_head_type in ["dpt", "dpt+pose"]:
+                batch_size = dense_head_inputs[0].shape[0]
+            else:
+                raise ValueError(
+                    f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
+                )
+            # Compute the mini batch size and number of mini batches adaptively based on available memory
+            minibatch = self._compute_adaptive_minibatch_size()
+            num_batches = (batch_size + minibatch - 1) // minibatch
+            # Run prediction for each mini-batch
+            dense_final_outputs_list = []
+            pose_final_outputs_list = [] if self.pred_head_type == "dpt+pose" else None
+            for batch_idx in range(num_batches):
+                start_idx = batch_idx * minibatch
+                end_idx = min((batch_idx + 1) * minibatch, batch_size)
+                # Get the inputs for the current mini-batch
+                if self.pred_head_type == "linear":
+                    dense_head_inputs_batch = dense_head_inputs[start_idx:end_idx]
+                elif self.pred_head_type in ["dpt", "dpt+pose"]:
+                    dense_head_inputs_batch = [
+                        x[start_idx:end_idx] for x in dense_head_inputs
+                    ]
+                else:
+                    raise ValueError(
+                        f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
+                    )
+                # Dense prediction (mini-batched)
+                dense_final_outputs_batch = self.downstream_dense_head(
+                    dense_head_inputs_batch, img_shape
+                )
+                dense_final_outputs_list.append(dense_final_outputs_batch)
+                # Pose prediction (mini-batched)
+                if self.pred_head_type == "dpt+pose":
+                    pose_head_inputs_batch = dense_head_inputs[-1][start_idx:end_idx]
+                    pose_head_outputs_batch = self.pose_head(
+                        PredictionHeadInput(last_feature=pose_head_inputs_batch)
+                    )
+                    pose_final_outputs_batch = self.pose_adaptor(
+                        AdaptorInput(
+                            adaptor_feature=pose_head_outputs_batch.decoded_channels,
+                            output_shape_hw=img_shape,
+                        )
+                    )
+                    pose_final_outputs_list.append(pose_final_outputs_batch)
+            # Concatenate the dense prediction head outputs from all mini-batches
+            available_keys = dense_final_outputs_batch.__dict__.keys()
+            dense_pred_data_dict = {
+                key: torch.cat(
+                    [getattr(output, key) for output in dense_final_outputs_list], dim=0
+                )
+                for key in available_keys
+            }
+            dense_final_outputs = dense_final_outputs_batch.__class__(
+                **dense_pred_data_dict
+            )
+            # Concatenate the pose prediction head outputs from all mini-batches
+            pose_final_outputs = None
+            if self.pred_head_type == "dpt+pose":
+                available_keys = pose_final_outputs_batch.__dict__.keys()
+                pose_pred_data_dict = {
+                    key: torch.cat(
+                        [getattr(output, key) for output in pose_final_outputs_list],
+                        dim=0,
+                    )
+                    for key in available_keys
+                }
+                pose_final_outputs = pose_final_outputs_batch.__class__(
+                    **pose_pred_data_dict
+                )
+            # Clear CUDA cache for better memory efficiency
+            if device.type == "cuda":
+                torch.cuda.empty_cache()
+        else:
+            # Run prediction for all (batch_size * num_views) in one go
+            # Dense prediction
+            dense_final_outputs = self.downstream_dense_head(
+                dense_head_inputs, img_shape
+            )
+            # Pose prediction
+            pose_final_outputs = None
+            if self.pred_head_type == "dpt+pose":
+                pose_head_outputs = self.pose_head(
+                    PredictionHeadInput(last_feature=dense_head_inputs[-1])
+                )
+                pose_final_outputs = self.pose_adaptor(
+                    AdaptorInput(
+                        adaptor_feature=pose_head_outputs.decoded_channels,
+                        output_shape_hw=img_shape,
+                    )
+                )
+        # Scale prediction is lightweight, so we can run it in one go
+        scale_head_output = self.scale_head(
+            PredictionHeadTokenInput(last_feature=scale_head_inputs)
+        )
+        scale_final_output = self.scale_adaptor(
+            AdaptorInput(
+                adaptor_feature=scale_head_output.decoded_channels,
+                output_shape_hw=img_shape,
+            )
+        )
+        scale_final_output = scale_final_output.value.squeeze(-1)  # (B, 1, 1) -> (B, 1)
+        # Clear CUDA cache for better memory efficiency
+        if memory_efficient_inference and device.type == "cuda":
+            torch.cuda.empty_cache()
+        return dense_final_outputs, pose_final_outputs, scale_final_output
+    def forward(self, views, memory_efficient_inference=False):
         """
         Forward pass performing the following operations:
         1. Encodes the N input views (images).
                                     "camera_pose_quats" (tensor): Camera pose quaternions. Tensor of shape (B, 4). Camera pose is opencv (RDF) cam2world transformation.
                                     "camera_pose_trans" (tensor): Camera pose translations. Tensor of shape (B, 3). Camera pose is opencv (RDF) cam2world transformation.
                                     "is_metric_scale" (tensor): Boolean tensor indicating whether the geometric inputs are in metric scale or not. Tensor of shape (B, 1).
+            memory_efficient_inference (bool): Whether to use memory efficient inference or not. This runs the dense prediction head (the memory bottleneck) in a memory efficient manner. Default is False.
         Returns:
             List[dict]: A list containing the final outputs for all N views.
                 f"Invalid pred_head_type: {self.pred_head_type}. Valid options: ['linear', 'dpt', 'dpt+pose']"
             )
         with torch.autocast("cuda", enabled=False):
+            # Prepare inputs for the downstream heads
             if self.pred_head_type == "linear":
+                dense_head_inputs = dense_head_inputs
+            elif self.pred_head_type in ["dpt", "dpt+pose"]:
+                dense_head_inputs = dense_head_inputs_list
+            scale_head_inputs = (
+                final_info_sharing_multi_view_feat.additional_token_features
             )
+            # Run the downstream heads
+            dense_final_outputs, pose_final_outputs, scale_final_output = (
+                self.downstream_head(
+                    dense_head_inputs=dense_head_inputs,
+                    scale_head_inputs=scale_head_inputs,
+                    img_shape=img_shape,
+                    memory_efficient_inference=memory_efficient_inference,
                 )
             )
             # Prepare the final scene representation for all views
             if self.scene_rep_type in [
                     "ray_dirs_prob": 1.0 if use_calibration else 0.0,
                     "depth_prob": 1.0 if use_depth else 0.0,
                     "cam_prob": 1.0 if use_pose else 0.0,
+                    "sparse_depth_prob": 0.0,
                     "depth_scale_norm_all_prob": 0.0 if use_depth_scale else 1.0,
                     "pose_scale_norm_all_prob": 0.0 if use_pose_scale else 1.0,
                 }
     def infer(
         self,
         views: List[Dict[str, Any]],
+        memory_efficient_inference: bool = False,
         use_amp: bool = True,
         amp_dtype: str = "bf16",
         apply_mask: bool = True,
                 - 'idx': List[int] where length of list is B - index info for each view
                 - 'true_shape': List[tuple] where length of list is B - true shape info (H, W) for each view
+            memory_efficient_inference: Whether to use memory-efficient inference for dense prediction heads (trades off speed). Defaults to False.
             use_amp: Whether to use automatic mixed precision for faster inference. Defaults to True.
             amp_dtype: The dtype to use for mixed precision. Defaults to "bf16" (bfloat16). Options: "fp16", "bf16", "fp32".
             apply_mask: Whether to apply the non-ambiguous mask to the output. Defaults to True.
         # Run the model
         with torch.autocast("cuda", enabled=bool(use_amp), dtype=amp_dtype):
+            preds = self.forward(
+                processed_views, memory_efficient_inference=memory_efficient_inference
+            )
         # Post-process the model outputs
         preds = postprocess_model_outputs_for_inference(

mapanything/utils/hf_utils/visual_util.py CHANGED Viewed

@@ -159,7 +159,7 @@ def predictions_to_glb(
     as_mesh=True,
 ) -> trimesh.Scene:
     """
-    Converts VGGT predictions to a 3D scene represented as a GLB file.
     Args:
         predictions (dict): Dictionary containing model predictions with keys:

     as_mesh=True,
 ) -> trimesh.Scene:
     """
+    Converts MapAnything predictions to a 3D scene represented as a GLB file.
     Args:
         predictions (dict): Dictionary containing model predictions with keys: