Spaces:
Running
on
Zero
Running
on
Zero
| # wzw | |
| """ Visual utilities for HuggingFace integration. | |
| References: https://github.com/facebookresearch/vggt | |
| """ | |
| import copy | |
| import os | |
| from typing import Tuple | |
| import cv2 | |
| import matplotlib | |
| import numpy as np | |
| import requests | |
| import trimesh | |
| from scipy.spatial.transform import Rotation | |
| def segment_sky(image_path, onnx_session): | |
| """ | |
| Segments sky from an image using an ONNX model. | |
| Thanks for the great model provided by https://github.com/xiongzhu666/Sky-Segmentation-and-Post-processing | |
| Args: | |
| image_path: Path to input image | |
| onnx_session: ONNX runtime session with loaded model | |
| Returns: | |
| np.ndarray: Binary mask where 255 indicates non-sky regions | |
| """ | |
| image = cv2.imread(image_path) | |
| result_map = run_skyseg(onnx_session, [320, 320], image) | |
| # resize the result_map to the original image size | |
| result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0])) | |
| # Fix: Invert the mask so that 255 = non-sky, 0 = sky | |
| # The model outputs low values for sky, high values for non-sky | |
| output_mask = np.zeros_like(result_map_original) | |
| output_mask[result_map_original < 32] = 255 # Use threshold of 32 | |
| return output_mask | |
| def run_skyseg(onnx_session, input_size, image): | |
| """ | |
| Runs sky segmentation inference using ONNX model. | |
| Args: | |
| onnx_session: ONNX runtime session | |
| input_size: Target size for model input (width, height) | |
| image: Input image in BGR format | |
| Returns: | |
| np.ndarray: Segmentation mask | |
| """ | |
| # Pre process:Resize, BGR->RGB, Transpose, PyTorch standardization, float32 cast | |
| temp_image = copy.deepcopy(image) | |
| resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1])) | |
| x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB) | |
| x = np.array(x, dtype=np.float32) | |
| mean = [0.485, 0.456, 0.406] | |
| std = [0.229, 0.224, 0.225] | |
| x = (x / 255 - mean) / std | |
| x = x.transpose(2, 0, 1) | |
| x = x.reshape(-1, 3, input_size[0], input_size[1]).astype("float32") | |
| # Inference | |
| input_name = onnx_session.get_inputs()[0].name | |
| output_name = onnx_session.get_outputs()[0].name | |
| onnx_result = onnx_session.run([output_name], {input_name: x}) | |
| # Post process | |
| onnx_result = np.array(onnx_result).squeeze() | |
| min_value = np.min(onnx_result) | |
| max_value = np.max(onnx_result) | |
| onnx_result = (onnx_result - min_value) / (max_value - min_value) | |
| onnx_result *= 255 | |
| onnx_result = onnx_result.astype("uint8") | |
| return onnx_result | |
| def download_file_from_url(url, filename): | |
| """Downloads a file from a Hugging Face model repo, handling redirects.""" | |
| try: | |
| # Get the redirect URL | |
| response = requests.get(url, allow_redirects=False) | |
| response.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx) | |
| if response.status_code == 302: # Expecting a redirect | |
| redirect_url = response.headers["Location"] | |
| response = requests.get(redirect_url, stream=True) | |
| response.raise_for_status() | |
| else: | |
| print(f"Unexpected status code: {response.status_code}") | |
| return | |
| with open(filename, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| print(f"Downloaded {filename} successfully.") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error downloading file: {e}") | |
| def create_image_mesh( | |
| *image_data: np.ndarray, | |
| mask: np.ndarray = None, | |
| triangulate: bool = False, | |
| return_vertex_indices: bool = False, | |
| ) -> Tuple[np.ndarray, ...]: | |
| """ | |
| Create a mesh from image data using pixel coordinates as vertices and grid connections as faces. | |
| Args: | |
| *image_data (np.ndarray): Image arrays with shape (height, width, [channels]) | |
| mask (np.ndarray, optional): Boolean mask with shape (height, width). Defaults to None. | |
| triangulate (bool): Convert quad faces to triangular faces. Defaults to False. | |
| return_vertex_indices (bool): Include vertex indices in output. Defaults to False. | |
| Returns: | |
| faces (np.ndarray): Face connectivity array. Shape (N, 4) for quads or (N, 3) for triangles | |
| *vertex_data (np.ndarray): Vertex attributes corresponding to input image_data | |
| vertex_indices (np.ndarray, optional): Original vertex indices if return_vertex_indices=True | |
| """ | |
| # Validate inputs | |
| assert (len(image_data) > 0) or (mask is not None), "Need at least one image or mask" | |
| if mask is None: | |
| height, width = image_data[0].shape[:2] | |
| else: | |
| height, width = mask.shape | |
| # Check all images have same dimensions | |
| for img in image_data: | |
| assert img.shape[:2] == (height, width), "All images must have same height and width" | |
| # Create quad faces connecting neighboring pixels | |
| base_quad = np.stack([ | |
| np.arange(0, width - 1, dtype=np.int32), # bottom-left | |
| np.arange(width, 2 * width - 1, dtype=np.int32), # top-left | |
| np.arange(1 + width, 2 * width, dtype=np.int32), # top-right | |
| np.arange(1, width, dtype=np.int32), # bottom-right | |
| ], axis=1) | |
| # Replicate quad pattern for all rows | |
| row_offsets = np.arange(0, (height - 1) * width, width, dtype=np.int32) | |
| faces = (row_offsets[:, None, None] + base_quad[None, :, :]).reshape((-1, 4)) | |
| if mask is None: | |
| # No masking - use all faces and vertices | |
| if triangulate: | |
| faces = _convert_quads_to_triangles(faces) | |
| output = [faces] | |
| for img in image_data: | |
| output.append(img.reshape(-1, *img.shape[2:])) | |
| if return_vertex_indices: | |
| output.append(np.arange(height * width, dtype=np.int32)) | |
| return tuple(output) | |
| else: | |
| # Apply mask - only keep faces where all 4 corners are valid | |
| valid_quads = ( | |
| mask[:-1, :-1] & mask[1:, :-1] & | |
| mask[1:, 1:] & mask[:-1, 1:] | |
| ).ravel() | |
| faces = faces[valid_quads] | |
| if triangulate: | |
| faces = _convert_quads_to_triangles(faces) | |
| # Remove unused vertices and remap face indices | |
| num_face_vertices = faces.shape[-1] | |
| unique_vertices, remapped_indices = np.unique(faces, return_inverse=True) | |
| faces = remapped_indices.astype(np.int32).reshape(-1, num_face_vertices) | |
| output = [faces] | |
| for img in image_data: | |
| flattened_img = img.reshape(-1, *img.shape[2:]) | |
| output.append(flattened_img[unique_vertices]) | |
| if return_vertex_indices: | |
| output.append(unique_vertices) | |
| return tuple(output) | |
| def _convert_quads_to_triangles(quad_faces: np.ndarray) -> np.ndarray: | |
| """Convert quadrilateral faces to triangular faces.""" | |
| if quad_faces.shape[-1] == 3: | |
| return quad_faces # Already triangular | |
| num_vertices_per_face = quad_faces.shape[-1] | |
| triangle_indices = np.stack([ | |
| np.zeros(num_vertices_per_face - 2, dtype=int), # First vertex | |
| np.arange(1, num_vertices_per_face - 1, dtype=int), # Sequential vertices | |
| np.arange(2, num_vertices_per_face, dtype=int), # Next sequential vertices | |
| ], axis=1) | |
| return quad_faces[:, triangle_indices].reshape((-1, 3)) | |
| def convert_predictions_to_glb_scene( | |
| predictions, | |
| filter_by_frames="all", | |
| show_camera=True, | |
| mask_sky_bg=False, | |
| mask_ambiguous=False, | |
| as_mesh=True, | |
| ) -> trimesh.Scene: | |
| """ | |
| Converts model predictions to a 3D scene represented as a GLB file. | |
| Args: | |
| predictions (dict): Dictionary containing model predictions with keys: | |
| - world_points: 3D point coordinates (S, H, W, 3) | |
| - images: Input images (S, H, W, 3) | |
| - camera_poses: Camera extrinsic matrices (S, 3, 4) | |
| filter_by_frames (str): Frame filter specification (default: "all") | |
| show_camera (bool): Include camera visualization (default: True) | |
| mask_sky_bg (bool): Mask out sky background pixels (default: False) | |
| mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False) | |
| as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False) | |
| Returns: | |
| trimesh.Scene: Processed 3D scene containing point cloud/mesh and cameras | |
| Raises: | |
| ValueError: If input predictions structure is invalid | |
| """ | |
| if not isinstance(predictions, dict): | |
| raise ValueError("predictions must be a dictionary") | |
| print("Building GLB scene") | |
| # Parse frame selection from filter string | |
| target_frame_index = None | |
| if filter_by_frames not in ["all", "All"]: | |
| try: | |
| # Extract numeric index before colon separator | |
| target_frame_index = int(filter_by_frames.split(":")[0]) | |
| except (ValueError, IndexError): | |
| pass | |
| # Validate required data in predictions | |
| print("Using Pointmap Branch") | |
| if "world_points" not in predictions: | |
| raise ValueError( | |
| "world_points not found in predictions. Pointmap Branch requires 'world_points' key. " | |
| "Depthmap and Camera branches have been removed." | |
| ) | |
| # Extract prediction data | |
| point_cloud_3d = predictions["world_points"] | |
| input_images = predictions["images"] | |
| extrinsic_matrices = predictions["camera_poses"] | |
| ambiguity_mask = predictions["final_mask"] | |
| sky_region_mask = predictions["sky_mask"] | |
| # Filter to single frame if specified | |
| if target_frame_index is not None: | |
| point_cloud_3d = point_cloud_3d[target_frame_index][None] | |
| input_images = input_images[target_frame_index][None] | |
| extrinsic_matrices = extrinsic_matrices[target_frame_index][None] | |
| ambiguity_mask = ambiguity_mask[target_frame_index][None] | |
| sky_region_mask = sky_region_mask[target_frame_index][None] | |
| # Flatten 3D points to vertex array | |
| flattened_vertices = point_cloud_3d.reshape(-1, 3) | |
| # Convert images to RGB color array | |
| if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format | |
| rgb_colors = np.transpose(input_images, (0, 2, 3, 1)) | |
| else: # Already in NHWC format | |
| rgb_colors = input_images | |
| rgb_colors = (rgb_colors.reshape(-1, 3) * 255).astype(np.uint8) | |
| # Build composite filtering mask | |
| valid_points_mask = np.ones(len(flattened_vertices), dtype=bool) | |
| # Apply ambiguity filtering if requested | |
| if mask_ambiguous: | |
| flat_ambiguity_mask = ambiguity_mask.reshape(-1) | |
| valid_points_mask = valid_points_mask & flat_ambiguity_mask | |
| # Apply sky region filtering if requested | |
| if mask_sky_bg: | |
| flat_sky_mask = sky_region_mask.reshape(-1) | |
| valid_points_mask = valid_points_mask & flat_sky_mask | |
| # Apply mask to filter vertices and colors | |
| filtered_vertices = flattened_vertices[valid_points_mask].copy() | |
| filtered_colors = rgb_colors[valid_points_mask].copy() | |
| # Handle empty geometry case | |
| if filtered_vertices is None or np.asarray(filtered_vertices).size == 0: | |
| filtered_vertices = np.array([[1, 0, 0]]) | |
| filtered_colors = np.array([[255, 255, 255]]) | |
| scene_scale_factor = 1 | |
| else: | |
| # Compute scene scale from percentile-based bounding box | |
| percentile_lower = np.percentile(filtered_vertices, 5, axis=0) | |
| percentile_upper = np.percentile(filtered_vertices, 95, axis=0) | |
| scene_scale_factor = np.linalg.norm(percentile_upper - percentile_lower) | |
| # Initialize color mapping for cameras | |
| color_palette = matplotlib.colormaps.get_cmap("gist_rainbow") | |
| # Create empty 3D scene container | |
| output_scene = trimesh.Scene() | |
| # Add geometry to scene based on representation type | |
| if as_mesh: | |
| # Mesh representation | |
| if target_frame_index is not None: | |
| # Single frame mesh generation | |
| frame_height, frame_width = point_cloud_3d.shape[1:3] | |
| # Prepare unfiltered data for mesh construction | |
| structured_points = point_cloud_3d.reshape(frame_height, frame_width, 3) | |
| # Convert image data to proper format | |
| if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format | |
| structured_colors = np.transpose(input_images[0], (1, 2, 0)) | |
| else: # Already in HWC format | |
| structured_colors = input_images[0] | |
| structured_colors *= 255 | |
| # Get structured mask for mesh creation | |
| structured_mask = predictions["final_mask"][target_frame_index].reshape( | |
| frame_height, frame_width | |
| ) | |
| # Build filtering mask | |
| mesh_filter_mask = structured_mask | |
| # Check for normal data availability | |
| mesh_normals = None | |
| if "normal" in predictions and predictions["normal"] is not None: | |
| # Extract normals for selected frame | |
| frame_normal_data = ( | |
| predictions["normal"][target_frame_index] | |
| if target_frame_index is not None | |
| else predictions["normal"][0] | |
| ) | |
| # Generate mesh with normal information | |
| mesh_faces, mesh_vertices, mesh_colors, mesh_normals = create_image_mesh( | |
| structured_points * np.array([1, -1, 1], dtype=np.float32), | |
| structured_colors / 255.0, | |
| frame_normal_data * np.array([1, -1, 1], dtype=np.float32), | |
| mask=mesh_filter_mask, | |
| triangulate=True, | |
| return_vertex_indices=False, | |
| ) | |
| # Apply coordinate system transformation to normals | |
| mesh_normals = mesh_normals * np.array([1, -1, 1], dtype=np.float32) | |
| else: | |
| # Generate mesh without normal information | |
| mesh_faces, mesh_vertices, mesh_colors = create_image_mesh( | |
| structured_points * np.array([1, -1, 1], dtype=np.float32), | |
| structured_colors / 255.0, | |
| mask=mesh_filter_mask, | |
| triangulate=True, | |
| return_vertex_indices=False, | |
| ) | |
| # Construct trimesh object with optional normals | |
| geometry_mesh = trimesh.Trimesh( | |
| vertices=mesh_vertices * np.array([1, -1, 1], dtype=np.float32), | |
| faces=mesh_faces, | |
| vertex_colors=(mesh_colors * 255).astype(np.uint8), | |
| vertex_normals=(mesh_normals if mesh_normals is not None else None), | |
| process=False, | |
| ) | |
| output_scene.add_geometry(geometry_mesh) | |
| else: | |
| # Multi-frame mesh generation | |
| print("Creating mesh for multi-frame data...") | |
| for frame_idx in range(point_cloud_3d.shape[0]): | |
| frame_height, frame_width = point_cloud_3d.shape[1:3] | |
| # Extract per-frame data | |
| frame_point_data = point_cloud_3d[frame_idx] | |
| frame_ambiguity_mask = predictions["final_mask"][frame_idx] | |
| frame_sky_mask = predictions["sky_mask"][frame_idx] | |
| # Extract frame image data | |
| if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format | |
| frame_image_data = np.transpose(input_images[frame_idx], (1, 2, 0)) | |
| else: # Already in HWC format | |
| frame_image_data = input_images[frame_idx] | |
| frame_image_data *= 255 | |
| # Build per-frame filtering mask | |
| frame_filter_mask = np.ones((frame_height, frame_width), dtype=bool) | |
| # Apply ambiguity filtering if enabled | |
| if mask_ambiguous: | |
| frame_filter_mask = frame_filter_mask & frame_ambiguity_mask | |
| # Apply sky filtering if enabled | |
| if mask_sky_bg: | |
| frame_filter_mask = frame_filter_mask & frame_sky_mask | |
| # Generate mesh for current frame | |
| frame_faces, frame_vertices, frame_colors = create_image_mesh( | |
| frame_point_data * np.array([1, -1, 1], dtype=np.float32), | |
| frame_image_data / 255.0, | |
| mask=frame_filter_mask, | |
| triangulate=True, | |
| return_vertex_indices=False, | |
| ) | |
| frame_vertices = frame_vertices * np.array([1, -1, 1], dtype=np.float32) | |
| # Create trimesh object for current frame | |
| frame_geometry = trimesh.Trimesh( | |
| vertices=frame_vertices, | |
| faces=frame_faces, | |
| vertex_colors=(frame_colors * 255).astype(np.uint8), | |
| process=False, | |
| ) | |
| output_scene.add_geometry(frame_geometry) | |
| else: | |
| # Point cloud representation | |
| point_cloud_geometry = trimesh.PointCloud(vertices=filtered_vertices, colors=filtered_colors) | |
| output_scene.add_geometry(point_cloud_geometry) | |
| # Add camera visualizations if requested | |
| num_camera_views = len(extrinsic_matrices) | |
| if show_camera: | |
| # Iterate through all camera views | |
| for camera_idx in range(num_camera_views): | |
| camera_extrinsic = extrinsic_matrices[camera_idx] | |
| camera_color_rgba = color_palette(camera_idx / num_camera_views) | |
| camera_color_rgb = tuple(int(255 * x) for x in camera_color_rgba[:3]) | |
| integrate_camera_into_scene( | |
| output_scene, camera_extrinsic, camera_color_rgb, scene_scale_factor | |
| ) | |
| # Define coordinate system transformation matrices | |
| opengl_transform = np.eye(4) | |
| opengl_transform[1, 1] = -1 # Flip Y axis | |
| opengl_transform[2, 2] = -1 # Flip Z axis | |
| # Define alignment rotation (180 degrees around Y-axis) | |
| alignment_rotation = np.eye(4) | |
| alignment_rotation[:3, :3] = Rotation.from_euler("y", 0, degrees=True).as_matrix() | |
| # Compute and apply final transformation | |
| scene_transformation = ( | |
| np.linalg.inv(extrinsic_matrices[0]) | |
| ) | |
| output_scene.apply_transform(scene_transformation) | |
| print("GLB Scene built") | |
| return output_scene | |
| def integrate_camera_into_scene( | |
| scene: trimesh.Scene, | |
| camera_transform: np.ndarray, | |
| camera_color: tuple, | |
| scale_factor: float, | |
| ): | |
| """ | |
| Adds a camera visualization mesh to the 3D scene. | |
| Args: | |
| scene (trimesh.Scene): The 3D scene to add the camera visualization. | |
| camera_transform (np.ndarray): 4x4 transformation matrix for camera positioning. | |
| camera_color (tuple): RGB color tuple for the camera mesh. | |
| scale_factor (float): Scaling factor for the camera size relative to scene. | |
| """ | |
| # Define camera dimensions based on scene scale | |
| camera_base_width = scale_factor * 0.05 | |
| camera_cone_height = scale_factor * 0.1 | |
| # Create base cone geometry for camera representation | |
| base_cone = trimesh.creation.cone(camera_base_width, camera_cone_height, sections=4) | |
| # Setup rotation transformation (45 degrees around z-axis) | |
| z_rotation_matrix = np.eye(4) | |
| z_rotation_matrix[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix() | |
| z_rotation_matrix[2, 3] = -camera_cone_height | |
| # Setup OpenGL coordinate system conversion | |
| opengl_coord_transform = np.eye(4) | |
| opengl_coord_transform[1, 1] = -1 # Flip Y axis | |
| opengl_coord_transform[2, 2] = -1 # Flip Z axis | |
| # Combine all transformations | |
| final_transform = camera_transform @ opengl_coord_transform @ z_rotation_matrix | |
| # Create slight rotation for mesh variation | |
| minor_rotation = np.eye(4) | |
| minor_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix() | |
| # Generate multiple vertex sets for complex camera geometry | |
| original_vertices = base_cone.vertices | |
| scaled_vertices = 0.95 * original_vertices | |
| rotated_vertices = apply_transformation_to_points(minor_rotation, original_vertices) | |
| # Combine all vertex sets | |
| all_vertices = np.concatenate([ | |
| original_vertices, | |
| scaled_vertices, | |
| rotated_vertices | |
| ]) | |
| # Transform vertices to final position | |
| transformed_vertices = apply_transformation_to_points(final_transform, all_vertices) | |
| # Generate faces for the complete camera mesh | |
| camera_faces = generate_camera_mesh_faces(base_cone) | |
| # Create and configure the camera mesh | |
| camera_mesh = trimesh.Trimesh( | |
| vertices=transformed_vertices, | |
| faces=camera_faces | |
| ) | |
| camera_mesh.visual.face_colors[:, :3] = camera_color | |
| # Add the camera mesh to the scene | |
| scene.add_geometry(camera_mesh) | |
| def apply_transformation_to_points( | |
| transform_matrix: np.ndarray, point_array: np.ndarray, output_dim: int = None | |
| ) -> np.ndarray: | |
| """ | |
| Applies a 4x4 transformation matrix to a collection of 3D points. | |
| Args: | |
| transform_matrix (np.ndarray): 4x4 transformation matrix to apply. | |
| point_array (np.ndarray): Array of points to transform. | |
| output_dim (int, optional): Target dimension for output points. | |
| Returns: | |
| np.ndarray: Array of transformed points. | |
| """ | |
| point_array = np.asarray(point_array) | |
| original_shape = point_array.shape[:-1] | |
| target_dim = output_dim or point_array.shape[-1] | |
| # Transpose transformation matrix for matrix multiplication | |
| transposed_transform = transform_matrix.swapaxes(-1, -2) | |
| # Apply rotation/scaling and translation components | |
| transformed_points = ( | |
| point_array @ transposed_transform[..., :-1, :] + | |
| transposed_transform[..., -1:, :] | |
| ) | |
| # Extract desired dimensions and restore original shape | |
| final_result = transformed_points[..., :target_dim].reshape(*original_shape, target_dim) | |
| return final_result | |
| def generate_camera_mesh_faces(base_cone_mesh: trimesh.Trimesh) -> np.ndarray: | |
| """ | |
| Generates face indices for a complex camera mesh composed of multiple cone layers. | |
| Args: | |
| base_cone_mesh (trimesh.Trimesh): Base cone geometry used as template. | |
| Returns: | |
| np.ndarray: Array of face indices defining the camera mesh topology. | |
| """ | |
| face_indices = [] | |
| vertex_count_per_cone = len(base_cone_mesh.vertices) | |
| # Process each face of the base cone | |
| for triangle_face in base_cone_mesh.faces: | |
| # Skip faces that include the cone tip (vertex 0) | |
| if 0 in triangle_face: | |
| continue | |
| # Get vertex indices for current triangle | |
| vertex_a, vertex_b, vertex_c = triangle_face | |
| # Calculate corresponding vertices in second and third cone layers | |
| vertex_a_layer2, vertex_b_layer2, vertex_c_layer2 = triangle_face + vertex_count_per_cone | |
| vertex_a_layer3, vertex_b_layer3, vertex_c_layer3 = triangle_face + 2 * vertex_count_per_cone | |
| # Create connecting faces between cone layers | |
| connecting_faces = [ | |
| (vertex_a, vertex_b, vertex_b_layer2), | |
| (vertex_a, vertex_a_layer2, vertex_c), | |
| (vertex_c_layer2, vertex_b, vertex_c), | |
| (vertex_a, vertex_b, vertex_b_layer3), | |
| (vertex_a, vertex_a_layer3, vertex_c), | |
| (vertex_c_layer3, vertex_b, vertex_c), | |
| ] | |
| face_indices.extend(connecting_faces) | |
| # Add reverse-winding faces for proper mesh closure | |
| reversed_faces = [(vertex_c, vertex_b, vertex_a) for vertex_a, vertex_b, vertex_c in face_indices] | |
| face_indices.extend(reversed_faces) | |
| return np.array(face_indices) | |