Spaces:

tencent
/

HunyuanWorld-Mirror

Running on Zero

App Files Files Community

HunyuanWorld-Mirror / src /utils /visual_util.py

ZhenweiWang

Upload folder using huggingface_hub

0ca05b5 verified 18 days ago

raw

history blame

23.9 kB

	# wzw
	""" Visual utilities for HuggingFace integration.

	References: https://github.com/facebookresearch/vggt
	"""

	import copy
	import os
	from typing import Tuple

	import cv2
	import matplotlib
	import numpy as np
	import requests
	import trimesh

	from scipy.spatial.transform import Rotation


	def segment_sky(image_path, onnx_session):
	"""
	Segments sky from an image using an ONNX model.
	Thanks for the great model provided by https://github.com/xiongzhu666/Sky-Segmentation-and-Post-processing

	Args:
	image_path: Path to input image
	onnx_session: ONNX runtime session with loaded model

	Returns:
	np.ndarray: Binary mask where 255 indicates non-sky regions
	"""

	image = cv2.imread(image_path)
	result_map = run_skyseg(onnx_session, [320, 320], image)
	# resize the result_map to the original image size
	result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))

	# Fix: Invert the mask so that 255 = non-sky, 0 = sky
	# The model outputs low values for sky, high values for non-sky
	output_mask = np.zeros_like(result_map_original)
	output_mask[result_map_original < 32] = 255 # Use threshold of 32
	return output_mask


	def run_skyseg(onnx_session, input_size, image):
	"""
	Runs sky segmentation inference using ONNX model.

	Args:
	onnx_session: ONNX runtime session
	input_size: Target size for model input (width, height)
	image: Input image in BGR format

	Returns:
	np.ndarray: Segmentation mask
	"""

	# Pre process:Resize, BGR->RGB, Transpose, PyTorch standardization, float32 cast
	temp_image = copy.deepcopy(image)
	resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1]))
	x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB)
	x = np.array(x, dtype=np.float32)
	mean = [0.485, 0.456, 0.406]
	std = [0.229, 0.224, 0.225]
	x = (x / 255 - mean) / std
	x = x.transpose(2, 0, 1)
	x = x.reshape(-1, 3, input_size[0], input_size[1]).astype("float32")

	# Inference
	input_name = onnx_session.get_inputs()[0].name
	output_name = onnx_session.get_outputs()[0].name
	onnx_result = onnx_session.run([output_name], {input_name: x})

	# Post process
	onnx_result = np.array(onnx_result).squeeze()
	min_value = np.min(onnx_result)
	max_value = np.max(onnx_result)
	onnx_result = (onnx_result - min_value) / (max_value - min_value)
	onnx_result *= 255
	onnx_result = onnx_result.astype("uint8")

	return onnx_result


	def download_file_from_url(url, filename):
	"""Downloads a file from a Hugging Face model repo, handling redirects."""
	try:
	# Get the redirect URL
	response = requests.get(url, allow_redirects=False)
	response.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)

	if response.status_code == 302: # Expecting a redirect
	redirect_url = response.headers["Location"]
	response = requests.get(redirect_url, stream=True)
	response.raise_for_status()
	else:
	print(f"Unexpected status code: {response.status_code}")
	return

	with open(filename, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print(f"Downloaded {filename} successfully.")

	except requests.exceptions.RequestException as e:
	print(f"Error downloading file: {e}")


	def create_image_mesh(
	*image_data: np.ndarray,
	mask: np.ndarray = None,
	triangulate: bool = False,
	return_vertex_indices: bool = False,
	) -> Tuple[np.ndarray, ...]:
	"""
	Create a mesh from image data using pixel coordinates as vertices and grid connections as faces.

	Args:
	*image_data (np.ndarray): Image arrays with shape (height, width, [channels])
	mask (np.ndarray, optional): Boolean mask with shape (height, width). Defaults to None.
	triangulate (bool): Convert quad faces to triangular faces. Defaults to False.
	return_vertex_indices (bool): Include vertex indices in output. Defaults to False.

	Returns:
	faces (np.ndarray): Face connectivity array. Shape (N, 4) for quads or (N, 3) for triangles
	*vertex_data (np.ndarray): Vertex attributes corresponding to input image_data
	vertex_indices (np.ndarray, optional): Original vertex indices if return_vertex_indices=True
	"""
	# Validate inputs
	assert (len(image_data) > 0) or (mask is not None), "Need at least one image or mask"

	if mask is None:
	height, width = image_data[0].shape[:2]
	else:
	height, width = mask.shape

	# Check all images have same dimensions
	for img in image_data:
	assert img.shape[:2] == (height, width), "All images must have same height and width"

	# Create quad faces connecting neighboring pixels
	base_quad = np.stack([
	np.arange(0, width - 1, dtype=np.int32), # bottom-left
	np.arange(width, 2 * width - 1, dtype=np.int32), # top-left
	np.arange(1 + width, 2 * width, dtype=np.int32), # top-right
	np.arange(1, width, dtype=np.int32), # bottom-right
	], axis=1)

	# Replicate quad pattern for all rows
	row_offsets = np.arange(0, (height - 1) * width, width, dtype=np.int32)
	faces = (row_offsets[:, None, None] + base_quad[None, :, :]).reshape((-1, 4))

	if mask is None:
	# No masking - use all faces and vertices
	if triangulate:
	faces = _convert_quads_to_triangles(faces)

	output = [faces]
	for img in image_data:
	output.append(img.reshape(-1, *img.shape[2:]))

	if return_vertex_indices:
	output.append(np.arange(height * width, dtype=np.int32))

	return tuple(output)
	else:
	# Apply mask - only keep faces where all 4 corners are valid
	valid_quads = (
	mask[:-1, :-1] & mask[1:, :-1] &
	mask[1:, 1:] & mask[:-1, 1:]
	).ravel()
	faces = faces[valid_quads]

	if triangulate:
	faces = _convert_quads_to_triangles(faces)

	# Remove unused vertices and remap face indices
	num_face_vertices = faces.shape[-1]
	unique_vertices, remapped_indices = np.unique(faces, return_inverse=True)
	faces = remapped_indices.astype(np.int32).reshape(-1, num_face_vertices)

	output = [faces]
	for img in image_data:
	flattened_img = img.reshape(-1, *img.shape[2:])
	output.append(flattened_img[unique_vertices])

	if return_vertex_indices:
	output.append(unique_vertices)

	return tuple(output)


	def _convert_quads_to_triangles(quad_faces: np.ndarray) -> np.ndarray:
	"""Convert quadrilateral faces to triangular faces."""
	if quad_faces.shape[-1] == 3:
	return quad_faces # Already triangular

	num_vertices_per_face = quad_faces.shape[-1]
	triangle_indices = np.stack([
	np.zeros(num_vertices_per_face - 2, dtype=int), # First vertex
	np.arange(1, num_vertices_per_face - 1, dtype=int), # Sequential vertices
	np.arange(2, num_vertices_per_face, dtype=int), # Next sequential vertices
	], axis=1)

	return quad_faces[:, triangle_indices].reshape((-1, 3))


	def convert_predictions_to_glb_scene(
	predictions,
	filter_by_frames="all",
	show_camera=True,
	mask_sky_bg=False,
	mask_ambiguous=False,
	as_mesh=True,
	) -> trimesh.Scene:
	"""
	Converts model predictions to a 3D scene represented as a GLB file.

	Args:
	predictions (dict): Dictionary containing model predictions with keys:
	- world_points: 3D point coordinates (S, H, W, 3)
	- images: Input images (S, H, W, 3)
	- camera_poses: Camera extrinsic matrices (S, 3, 4)
	filter_by_frames (str): Frame filter specification (default: "all")
	show_camera (bool): Include camera visualization (default: True)
	mask_sky_bg (bool): Mask out sky background pixels (default: False)
	mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False)
	as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False)

	Returns:
	trimesh.Scene: Processed 3D scene containing point cloud/mesh and cameras

	Raises:
	ValueError: If input predictions structure is invalid
	"""
	if not isinstance(predictions, dict):
	raise ValueError("predictions must be a dictionary")

	print("Building GLB scene")

	# Parse frame selection from filter string
	target_frame_index = None
	if filter_by_frames not in ["all", "All"]:
	try:
	# Extract numeric index before colon separator
	target_frame_index = int(filter_by_frames.split(":")[0])
	except (ValueError, IndexError):
	pass

	# Validate required data in predictions
	print("Using Pointmap Branch")
	if "world_points" not in predictions:
	raise ValueError(
	"world_points not found in predictions. Pointmap Branch requires 'world_points' key. "
	"Depthmap and Camera branches have been removed."
	)

	# Extract prediction data
	point_cloud_3d = predictions["world_points"]
	input_images = predictions["images"]
	extrinsic_matrices = predictions["camera_poses"]
	ambiguity_mask = predictions["final_mask"]
	sky_region_mask = predictions["sky_mask"]

	# Filter to single frame if specified
	if target_frame_index is not None:
	point_cloud_3d = point_cloud_3d[target_frame_index][None]
	input_images = input_images[target_frame_index][None]
	extrinsic_matrices = extrinsic_matrices[target_frame_index][None]
	ambiguity_mask = ambiguity_mask[target_frame_index][None]
	sky_region_mask = sky_region_mask[target_frame_index][None]

	# Flatten 3D points to vertex array
	flattened_vertices = point_cloud_3d.reshape(-1, 3)

	# Convert images to RGB color array
	if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format
	rgb_colors = np.transpose(input_images, (0, 2, 3, 1))
	else: # Already in NHWC format
	rgb_colors = input_images
	rgb_colors = (rgb_colors.reshape(-1, 3) * 255).astype(np.uint8)

	# Build composite filtering mask
	valid_points_mask = np.ones(len(flattened_vertices), dtype=bool)

	# Apply ambiguity filtering if requested
	if mask_ambiguous:
	flat_ambiguity_mask = ambiguity_mask.reshape(-1)
	valid_points_mask = valid_points_mask & flat_ambiguity_mask

	# Apply sky region filtering if requested
	if mask_sky_bg:
	flat_sky_mask = sky_region_mask.reshape(-1)
	valid_points_mask = valid_points_mask & flat_sky_mask

	# Apply mask to filter vertices and colors
	filtered_vertices = flattened_vertices[valid_points_mask].copy()
	filtered_colors = rgb_colors[valid_points_mask].copy()

	# Handle empty geometry case
	if filtered_vertices is None or np.asarray(filtered_vertices).size == 0:
	filtered_vertices = np.array([[1, 0, 0]])
	filtered_colors = np.array([[255, 255, 255]])
	scene_scale_factor = 1
	else:
	# Compute scene scale from percentile-based bounding box
	percentile_lower = np.percentile(filtered_vertices, 5, axis=0)
	percentile_upper = np.percentile(filtered_vertices, 95, axis=0)
	scene_scale_factor = np.linalg.norm(percentile_upper - percentile_lower)

	# Initialize color mapping for cameras
	color_palette = matplotlib.colormaps.get_cmap("gist_rainbow")

	# Create empty 3D scene container
	output_scene = trimesh.Scene()

	# Add geometry to scene based on representation type
	if as_mesh:
	# Mesh representation
	if target_frame_index is not None:
	# Single frame mesh generation
	frame_height, frame_width = point_cloud_3d.shape[1:3]

	# Prepare unfiltered data for mesh construction
	structured_points = point_cloud_3d.reshape(frame_height, frame_width, 3)

	# Convert image data to proper format
	if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format
	structured_colors = np.transpose(input_images[0], (1, 2, 0))
	else: # Already in HWC format
	structured_colors = input_images[0]
	structured_colors *= 255

	# Get structured mask for mesh creation
	structured_mask = predictions["final_mask"][target_frame_index].reshape(
	frame_height, frame_width
	)

	# Build filtering mask
	mesh_filter_mask = structured_mask

	# Check for normal data availability
	mesh_normals = None
	if "normal" in predictions and predictions["normal"] is not None:
	# Extract normals for selected frame
	frame_normal_data = (
	predictions["normal"][target_frame_index]
	if target_frame_index is not None
	else predictions["normal"][0]
	)

	# Generate mesh with normal information
	mesh_faces, mesh_vertices, mesh_colors, mesh_normals = create_image_mesh(
	structured_points * np.array([1, -1, 1], dtype=np.float32),
	structured_colors / 255.0,
	frame_normal_data * np.array([1, -1, 1], dtype=np.float32),
	mask=mesh_filter_mask,
	triangulate=True,
	return_vertex_indices=False,
	)

	# Apply coordinate system transformation to normals
	mesh_normals = mesh_normals * np.array([1, -1, 1], dtype=np.float32)
	else:
	# Generate mesh without normal information
	mesh_faces, mesh_vertices, mesh_colors = create_image_mesh(
	structured_points * np.array([1, -1, 1], dtype=np.float32),
	structured_colors / 255.0,
	mask=mesh_filter_mask,
	triangulate=True,
	return_vertex_indices=False,
	)

	# Construct trimesh object with optional normals
	geometry_mesh = trimesh.Trimesh(
	vertices=mesh_vertices * np.array([1, -1, 1], dtype=np.float32),
	faces=mesh_faces,
	vertex_colors=(mesh_colors * 255).astype(np.uint8),
	vertex_normals=(mesh_normals if mesh_normals is not None else None),
	process=False,
	)
	output_scene.add_geometry(geometry_mesh)
	else:
	# Multi-frame mesh generation
	print("Creating mesh for multi-frame data...")

	for frame_idx in range(point_cloud_3d.shape[0]):
	frame_height, frame_width = point_cloud_3d.shape[1:3]

	# Extract per-frame data
	frame_point_data = point_cloud_3d[frame_idx]
	frame_ambiguity_mask = predictions["final_mask"][frame_idx]
	frame_sky_mask = predictions["sky_mask"][frame_idx]

	# Extract frame image data
	if input_images.ndim == 4 and input_images.shape[1] == 3: # NCHW format
	frame_image_data = np.transpose(input_images[frame_idx], (1, 2, 0))
	else: # Already in HWC format
	frame_image_data = input_images[frame_idx]
	frame_image_data *= 255

	# Build per-frame filtering mask
	frame_filter_mask = np.ones((frame_height, frame_width), dtype=bool)

	# Apply ambiguity filtering if enabled
	if mask_ambiguous:
	frame_filter_mask = frame_filter_mask & frame_ambiguity_mask

	# Apply sky filtering if enabled
	if mask_sky_bg:
	frame_filter_mask = frame_filter_mask & frame_sky_mask

	# Generate mesh for current frame
	frame_faces, frame_vertices, frame_colors = create_image_mesh(
	frame_point_data * np.array([1, -1, 1], dtype=np.float32),
	frame_image_data / 255.0,
	mask=frame_filter_mask,
	triangulate=True,
	return_vertex_indices=False,
	)

	frame_vertices = frame_vertices * np.array([1, -1, 1], dtype=np.float32)

	# Create trimesh object for current frame
	frame_geometry = trimesh.Trimesh(
	vertices=frame_vertices,
	faces=frame_faces,
	vertex_colors=(frame_colors * 255).astype(np.uint8),
	process=False,
	)
	output_scene.add_geometry(frame_geometry)
	else:
	# Point cloud representation
	point_cloud_geometry = trimesh.PointCloud(vertices=filtered_vertices, colors=filtered_colors)
	output_scene.add_geometry(point_cloud_geometry)

	# Add camera visualizations if requested
	num_camera_views = len(extrinsic_matrices)

	if show_camera:
	# Iterate through all camera views
	for camera_idx in range(num_camera_views):
	camera_extrinsic = extrinsic_matrices[camera_idx]
	camera_color_rgba = color_palette(camera_idx / num_camera_views)
	camera_color_rgb = tuple(int(255 * x) for x in camera_color_rgba[:3])

	integrate_camera_into_scene(
	output_scene, camera_extrinsic, camera_color_rgb, scene_scale_factor
	)

	# Define coordinate system transformation matrices
	opengl_transform = np.eye(4)
	opengl_transform[1, 1] = -1 # Flip Y axis
	opengl_transform[2, 2] = -1 # Flip Z axis

	# Define alignment rotation (180 degrees around Y-axis)
	alignment_rotation = np.eye(4)
	alignment_rotation[:3, :3] = Rotation.from_euler("y", 0, degrees=True).as_matrix()

	# Compute and apply final transformation
	scene_transformation = (
	np.linalg.inv(extrinsic_matrices[0])
	@ opengl_transform
	@ alignment_rotation
	)
	output_scene.apply_transform(scene_transformation)

	print("GLB Scene built")
	return output_scene

	def integrate_camera_into_scene(
	scene: trimesh.Scene,
	camera_transform: np.ndarray,
	camera_color: tuple,
	scale_factor: float,
	):
	"""
	Adds a camera visualization mesh to the 3D scene.

	Args:
	scene (trimesh.Scene): The 3D scene to add the camera visualization.
	camera_transform (np.ndarray): 4x4 transformation matrix for camera positioning.
	camera_color (tuple): RGB color tuple for the camera mesh.
	scale_factor (float): Scaling factor for the camera size relative to scene.
	"""
	# Define camera dimensions based on scene scale
	camera_base_width = scale_factor * 0.05
	camera_cone_height = scale_factor * 0.1

	# Create base cone geometry for camera representation
	base_cone = trimesh.creation.cone(camera_base_width, camera_cone_height, sections=4)

	# Setup rotation transformation (45 degrees around z-axis)
	z_rotation_matrix = np.eye(4)
	z_rotation_matrix[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
	z_rotation_matrix[2, 3] = -camera_cone_height

	# Setup OpenGL coordinate system conversion
	opengl_coord_transform = np.eye(4)
	opengl_coord_transform[1, 1] = -1 # Flip Y axis
	opengl_coord_transform[2, 2] = -1 # Flip Z axis

	# Combine all transformations
	final_transform = camera_transform @ opengl_coord_transform @ z_rotation_matrix

	# Create slight rotation for mesh variation
	minor_rotation = np.eye(4)
	minor_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()

	# Generate multiple vertex sets for complex camera geometry
	original_vertices = base_cone.vertices
	scaled_vertices = 0.95 * original_vertices
	rotated_vertices = apply_transformation_to_points(minor_rotation, original_vertices)

	# Combine all vertex sets
	all_vertices = np.concatenate([
	original_vertices,
	scaled_vertices,
	rotated_vertices
	])

	# Transform vertices to final position
	transformed_vertices = apply_transformation_to_points(final_transform, all_vertices)

	# Generate faces for the complete camera mesh
	camera_faces = generate_camera_mesh_faces(base_cone)

	# Create and configure the camera mesh
	camera_mesh = trimesh.Trimesh(
	vertices=transformed_vertices,
	faces=camera_faces
	)
	camera_mesh.visual.face_colors[:, :3] = camera_color

	# Add the camera mesh to the scene
	scene.add_geometry(camera_mesh)


	def apply_transformation_to_points(
	transform_matrix: np.ndarray, point_array: np.ndarray, output_dim: int = None
	) -> np.ndarray:
	"""
	Applies a 4x4 transformation matrix to a collection of 3D points.

	Args:
	transform_matrix (np.ndarray): 4x4 transformation matrix to apply.
	point_array (np.ndarray): Array of points to transform.
	output_dim (int, optional): Target dimension for output points.

	Returns:
	np.ndarray: Array of transformed points.
	"""
	point_array = np.asarray(point_array)
	original_shape = point_array.shape[:-1]
	target_dim = output_dim or point_array.shape[-1]

	# Transpose transformation matrix for matrix multiplication
	transposed_transform = transform_matrix.swapaxes(-1, -2)

	# Apply rotation/scaling and translation components
	transformed_points = (
	point_array @ transposed_transform[..., :-1, :] +
	transposed_transform[..., -1:, :]
	)

	# Extract desired dimensions and restore original shape
	final_result = transformed_points[..., :target_dim].reshape(*original_shape, target_dim)
	return final_result


	def generate_camera_mesh_faces(base_cone_mesh: trimesh.Trimesh) -> np.ndarray:
	"""
	Generates face indices for a complex camera mesh composed of multiple cone layers.

	Args:
	base_cone_mesh (trimesh.Trimesh): Base cone geometry used as template.

	Returns:
	np.ndarray: Array of face indices defining the camera mesh topology.
	"""
	face_indices = []
	vertex_count_per_cone = len(base_cone_mesh.vertices)

	# Process each face of the base cone
	for triangle_face in base_cone_mesh.faces:
	# Skip faces that include the cone tip (vertex 0)
	if 0 in triangle_face:
	continue

	# Get vertex indices for current triangle
	vertex_a, vertex_b, vertex_c = triangle_face

	# Calculate corresponding vertices in second and third cone layers
	vertex_a_layer2, vertex_b_layer2, vertex_c_layer2 = triangle_face + vertex_count_per_cone
	vertex_a_layer3, vertex_b_layer3, vertex_c_layer3 = triangle_face + 2 * vertex_count_per_cone

	# Create connecting faces between cone layers
	connecting_faces = [
	(vertex_a, vertex_b, vertex_b_layer2),
	(vertex_a, vertex_a_layer2, vertex_c),
	(vertex_c_layer2, vertex_b, vertex_c),
	(vertex_a, vertex_b, vertex_b_layer3),
	(vertex_a, vertex_a_layer3, vertex_c),
	(vertex_c_layer3, vertex_b, vertex_c),
	]

	face_indices.extend(connecting_faces)

	# Add reverse-winding faces for proper mesh closure
	reversed_faces = [(vertex_c, vertex_b, vertex_a) for vertex_a, vertex_b, vertex_c in face_indices]
	face_indices.extend(reversed_faces)

	return np.array(face_indices)