Spaces:

lch01
/

StreamVGGT

Running on Zero

App Files Files Community

StreamVGGT / vggt /utils /pose_enc.py

lch01

first version

54226b9 4 months ago

raw

history blame contribute delete

5.54 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	from .rotation import quat_to_mat, mat_to_quat


	def extri_intri_to_pose_encoding(
	extrinsics,
	intrinsics,
	image_size_hw=None, # e.g., (256, 512)
	pose_encoding_type="absT_quaR_FoV",
	):
	"""Convert camera extrinsics and intrinsics to a compact pose encoding.

	This function transforms camera parameters into a unified pose encoding format,
	which can be used for various downstream tasks like pose prediction or representation.

	Args:
	extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4,
	where B is batch size and S is sequence length.
	In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation.
	The format is [R\|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector.
	intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3.
	Defined in pixels, with format:
	[[fx, 0, cx],
	[0, fy, cy],
	[0, 0, 1]]
	where fx, fy are focal lengths and (cx, cy) is the principal point
	image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
	Required for computing field of view values. For example: (256, 512).
	pose_encoding_type (str): Type of pose encoding to use. Currently only
	supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).

	Returns:
	torch.Tensor: Encoded camera pose parameters with shape BxSx9.
	For "absT_quaR_FoV" type, the 9 dimensions are:
	- [:3] = absolute translation vector T (3D)
	- [3:7] = rotation as quaternion quat (4D)
	- [7:] = field of view (2D)
	"""

	# extrinsics: BxSx3x4
	# intrinsics: BxSx3x3

	if pose_encoding_type == "absT_quaR_FoV":
	R = extrinsics[:, :, :3, :3] # BxSx3x3
	T = extrinsics[:, :, :3, 3] # BxSx3

	quat = mat_to_quat(R)
	# Note the order of h and w here
	H, W = image_size_hw
	fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
	fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
	pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
	else:
	raise NotImplementedError

	return pose_encoding


	def pose_encoding_to_extri_intri(
	pose_encoding,
	image_size_hw=None, # e.g., (256, 512)
	pose_encoding_type="absT_quaR_FoV",
	build_intrinsics=True,
	):
	"""Convert a pose encoding back to camera extrinsics and intrinsics.

	This function performs the inverse operation of extri_intri_to_pose_encoding,
	reconstructing the full camera parameters from the compact encoding.

	Args:
	pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9,
	where B is batch size and S is sequence length.
	For "absT_quaR_FoV" type, the 9 dimensions are:
	- [:3] = absolute translation vector T (3D)
	- [3:7] = rotation as quaternion quat (4D)
	- [7:] = field of view (2D)
	image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
	Required for reconstructing intrinsics from field of view values.
	For example: (256, 512).
	pose_encoding_type (str): Type of pose encoding used. Currently only
	supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).
	build_intrinsics (bool): Whether to reconstruct the intrinsics matrix.
	If False, only extrinsics are returned and intrinsics will be None.

	Returns:
	tuple: (extrinsics, intrinsics)
	- extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4.
	In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world
	transformation. The format is [R\|t] where R is a 3x3 rotation matrix and t is
	a 3x1 translation vector.
	- intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3,
	or None if build_intrinsics is False. Defined in pixels, with format:
	[[fx, 0, cx],
	[0, fy, cy],
	[0, 0, 1]]
	where fx, fy are focal lengths and (cx, cy) is the principal point,
	assumed to be at the center of the image (W/2, H/2).
	"""

	intrinsics = None

	if pose_encoding_type == "absT_quaR_FoV":
	T = pose_encoding[..., :3]
	quat = pose_encoding[..., 3:7]
	fov_h = pose_encoding[..., 7]
	fov_w = pose_encoding[..., 8]

	R = quat_to_mat(quat)
	extrinsics = torch.cat([R, T[..., None]], dim=-1)

	if build_intrinsics:
	H, W = image_size_hw
	fy = (H / 2.0) / torch.tan(fov_h / 2.0)
	fx = (W / 2.0) / torch.tan(fov_w / 2.0)
	intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
	intrinsics[..., 0, 0] = fx
	intrinsics[..., 1, 1] = fy
	intrinsics[..., 0, 2] = W / 2
	intrinsics[..., 1, 2] = H / 2
	intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1
	else:
	raise NotImplementedError

	return extrinsics, intrinsics