Spaces:

lch01
/

StreamVGGT

Running on Zero

App Files Files Community

StreamVGGT / croco /models /croco_downstream.py

lch01

add dependencies

f4ba42f 4 months ago

raw

history blame

5.06 kB

	# Copyright (C) 2022-present Naver Corporation. All rights reserved.
	# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

	# --------------------------------------------------------
	# CroCo model for downstream tasks
	# --------------------------------------------------------

	import torch

	from .croco import CroCoNet


	def croco_args_from_ckpt(ckpt):
	if "croco_kwargs" in ckpt: # CroCo v2 released models
	return ckpt["croco_kwargs"]
	elif "args" in ckpt and hasattr(
	ckpt["args"], "model"
	): # pretrained using the official code release
	s = ckpt[
	"args"
	].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
	assert s.startswith("CroCoNet(")
	return eval(
	"dict" + s[len("CroCoNet") :]
	) # transform it into the string of a dictionary and evaluate it
	else: # CroCo v1 released models
	return dict()


	class CroCoDownstreamMonocularEncoder(CroCoNet):

	def __init__(self, head, **kwargs):
	"""Build network for monocular downstream task, only using the encoder.
	It takes an extra argument head, that is called with the features
	and a dictionary img_info containing 'width' and 'height' keys
	The head is setup with the croconet arguments in this init function
	NOTE: It works by *calling super().__init__() but with redefined setters

	"""
	super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
	head.setup(self)
	self.head = head

	def _set_mask_generator(self, args, *kwargs):
	"""No mask generator"""
	return

	def _set_mask_token(self, args, *kwargs):
	"""No mask token"""
	self.mask_token = None
	return

	def _set_decoder(self, args, *kwargs):
	"""No decoder"""
	return

	def _set_prediction_head(self, args, *kwargs):
	"""No 'prediction head' for downstream tasks."""
	return

	def forward(self, img):
	"""
	img if of size batch_size x 3 x h x w
	"""
	B, C, H, W = img.size()
	img_info = {"height": H, "width": W}
	need_all_layers = (
	hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
	)
	out, _, _ = self._encode_image(
	img, do_mask=False, return_all_blocks=need_all_layers
	)
	return self.head(out, img_info)


	class CroCoDownstreamBinocular(CroCoNet):

	def __init__(self, head, **kwargs):
	"""Build network for binocular downstream task
	It takes an extra argument head, that is called with the features
	and a dictionary img_info containing 'width' and 'height' keys
	The head is setup with the croconet arguments in this init function
	"""
	super(CroCoDownstreamBinocular, self).__init__(**kwargs)
	head.setup(self)
	self.head = head

	def _set_mask_generator(self, args, *kwargs):
	"""No mask generator"""
	return

	def _set_mask_token(self, args, *kwargs):
	"""No mask token"""
	self.mask_token = None
	return

	def _set_prediction_head(self, args, *kwargs):
	"""No prediction head for downstream tasks, define your own head"""
	return

	def encode_image_pairs(self, img1, img2, return_all_blocks=False):
	"""run encoder for a pair of images
	it is actually ~5% faster to concatenate the images along the batch dimension
	than to encode them separately
	"""
	## the two commented lines below is the naive version with separate encoding
	# out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
	# out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
	## and now the faster version
	out, pos, _ = self._encode_image(
	torch.cat((img1, img2), dim=0),
	do_mask=False,
	return_all_blocks=return_all_blocks,
	)
	if return_all_blocks:
	out, out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
	out2 = out2[-1]
	else:
	out, out2 = out.chunk(2, dim=0)
	pos, pos2 = pos.chunk(2, dim=0)
	return out, out2, pos, pos2

	def forward(self, img1, img2):
	B, C, H, W = img1.size()
	img_info = {"height": H, "width": W}
	return_all_blocks = (
	hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
	)
	out, out2, pos, pos2 = self.encode_image_pairs(
	img1, img2, return_all_blocks=return_all_blocks
	)
	if return_all_blocks:
	decout = self._decoder(
	out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks
	)
	decout = out + decout
	else:
	decout = self._decoder(
	out, pos, None, out2, pos2, return_all_blocks=return_all_blocks
	)
	return self.head(decout, img_info)