Spaces:
Running
on
Zero
Running
on
Zero
minor adjustments
Browse files- app.py +7 -15
- vggt_to_colmap.py +0 -593
app.py
CHANGED
|
@@ -1,21 +1,10 @@
|
|
|
|
|
|
|
|
| 1 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
# All rights reserved.
|
| 3 |
#
|
| 4 |
# This source code is licensed under the license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
import shutil
|
| 7 |
-
import os
|
| 8 |
-
|
| 9 |
-
cache_dirs = [
|
| 10 |
-
os.path.expanduser("~/.cache/huggingface/hub"),
|
| 11 |
-
os.path.expanduser("~/.cache/torch/hub"),
|
| 12 |
-
]
|
| 13 |
-
|
| 14 |
-
for cache_dir in cache_dirs:
|
| 15 |
-
if os.path.exists(cache_dir):
|
| 16 |
-
print(f"Removing cache directory: {cache_dir}")
|
| 17 |
-
shutil.rmtree(cache_dir)
|
| 18 |
-
|
| 19 |
import os
|
| 20 |
import cv2
|
| 21 |
import torch
|
|
@@ -447,12 +436,15 @@ with gr.Blocks(
|
|
| 447 |
<p>
|
| 448 |
<a href="https://github.com/wzzheng/StreamVGGT">GitHub Repository</a> |
|
| 449 |
<a href="https://wzzheng.net/StreamVGGT/">Project Page</a> |
|
| 450 |
-
<a href="https://arxiv.org/abs/2507.11539">Paper</a>
|
|
|
|
| 451 |
</p>
|
| 452 |
|
|
|
|
|
|
|
|
|
|
| 453 |
<div style="font-size: 16px; line-height: 1.5;">
|
| 454 |
<p>Upload a video or a set of images to create a 3D reconstruction of a scene or object. StreamVGGT takes these images and generates a 3D point cloud, along with estimated camera poses.</p>
|
| 455 |
-
<p>Big thanks to the <a href="https://github.com/facebookresearch/vggt">VGGT</a> team for sharing your awesome code! We built this demo based on it. </p>
|
| 456 |
|
| 457 |
<h3>Getting Started:</h3>
|
| 458 |
<ol>
|
|
|
|
| 1 |
+
# https://huggingface.co/lch01/StreamVGGT
|
| 2 |
+
|
| 3 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 4 |
# All rights reserved.
|
| 5 |
#
|
| 6 |
# This source code is licensed under the license found in the
|
| 7 |
# LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import os
|
| 9 |
import cv2
|
| 10 |
import torch
|
|
|
|
| 436 |
<p>
|
| 437 |
<a href="https://github.com/wzzheng/StreamVGGT">GitHub Repository</a> |
|
| 438 |
<a href="https://wzzheng.net/StreamVGGT/">Project Page</a> |
|
| 439 |
+
<a href="https://arxiv.org/abs/2507.11539">Paper</a> |
|
| 440 |
+
<a href="https://huggingface.co/lch01/StreamVGGT">Hugging Face Model</a>
|
| 441 |
</p>
|
| 442 |
|
| 443 |
+
<div style="font-size: 20px; line-height: 1.5;">
|
| 444 |
+
<p>Big thanks to the <a href="https://github.com/facebookresearch/vggt">VGGT</a> team for sharing your awesome code! We built this demo based on it. </p>
|
| 445 |
+
|
| 446 |
<div style="font-size: 16px; line-height: 1.5;">
|
| 447 |
<p>Upload a video or a set of images to create a 3D reconstruction of a scene or object. StreamVGGT takes these images and generates a 3D point cloud, along with estimated camera poses.</p>
|
|
|
|
| 448 |
|
| 449 |
<h3>Getting Started:</h3>
|
| 450 |
<ol>
|
vggt_to_colmap.py
DELETED
|
@@ -1,593 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import argparse
|
| 3 |
-
import numpy as np
|
| 4 |
-
import torch
|
| 5 |
-
import glob
|
| 6 |
-
import struct
|
| 7 |
-
from scipy.spatial.transform import Rotation
|
| 8 |
-
import sys
|
| 9 |
-
from PIL import Image
|
| 10 |
-
import cv2
|
| 11 |
-
import requests
|
| 12 |
-
import tempfile
|
| 13 |
-
|
| 14 |
-
from vggt.models.vggt import VGGT
|
| 15 |
-
from vggt.utils.load_fn import load_and_preprocess_images
|
| 16 |
-
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
|
| 17 |
-
from vggt.utils.geometry import unproject_depth_map_to_point_map
|
| 18 |
-
|
| 19 |
-
def load_model(device=None):
|
| 20 |
-
"""Load and initialize the VGGT model."""
|
| 21 |
-
if device is None:
|
| 22 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
-
print(f"Using device: {device}")
|
| 24 |
-
|
| 25 |
-
model = VGGT.from_pretrained("facebook/VGGT-1B")
|
| 26 |
-
|
| 27 |
-
# model = VGGT()
|
| 28 |
-
# _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
|
| 29 |
-
# model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
|
| 30 |
-
|
| 31 |
-
model.eval()
|
| 32 |
-
model = model.to(device)
|
| 33 |
-
return model, device
|
| 34 |
-
|
| 35 |
-
def process_images(image_dir, model, device):
|
| 36 |
-
"""Process images with VGGT and return predictions."""
|
| 37 |
-
image_names = glob.glob(os.path.join(image_dir, "*"))
|
| 38 |
-
image_names = sorted([f for f in image_names if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
|
| 39 |
-
print(f"Found {len(image_names)} images")
|
| 40 |
-
|
| 41 |
-
if len(image_names) == 0:
|
| 42 |
-
raise ValueError(f"No images found in {image_dir}")
|
| 43 |
-
|
| 44 |
-
original_images = []
|
| 45 |
-
for img_path in image_names:
|
| 46 |
-
img = Image.open(img_path).convert('RGB')
|
| 47 |
-
original_images.append(np.array(img))
|
| 48 |
-
|
| 49 |
-
images = load_and_preprocess_images(image_names).to(device)
|
| 50 |
-
print(f"Preprocessed images shape: {images.shape}")
|
| 51 |
-
|
| 52 |
-
print("Running inference...")
|
| 53 |
-
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
|
| 54 |
-
|
| 55 |
-
with torch.no_grad():
|
| 56 |
-
with torch.cuda.amp.autocast(dtype=dtype):
|
| 57 |
-
predictions = model(images)
|
| 58 |
-
|
| 59 |
-
print("Converting pose encoding to camera parameters...")
|
| 60 |
-
extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
|
| 61 |
-
predictions["extrinsic"] = extrinsic
|
| 62 |
-
predictions["intrinsic"] = intrinsic
|
| 63 |
-
|
| 64 |
-
for key in predictions.keys():
|
| 65 |
-
if isinstance(predictions[key], torch.Tensor):
|
| 66 |
-
predictions[key] = predictions[key].cpu().numpy().squeeze(0) # remove batch dimension
|
| 67 |
-
|
| 68 |
-
print("Computing 3D points from depth maps...")
|
| 69 |
-
depth_map = predictions["depth"] # (S, H, W, 1)
|
| 70 |
-
world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
|
| 71 |
-
predictions["world_points_from_depth"] = world_points
|
| 72 |
-
|
| 73 |
-
predictions["original_images"] = original_images
|
| 74 |
-
|
| 75 |
-
S, H, W = world_points.shape[:3]
|
| 76 |
-
normalized_images = np.zeros((S, H, W, 3), dtype=np.float32)
|
| 77 |
-
|
| 78 |
-
for i, img in enumerate(original_images):
|
| 79 |
-
resized_img = cv2.resize(img, (W, H))
|
| 80 |
-
normalized_images[i] = resized_img / 255.0
|
| 81 |
-
|
| 82 |
-
predictions["images"] = normalized_images
|
| 83 |
-
|
| 84 |
-
return predictions, image_names
|
| 85 |
-
|
| 86 |
-
def extrinsic_to_colmap_format(extrinsics):
|
| 87 |
-
"""Convert extrinsic matrices to COLMAP format (quaternion + translation)."""
|
| 88 |
-
num_cameras = extrinsics.shape[0]
|
| 89 |
-
quaternions = []
|
| 90 |
-
translations = []
|
| 91 |
-
|
| 92 |
-
for i in range(num_cameras):
|
| 93 |
-
# VGGT's extrinsic is camera-to-world (R|t) format
|
| 94 |
-
R = extrinsics[i, :3, :3]
|
| 95 |
-
t = extrinsics[i, :3, 3]
|
| 96 |
-
|
| 97 |
-
# Convert rotation matrix to quaternion
|
| 98 |
-
# COLMAP quaternion format is [qw, qx, qy, qz]
|
| 99 |
-
rot = Rotation.from_matrix(R)
|
| 100 |
-
quat = rot.as_quat() # scipy returns [x, y, z, w]
|
| 101 |
-
quat = np.array([quat[3], quat[0], quat[1], quat[2]]) # Convert to [w, x, y, z]
|
| 102 |
-
|
| 103 |
-
quaternions.append(quat)
|
| 104 |
-
translations.append(t)
|
| 105 |
-
|
| 106 |
-
return np.array(quaternions), np.array(translations)
|
| 107 |
-
|
| 108 |
-
def download_file_from_url(url, filename):
|
| 109 |
-
"""Downloads a file from a URL, handling redirects."""
|
| 110 |
-
try:
|
| 111 |
-
response = requests.get(url, allow_redirects=False)
|
| 112 |
-
response.raise_for_status()
|
| 113 |
-
|
| 114 |
-
if response.status_code == 302:
|
| 115 |
-
redirect_url = response.headers["Location"]
|
| 116 |
-
response = requests.get(redirect_url, stream=True)
|
| 117 |
-
response.raise_for_status()
|
| 118 |
-
else:
|
| 119 |
-
response = requests.get(url, stream=True)
|
| 120 |
-
response.raise_for_status()
|
| 121 |
-
|
| 122 |
-
with open(filename, "wb") as f:
|
| 123 |
-
for chunk in response.iter_content(chunk_size=8192):
|
| 124 |
-
f.write(chunk)
|
| 125 |
-
print(f"Downloaded {filename} successfully.")
|
| 126 |
-
return True
|
| 127 |
-
|
| 128 |
-
except requests.exceptions.RequestException as e:
|
| 129 |
-
print(f"Error downloading file: {e}")
|
| 130 |
-
return False
|
| 131 |
-
|
| 132 |
-
def segment_sky(image_path, onnx_session, mask_filename=None):
|
| 133 |
-
"""
|
| 134 |
-
Segments sky from an image using an ONNX model.
|
| 135 |
-
"""
|
| 136 |
-
image = cv2.imread(image_path)
|
| 137 |
-
|
| 138 |
-
result_map = run_skyseg(onnx_session, [320, 320], image)
|
| 139 |
-
result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))
|
| 140 |
-
|
| 141 |
-
# Fix: Invert the mask so that 255 = non-sky, 0 = sky
|
| 142 |
-
# The model outputs low values for sky, high values for non-sky
|
| 143 |
-
output_mask = np.zeros_like(result_map_original)
|
| 144 |
-
output_mask[result_map_original < 32] = 255 # Use threshold of 32
|
| 145 |
-
|
| 146 |
-
if mask_filename is not None:
|
| 147 |
-
os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
|
| 148 |
-
cv2.imwrite(mask_filename, output_mask)
|
| 149 |
-
|
| 150 |
-
return output_mask
|
| 151 |
-
|
| 152 |
-
def run_skyseg(onnx_session, input_size, image):
|
| 153 |
-
"""
|
| 154 |
-
Runs sky segmentation inference using ONNX model.
|
| 155 |
-
"""
|
| 156 |
-
import copy
|
| 157 |
-
|
| 158 |
-
temp_image = copy.deepcopy(image)
|
| 159 |
-
resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1]))
|
| 160 |
-
x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB)
|
| 161 |
-
x = np.array(x, dtype=np.float32)
|
| 162 |
-
mean = [0.485, 0.456, 0.406]
|
| 163 |
-
std = [0.229, 0.224, 0.225]
|
| 164 |
-
x = (x / 255 - mean) / std
|
| 165 |
-
x = x.transpose(2, 0, 1)
|
| 166 |
-
x = x.reshape(-1, 3, input_size[0], input_size[1]).astype("float32")
|
| 167 |
-
|
| 168 |
-
input_name = onnx_session.get_inputs()[0].name
|
| 169 |
-
output_name = onnx_session.get_outputs()[0].name
|
| 170 |
-
onnx_result = onnx_session.run([output_name], {input_name: x})
|
| 171 |
-
|
| 172 |
-
onnx_result = np.array(onnx_result).squeeze()
|
| 173 |
-
min_value = np.min(onnx_result)
|
| 174 |
-
max_value = np.max(onnx_result)
|
| 175 |
-
onnx_result = (onnx_result - min_value) / (max_value - min_value)
|
| 176 |
-
onnx_result *= 255
|
| 177 |
-
onnx_result = onnx_result.astype("uint8")
|
| 178 |
-
|
| 179 |
-
return onnx_result
|
| 180 |
-
|
| 181 |
-
def filter_and_prepare_points(predictions, conf_threshold, mask_sky=False, mask_black_bg=False,
|
| 182 |
-
mask_white_bg=False, stride=1, prediction_mode="Depthmap and Camera Branch"):
|
| 183 |
-
"""
|
| 184 |
-
Filter points based on confidence and prepare for COLMAP format.
|
| 185 |
-
Implementation matches the conventions in the original VGGT code.
|
| 186 |
-
"""
|
| 187 |
-
|
| 188 |
-
if "Pointmap" in prediction_mode:
|
| 189 |
-
print("Using Pointmap Branch")
|
| 190 |
-
if "world_points" in predictions:
|
| 191 |
-
pred_world_points = predictions["world_points"]
|
| 192 |
-
pred_world_points_conf = predictions.get("world_points_conf", np.ones_like(pred_world_points[..., 0]))
|
| 193 |
-
else:
|
| 194 |
-
print("Warning: world_points not found in predictions, falling back to depth-based points")
|
| 195 |
-
pred_world_points = predictions["world_points_from_depth"]
|
| 196 |
-
pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
|
| 197 |
-
else:
|
| 198 |
-
print("Using Depthmap and Camera Branch")
|
| 199 |
-
pred_world_points = predictions["world_points_from_depth"]
|
| 200 |
-
pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
|
| 201 |
-
|
| 202 |
-
colors_rgb = predictions["images"]
|
| 203 |
-
|
| 204 |
-
S, H, W = pred_world_points.shape[:3]
|
| 205 |
-
if colors_rgb.shape[:3] != (S, H, W):
|
| 206 |
-
print(f"Reshaping colors_rgb from {colors_rgb.shape} to match {(S, H, W, 3)}")
|
| 207 |
-
reshaped_colors = np.zeros((S, H, W, 3), dtype=np.float32)
|
| 208 |
-
for i in range(S):
|
| 209 |
-
if i < len(colors_rgb):
|
| 210 |
-
reshaped_colors[i] = cv2.resize(colors_rgb[i], (W, H))
|
| 211 |
-
colors_rgb = reshaped_colors
|
| 212 |
-
|
| 213 |
-
colors_rgb = (colors_rgb * 255).astype(np.uint8)
|
| 214 |
-
|
| 215 |
-
if mask_sky:
|
| 216 |
-
print("Applying sky segmentation mask")
|
| 217 |
-
try:
|
| 218 |
-
import onnxruntime
|
| 219 |
-
|
| 220 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
| 221 |
-
print(f"Created temporary directory for sky segmentation: {temp_dir}")
|
| 222 |
-
temp_images_dir = os.path.join(temp_dir, "images")
|
| 223 |
-
sky_masks_dir = os.path.join(temp_dir, "sky_masks")
|
| 224 |
-
os.makedirs(temp_images_dir, exist_ok=True)
|
| 225 |
-
os.makedirs(sky_masks_dir, exist_ok=True)
|
| 226 |
-
|
| 227 |
-
image_list = []
|
| 228 |
-
for i, img in enumerate(colors_rgb):
|
| 229 |
-
img_path = os.path.join(temp_images_dir, f"image_{i:04d}.png")
|
| 230 |
-
image_list.append(img_path)
|
| 231 |
-
cv2.imwrite(img_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
skyseg_path = os.path.join(temp_dir, "skyseg.onnx")
|
| 235 |
-
if not os.path.exists("skyseg.onnx"):
|
| 236 |
-
print("Downloading skyseg.onnx...")
|
| 237 |
-
download_success = download_file_from_url(
|
| 238 |
-
"https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx",
|
| 239 |
-
skyseg_path
|
| 240 |
-
)
|
| 241 |
-
if not download_success:
|
| 242 |
-
print("Failed to download skyseg model, skipping sky filtering")
|
| 243 |
-
mask_sky = False
|
| 244 |
-
else:
|
| 245 |
-
|
| 246 |
-
import shutil
|
| 247 |
-
shutil.copy("skyseg.onnx", skyseg_path)
|
| 248 |
-
|
| 249 |
-
if mask_sky:
|
| 250 |
-
skyseg_session = onnxruntime.InferenceSession(skyseg_path)
|
| 251 |
-
sky_mask_list = []
|
| 252 |
-
|
| 253 |
-
for img_path in image_list:
|
| 254 |
-
mask_path = os.path.join(sky_masks_dir, os.path.basename(img_path))
|
| 255 |
-
sky_mask = segment_sky(img_path, skyseg_session, mask_path)
|
| 256 |
-
|
| 257 |
-
if sky_mask.shape[0] != H or sky_mask.shape[1] != W:
|
| 258 |
-
sky_mask = cv2.resize(sky_mask, (W, H))
|
| 259 |
-
|
| 260 |
-
sky_mask_list.append(sky_mask)
|
| 261 |
-
|
| 262 |
-
sky_mask_array = np.array(sky_mask_list)
|
| 263 |
-
|
| 264 |
-
sky_mask_binary = (sky_mask_array > 0.1).astype(np.float32)
|
| 265 |
-
pred_world_points_conf = pred_world_points_conf * sky_mask_binary
|
| 266 |
-
print(f"Applied sky mask, shape: {sky_mask_binary.shape}")
|
| 267 |
-
|
| 268 |
-
except (ImportError, Exception) as e:
|
| 269 |
-
print(f"Error in sky segmentation: {e}")
|
| 270 |
-
mask_sky = False
|
| 271 |
-
|
| 272 |
-
vertices_3d = pred_world_points.reshape(-1, 3)
|
| 273 |
-
conf = pred_world_points_conf.reshape(-1)
|
| 274 |
-
colors_rgb_flat = colors_rgb.reshape(-1, 3)
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
if len(conf) != len(colors_rgb_flat):
|
| 279 |
-
print(f"WARNING: Shape mismatch between confidence ({len(conf)}) and colors ({len(colors_rgb_flat)})")
|
| 280 |
-
min_size = min(len(conf), len(colors_rgb_flat))
|
| 281 |
-
conf = conf[:min_size]
|
| 282 |
-
vertices_3d = vertices_3d[:min_size]
|
| 283 |
-
colors_rgb_flat = colors_rgb_flat[:min_size]
|
| 284 |
-
|
| 285 |
-
if conf_threshold == 0.0:
|
| 286 |
-
conf_thres_value = 0.0
|
| 287 |
-
else:
|
| 288 |
-
conf_thres_value = np.percentile(conf, conf_threshold)
|
| 289 |
-
|
| 290 |
-
print(f"Using confidence threshold: {conf_threshold}% (value: {conf_thres_value:.4f})")
|
| 291 |
-
conf_mask = (conf >= conf_thres_value) & (conf > 1e-5)
|
| 292 |
-
|
| 293 |
-
if mask_black_bg:
|
| 294 |
-
print("Filtering black background")
|
| 295 |
-
black_bg_mask = colors_rgb_flat.sum(axis=1) >= 16
|
| 296 |
-
conf_mask = conf_mask & black_bg_mask
|
| 297 |
-
|
| 298 |
-
if mask_white_bg:
|
| 299 |
-
print("Filtering white background")
|
| 300 |
-
white_bg_mask = ~((colors_rgb_flat[:, 0] > 240) & (colors_rgb_flat[:, 1] > 240) & (colors_rgb_flat[:, 2] > 240))
|
| 301 |
-
conf_mask = conf_mask & white_bg_mask
|
| 302 |
-
|
| 303 |
-
filtered_vertices = vertices_3d[conf_mask]
|
| 304 |
-
filtered_colors = colors_rgb_flat[conf_mask]
|
| 305 |
-
|
| 306 |
-
if len(filtered_vertices) == 0:
|
| 307 |
-
print("Warning: No points remaining after filtering. Using default point.")
|
| 308 |
-
filtered_vertices = np.array([[0, 0, 0]])
|
| 309 |
-
filtered_colors = np.array([[200, 200, 200]])
|
| 310 |
-
|
| 311 |
-
print(f"Filtered to {len(filtered_vertices)} points")
|
| 312 |
-
|
| 313 |
-
points3D = []
|
| 314 |
-
point_indices = {}
|
| 315 |
-
image_points2D = [[] for _ in range(len(pred_world_points))]
|
| 316 |
-
|
| 317 |
-
print(f"Preparing points for COLMAP format with stride {stride}...")
|
| 318 |
-
|
| 319 |
-
total_points = 0
|
| 320 |
-
for img_idx in range(S):
|
| 321 |
-
for y in range(0, H, stride):
|
| 322 |
-
for x in range(0, W, stride):
|
| 323 |
-
flat_idx = img_idx * H * W + y * W + x
|
| 324 |
-
|
| 325 |
-
if flat_idx >= len(conf):
|
| 326 |
-
continue
|
| 327 |
-
|
| 328 |
-
if conf[flat_idx] < conf_thres_value or conf[flat_idx] <= 1e-5:
|
| 329 |
-
continue
|
| 330 |
-
|
| 331 |
-
if mask_black_bg and colors_rgb_flat[flat_idx].sum() < 16:
|
| 332 |
-
continue
|
| 333 |
-
|
| 334 |
-
if mask_white_bg and all(colors_rgb_flat[flat_idx] > 240):
|
| 335 |
-
continue
|
| 336 |
-
|
| 337 |
-
point3D = vertices_3d[flat_idx]
|
| 338 |
-
rgb = colors_rgb_flat[flat_idx]
|
| 339 |
-
|
| 340 |
-
if not np.all(np.isfinite(point3D)):
|
| 341 |
-
continue
|
| 342 |
-
|
| 343 |
-
point_hash = hash_point(point3D, scale=100)
|
| 344 |
-
|
| 345 |
-
if point_hash not in point_indices:
|
| 346 |
-
point_idx = len(points3D)
|
| 347 |
-
point_indices[point_hash] = point_idx
|
| 348 |
-
|
| 349 |
-
point_entry = {
|
| 350 |
-
"id": point_idx,
|
| 351 |
-
"xyz": point3D,
|
| 352 |
-
"rgb": rgb,
|
| 353 |
-
"error": 1.0,
|
| 354 |
-
"track": [(img_idx, len(image_points2D[img_idx]))]
|
| 355 |
-
}
|
| 356 |
-
points3D.append(point_entry)
|
| 357 |
-
total_points += 1
|
| 358 |
-
else:
|
| 359 |
-
point_idx = point_indices[point_hash]
|
| 360 |
-
points3D[point_idx]["track"].append((img_idx, len(image_points2D[img_idx])))
|
| 361 |
-
|
| 362 |
-
image_points2D[img_idx].append((x, y, point_indices[point_hash]))
|
| 363 |
-
|
| 364 |
-
print(f"Prepared {len(points3D)} 3D points with {sum(len(pts) for pts in image_points2D)} observations for COLMAP")
|
| 365 |
-
return points3D, image_points2D
|
| 366 |
-
|
| 367 |
-
def hash_point(point, scale=100):
|
| 368 |
-
"""Create a hash for a 3D point by quantizing coordinates."""
|
| 369 |
-
quantized = tuple(np.round(point * scale).astype(int))
|
| 370 |
-
return hash(quantized)
|
| 371 |
-
|
| 372 |
-
def write_colmap_cameras_txt(file_path, intrinsics, image_width, image_height):
|
| 373 |
-
"""Write camera intrinsics to COLMAP cameras.txt format."""
|
| 374 |
-
with open(file_path, 'w') as f:
|
| 375 |
-
f.write("# Camera list with one line of data per camera:\n")
|
| 376 |
-
f.write("# CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n")
|
| 377 |
-
f.write(f"# Number of cameras: {len(intrinsics)}\n")
|
| 378 |
-
|
| 379 |
-
for i, intrinsic in enumerate(intrinsics):
|
| 380 |
-
camera_id = i + 1 # COLMAP uses 1-indexed camera IDs
|
| 381 |
-
model = "PINHOLE"
|
| 382 |
-
|
| 383 |
-
fx = intrinsic[0, 0]
|
| 384 |
-
fy = intrinsic[1, 1]
|
| 385 |
-
cx = intrinsic[0, 2]
|
| 386 |
-
cy = intrinsic[1, 2]
|
| 387 |
-
|
| 388 |
-
f.write(f"{camera_id} {model} {image_width} {image_height} {fx} {fy} {cx} {cy}\n")
|
| 389 |
-
|
| 390 |
-
def write_colmap_images_txt(file_path, quaternions, translations, image_points2D, image_names):
|
| 391 |
-
"""Write camera poses and keypoints to COLMAP images.txt format."""
|
| 392 |
-
with open(file_path, 'w') as f:
|
| 393 |
-
f.write("# Image list with two lines of data per image:\n")
|
| 394 |
-
f.write("# IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n")
|
| 395 |
-
f.write("# POINTS2D[] as (X, Y, POINT3D_ID)\n")
|
| 396 |
-
|
| 397 |
-
num_points = sum(len(points) for points in image_points2D)
|
| 398 |
-
avg_points = num_points / len(image_points2D) if image_points2D else 0
|
| 399 |
-
f.write(f"# Number of images: {len(quaternions)}, mean observations per image: {avg_points:.1f}\n")
|
| 400 |
-
|
| 401 |
-
for i in range(len(quaternions)):
|
| 402 |
-
image_id = i + 1
|
| 403 |
-
camera_id = i + 1
|
| 404 |
-
|
| 405 |
-
qw, qx, qy, qz = quaternions[i]
|
| 406 |
-
tx, ty, tz = translations[i]
|
| 407 |
-
|
| 408 |
-
f.write(f"{image_id} {qw} {qx} {qy} {qz} {tx} {ty} {tz} {camera_id} {os.path.basename(image_names[i])}\n")
|
| 409 |
-
|
| 410 |
-
points_line = " ".join([f"{x} {y} {point3d_id+1}" for x, y, point3d_id in image_points2D[i]])
|
| 411 |
-
f.write(f"{points_line}\n")
|
| 412 |
-
|
| 413 |
-
def write_colmap_points3D_txt(file_path, points3D):
|
| 414 |
-
"""Write 3D points and tracks to COLMAP points3D.txt format."""
|
| 415 |
-
with open(file_path, 'w') as f:
|
| 416 |
-
f.write("# 3D point list with one line of data per point:\n")
|
| 417 |
-
f.write("# POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n")
|
| 418 |
-
|
| 419 |
-
avg_track_length = sum(len(point["track"]) for point in points3D) / len(points3D) if points3D else 0
|
| 420 |
-
f.write(f"# Number of points: {len(points3D)}, mean track length: {avg_track_length:.4f}\n")
|
| 421 |
-
|
| 422 |
-
for point in points3D:
|
| 423 |
-
point_id = point["id"] + 1
|
| 424 |
-
x, y, z = point["xyz"]
|
| 425 |
-
r, g, b = point["rgb"]
|
| 426 |
-
error = point["error"]
|
| 427 |
-
|
| 428 |
-
track = " ".join([f"{img_id+1} {point2d_idx}" for img_id, point2d_idx in point["track"]])
|
| 429 |
-
|
| 430 |
-
f.write(f"{point_id} {x} {y} {z} {int(r)} {int(g)} {int(b)} {error} {track}\n")
|
| 431 |
-
|
| 432 |
-
def write_colmap_cameras_bin(file_path, intrinsics, image_width, image_height):
|
| 433 |
-
"""Write camera intrinsics to COLMAP cameras.bin format."""
|
| 434 |
-
with open(file_path, 'wb') as fid:
|
| 435 |
-
# Write number of cameras (uint64)
|
| 436 |
-
fid.write(struct.pack('<Q', len(intrinsics)))
|
| 437 |
-
|
| 438 |
-
for i, intrinsic in enumerate(intrinsics):
|
| 439 |
-
camera_id = i + 1
|
| 440 |
-
model_id = 1
|
| 441 |
-
|
| 442 |
-
fx = float(intrinsic[0, 0])
|
| 443 |
-
fy = float(intrinsic[1, 1])
|
| 444 |
-
cx = float(intrinsic[0, 2])
|
| 445 |
-
cy = float(intrinsic[1, 2])
|
| 446 |
-
|
| 447 |
-
# Camera ID (uint32)
|
| 448 |
-
fid.write(struct.pack('<I', camera_id))
|
| 449 |
-
# Model ID (uint32)
|
| 450 |
-
fid.write(struct.pack('<I', model_id))
|
| 451 |
-
# Width (uint64)
|
| 452 |
-
fid.write(struct.pack('<Q', image_width))
|
| 453 |
-
# Height (uint64)
|
| 454 |
-
fid.write(struct.pack('<Q', image_height))
|
| 455 |
-
|
| 456 |
-
# Parameters (double)
|
| 457 |
-
fid.write(struct.pack('<dddd', fx, fy, cx, cy))
|
| 458 |
-
|
| 459 |
-
def write_colmap_images_bin(file_path, quaternions, translations, image_points2D, image_names):
|
| 460 |
-
"""Write camera poses and keypoints to COLMAP images.bin format."""
|
| 461 |
-
with open(file_path, 'wb') as fid:
|
| 462 |
-
# Write number of images (uint64)
|
| 463 |
-
fid.write(struct.pack('<Q', len(quaternions)))
|
| 464 |
-
|
| 465 |
-
for i in range(len(quaternions)):
|
| 466 |
-
image_id = i + 1
|
| 467 |
-
camera_id = i + 1
|
| 468 |
-
|
| 469 |
-
qw, qx, qy, qz = quaternions[i].astype(float)
|
| 470 |
-
tx, ty, tz = translations[i].astype(float)
|
| 471 |
-
|
| 472 |
-
image_name = os.path.basename(image_names[i]).encode()
|
| 473 |
-
points = image_points2D[i]
|
| 474 |
-
|
| 475 |
-
# Image ID (uint32)
|
| 476 |
-
fid.write(struct.pack('<I', image_id))
|
| 477 |
-
# Quaternion (double): qw, qx, qy, qz
|
| 478 |
-
fid.write(struct.pack('<dddd', qw, qx, qy, qz))
|
| 479 |
-
# Translation (double): tx, ty, tz
|
| 480 |
-
fid.write(struct.pack('<ddd', tx, ty, tz))
|
| 481 |
-
# Camera ID (uint32)
|
| 482 |
-
fid.write(struct.pack('<I', camera_id))
|
| 483 |
-
# Image name
|
| 484 |
-
fid.write(struct.pack('<I', len(image_name)))
|
| 485 |
-
fid.write(image_name)
|
| 486 |
-
|
| 487 |
-
# Write number of 2D points (uint64)
|
| 488 |
-
fid.write(struct.pack('<Q', len(points)))
|
| 489 |
-
|
| 490 |
-
# Write 2D points: x, y, point3D_id
|
| 491 |
-
for x, y, point3d_id in points:
|
| 492 |
-
fid.write(struct.pack('<dd', float(x), float(y)))
|
| 493 |
-
fid.write(struct.pack('<Q', point3d_id + 1))
|
| 494 |
-
|
| 495 |
-
def write_colmap_points3D_bin(file_path, points3D):
|
| 496 |
-
"""Write 3D points and tracks to COLMAP points3D.bin format."""
|
| 497 |
-
with open(file_path, 'wb') as fid:
|
| 498 |
-
# Write number of points (uint64)
|
| 499 |
-
fid.write(struct.pack('<Q', len(points3D)))
|
| 500 |
-
|
| 501 |
-
for point in points3D:
|
| 502 |
-
point_id = point["id"] + 1
|
| 503 |
-
x, y, z = point["xyz"].astype(float)
|
| 504 |
-
r, g, b = point["rgb"].astype(np.uint8)
|
| 505 |
-
error = float(point["error"])
|
| 506 |
-
track = point["track"]
|
| 507 |
-
|
| 508 |
-
# Point ID (uint64)
|
| 509 |
-
fid.write(struct.pack('<Q', point_id))
|
| 510 |
-
# Position (double): x, y, z
|
| 511 |
-
fid.write(struct.pack('<ddd', x, y, z))
|
| 512 |
-
# Color (uint8): r, g, b
|
| 513 |
-
fid.write(struct.pack('<BBB', int(r), int(g), int(b)))
|
| 514 |
-
# Error (double)
|
| 515 |
-
fid.write(struct.pack('<d', error))
|
| 516 |
-
|
| 517 |
-
# Track: list of (image_id, point2D_idx)
|
| 518 |
-
fid.write(struct.pack('<Q', len(track)))
|
| 519 |
-
for img_id, point2d_idx in track:
|
| 520 |
-
fid.write(struct.pack('<II', img_id + 1, point2d_idx))
|
| 521 |
-
|
| 522 |
-
def main():
|
| 523 |
-
parser = argparse.ArgumentParser(description="Convert images to COLMAP format using VGGT")
|
| 524 |
-
parser.add_argument("--image_dir", type=str, required=True,
|
| 525 |
-
help="Directory containing input images")
|
| 526 |
-
parser.add_argument("--output_dir", type=str, default="colmap_output",
|
| 527 |
-
help="Directory to save COLMAP files")
|
| 528 |
-
parser.add_argument("--conf_threshold", type=float, default=50.0,
|
| 529 |
-
help="Confidence threshold (0-100%) for including points")
|
| 530 |
-
parser.add_argument("--mask_sky", action="store_true",
|
| 531 |
-
help="Filter out points likely to be sky")
|
| 532 |
-
parser.add_argument("--mask_black_bg", action="store_true",
|
| 533 |
-
help="Filter out points with very dark/black color")
|
| 534 |
-
parser.add_argument("--mask_white_bg", action="store_true",
|
| 535 |
-
help="Filter out points with very bright/white color")
|
| 536 |
-
parser.add_argument("--binary", action="store_true",
|
| 537 |
-
help="Output binary COLMAP files instead of text")
|
| 538 |
-
parser.add_argument("--stride", type=int, default=1,
|
| 539 |
-
help="Stride for point sampling (higher = fewer points)")
|
| 540 |
-
parser.add_argument("--prediction_mode", type=str, default="Depthmap and Camera Branch",
|
| 541 |
-
choices=["Depthmap and Camera Branch", "Pointmap Branch"],
|
| 542 |
-
help="Which prediction branch to use")
|
| 543 |
-
|
| 544 |
-
args = parser.parse_args()
|
| 545 |
-
|
| 546 |
-
os.makedirs(args.output_dir, exist_ok=True)
|
| 547 |
-
|
| 548 |
-
model, device = load_model()
|
| 549 |
-
|
| 550 |
-
predictions, image_names = process_images(args.image_dir, model, device)
|
| 551 |
-
|
| 552 |
-
print("Converting camera parameters to COLMAP format...")
|
| 553 |
-
quaternions, translations = extrinsic_to_colmap_format(predictions["extrinsic"])
|
| 554 |
-
|
| 555 |
-
print(f"Filtering points with confidence threshold {args.conf_threshold}% and stride {args.stride}...")
|
| 556 |
-
points3D, image_points2D = filter_and_prepare_points(
|
| 557 |
-
predictions,
|
| 558 |
-
args.conf_threshold,
|
| 559 |
-
mask_sky=args.mask_sky,
|
| 560 |
-
mask_black_bg=args.mask_black_bg,
|
| 561 |
-
mask_white_bg=args.mask_white_bg,
|
| 562 |
-
stride=args.stride,
|
| 563 |
-
prediction_mode=args.prediction_mode
|
| 564 |
-
)
|
| 565 |
-
|
| 566 |
-
height, width = predictions["depth"].shape[1:3]
|
| 567 |
-
|
| 568 |
-
print(f"Writing {'binary' if args.binary else 'text'} COLMAP files to {args.output_dir}...")
|
| 569 |
-
if args.binary:
|
| 570 |
-
write_colmap_cameras_bin(
|
| 571 |
-
os.path.join(args.output_dir, "cameras.bin"),
|
| 572 |
-
predictions["intrinsic"], width, height)
|
| 573 |
-
write_colmap_images_bin(
|
| 574 |
-
os.path.join(args.output_dir, "images.bin"),
|
| 575 |
-
quaternions, translations, image_points2D, image_names)
|
| 576 |
-
write_colmap_points3D_bin(
|
| 577 |
-
os.path.join(args.output_dir, "points3D.bin"),
|
| 578 |
-
points3D)
|
| 579 |
-
else:
|
| 580 |
-
write_colmap_cameras_txt(
|
| 581 |
-
os.path.join(args.output_dir, "cameras.txt"),
|
| 582 |
-
predictions["intrinsic"], width, height)
|
| 583 |
-
write_colmap_images_txt(
|
| 584 |
-
os.path.join(args.output_dir, "images.txt"),
|
| 585 |
-
quaternions, translations, image_points2D, image_names)
|
| 586 |
-
write_colmap_points3D_txt(
|
| 587 |
-
os.path.join(args.output_dir, "points3D.txt"),
|
| 588 |
-
points3D)
|
| 589 |
-
|
| 590 |
-
print(f"COLMAP files successfully written to {args.output_dir}")
|
| 591 |
-
|
| 592 |
-
if __name__ == "__main__":
|
| 593 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|