aknapitsch user commited on
Commit
c226bc0
·
1 Parent(s): eb74057

pointcloud as mesh

Browse files
app.py CHANGED
@@ -22,12 +22,12 @@ sys.path.append("mapanything/")
22
 
23
  from mapanything.utils.geometry import depthmap_to_world_frame, points_to_normals
24
  from mapanything.utils.hf_utils.css_and_html import (
 
 
25
  get_acknowledgements_html,
26
  get_description_html,
27
  get_gradio_theme,
28
  get_header_html,
29
- GRADIO_CSS,
30
- MEASURE_INSTRUCTIONS_HTML,
31
  )
32
  from mapanything.utils.hf_utils.hf_helpers import initialize_mapanything_model
33
  from mapanything.utils.hf_utils.visual_util import predictions_to_glb
@@ -37,7 +37,7 @@ from mapanything.utils.image import load_images, rgb
37
  def get_logo_base64():
38
  """Convert WAI logo to base64 for embedding in HTML"""
39
  import base64
40
-
41
  logo_path = "examples/wai_logo/wai_logo.png"
42
  try:
43
  with open(logo_path, "rb") as img_file:
@@ -506,9 +506,7 @@ def gradio_demo(
506
 
507
  print("Running MapAnything model...")
508
  with torch.no_grad():
509
- predictions, processed_data = run_model(
510
- target_dir, apply_mask, mask_edges
511
- )
512
 
513
  # Save predictions
514
  prediction_save_path = os.path.join(target_dir, "predictions.npz")
@@ -534,6 +532,7 @@ def gradio_demo(
534
  mask_sky=filter_sky,
535
  mask_black_bg=filter_black_bg,
536
  mask_white_bg=filter_white_bg,
 
537
  )
538
  glbscene.export(file_obj=glbfile)
539
 
@@ -876,6 +875,7 @@ def update_visualization(
876
  filter_sky=False,
877
  filter_black_bg=False,
878
  filter_white_bg=False,
 
879
  ):
880
  """
881
  Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
@@ -923,6 +923,7 @@ def update_visualization(
923
  mask_sky=filter_sky,
924
  mask_black_bg=filter_black_bg,
925
  mask_white_bg=filter_white_bg,
 
926
  )
927
  glbscene.export(file_obj=glbfile)
928
 
@@ -1145,6 +1146,7 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
1145
  with gr.Column():
1146
  gr.Markdown("### Pointcloud options (live updates)")
1147
  show_cam = gr.Checkbox(label="Show Camera", value=True)
 
1148
  filter_sky = gr.Checkbox(
1149
  label="Filter Sky (using skyseg.onnx)", value=False
1150
  )
@@ -1160,7 +1162,7 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
1160
  )
1161
  mask_edges_checkbox = apply_mask_checkbox
1162
  # ---------------------- Example Scenes Section ----------------------
1163
- gr.Markdown("## Example Scenes")
1164
  gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
1165
 
1166
  # Get scene information
@@ -1305,6 +1307,22 @@ with gr.Blocks(theme=theme, css=GRADIO_CSS) as demo:
1305
  filter_sky,
1306
  filter_black_bg,
1307
  filter_white_bg,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1308
  ],
1309
  [reconstruction_output, log_output],
1310
  )
 
22
 
23
  from mapanything.utils.geometry import depthmap_to_world_frame, points_to_normals
24
  from mapanything.utils.hf_utils.css_and_html import (
25
+ GRADIO_CSS,
26
+ MEASURE_INSTRUCTIONS_HTML,
27
  get_acknowledgements_html,
28
  get_description_html,
29
  get_gradio_theme,
30
  get_header_html,
 
 
31
  )
32
  from mapanything.utils.hf_utils.hf_helpers import initialize_mapanything_model
33
  from mapanything.utils.hf_utils.visual_util import predictions_to_glb
 
37
  def get_logo_base64():
38
  """Convert WAI logo to base64 for embedding in HTML"""
39
  import base64
40
+
41
  logo_path = "examples/wai_logo/wai_logo.png"
42
  try:
43
  with open(logo_path, "rb") as img_file:
 
506
 
507
  print("Running MapAnything model...")
508
  with torch.no_grad():
509
+ predictions, processed_data = run_model(target_dir, apply_mask, mask_edges)
 
 
510
 
511
  # Save predictions
512
  prediction_save_path = os.path.join(target_dir, "predictions.npz")
 
532
  mask_sky=filter_sky,
533
  mask_black_bg=filter_black_bg,
534
  mask_white_bg=filter_white_bg,
535
+ as_mesh=True, # Default to True for reconstruction
536
  )
537
  glbscene.export(file_obj=glbfile)
538
 
 
875
  filter_sky=False,
876
  filter_black_bg=False,
877
  filter_white_bg=False,
878
+ show_mesh=True,
879
  ):
880
  """
881
  Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
 
923
  mask_sky=filter_sky,
924
  mask_black_bg=filter_black_bg,
925
  mask_white_bg=filter_white_bg,
926
+ as_mesh=show_mesh,
927
  )
928
  glbscene.export(file_obj=glbfile)
929
 
 
1146
  with gr.Column():
1147
  gr.Markdown("### Pointcloud options (live updates)")
1148
  show_cam = gr.Checkbox(label="Show Camera", value=True)
1149
+ show_mesh = gr.Checkbox(label="Show mesh", value=True)
1150
  filter_sky = gr.Checkbox(
1151
  label="Filter Sky (using skyseg.onnx)", value=False
1152
  )
 
1162
  )
1163
  mask_edges_checkbox = apply_mask_checkbox
1164
  # ---------------------- Example Scenes Section ----------------------
1165
+ gr.Markdown("## Example Scenes (lists all scenes in the examples folder)")
1166
  gr.Markdown("Click any thumbnail to load the scene for reconstruction.")
1167
 
1168
  # Get scene information
 
1307
  filter_sky,
1308
  filter_black_bg,
1309
  filter_white_bg,
1310
+ show_mesh,
1311
+ ],
1312
+ [reconstruction_output, log_output],
1313
+ )
1314
+
1315
+ show_mesh.change(
1316
+ update_visualization,
1317
+ [
1318
+ target_dir_output,
1319
+ frame_filter,
1320
+ show_cam,
1321
+ is_example,
1322
+ filter_sky,
1323
+ filter_black_bg,
1324
+ filter_white_bg,
1325
+ show_mesh,
1326
  ],
1327
  [reconstruction_output, log_output],
1328
  )
mapanything/utils/hf_utils/moge_utils.py ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import *
3
+ from numbers import Number
4
+ import warnings
5
+ import functools
6
+
7
+ from ._helpers import batched
8
+ from . import transforms
9
+ from . import mesh
10
+
11
+ __all__ = [
12
+ 'sliding_window_1d',
13
+ 'sliding_window_nd',
14
+ 'sliding_window_2d',
15
+ 'max_pool_1d',
16
+ 'max_pool_2d',
17
+ 'max_pool_nd',
18
+ 'depth_edge',
19
+ 'normals_edge',
20
+ 'depth_aliasing',
21
+ 'interpolate',
22
+ 'image_scrcoord',
23
+ 'image_uv',
24
+ 'image_pixel_center',
25
+ 'image_pixel',
26
+ 'image_mesh',
27
+ 'image_mesh_from_depth',
28
+ 'points_to_normals',
29
+ 'points_to_normals',
30
+ 'chessboard',
31
+ 'cube',
32
+ 'icosahedron',
33
+ 'square',
34
+ 'camera_frustum',
35
+ 'to4x4'
36
+ ]
37
+
38
+
39
+ def no_runtime_warnings(fn):
40
+ """
41
+ Disable runtime warnings in numpy.
42
+ """
43
+ @functools.wraps(fn)
44
+ def wrapper(*args, **kwargs):
45
+ with warnings.catch_warnings():
46
+ warnings.simplefilter("ignore")
47
+ return fn(*args, **kwargs)
48
+ return wrapper
49
+
50
+
51
+ def sliding_window_1d(x: np.ndarray, window_size: int, stride: int, axis: int = -1):
52
+ """
53
+ Return x view of the input array with x sliding window of the given kernel size and stride.
54
+ The sliding window is performed over the given axis, and the window dimension is append to the end of the output array's shape.
55
+
56
+ Args:
57
+ x (np.ndarray): input array with shape (..., axis_size, ...)
58
+ kernel_size (int): size of the sliding window
59
+ stride (int): stride of the sliding window
60
+ axis (int): axis to perform sliding window over
61
+
62
+ Returns:
63
+ a_sliding (np.ndarray): view of the input array with shape (..., n_windows, ..., kernel_size), where n_windows = (axis_size - kernel_size + 1) // stride
64
+ """
65
+ assert x.shape[axis] >= window_size, f"kernel_size ({window_size}) is larger than axis_size ({x.shape[axis]})"
66
+ axis = axis % x.ndim
67
+ shape = (*x.shape[:axis], (x.shape[axis] - window_size + 1) // stride, *x.shape[axis + 1:], window_size)
68
+ strides = (*x.strides[:axis], stride * x.strides[axis], *x.strides[axis + 1:], x.strides[axis])
69
+ x_sliding = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
70
+ return x_sliding
71
+
72
+
73
+ def sliding_window_nd(x: np.ndarray, window_size: Tuple[int,...], stride: Tuple[int,...], axis: Tuple[int,...]) -> np.ndarray:
74
+ axis = [axis[i] % x.ndim for i in range(len(axis))]
75
+ for i in range(len(axis)):
76
+ x = sliding_window_1d(x, window_size[i], stride[i], axis[i])
77
+ return x
78
+
79
+
80
+ def sliding_window_2d(x: np.ndarray, window_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]], axis: Tuple[int, int] = (-2, -1)) -> np.ndarray:
81
+ if isinstance(window_size, int):
82
+ window_size = (window_size, window_size)
83
+ if isinstance(stride, int):
84
+ stride = (stride, stride)
85
+ return sliding_window_nd(x, window_size, stride, axis)
86
+
87
+
88
+ def max_pool_1d(x: np.ndarray, kernel_size: int, stride: int, padding: int = 0, axis: int = -1):
89
+ axis = axis % x.ndim
90
+ if padding > 0:
91
+ fill_value = np.nan if x.dtype.kind == 'f' else np.iinfo(x.dtype).min
92
+ padding_arr = np.full((*x.shape[:axis], padding, *x.shape[axis + 1:]), fill_value=fill_value, dtype=x.dtype)
93
+ x = np.concatenate([padding_arr, x, padding_arr], axis=axis)
94
+ a_sliding = sliding_window_1d(x, kernel_size, stride, axis)
95
+ max_pool = np.nanmax(a_sliding, axis=-1)
96
+ return max_pool
97
+
98
+
99
+ def max_pool_nd(x: np.ndarray, kernel_size: Tuple[int,...], stride: Tuple[int,...], padding: Tuple[int,...], axis: Tuple[int,...]) -> np.ndarray:
100
+ for i in range(len(axis)):
101
+ x = max_pool_1d(x, kernel_size[i], stride[i], padding[i], axis[i])
102
+ return x
103
+
104
+
105
+ def max_pool_2d(x: np.ndarray, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]], padding: Union[int, Tuple[int, int]], axis: Tuple[int, int] = (-2, -1)):
106
+ if isinstance(kernel_size, Number):
107
+ kernel_size = (kernel_size, kernel_size)
108
+ if isinstance(stride, Number):
109
+ stride = (stride, stride)
110
+ if isinstance(padding, Number):
111
+ padding = (padding, padding)
112
+ axis = tuple(axis)
113
+ return max_pool_nd(x, kernel_size, stride, padding, axis)
114
+
115
+
116
+ @no_runtime_warnings
117
+ def depth_edge(depth: np.ndarray, atol: float = None, rtol: float = None, kernel_size: int = 3, mask: np.ndarray = None) -> np.ndarray:
118
+ """
119
+ Compute the edge mask from depth map. The edge is defined as the pixels whose neighbors have large difference in depth.
120
+
121
+ Args:
122
+ depth (np.ndarray): shape (..., height, width), linear depth map
123
+ atol (float): absolute tolerance
124
+ rtol (float): relative tolerance
125
+
126
+ Returns:
127
+ edge (np.ndarray): shape (..., height, width) of dtype torch.bool
128
+ """
129
+ if mask is None:
130
+ diff = (max_pool_2d(depth, kernel_size, stride=1, padding=kernel_size // 2) + max_pool_2d(-depth, kernel_size, stride=1, padding=kernel_size // 2))
131
+ else:
132
+ diff = (max_pool_2d(np.where(mask, depth, -np.inf), kernel_size, stride=1, padding=kernel_size // 2) + max_pool_2d(np.where(mask, -depth, -np.inf), kernel_size, stride=1, padding=kernel_size // 2))
133
+
134
+ edge = np.zeros_like(depth, dtype=bool)
135
+ if atol is not None:
136
+ edge |= diff > atol
137
+
138
+ with warnings.catch_warnings():
139
+ warnings.simplefilter("ignore", category=RuntimeWarning)
140
+ if rtol is not None:
141
+ edge |= diff / depth > rtol
142
+ return edge
143
+
144
+
145
+ @no_runtime_warnings
146
+ def depth_aliasing(depth: np.ndarray, atol: float = None, rtol: float = None, kernel_size: int = 3, mask: np.ndarray = None) -> np.ndarray:
147
+ """
148
+ Compute the map that indicates the aliasing of x depth map. The aliasing is defined as the pixels which neither close to the maximum nor the minimum of its neighbors.
149
+ Args:
150
+ depth (np.ndarray): shape (..., height, width), linear depth map
151
+ atol (float): absolute tolerance
152
+ rtol (float): relative tolerance
153
+
154
+ Returns:
155
+ edge (np.ndarray): shape (..., height, width) of dtype torch.bool
156
+ """
157
+ if mask is None:
158
+ diff_max = max_pool_2d(depth, kernel_size, stride=1, padding=kernel_size // 2) - depth
159
+ diff_min = max_pool_2d(-depth, kernel_size, stride=1, padding=kernel_size // 2) + depth
160
+ else:
161
+ diff_max = max_pool_2d(np.where(mask, depth, -np.inf), kernel_size, stride=1, padding=kernel_size // 2) - depth
162
+ diff_min = max_pool_2d(np.where(mask, -depth, -np.inf), kernel_size, stride=1, padding=kernel_size // 2) + depth
163
+ diff = np.minimum(diff_max, diff_min)
164
+
165
+ edge = np.zeros_like(depth, dtype=bool)
166
+ if atol is not None:
167
+ edge |= diff > atol
168
+ if rtol is not None:
169
+ edge |= diff / depth > rtol
170
+ return edge
171
+
172
+
173
+ @no_runtime_warnings
174
+ def normals_edge(normals: np.ndarray, tol: float, kernel_size: int = 3, mask: np.ndarray = None) -> np.ndarray:
175
+ """
176
+ Compute the edge mask from normal map.
177
+
178
+ Args:
179
+ normal (np.ndarray): shape (..., height, width, 3), normal map
180
+ tol (float): tolerance in degrees
181
+
182
+ Returns:
183
+ edge (np.ndarray): shape (..., height, width) of dtype torch.bool
184
+ """
185
+ assert normals.ndim >= 3 and normals.shape[-1] == 3, "normal should be of shape (..., height, width, 3)"
186
+ normals = normals / (np.linalg.norm(normals, axis=-1, keepdims=True) + 1e-12)
187
+
188
+ padding = kernel_size // 2
189
+ normals_window = sliding_window_2d(
190
+ np.pad(normals, (*([(0, 0)] * (normals.ndim - 3)), (padding, padding), (padding, padding), (0, 0)), mode='edge'),
191
+ window_size=kernel_size,
192
+ stride=1,
193
+ axis=(-3, -2)
194
+ )
195
+ if mask is None:
196
+ angle_diff = np.acos((normals[..., None, None] * normals_window).sum(axis=-3)).max(axis=(-2, -1))
197
+ else:
198
+ mask_window = sliding_window_2d(
199
+ np.pad(mask, (*([(0, 0)] * (mask.ndim - 3)), (padding, padding), (padding, padding)), mode='edge'),
200
+ window_size=kernel_size,
201
+ stride=1,
202
+ axis=(-3, -2)
203
+ )
204
+ angle_diff = np.where(mask_window, np.acos((normals[..., None, None] * normals_window).sum(axis=-3)), 0).max(axis=(-2, -1))
205
+
206
+ angle_diff = max_pool_2d(angle_diff, kernel_size, stride=1, padding=kernel_size // 2)
207
+ edge = angle_diff > np.deg2rad(tol)
208
+ return edge
209
+
210
+ @no_runtime_warnings
211
+ def points_to_normals(point: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
212
+ """
213
+ Calculate normal map from point map. Value range is [-1, 1]. Normal direction in OpenGL identity camera's coordinate system.
214
+
215
+ Args:
216
+ point (np.ndarray): shape (height, width, 3), point map
217
+ Returns:
218
+ normal (np.ndarray): shape (height, width, 3), normal map.
219
+ """
220
+ height, width = point.shape[-3:-1]
221
+ has_mask = mask is not None
222
+
223
+ if mask is None:
224
+ mask = np.ones_like(point[..., 0], dtype=bool)
225
+ mask_pad = np.zeros((height + 2, width + 2), dtype=bool)
226
+ mask_pad[1:-1, 1:-1] = mask
227
+ mask = mask_pad
228
+
229
+ pts = np.zeros((height + 2, width + 2, 3), dtype=point.dtype)
230
+ pts[1:-1, 1:-1, :] = point
231
+ up = pts[:-2, 1:-1, :] - pts[1:-1, 1:-1, :]
232
+ left = pts[1:-1, :-2, :] - pts[1:-1, 1:-1, :]
233
+ down = pts[2:, 1:-1, :] - pts[1:-1, 1:-1, :]
234
+ right = pts[1:-1, 2:, :] - pts[1:-1, 1:-1, :]
235
+ normal = np.stack([
236
+ np.cross(up, left, axis=-1),
237
+ np.cross(left, down, axis=-1),
238
+ np.cross(down, right, axis=-1),
239
+ np.cross(right, up, axis=-1),
240
+ ])
241
+ normal = normal / (np.linalg.norm(normal, axis=-1, keepdims=True) + 1e-12)
242
+ valid = np.stack([
243
+ mask[:-2, 1:-1] & mask[1:-1, :-2],
244
+ mask[1:-1, :-2] & mask[2:, 1:-1],
245
+ mask[2:, 1:-1] & mask[1:-1, 2:],
246
+ mask[1:-1, 2:] & mask[:-2, 1:-1],
247
+ ]) & mask[None, 1:-1, 1:-1]
248
+ normal = (normal * valid[..., None]).sum(axis=0)
249
+ normal = normal / (np.linalg.norm(normal, axis=-1, keepdims=True) + 1e-12)
250
+
251
+ if has_mask:
252
+ normal_mask = valid.any(axis=0)
253
+ normal = np.where(normal_mask[..., None], normal, 0)
254
+ return normal, normal_mask
255
+ else:
256
+ return normal
257
+
258
+
259
+ def depth_to_normals(depth: np.ndarray, intrinsics: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
260
+ """
261
+ Calculate normal map from depth map. Value range is [-1, 1]. Normal direction in OpenGL identity camera's coordinate system.
262
+
263
+ Args:
264
+ depth (np.ndarray): shape (height, width), linear depth map
265
+ intrinsics (np.ndarray): shape (3, 3), intrinsics matrix
266
+ Returns:
267
+ normal (np.ndarray): shape (height, width, 3), normal map.
268
+ """
269
+ has_mask = mask is not None
270
+
271
+ height, width = depth.shape[-2:]
272
+ if mask is None:
273
+ mask = np.ones_like(depth, dtype=bool)
274
+
275
+ uv = image_uv(width=width, height=height, dtype=np.float32)
276
+ pts = transforms.unproject_cv(uv, depth, intrinsics=intrinsics, extrinsics=None)
277
+
278
+ return points_to_normals(pts, mask)
279
+
280
+ def interpolate(bary: np.ndarray, tri_id: np.ndarray, attr: np.ndarray, faces: np.ndarray) -> np.ndarray:
281
+ """Interpolate with given barycentric coordinates and triangle indices
282
+
283
+ Args:
284
+ bary (np.ndarray): shape (..., 3), barycentric coordinates
285
+ tri_id (np.ndarray): int array of shape (...), triangle indices
286
+ attr (np.ndarray): shape (N, M), vertices attributes
287
+ faces (np.ndarray): int array of shape (T, 3), face vertex indices
288
+
289
+ Returns:
290
+ np.ndarray: shape (..., M) interpolated result
291
+ """
292
+ faces_ = np.concatenate([np.zeros((1, 3), dtype=faces.dtype), faces + 1], axis=0)
293
+ attr_ = np.concatenate([np.zeros((1, attr.shape[1]), dtype=attr.dtype), attr], axis=0)
294
+ return np.sum(bary[..., None] * attr_[faces_[tri_id + 1]], axis=-2)
295
+
296
+
297
+ def image_scrcoord(
298
+ width: int,
299
+ height: int,
300
+ ) -> np.ndarray:
301
+ """
302
+ Get OpenGL's screen space coordinates, ranging in [0, 1].
303
+ [0, 0] is the bottom-left corner of the image.
304
+
305
+ Args:
306
+ width (int): image width
307
+ height (int): image height
308
+
309
+ Returns:
310
+ (np.ndarray): shape (height, width, 2)
311
+ """
312
+ x, y = np.meshgrid(
313
+ np.linspace(0.5 / width, 1 - 0.5 / width, width, dtype=np.float32),
314
+ np.linspace(1 - 0.5 / height, 0.5 / height, height, dtype=np.float32),
315
+ indexing='xy'
316
+ )
317
+ return np.stack([x, y], axis=2)
318
+
319
+
320
+ def image_uv(
321
+ height: int,
322
+ width: int,
323
+ left: int = None,
324
+ top: int = None,
325
+ right: int = None,
326
+ bottom: int = None,
327
+ dtype: np.dtype = np.float32
328
+ ) -> np.ndarray:
329
+ """
330
+ Get image space UV grid, ranging in [0, 1].
331
+
332
+ >>> image_uv(10, 10):
333
+ [[[0.05, 0.05], [0.15, 0.05], ..., [0.95, 0.05]],
334
+ [[0.05, 0.15], [0.15, 0.15], ..., [0.95, 0.15]],
335
+ ... ... ...
336
+ [[0.05, 0.95], [0.15, 0.95], ..., [0.95, 0.95]]]
337
+
338
+ Args:
339
+ width (int): image width
340
+ height (int): image height
341
+
342
+ Returns:
343
+ np.ndarray: shape (height, width, 2)
344
+ """
345
+ if left is None: left = 0
346
+ if top is None: top = 0
347
+ if right is None: right = width
348
+ if bottom is None: bottom = height
349
+ u = np.linspace((left + 0.5) / width, (right - 0.5) / width, right - left, dtype=dtype)
350
+ v = np.linspace((top + 0.5) / height, (bottom - 0.5) / height, bottom - top, dtype=dtype)
351
+ u, v = np.meshgrid(u, v, indexing='xy')
352
+ return np.stack([u, v], axis=2)
353
+
354
+
355
+ def image_pixel_center(
356
+ height: int,
357
+ width: int,
358
+ left: int = None,
359
+ top: int = None,
360
+ right: int = None,
361
+ bottom: int = None,
362
+ dtype: np.dtype = np.float32
363
+ ) -> np.ndarray:
364
+ """
365
+ Get image pixel center coordinates, ranging in [0, width] and [0, height].
366
+ `image[i, j]` has pixel center coordinates `(j + 0.5, i + 0.5)`.
367
+
368
+ >>> image_pixel_center(10, 10):
369
+ [[[0.5, 0.5], [1.5, 0.5], ..., [9.5, 0.5]],
370
+ [[0.5, 1.5], [1.5, 1.5], ..., [9.5, 1.5]],
371
+ ... ... ...
372
+ [[0.5, 9.5], [1.5, 9.5], ..., [9.5, 9.5]]]
373
+
374
+ Args:
375
+ width (int): image width
376
+ height (int): image height
377
+
378
+ Returns:
379
+ np.ndarray: shape (height, width, 2)
380
+ """
381
+ if left is None: left = 0
382
+ if top is None: top = 0
383
+ if right is None: right = width
384
+ if bottom is None: bottom = height
385
+ u = np.linspace(left + 0.5, right - 0.5, right - left, dtype=dtype)
386
+ v = np.linspace(top + 0.5, bottom - 0.5, bottom - top, dtype=dtype)
387
+ u, v = np.meshgrid(u, v, indexing='xy')
388
+ return np.stack([u, v], axis=2)
389
+
390
+ def image_pixel(
391
+ height: int,
392
+ width: int,
393
+ left: int = None,
394
+ top: int = None,
395
+ right: int = None,
396
+ bottom: int = None,
397
+ dtype: np.dtype = np.int32
398
+ ) -> np.ndarray:
399
+ """
400
+ Get image pixel coordinates grid, ranging in [0, width - 1] and [0, height - 1].
401
+ `image[i, j]` has pixel center coordinates `(j, i)`.
402
+
403
+ >>> image_pixel_center(10, 10):
404
+ [[[0, 0], [1, 0], ..., [9, 0]],
405
+ [[0, 1.5], [1, 1], ..., [9, 1]],
406
+ ... ... ...
407
+ [[0, 9.5], [1, 9], ..., [9, 9 ]]]
408
+
409
+ Args:
410
+ width (int): image width
411
+ height (int): image height
412
+
413
+ Returns:
414
+ np.ndarray: shape (height, width, 2)
415
+ """
416
+ if left is None: left = 0
417
+ if top is None: top = 0
418
+ if right is None: right = width
419
+ if bottom is None: bottom = height
420
+ u = np.arange(left, right, dtype=dtype)
421
+ v = np.arange(top, bottom, dtype=dtype)
422
+ u, v = np.meshgrid(u, v, indexing='xy')
423
+ return np.stack([u, v], axis=2)
424
+
425
+
426
+ def image_mesh(
427
+ *image_attrs: np.ndarray,
428
+ mask: np.ndarray = None,
429
+ tri: bool = False,
430
+ return_indices: bool = False
431
+ ) -> Tuple[np.ndarray, ...]:
432
+ """
433
+ Get a mesh regarding image pixel uv coordinates as vertices and image grid as faces.
434
+
435
+ Args:
436
+ *image_attrs (np.ndarray): image attributes in shape (height, width, [channels])
437
+ mask (np.ndarray, optional): binary mask of shape (height, width), dtype=bool. Defaults to None.
438
+
439
+ Returns:
440
+ faces (np.ndarray): faces connecting neighboring pixels. shape (T, 4) if tri is False, else (T, 3)
441
+ *vertex_attrs (np.ndarray): vertex attributes in corresponding order with input image_attrs
442
+ indices (np.ndarray, optional): indices of vertices in the original mesh
443
+ """
444
+ assert (len(image_attrs) > 0) or (mask is not None), "At least one of image_attrs or mask should be provided"
445
+ height, width = next(image_attrs).shape[:2] if mask is None else mask.shape
446
+ assert all(img.shape[:2] == (height, width) for img in image_attrs), "All image_attrs should have the same shape"
447
+
448
+ row_faces = np.stack([np.arange(0, width - 1, dtype=np.int32), np.arange(width, 2 * width - 1, dtype=np.int32), np.arange(1 + width, 2 * width, dtype=np.int32), np.arange(1, width, dtype=np.int32)], axis=1)
449
+ faces = (np.arange(0, (height - 1) * width, width, dtype=np.int32)[:, None, None] + row_faces[None, :, :]).reshape((-1, 4))
450
+ if mask is None:
451
+ if tri:
452
+ faces = mesh.triangulate(faces)
453
+ ret = [faces, *(img.reshape(-1, *img.shape[2:]) for img in image_attrs)]
454
+ if return_indices:
455
+ ret.append(np.arange(height * width, dtype=np.int32))
456
+ return tuple(ret)
457
+ else:
458
+ quad_mask = (mask[:-1, :-1] & mask[1:, :-1] & mask[1:, 1:] & mask[:-1, 1:]).ravel()
459
+ faces = faces[quad_mask]
460
+ if tri:
461
+ faces = mesh.triangulate(faces)
462
+ return mesh.remove_unreferenced_vertices(
463
+ faces,
464
+ *(x.reshape(-1, *x.shape[2:]) for x in image_attrs),
465
+ return_indices=return_indices
466
+ )
467
+
468
+ def image_mesh_from_depth(
469
+ depth: np.ndarray,
470
+ extrinsics: np.ndarray = None,
471
+ intrinsics: np.ndarray = None,
472
+ *vertice_attrs: np.ndarray,
473
+ atol: float = None,
474
+ rtol: float = None,
475
+ remove_by_depth: bool = False,
476
+ return_uv: bool = False,
477
+ return_indices: bool = False
478
+ ) -> Tuple[np.ndarray, ...]:
479
+ """
480
+ Get x triangle mesh by lifting depth map to 3D.
481
+
482
+ Args:
483
+ depth (np.ndarray): [H, W] depth map
484
+ extrinsics (np.ndarray, optional): [4, 4] extrinsics matrix. Defaults to None.
485
+ intrinsics (np.ndarray, optional): [3, 3] intrinsics matrix. Defaults to None.
486
+ *vertice_attrs (np.ndarray): [H, W, C] vertex attributes. Defaults to None.
487
+ atol (float, optional): absolute tolerance. Defaults to None.
488
+ rtol (float, optional): relative tolerance. Defaults to None.
489
+ triangles with vertices having depth difference larger than atol + rtol * depth will be marked.
490
+ remove_by_depth (bool, optional): whether to remove triangles with large depth difference. Defaults to True.
491
+ return_uv (bool, optional): whether to return uv coordinates. Defaults to False.
492
+ return_indices (bool, optional): whether to return indices of vertices in the original mesh. Defaults to False.
493
+
494
+ Returns:
495
+ vertices (np.ndarray): [N, 3] vertices
496
+ faces (np.ndarray): [T, 3] faces
497
+ *vertice_attrs (np.ndarray): [N, C] vertex attributes
498
+ image_uv (np.ndarray, optional): [N, 2] uv coordinates
499
+ ref_indices (np.ndarray, optional): [N] indices of vertices in the original mesh
500
+ """
501
+ height, width = depth.shape
502
+ image_uv, image_face = image_mesh(height, width)
503
+ depth = depth.reshape(-1)
504
+ pts = transforms.unproject_cv(image_uv, depth, extrinsics, intrinsics)
505
+ image_face = mesh.triangulate(image_face, vertices=pts)
506
+ ref_indices = None
507
+ ret = []
508
+ if atol is not None or rtol is not None:
509
+ atol = 0 if atol is None else atol
510
+ rtol = 0 if rtol is None else rtol
511
+ mean = depth[image_face].mean(axis=1)
512
+ diff = np.max(np.abs(depth[image_face] - depth[image_face[:, [1, 2, 0]]]), axis=1)
513
+ mask = (diff <= atol + rtol * mean)
514
+ image_face_ = image_face[mask]
515
+ image_face_, ref_indices = mesh.remove_unreferenced_vertices(image_face_, return_indices=True)
516
+
517
+ remove = remove_by_depth and ref_indices is not None
518
+ if remove:
519
+ pts = pts[ref_indices]
520
+ image_face = image_face_
521
+ ret += [pts, image_face]
522
+ for attr in vertice_attrs:
523
+ ret.append(attr.reshape(-1, attr.shape[-1]) if not remove else attr.reshape(-1, attr.shape[-1])[ref_indices])
524
+ if return_uv:
525
+ ret.append(image_uv if not remove else image_uv[ref_indices])
526
+ if return_indices and ref_indices is not None:
527
+ ret.append(ref_indices)
528
+ return tuple(ret)
529
+
530
+
531
+ def chessboard(width: int, height: int, grid_size: int, color_a: np.ndarray, color_b: np.ndarray) -> np.ndarray:
532
+ """get x chessboard image
533
+
534
+ Args:
535
+ width (int): image width
536
+ height (int): image height
537
+ grid_size (int): size of chessboard grid
538
+ color_a (np.ndarray): color of the grid at the top-left corner
539
+ color_b (np.ndarray): color in complementary grid cells
540
+
541
+ Returns:
542
+ image (np.ndarray): shape (height, width, channels), chessboard image
543
+ """
544
+ x = np.arange(width) // grid_size
545
+ y = np.arange(height) // grid_size
546
+ mask = (x[None, :] + y[:, None]) % 2
547
+ image = (1 - mask[..., None]) * color_a + mask[..., None] * color_b
548
+ return image
549
+
550
+
551
+ def square(tri: bool = False) -> Tuple[np.ndarray, np.ndarray]:
552
+ """
553
+ Get a square mesh of area 1 centered at origin in the xy-plane.
554
+
555
+ ### Returns
556
+ vertices (np.ndarray): shape (4, 3)
557
+ faces (np.ndarray): shape (1, 4)
558
+ """
559
+ vertices = np.array([
560
+ [-0.5, 0.5, 0], [0.5, 0.5, 0], [0.5, -0.5, 0], [-0.5, -0.5, 0] # v0-v1-v2-v3
561
+ ], dtype=np.float32)
562
+ if tri:
563
+ faces = np.array([[0, 1, 2], [0, 2, 3]], dtype=np.int32)
564
+ else:
565
+ faces = np.array([[0, 1, 2, 3]], dtype=np.int32)
566
+ return vertices, faces
567
+
568
+
569
+ def cube(tri: bool = False) -> Tuple[np.ndarray, np.ndarray]:
570
+ """
571
+ Get x cube mesh of size 1 centered at origin.
572
+
573
+ ### Parameters
574
+ tri (bool, optional): return triangulated mesh. Defaults to False, which returns quad mesh.
575
+
576
+ ### Returns
577
+ vertices (np.ndarray): shape (8, 3)
578
+ faces (np.ndarray): shape (12, 3)
579
+ """
580
+ vertices = np.array([
581
+ [-0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [-0.5, -0.5, 0.5], # v0-v1-v2-v3
582
+ [-0.5, 0.5, -0.5], [0.5, 0.5, -0.5], [0.5, -0.5, -0.5], [-0.5, -0.5, -0.5] # v4-v5-v6-v7
583
+ ], dtype=np.float32).reshape((-1, 3))
584
+
585
+ faces = np.array([
586
+ [0, 1, 2, 3], # v0-v1-v2-v3 (front)
587
+ [4, 5, 1, 0], # v4-v5-v1-v0 (top)
588
+ [3, 2, 6, 7], # v3-v2-v6-v7 (bottom)
589
+ [5, 4, 7, 6], # v5-v4-v7-v6 (back)
590
+ [1, 5, 6, 2], # v1-v5-v6-v2 (right)
591
+ [4, 0, 3, 7] # v4-v0-v3-v7 (left)
592
+ ], dtype=np.int32)
593
+
594
+ if tri:
595
+ faces = mesh.triangulate(faces, vertices=vertices)
596
+
597
+ return vertices, faces
598
+
599
+
600
+ def camera_frustum(extrinsics: np.ndarray, intrinsics: np.ndarray, depth: float = 1.0) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
601
+ """
602
+ Get x triangle mesh of camera frustum.
603
+ """
604
+ assert extrinsics.shape == (4, 4) and intrinsics.shape == (3, 3)
605
+ vertices = transforms.unproject_cv(
606
+ np.array([[0, 0], [0, 0], [0, 1], [1, 1], [1, 0]], dtype=np.float32),
607
+ np.array([0] + [depth] * 4, dtype=np.float32),
608
+ extrinsics,
609
+ intrinsics
610
+ ).astype(np.float32)
611
+ edges = np.array([
612
+ [0, 1], [0, 2], [0, 3], [0, 4],
613
+ [1, 2], [2, 3], [3, 4], [4, 1]
614
+ ], dtype=np.int32)
615
+ faces = np.array([
616
+ [0, 1, 2],
617
+ [0, 2, 3],
618
+ [0, 3, 4],
619
+ [0, 4, 1],
620
+ [1, 2, 3],
621
+ [1, 3, 4]
622
+ ], dtype=np.int32)
623
+ return vertices, edges, faces
624
+
625
+
626
+ def icosahedron():
627
+ A = (1 + 5 ** 0.5) / 2
628
+ vertices = np.array([
629
+ [0, 1, A], [0, -1, A], [0, 1, -A], [0, -1, -A],
630
+ [1, A, 0], [-1, A, 0], [1, -A, 0], [-1, -A, 0],
631
+ [A, 0, 1], [A, 0, -1], [-A, 0, 1], [-A, 0, -1]
632
+ ], dtype=np.float32)
633
+ faces = np.array([
634
+ [0, 1, 8], [0, 8, 4], [0, 4, 5], [0, 5, 10], [0, 10, 1],
635
+ [3, 2, 9], [3, 9, 6], [3, 6, 7], [3, 7, 11], [3, 11, 2],
636
+ [1, 6, 8], [8, 9, 4], [4, 2, 5], [5, 11, 10], [10, 7, 1],
637
+ [2, 4, 9], [9, 8, 6], [6, 1, 7], [7, 10, 11], [11, 5, 2]
638
+ ], dtype=np.int32)
639
+ return vertices, faces
mapanything/utils/hf_utils/visual_util.py CHANGED
@@ -6,6 +6,7 @@
6
 
7
  import copy
8
  import os
 
9
 
10
  import cv2
11
  import matplotlib
@@ -15,6 +16,139 @@ import trimesh
15
  from scipy.spatial.transform import Rotation
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def predictions_to_glb(
19
  predictions,
20
  conf_thres=50.0,
@@ -26,6 +160,7 @@ def predictions_to_glb(
26
  target_dir=None,
27
  prediction_mode="Predicted Pointmap",
28
  mask_ambiguous=False,
 
29
  ) -> trimesh.Scene:
30
  """
31
  Converts VGGT predictions to a 3D scene represented as a GLB file.
@@ -44,9 +179,11 @@ def predictions_to_glb(
44
  mask_sky (bool): Apply sky segmentation mask (default: False)
45
  target_dir (str): Output directory for intermediate files (default: None)
46
  prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
 
 
47
 
48
  Returns:
49
- trimesh.Scene: Processed 3D scene containing point cloud and cameras
50
 
51
  Raises:
52
  ValueError: If input predictions structure is invalid
@@ -215,9 +352,135 @@ def predictions_to_glb(
215
  scene_3d = trimesh.Scene()
216
 
217
  # Add point cloud data to the scene
218
- point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- scene_3d.add_geometry(point_cloud_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  # Prepare 4x4 matrices for camera extrinsics
223
  num_cameras = len(camera_matrices)
@@ -259,9 +522,11 @@ def integrate_camera_into_scene(
259
  face_colors (tuple): Color of the camera face.
260
  scene_scale (float): Scale of the scene.
261
  """
262
-
263
  cam_width = scene_scale * 0.05
264
  cam_height = scene_scale * 0.1
 
 
265
 
266
  # Create cone shape for camera
267
  rot_45_degree = np.eye(4)
 
6
 
7
  import copy
8
  import os
9
+ from typing import Tuple
10
 
11
  import cv2
12
  import matplotlib
 
16
  from scipy.spatial.transform import Rotation
17
 
18
 
19
+ def remove_unreferenced_vertices(
20
+ faces: np.ndarray, *vertice_attrs, return_indices: bool = False
21
+ ) -> Tuple[np.ndarray, ...]:
22
+ """
23
+ Remove unreferenced vertices of a mesh.
24
+ Unreferenced vertices are removed, and the face indices are updated accordingly.
25
+
26
+ Args:
27
+ faces (np.ndarray): [T, P] face indices
28
+ *vertice_attrs: vertex attributes
29
+
30
+ Returns:
31
+ faces (np.ndarray): [T, P] face indices
32
+ *vertice_attrs: vertex attributes
33
+ indices (np.ndarray, optional): [N] indices of vertices that are kept. Defaults to None.
34
+ """
35
+ P = faces.shape[-1]
36
+ fewer_indices, inv_map = np.unique(faces, return_inverse=True)
37
+ faces = inv_map.astype(np.int32).reshape(-1, P)
38
+ ret = [faces]
39
+ for attr in vertice_attrs:
40
+ ret.append(attr[fewer_indices])
41
+ if return_indices:
42
+ ret.append(fewer_indices)
43
+ return tuple(ret)
44
+
45
+
46
+ def triangulate(
47
+ faces: np.ndarray, vertices: np.ndarray = None, backslash: np.ndarray = None
48
+ ) -> np.ndarray:
49
+ """
50
+ Triangulate a polygonal mesh.
51
+
52
+ Args:
53
+ faces (np.ndarray): [L, P] polygonal faces
54
+ vertices (np.ndarray, optional): [N, 3] 3-dimensional vertices.
55
+ If given, the triangulation is performed according to the distance
56
+ between vertices. Defaults to None.
57
+ backslash (np.ndarray, optional): [L] boolean array indicating
58
+ how to triangulate the quad faces. Defaults to None.
59
+
60
+ Returns:
61
+ (np.ndarray): [L * (P - 2), 3] triangular faces
62
+ """
63
+ if faces.shape[-1] == 3:
64
+ return faces
65
+ P = faces.shape[-1]
66
+ if vertices is not None:
67
+ assert faces.shape[-1] == 4, "now only support quad mesh"
68
+ if backslash is None:
69
+ backslash = np.linalg.norm(
70
+ vertices[faces[:, 0]] - vertices[faces[:, 2]], axis=-1
71
+ ) < np.linalg.norm(vertices[faces[:, 1]] - vertices[faces[:, 3]], axis=-1)
72
+ if backslash is None:
73
+ loop_indice = np.stack(
74
+ [
75
+ np.zeros(P - 2, dtype=int),
76
+ np.arange(1, P - 1, 1, dtype=int),
77
+ np.arange(2, P, 1, dtype=int),
78
+ ],
79
+ axis=1,
80
+ )
81
+ return faces[:, loop_indice].reshape((-1, 3))
82
+ else:
83
+ assert faces.shape[-1] == 4, "now only support quad mesh"
84
+ faces = np.where(
85
+ backslash[:, None],
86
+ faces[:, [0, 1, 2, 0, 2, 3]],
87
+ faces[:, [0, 1, 3, 3, 1, 2]],
88
+ ).reshape((-1, 3))
89
+ return faces
90
+
91
+
92
+ def image_mesh(
93
+ *image_attrs: np.ndarray,
94
+ mask: np.ndarray = None,
95
+ tri: bool = False,
96
+ return_indices: bool = False,
97
+ ) -> Tuple[np.ndarray, ...]:
98
+ """
99
+ Get a mesh regarding image pixel uv coordinates as vertices and image grid as faces.
100
+
101
+ Args:
102
+ *image_attrs (np.ndarray): image attributes in shape (height, width, [channels])
103
+ mask (np.ndarray, optional): binary mask of shape (height, width), dtype=bool. Defaults to None.
104
+
105
+ Returns:
106
+ faces (np.ndarray): faces connecting neighboring pixels. shape (T, 4) if tri is False, else (T, 3)
107
+ *vertex_attrs (np.ndarray): vertex attributes in corresponding order with input image_attrs
108
+ indices (np.ndarray, optional): indices of vertices in the original mesh
109
+ """
110
+ assert (len(image_attrs) > 0) or (mask is not None), (
111
+ "At least one of image_attrs or mask should be provided"
112
+ )
113
+ height, width = next(image_attrs).shape[:2] if mask is None else mask.shape
114
+ assert all(img.shape[:2] == (height, width) for img in image_attrs), (
115
+ "All image_attrs should have the same shape"
116
+ )
117
+
118
+ row_faces = np.stack(
119
+ [
120
+ np.arange(0, width - 1, dtype=np.int32),
121
+ np.arange(width, 2 * width - 1, dtype=np.int32),
122
+ np.arange(1 + width, 2 * width, dtype=np.int32),
123
+ np.arange(1, width, dtype=np.int32),
124
+ ],
125
+ axis=1,
126
+ )
127
+ faces = (
128
+ np.arange(0, (height - 1) * width, width, dtype=np.int32)[:, None, None]
129
+ + row_faces[None, :, :]
130
+ ).reshape((-1, 4))
131
+ if mask is None:
132
+ if tri:
133
+ faces = triangulate(faces)
134
+ ret = [faces, *(img.reshape(-1, *img.shape[2:]) for img in image_attrs)]
135
+ if return_indices:
136
+ ret.append(np.arange(height * width, dtype=np.int32))
137
+ return tuple(ret)
138
+ else:
139
+ quad_mask = (
140
+ mask[:-1, :-1] & mask[1:, :-1] & mask[1:, 1:] & mask[:-1, 1:]
141
+ ).ravel()
142
+ faces = faces[quad_mask]
143
+ if tri:
144
+ faces = triangulate(faces)
145
+ return remove_unreferenced_vertices(
146
+ faces,
147
+ *(x.reshape(-1, *x.shape[2:]) for x in image_attrs),
148
+ return_indices=return_indices,
149
+ )
150
+
151
+
152
  def predictions_to_glb(
153
  predictions,
154
  conf_thres=50.0,
 
160
  target_dir=None,
161
  prediction_mode="Predicted Pointmap",
162
  mask_ambiguous=False,
163
+ as_mesh=True,
164
  ) -> trimesh.Scene:
165
  """
166
  Converts VGGT predictions to a 3D scene represented as a GLB file.
 
179
  mask_sky (bool): Apply sky segmentation mask (default: False)
180
  target_dir (str): Output directory for intermediate files (default: None)
181
  prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
182
+ mask_ambiguous (bool): Apply final mask to filter ambiguous predictions (default: False)
183
+ as_mesh (bool): Represent the data as a mesh instead of point cloud (default: False)
184
 
185
  Returns:
186
+ trimesh.Scene: Processed 3D scene containing point cloud/mesh and cameras
187
 
188
  Raises:
189
  ValueError: If input predictions structure is invalid
 
352
  scene_3d = trimesh.Scene()
353
 
354
  # Add point cloud data to the scene
355
+ if as_mesh:
356
+ # Create mesh from pointcloud
357
+ # try:
358
+ if selected_frame_idx is not None:
359
+ # Single frame case - we can create a proper mesh
360
+ H, W = pred_world_points.shape[1:3]
361
+
362
+ # Get original unfiltered data for mesh creation
363
+ original_points = pred_world_points.reshape(H, W, 3)
364
+
365
+ # Reshape original image data properly
366
+ if images.ndim == 4 and images.shape[1] == 3: # NCHW format
367
+ original_image_colors = np.transpose(images[0], (1, 2, 0))
368
+ else: # Assume already in HWC format
369
+ original_image_colors = images[0]
370
+ original_image_colors *= 255
371
+ # Create mask from confidence and other filters
372
+ original_conf = pred_world_points_conf.reshape(H, W)
373
+ original_final_mask = predictions["final_mask"][selected_frame_idx].reshape(
374
+ H, W
375
+ )
376
+
377
+ # Apply thresholds to create mask
378
+ mask = (original_conf >= conf_threshold) & (original_conf > 1e-5)
379
+ if mask_ambiguous:
380
+ mask = mask & original_final_mask
381
+
382
+ # Additional background masks if needed
383
+ if mask_black_bg:
384
+ black_bg_mask = original_image_colors.sum(axis=2) >= 16
385
+ mask = mask & black_bg_mask
386
+
387
+ if mask_white_bg:
388
+ white_bg_mask = ~(
389
+ (original_image_colors[:, :, 0] > 240)
390
+ & (original_image_colors[:, :, 1] > 240)
391
+ & (original_image_colors[:, :, 2] > 240)
392
+ )
393
+ mask = mask & white_bg_mask
394
+
395
+ # Check if normals are available in predictions
396
+ vertex_normals = None
397
+ if "normal" in predictions and predictions["normal"] is not None:
398
+ # Get normals for the selected frame
399
+ frame_normals = (
400
+ predictions["normal"][selected_frame_idx]
401
+ if selected_frame_idx is not None
402
+ else predictions["normal"][0]
403
+ )
404
+
405
+ # Create faces and vertices using image_mesh with normals support
406
+ faces, vertices, vertex_colors, vertex_normals = image_mesh(
407
+ original_points * np.array([1, -1, 1], dtype=np.float32),
408
+ original_image_colors / 255.0,
409
+ frame_normals * np.array([1, -1, 1], dtype=np.float32),
410
+ mask=original_final_mask,
411
+ tri=True,
412
+ return_indices=False,
413
+ )
414
+
415
+ # Apply coordinate transformations to normals
416
+ vertex_normals = vertex_normals * np.array([1, -1, 1], dtype=np.float32)
417
+ # frame_normals = frame_normals * np.array([1, -1, 1], dtype=np.float32)
418
+ else:
419
+ # Create faces and vertices using image_mesh without normals
420
+ faces, vertices, vertex_colors = image_mesh(
421
+ original_points * np.array([1, -1, 1], dtype=np.float32),
422
+ original_image_colors / 255.0,
423
+ mask=original_final_mask,
424
+ tri=True,
425
+ return_indices=False,
426
+ )
427
+
428
+ vertices = vertices * np.array([1, -1, 1], dtype=np.float32)
429
+
430
+ # Create trimesh object with optional normals
431
+ mesh_data = trimesh.Trimesh(
432
+ vertices=vertices,
433
+ faces=faces,
434
+ vertex_colors=(vertex_colors * 255).astype(np.uint8),
435
+ vertex_normals=(vertex_normals if vertex_normals is not None else None),
436
+ process=False,
437
+ )
438
+ scene_3d.add_geometry(mesh_data)
439
 
440
+ else:
441
+ # Multi-frame case - create separate meshes for each frame
442
+ print("Creating mesh for multi-frame data...")
443
+
444
+ for frame_idx in range(pred_world_points.shape[0]):
445
+ H, W = pred_world_points.shape[1:3]
446
+
447
+ # Get data for this frame
448
+ frame_points = pred_world_points[frame_idx]
449
+ frame_conf = pred_world_points_conf[frame_idx]
450
+ frame_final_mask = predictions["final_mask"][frame_idx]
451
+
452
+ # Get frame image
453
+ if images.ndim == 4 and images.shape[1] == 3: # NCHW format
454
+ frame_image = np.transpose(images[frame_idx], (1, 2, 0))
455
+ else: # Assume already in HWC format
456
+ frame_image = images[frame_idx]
457
+ frame_image *= 255
458
+ # Create mask for this frame
459
+ mask = (frame_conf >= conf_threshold) & (frame_conf > 1e-5)
460
+ if mask_ambiguous:
461
+ mask = mask | frame_final_mask
462
+
463
+ # Create mesh for this frame
464
+ faces, vertices, vertex_colors = image_mesh(
465
+ frame_points * np.array([1, -1, 1], dtype=np.float32),
466
+ frame_image / 255.0,
467
+ mask=frame_final_mask,
468
+ tri=True,
469
+ return_indices=False,
470
+ )
471
+
472
+ vertices = vertices * np.array([1, -1, 1], dtype=np.float32)
473
+ # Create trimesh object for this frame
474
+ frame_mesh = trimesh.Trimesh(
475
+ vertices=vertices,
476
+ faces=faces,
477
+ vertex_colors=(vertex_colors * 255).astype(np.uint8),
478
+ process=False,
479
+ )
480
+ scene_3d.add_geometry(frame_mesh)
481
+ else:
482
+ point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
483
+ scene_3d.add_geometry(point_cloud_data)
484
 
485
  # Prepare 4x4 matrices for camera extrinsics
486
  num_cameras = len(camera_matrices)
 
522
  face_colors (tuple): Color of the camera face.
523
  scene_scale (float): Scale of the scene.
524
  """
525
+ scene_scale = 12
526
  cam_width = scene_scale * 0.05
527
  cam_height = scene_scale * 0.1
528
+ # cam_width = scene_scale * 0.05
529
+ # cam_height = scene_scale * 0.1
530
 
531
  # Create cone shape for camera
532
  rot_45_degree = np.eye(4)