Source code for viser.extras._record3d

from __future__ import annotations

import dataclasses
import json
from pathlib import Path
from typing import Tuple, cast

import imageio.v3 as iio
import liblzfse
import numpy as np
import numpy.typing as npt
import skimage.transform
from scipy.spatial.transform import Rotation


[docs] class Record3dLoader: """Helper for loading frames for Record3D captures.""" # NOTE(hangg): Consider moving this module into # `examples/7_record3d_visualizer.py` since it is usecase-specific. def __init__(self, data_dir: Path): metadata_path = data_dir / "metadata" # Read metadata. metadata = json.loads(metadata_path.read_text()) K: np.ndarray = np.array(metadata["K"], np.float32).reshape(3, 3).T fps = metadata["fps"] T_world_cameras: np.ndarray = np.array(metadata["poses"], np.float32) T_world_cameras = np.concatenate( [ Rotation.from_quat(T_world_cameras[:, :4]).as_matrix(), T_world_cameras[:, 4:, None], ], -1, ) T_world_cameras = (T_world_cameras @ np.diag([1, -1, -1, 1])).astype(np.float32) self.K = K self.fps = fps self.T_world_cameras = T_world_cameras rgbd_dir = data_dir / "rgbd" self.rgb_paths = sorted(rgbd_dir.glob("*.jpg"), key=lambda p: int(p.stem)) self.depth_paths = [ rgb_path.with_suffix(".depth") for rgb_path in self.rgb_paths ] self.conf_paths = [rgb_path.with_suffix(".conf") for rgb_path in self.rgb_paths]
[docs] def num_frames(self) -> int: return len(self.rgb_paths)
[docs] def get_frame(self, index: int) -> Record3dFrame: # Read conf. conf: np.ndarray = np.frombuffer( liblzfse.decompress(self.conf_paths[index].read_bytes()), dtype=np.uint8 ) if conf.shape[0] == 640 * 480: conf = conf.reshape((640, 480)) # For a FaceID camera 3D Video elif conf.shape[0] == 256 * 192: conf = conf.reshape((256, 192)) # For a LiDAR 3D Video else: assert False, f"Unexpected conf shape {conf.shape}" # Read depth. depth: np.ndarray = np.frombuffer( liblzfse.decompress(self.depth_paths[index].read_bytes()), dtype=np.float32 ).copy() if depth.shape[0] == 640 * 480: depth = depth.reshape((640, 480)) # For a FaceID camera 3D Video elif depth.shape[0] == 256 * 192: depth = depth.reshape((256, 192)) # For a LiDAR 3D Video else: assert False, f"Unexpected depth shape {depth.shape}" # Read RGB. rgb = iio.imread(self.rgb_paths[index]) return Record3dFrame( K=self.K, rgb=rgb, depth=depth, mask=conf == 2, T_world_camera=self.T_world_cameras[index], )
[docs] @dataclasses.dataclass class Record3dFrame: """A single frame from a Record3D capture.""" K: npt.NDArray[np.float32] rgb: npt.NDArray[np.uint8] depth: npt.NDArray[np.float32] mask: npt.NDArray[np.bool_] T_world_camera: npt.NDArray[np.float32]
[docs] def get_point_cloud( self, downsample_factor: int = 1 ) -> Tuple[npt.NDArray[np.float32], npt.NDArray[np.uint8]]: rgb = self.rgb[::downsample_factor, ::downsample_factor] depth = skimage.transform.resize(self.depth, rgb.shape[:2], order=0) mask = cast( npt.NDArray[np.bool_], skimage.transform.resize(self.mask, rgb.shape[:2], order=0), ) assert depth.shape == rgb.shape[:2] K = self.K T_world_camera = self.T_world_camera img_wh = rgb.shape[:2][::-1] grid = ( np.stack(np.meshgrid(np.arange(img_wh[0]), np.arange(img_wh[1])), 2) + 0.5 ) grid = grid * downsample_factor homo_grid = np.pad(grid[mask], np.array([[0, 0], [0, 1]]), constant_values=1) local_dirs = np.einsum("ij,bj->bi", np.linalg.inv(K), homo_grid) dirs = np.einsum("ij,bj->bi", T_world_camera[:3, :3], local_dirs) points = (T_world_camera[:, -1] + dirs * depth[mask, None]).astype(np.float32) point_colors = rgb[mask] return points, point_colors