SpatialVID/utils/pack_clip_assets.py at main · NJU-3DV/SpatialVID · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
pack_clip_assets.py
------------------
This script unifies depth, RGB frames, intrinsics, extrinsics, etc. of a specified video clip into a single npz file for downstream 3D reconstruction or analysis.

Usage example:
    python pack_clip_assets.py --base_dir /path/to/HQ --clip_id group_xxxx/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --height 328 --width 584

"""

import argparse
import numpy as np
import torch
from lietorch import SE3
import cv2
from read_depth import read_depth

def load_video(clip_path, indexes_path, height=720, width=1280):
    """
    Read video frames at specified indexes and resize to (height, width).
    Args:
        clip_path (str): Path to video file
        indexes_path (str): Path to frame indexes txt
        height (int): Output frame height
        width (int): Output frame width
    Returns:
        np.ndarray: (N, height, width, 3) RGB frames
    """
    indexes = []
    with open(indexes_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                indexes.append(int(parts[1]))
    print(f"Frame indexes: {indexes}")
    cap = cv2.VideoCapture(clip_path)
    frames = []
    for idx in indexes:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            raise ValueError(f"Frame at index {idx} could not be read.")
        frame = cv2.resize(frame, (width, height))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    return np.array(frames)

def load_intrinsics(intrinsics_path, tgt_width=1024, tgt_height=576):
    """
    Read normalized intrinsics (n,4), convert to 3x3 matrix and scale to target resolution.
    Args:
        intrinsics_path (str): Path to intrinsics npy
        tgt_width (int): Target width
        tgt_height (int): Target height
    Returns:
        np.ndarray: (N, 3, 3) intrinsics matrices
    """
    intrinsics = np.load(intrinsics_path)
    intrinsics_3x3 = []
    for intrin in intrinsics:
        fx, fy, cx, cy = intrin
        K = np.array([[fx, 0, cx],
                      [0, fy, cy],
                      [0, 0, 1]], dtype=np.float32)
        intrinsics_3x3.append(K)
    intrinsics_3x3 = np.array(intrinsics_3x3)
    intrinsics_3x3[:, 0, 0] *= tgt_width
    intrinsics_3x3[:, 1, 1] *= tgt_height
    intrinsics_3x3[:, 0, 2] *= tgt_width
    intrinsics_3x3[:, 1, 2] *= tgt_height
    return intrinsics_3x3

def main():
    """
    Main pipeline: load depth, RGB frames, intrinsics, extrinsics, and save as npz.
    """
    parser = argparse.ArgumentParser(description="Pack clip assets into a single npz file.")
    parser.add_argument('--base_dir', type=str, required=True, help='Root directory of HQ data')
    parser.add_argument('--group_id', type=int, required=False, help='Group ID, e.g. group_xxxx')
    parser.add_argument('--clip_id', type=str, required=True, help='Clip ID, e.g. xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx')
    parser.add_argument('--height', type=int, default=328, help='Output image height')
    parser.add_argument('--width', type=int, default=584, help='Output image width')
    parser.add_argument('--output', type=str, default='sgd_cvd_hr.npz', help='Output npz filename')
    args = parser.parse_args()

    # Path construction
    annotation_dir = f'{args.base_dir}/annotations/group_{args.group_id:04d}/{args.clip_id}'
    depth_path = f'{args.base_dir}/depths/group_{args.group_id:04d}/{args.clip_id}.zip'
    clip_path = f'{args.base_dir}/videos/group_{args.group_id:04d}/{args.clip_id}.mp4'
    intrinsics_path = f'{annotation_dir}/intrinsics.npy'
    extrinsics_path = f'{annotation_dir}/poses.npy'
    indexes_path = f'{annotation_dir}/indexes.txt'

    # Load intrinsics and extrinsics
    intrinsics = load_intrinsics(intrinsics_path, tgt_width=args.width, tgt_height=args.height)
    extrinsics = np.load(extrinsics_path)

    # Load and resize depth
    depth = np.clip(read_depth(depth_path), 1e-3, 1e2)  # (N, H, W)
    resized_depth = np.zeros((depth.shape[0], args.height, args.width), dtype=depth.dtype)
    for i in range(depth.shape[0]):
        resized_depth[i] = cv2.resize(depth[i], (args.width, args.height), interpolation=cv2.INTER_LINEAR)

    # Load RGB frames
    frames = load_video(clip_path, indexes_path, args.height, args.width)

    # Compute camera poses
    poses_th = torch.as_tensor(extrinsics, device="cpu").float()
    cam_c2w = SE3(poses_th).inv().matrix()
    K = intrinsics[0]
    K_o = torch.from_numpy(K).float()

    # Save as npz
    np.savez(
        args.output,
        images=frames,
        depths=resized_depth,
        intrinsic=K_o.detach().cpu().numpy(),
        cam_c2w=cam_c2w.detach().cpu().numpy(),
    )
    print(f"Saved to {args.output}")

if __name__ == "__main__":
    main()