-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathvisualize_bboxes_cameraonly.py
More file actions
185 lines (150 loc) · 6.46 KB
/
visualize_bboxes_cameraonly.py
File metadata and controls
185 lines (150 loc) · 6.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import argparse
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from mmengine.config import Config
from mmengine.registry import RUNNERS
from mmengine.runner import Runner, load_checkpoint
from tqdm import tqdm
def parse_args():
parser = argparse.ArgumentParser(
description="Visualize the results of camera-only 3D detection models as a video in a 2x3 grid of images. "
"The script projects 3D bounding boxes onto camera images and creates a visualization video."
)
parser.add_argument("config", help="Path to config file")
parser.add_argument("checkpoint", help="Path to checkpoint file")
parser.add_argument("--threshold", type=float, default=0.3, help="Score threshold for predictions")
parser.add_argument("--step", type=int, default=120, help="Number of steps to visualize")
parser.add_argument("--cam_order", type=list, default=[2, 0, 4, 3, 1, 5], help="Camera order")
return parser.parse_args()
def denormalize_image(img, mean, std, to_rgb=False):
"""
Denormalize an image normalized with mean and std.
Args:
img (np.ndarray): Image array with shape (C, H, W) and dtype float32.
mean (list): Per-channel mean.
std (list): Per-channel std.
to_rgb (bool): Whether to convert BGR to RGB after denormalization.
Returns:
np.ndarray: Denormalized image with shape (H, W, C) and dtype uint8.
"""
assert img.shape[0] == 3, "Expected 3 channels (C, H, W)"
denorm = img.copy()
for i in range(3):
denorm[i] = denorm[i] * std[i] + mean[i]
denorm = denorm.transpose(1, 2, 0) # (H, W, C)
if to_rgb:
denorm = denorm[..., ::-1] # BGR to RGB
return np.clip(denorm, 0, 255).astype(np.uint8)
def project_to_image(points, lidar2cam, cam2img):
"""Project 3D points to image plane."""
points_hom = np.hstack((points, np.ones((points.shape[0], 1))))
points_cam = np.dot(lidar2cam, points_hom.T).T
valid_mask = points_cam[:, 2] > 0
points_img = np.dot(cam2img, points_cam[:, :3].T).T
points_img /= points_img[:, 2:3]
return points_img[:, :2], valid_mask
def draw_projected_3d_bboxes(ax, bboxes, lidar2cam, cam2img, img_shape):
"""
Projects 3D bounding boxes to 2D and draws them on the given axis.
Clips the lines at image boundaries.
"""
H, W = img_shape[:2]
def clip_line(pt1, pt2, W, H):
"""Clip a line segment to the image bounds."""
x1, y1 = pt1
x2, y2 = pt2
# Both points are inside
if (0 <= x1 < W and 0 <= y1 < H) and (0 <= x2 < W and 0 <= y2 < H):
return [pt1, pt2]
# If both are out of bounds, skip
if (x1 < 0 and x2 < 0) or (x1 >= W and x2 >= W) or (y1 < 0 and y2 < 0) or (y1 >= H and y2 >= H):
return None
# You can implement Liang-Barsky or use simple clamping for one-side clip
pt1_clipped = [np.clip(x1, 0, W - 1), np.clip(y1, 0, H - 1)]
pt2_clipped = [np.clip(x2, 0, W - 1), np.clip(y2, 0, H - 1)]
return [pt1_clipped, pt2_clipped]
for bbox in bboxes:
x, y, z, dx, dy, dz, yaw = bbox.cpu().numpy()[:7]
# Create 3D corners
corners = np.array(
[
[dx / 2, dy / 2, -dz / 2],
[dx / 2, -dy / 2, -dz / 2],
[-dx / 2, -dy / 2, -dz / 2],
[-dx / 2, dy / 2, -dz / 2],
[dx / 2, dy / 2, dz / 2],
[dx / 2, -dy / 2, dz / 2],
[-dx / 2, -dy / 2, dz / 2],
[-dx / 2, dy / 2, dz / 2],
]
)
# Apply rotation and translation
R = np.array(
[
[np.cos(yaw), -np.sin(yaw), 0],
[np.sin(yaw), np.cos(yaw), 0],
[0, 0, 1],
]
)
rotated = corners @ R.T
translated = rotated + np.array([x, y, z])
# Project to image
projected, valid_mask = project_to_image(translated, lidar2cam, cam2img)
if not valid_mask.all():
continue
# Define lines
lines = [(0, 1), (1, 2), (2, 3), (3, 0), (4, 5), (5, 6), (6, 7), (7, 4), (0, 4), (1, 5), (2, 6), (3, 7)]
for i, j in lines:
pt1, pt2 = projected[i], projected[j]
clipped = clip_line(pt1, pt2, W, H)
if clipped:
ax.plot([clipped[0][0], clipped[1][0]], [clipped[0][1], clipped[1][1]], color="lime", linewidth=1)
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
cfg.work_dir = cfg.get("work_dir", "./work_dirs/default")
runner = RUNNERS.build(cfg)
load_checkpoint(runner.model, args.checkpoint)
runner.model.eval()
dloader = iter(runner.test_dataloader)
os.makedirs(os.path.join(cfg.work_dir, "visualization"), exist_ok=True)
# Create video writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
video_path = os.path.join(cfg.work_dir, "visualization", "visualization.mp4")
video_writer = cv2.VideoWriter(video_path, fourcc, 1.0, (1500, 1000))
steps = args.step if args.step != -1 else len(dloader)
for step in tqdm(range(steps)): # change to desired number of samples
data = next(dloader)
with torch.no_grad():
results = runner.model.test_step(data)
result = results[0]
pred_mask = result["pred_instances_3d"]["scores_3d"] > args.threshold
pred_bboxes = result["pred_instances_3d"]["bboxes_3d"].tensor[pred_mask]
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()
for i, cam_id in enumerate(args.cam_order):
img = data["img"][0][0][cam_id].cpu().numpy()
img = denormalize_image(img, cfg.img_norm_cfg.mean, cfg.img_norm_cfg.std, True).astype(np.uint8)
lidar2cam = data["extrinsics"][0][0][cam_id].cpu().numpy()
cam2img = data["intrinsics"][0][0][cam_id].cpu().numpy()
ax = axs[i]
ax.imshow(img)
draw_projected_3d_bboxes(ax, pred_bboxes, lidar2cam, cam2img, img.shape)
ax.axis("off")
ax.set_title(f"Camera {cam_id}")
plt.tight_layout()
# Convert matplotlib figure to image
fig.canvas.draw()
frame = np.array(fig.canvas.renderer.buffer_rgba())
frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2BGR)
# Write frame to video
video_writer.write(frame)
plt.close()
# Release video writer
video_writer.release()
print(f"Saved video to: {video_path}")
if __name__ == "__main__":
main()