Skip to content

Commit 142ca6e

Browse files
author
Józef Daniecki
committed
Tracker Service v0.4.0: scene config dynamic loading
1 parent 2005584 commit 142ca6e

92 files changed

Lines changed: 2706 additions & 6105 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/resources/.prettierignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,4 @@
1313
**/CMakePresets.json
1414
**/CMakeFiles/
1515

16-
**/tests/api/README.md
17-
**/tests/system/metric/dataset/*.json
16+
**/tests/api/README.md

controller/src/robot_vision/src/rv/tracking/ObjectMatching.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
#include "rv/apollo/secure_matrix.hpp"
1212
#include "rv/tracking/Classification.hpp"
1313

14-
#include <iostream>
15-
1614
namespace rv {
1715
namespace tracking {
1816

controller/src/robot_vision/src/rv/tracking/TrackManager.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
#include "rv/Utils.hpp"
55
#include "rv/tracking/TrackManager.hpp"
6-
#include <iostream>
76
#include <omp.h>
87

98
namespace rv {
@@ -354,10 +353,6 @@ void TrackManager::updateTrackerConfig(int camera_frame_rate)
354353
mConfig.mMaxNumberOfUnreliableFrames = std::ceil(camera_frame_rate*mConfig.mMaxUnreliableTime);
355354
mConfig.mNonMeasurementFramesDynamic = std::ceil(camera_frame_rate*mConfig.mNonMeasurementTimeDynamic);
356355
mConfig.mNonMeasurementFramesStatic = std::ceil(camera_frame_rate*mConfig.mNonMeasurementTimeStatic);
357-
std::cout << "Updated parameters for reference camera frame rate = " << camera_frame_rate << "fps" << std::endl;
358-
std::cout << "max_unreliable_frames = " << mConfig.mMaxNumberOfUnreliableFrames << std::endl;
359-
std::cout << "non_measurement_frames_dynamic = " << mConfig.mNonMeasurementFramesDynamic << std::endl;
360-
std::cout << "non_measurement_frames_static = " << mConfig.mNonMeasurementFramesStatic << std::endl;
361356
}
362357

363358
} // namespace tracking

docs/design/tracker-evaluation-pipeline.md

Lines changed: 0 additions & 337 deletions
This file was deleted.

manager/src/static/js/scenescape3d.js

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,13 +394,12 @@ function main() {
394394
sensorManager.sceneSensors[sensorId]
395395
) {
396396
const sensor = sensorManager.sceneSensors[sensorId];
397-
const sensorArea = sensor.region && sensor.region.area;
398397

399398
// Only control lighting for sensors with area set to "scene"
400399
// Don't control lighting for localized sensors ("circle", "poly") or any other value
401-
if (sensorArea !== "scene") {
400+
if (sensor.area !== "scene") {
402401
console.log(
403-
`Light sensor (${sensorId}): area="${sensorArea}" - not controlling scene lighting (only "scene" area sensors affect ambient light)`,
402+
`Light sensor (${sensorId}): area="${sensor.area}" - not controlling scene lighting (only "scene" area sensors affect ambient light)`,
404403
);
405404
return;
406405
}

manager/src/static/js/sscape.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,7 @@ function setupGenerateMesh() {
15421542
}
15431543

15441544
async function pollMeshStatus(sceneId, requestId) {
1545-
const timeout = 15 * 60 * 1000; // 15 minutes
1545+
const timeout = 10 * 60 * 1000; // 10 minutes
15461546
const start = Date.now();
15471547

15481548
while (true) {

mapping/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ RUN pip install --no-cache-dir -r $SCENESCAPE_HOME/requirements_api.txt && \
110110
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r $SCENESCAPE_HOME/requirements_vggt.txt && \
111111
cd /workspace/vggt && \
112112
pip install --no-cache-dir -e . && \
113-
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt -r requirements_demo.txt; \
113+
pip install --no-cache-dir -r requirements.txt -r requirements_demo.txt; \
114114
fi && \
115115
rm -rf /usr/local/lib/python3.11/site-packages/torch/test \
116116
/usr/local/lib/python3.11/site-packages/torch/share \

mapping/src/mapanything_model.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
"""
1212

1313
import base64
14+
import math
15+
import os
16+
import subprocess
1417
import sys
18+
import tempfile
1519
from typing import Dict, Any, List, Optional, Tuple
1620

1721
import numpy as np
@@ -116,6 +120,119 @@ def runInference(self, frames: List[Dict[str, Any]]) -> Dict[str, Any]:
116120
log.error(f"MapAnything inference (frames) failed: {e}")
117121
raise RuntimeError(f"MapAnything inference (frames) failed: {e}")
118122

123+
def _maxFramesForTimeBudget(
124+
self,
125+
time_budget_seconds: float,
126+
overhead: float,
127+
) -> int:
128+
129+
cpu_sec_per_frame = float(os.getenv("MAPANYTHING_CPU_SEC_PER_FRAME", "10"))
130+
cuda_sec_per_frame = float(os.getenv("MAPANYTHING_CUDA_SEC_PER_FRAME", "0.8"))
131+
sec_per_frame = cpu_sec_per_frame
132+
if self.device.startswith("cuda") and cuda_sec_per_frame:
133+
sec_per_frame = cuda_sec_per_frame
134+
135+
usable = max(0.0, time_budget_seconds - overhead)
136+
if usable <= 0:
137+
return 0
138+
139+
# conservative: floor
140+
max_frames = int(math.floor(usable / max(1e-6, sec_per_frame)))
141+
return max_frames
142+
143+
# Put in ReconstructionModel base class
144+
def _framesFromVideoAsBase64Dicts(
145+
self,
146+
video_path: str,
147+
max_frames: int,
148+
use_keyframes: bool = True,
149+
sample_every_n: int = 10,
150+
jpeg_quality: int = 85,
151+
max_side: Optional[int] = 960,
152+
) -> List[Dict[str, Any]]:
153+
"""
154+
Extract frames using ffmpeg and return:
155+
[{"data": "<base64-encoded-jpeg>"}, ...]
156+
157+
Modes:
158+
- use_keyframes=True: extract TRUE keyframes (I-frames)
159+
- use_keyframes=False: sample every N frames using select filter
160+
"""
161+
if max_frames < 1:
162+
return []
163+
164+
if not os.path.isfile(video_path):
165+
raise ValueError(f"Video file not found: {video_path}")
166+
167+
if sample_every_n < 1:
168+
sample_every_n = 1
169+
170+
# Map jpeg_quality (1..100) -> ffmpeg mjpeg qscale (2..31), where 2 is best quality
171+
qscale = int(round(31 - (np.clip(jpeg_quality, 1, 100) / 100.0) * 29))
172+
qscale = int(np.clip(qscale, 2, 31))
173+
174+
vf_parts: List[str] = []
175+
176+
# If not keyframes, use select filter to sample frames
177+
if not use_keyframes:
178+
# keep frames where n % sample_every_n == 0
179+
vf_parts.append(f"select='not(mod(n\\,{sample_every_n}))'")
180+
else:
181+
log.info("Using key frames")
182+
183+
# Optional downscale: keep aspect ratio, cap longest side
184+
if max_side is not None and max_side > 0:
185+
vf_parts.append(
186+
f"scale='if(gte(iw,ih),min(iw,{max_side}),-2)':'if(lt(iw,ih),min(ih,{max_side}),-2)'"
187+
)
188+
189+
vf = ",".join(vf_parts) if vf_parts else None
190+
191+
frames: List[Dict[str, Any]] = []
192+
193+
with tempfile.TemporaryDirectory(prefix="frames_") as tmpdir:
194+
out_pattern = os.path.join(tmpdir, "frame_%06d.jpg")
195+
196+
cmd = [
197+
"ffmpeg",
198+
"-hide_banner",
199+
"-loglevel", "error",
200+
]
201+
202+
# Keyframes mode: only decode keyframes
203+
if use_keyframes:
204+
cmd += ["-skip_frame", "nokey"]
205+
206+
cmd += ["-i", video_path]
207+
208+
if vf:
209+
cmd += ["-vf", vf]
210+
211+
cmd += [
212+
"-vsync", "vfr",
213+
"-frames:v", str(max_frames),
214+
"-q:v", str(qscale),
215+
out_pattern,
216+
]
217+
218+
try:
219+
subprocess.run(cmd, check=True)
220+
except FileNotFoundError:
221+
raise RuntimeError("ffmpeg not found. Install ffmpeg in the container/host.")
222+
except subprocess.CalledProcessError as e:
223+
mode = "keyframes" if use_keyframes else f"sample_every_n={sample_every_n}"
224+
raise RuntimeError(f"ffmpeg failed extracting frames ({mode}): {e}")
225+
226+
# Read extracted frames back into base64
227+
for i in range(1, max_frames + 1):
228+
fpath = os.path.join(tmpdir, f"frame_{i:06d}.jpg")
229+
if not os.path.exists(fpath):
230+
break
231+
with open(fpath, "rb") as f:
232+
frames.append({"data": base64.b64encode(f.read()).decode("utf-8")})
233+
234+
return frames
235+
119236
def getSupportedOutputs(self) -> List[str]:
120237
"""Get supported output formats."""
121238
return ["mesh", "pointcloud"]

mapping/src/model_interface.py

Lines changed: 1 addition & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,10 @@
1212
"""
1313

1414
from abc import ABC, abstractmethod
15-
from typing import Dict, Any, List, Optional
15+
from typing import Dict, Any, List
1616

17-
import base64
1817
import cv2
1918
import numpy as np
20-
import os
21-
import subprocess
22-
import math
23-
import tempfile
2419

2520
from scene_common import log
2621

@@ -292,115 +287,3 @@ def rotationMatrixToQuaternion(self, R: np.ndarray) -> np.ndarray:
292287
z = 0.25 * s
293288

294289
return np.array([x, y, z, w])
295-
296-
def _maxFramesForTimeBudget(
297-
self,
298-
time_budget_seconds: float,
299-
overhead: float,
300-
) -> int:
301-
302-
cpu_sec_per_frame = float(os.getenv("MAPPING_CPU_SEC_PER_FRAME", "10"))
303-
cuda_sec_per_frame = float(os.getenv("MAPPING_CUDA_SEC_PER_FRAME", "0.8"))
304-
sec_per_frame = cpu_sec_per_frame
305-
if self.device.startswith("cuda") and cuda_sec_per_frame:
306-
sec_per_frame = cuda_sec_per_frame
307-
308-
usable = max(0.0, time_budget_seconds - overhead)
309-
if usable <= 0:
310-
return 0
311-
312-
# conservative: floor
313-
max_frames = int(math.floor(usable / max(1e-6, sec_per_frame)))
314-
return max_frames
315-
316-
def _framesFromVideoAsBase64Dicts(
317-
self,
318-
video_path: str,
319-
max_frames: int,
320-
use_keyframes: bool = True,
321-
sample_every_n: int = 10,
322-
jpeg_quality: int = 85,
323-
max_side: Optional[int] = 960,
324-
) -> List[Dict[str, Any]]:
325-
"""
326-
Extract frames using ffmpeg and return:
327-
[{"data": "<base64-encoded-jpeg>"}, ...]
328-
329-
Modes:
330-
- use_keyframes=True: extract TRUE keyframes (I-frames)
331-
- use_keyframes=False: sample every N frames using select filter
332-
"""
333-
if max_frames < 1:
334-
return []
335-
336-
if not os.path.isfile(video_path):
337-
raise ValueError(f"Video file not found: {video_path}")
338-
339-
if sample_every_n < 1:
340-
sample_every_n = 1
341-
342-
# Map jpeg_quality (1..100) -> ffmpeg mjpeg qscale (2..31), where 2 is best quality
343-
qscale = int(round(31 - (np.clip(jpeg_quality, 1, 100) / 100.0) * 29))
344-
qscale = int(np.clip(qscale, 2, 31))
345-
346-
vf_parts: List[str] = []
347-
348-
# If not keyframes, use select filter to sample frames
349-
if not use_keyframes:
350-
# keep frames where n % sample_every_n == 0
351-
vf_parts.append(f"select='not(mod(n\\,{sample_every_n}))'")
352-
else:
353-
log.info("Using key frames")
354-
355-
# Optional downscale: keep aspect ratio, cap longest side
356-
if max_side is not None and max_side > 0:
357-
vf_parts.append(
358-
f"scale='if(gte(iw,ih),min(iw,{max_side}),-2)':'if(lt(iw,ih),min(ih,{max_side}),-2)'"
359-
)
360-
361-
vf = ",".join(vf_parts) if vf_parts else None
362-
363-
frames: List[Dict[str, Any]] = []
364-
365-
with tempfile.TemporaryDirectory(prefix="frames_") as tmpdir:
366-
out_pattern = os.path.join(tmpdir, "frame_%06d.jpg")
367-
368-
cmd = [
369-
"ffmpeg",
370-
"-hide_banner",
371-
"-loglevel", "error",
372-
]
373-
374-
# Keyframes mode: only decode keyframes
375-
if use_keyframes:
376-
cmd += ["-skip_frame", "nokey"]
377-
378-
cmd += ["-i", video_path]
379-
380-
if vf:
381-
cmd += ["-vf", vf]
382-
383-
cmd += [
384-
"-vsync", "vfr",
385-
"-frames:v", str(max_frames),
386-
"-q:v", str(qscale),
387-
out_pattern,
388-
]
389-
390-
try:
391-
subprocess.run(cmd, check=True)
392-
except FileNotFoundError:
393-
raise RuntimeError("ffmpeg not found. Install ffmpeg in the container/host.")
394-
except subprocess.CalledProcessError as e:
395-
mode = "keyframes" if use_keyframes else f"sample_every_n={sample_every_n}"
396-
raise RuntimeError(f"ffmpeg failed extracting frames ({mode}): {e}")
397-
398-
# Read extracted frames back into base64
399-
for i in range(1, max_frames + 1):
400-
fpath = os.path.join(tmpdir, f"frame_{i:06d}.jpg")
401-
if not os.path.exists(fpath):
402-
break
403-
with open(fpath, "rb") as f:
404-
frames.append({"data": base64.b64encode(f.read()).decode("utf-8")})
405-
406-
return frames

mapping/src/vggt_model.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -406,33 +406,35 @@ def _preprocessImages(self, pil_images: List[Image.Image]) -> tuple:
406406
Returns:
407407
Tuple of (processed_tensor, model_size)
408408
"""
409-
target = 518
410-
n = len(pil_images)
409+
processed_images = []
410+
target_size = 518
411411

412-
# Preallocate on CPU, then move once
413-
batch = torch.empty((n, 3, target, target), dtype=torch.float32)
412+
for pil_image in pil_images:
413+
# Apply VGGT preprocessing (similar to load_and_preprocess_images)
414+
width, height = pil_image.size
414415

415-
for i, im in enumerate(pil_images):
416-
w, h = im.size
417-
new_w = target
418-
new_h = round(h * (new_w / w) / 14) * 14
419-
new_h = max(14, new_h)
416+
# Set width to target_size, calculate height maintaining aspect ratio
417+
new_width = target_size
418+
new_height = round(height * (new_width / width) / 14) * 14 # Divisible by 14
420419

421-
im = im.resize((new_w, new_h), Image.Resampling.BICUBIC)
420+
# Resize image
421+
img_resized = pil_image.resize((new_width, new_height), Image.Resampling.BICUBIC)
422422

423-
if new_h > target:
424-
top = (new_h - target) // 2
425-
im = im.crop((0, top, target, top + target))
426-
elif new_h < target:
427-
pad_top = (target - new_h) // 2
428-
canvas = Image.new(im.mode, (target, target))
429-
canvas.paste(im, (0, pad_top))
430-
im = canvas
423+
# Convert to tensor
424+
img_tensor = tvf.ToTensor()(img_resized) # Shape: (3, H, W), values [0, 1]
431425

432-
batch[i] = tvf.ToTensor()(im)
426+
# Center crop height if larger than target_size
427+
if new_height > target_size:
428+
start_y = (new_height - target_size) // 2
429+
img_tensor = img_tensor[:, start_y:start_y + target_size, :]
433430

434-
images_tensor = batch.to(self.device, non_blocking=True)
435-
return images_tensor, (target, target)
431+
processed_images.append(img_tensor)
432+
433+
# Stack all images and move to device
434+
images_tensor = torch.stack(processed_images).to(self.device) # Shape: (N, 3, H, W)
435+
model_size = images_tensor.shape[-2:] # (height, width)
436+
437+
return images_tensor, model_size
436438

437439
def _runModelInference(self, images_tensor: torch.Tensor) -> Dict[str, Any]:
438440
"""

0 commit comments

Comments
 (0)