Skip to content
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
039c550
Set map scale during map creation during scene import
daddo-intel Jan 13, 2026
96d26c5
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 15, 2026
23f8d8d
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 21, 2026
0968474
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 22, 2026
491e473
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 26, 2026
154c85b
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 27, 2026
4254d2f
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 27, 2026
90d0488
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 27, 2026
74adad9
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Jan 29, 2026
73361cf
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 3, 2026
54fdfa8
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 3, 2026
70058d8
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 4, 2026
aab521e
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 5, 2026
89f7bf4
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 9, 2026
3a1e0a7
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 11, 2026
b449578
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 12, 2026
7797509
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 13, 2026
9af0ff3
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 17, 2026
39f818d
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 17, 2026
26321bd
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 18, 2026
a9417db
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 22, 2026
a41ff1c
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 25, 2026
6395dd2
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Feb 26, 2026
0636785
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Mar 2, 2026
2cb8972
Merge branch 'main' of https://github.com/open-edge-platform/scenescape
daddo-intel Mar 2, 2026
8730e38
Fix camera pose
daddo-intel Mar 2, 2026
d2af6a5
Fix camera & analytics pose for vggt mapping model
daddo-intel Mar 5, 2026
af4b6e6
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Mar 5, 2026
ecd1077
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Mar 5, 2026
726e3d4
Fix indentation
daddo-intel Mar 5, 2026
38d7560
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Mar 9, 2026
f57e1fc
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
dmytroye Mar 9, 2026
97fde32
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Mar 25, 2026
8a54ca8
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
saratpoluri Mar 26, 2026
5afe184
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Mar 30, 2026
8df6154
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 1, 2026
902208d
Update mapping/src/vggt_model.py
daddo-intel Apr 1, 2026
b6b228b
Define poll timeout, remove redundant if and add logging during exce…
daddo-intel Apr 1, 2026
42f94ce
Update mapping/src/vggt_model.py
daddo-intel Apr 1, 2026
665f083
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 1, 2026
7ee918a
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 2, 2026
8da53ce
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 3, 2026
94284ee
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 3, 2026
d3353da
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 20, 2026
5e3d4e5
Change baseline_m to baseline_metric
saratpoluri Apr 21, 2026
7753a57
use minimum pairwise distance as metric baseline instead of median
daddo-intel Apr 21, 2026
156c4e2
Merge branch 'main' into fix/ITEP-84336-incorrect-cam-pose-vggt
daddo-intel Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions manager/src/django/mesh_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from scene_common.mesh_util import mergeMesh
from scene_common.options import QUATERNION
from scene_common import log
from manager.serializers import CamSerializer

ALLOWED_VIDEO_MIME_TYPES = {
"video/mp4",
Expand Down Expand Up @@ -157,6 +158,7 @@ def startReconstructMesh(
self,
images: Dict[str, Dict],
camera_order: List[str],
camera_location_order: List,
mesh_type: str = "mesh",
uploaded_map=None,
):
Comment thread
daddo-intel marked this conversation as resolved.
Expand All @@ -178,6 +180,10 @@ def startReconstructMesh(
"mesh_type": mesh_type,
}

camera_loc_by_id = {
cam_id: cam_loc
for cam_id, cam_loc in zip(camera_order, camera_location_order)
}
log.info(f"Sending {len(images)} images to mapping service for reconstruction")

files = []
Expand All @@ -201,6 +207,16 @@ def startReconstructMesh(
)
)
files.append(("camera_ids", (None, camera_id)))
cam_loc = camera_loc_by_id.get(camera_id)
if cam_loc is not None:
cam_loc_clean = {
"translation": list(cam_loc["translation"]),
"rotation": list(cam_loc["rotation"]),
"scale": list(cam_loc.get("scale", [1.0, 1.0, 1.0])),
}
files.append(("camera_locations", (None, json.dumps(cam_loc_clean))))
else:
log.warning(f"No camera location for {camera_id}")
else:
log.warning(
f"Camera {camera_id} in camera_order but not in images dict"
Expand Down Expand Up @@ -414,9 +430,30 @@ def startMeshGeneration(self, scene, mesh_type='mesh', uploaded_map=None):
log.info(f"Collected {len(images)} images, calling mapping service")
# Call mapping service to generate mesh
# Pass camera IDs in order to ensure correct pose association
camera_order = [camera.sensor_id for camera in cameras]

camera_location_order = []
camera_order = []
serializer = CamSerializer()

for camera in cameras:
cam_id = camera.sensor_id
camera_order.append(cam_id)

t = serializer.get_translation(camera)
q = serializer.get_rotation(camera)
s = serializer.get_scale(camera) or [1.0, 1.0, 1.0]

if t is None or q is None:
raise ValueError(f"Missing pose for camera {cam_id}: t={t} q={q}")

camera_location_order.append({
"translation": list(t),
"rotation": list(q),
"scale": list(s),
})

started = self.mapping_client.startReconstructMesh(
images, camera_order, mesh_type, uploaded_map_path
images, camera_order, camera_location_order, mesh_type, uploaded_map_path
)
rid = started.get("request_id")
if not rid:
Expand Down
27 changes: 22 additions & 5 deletions mapping/src/api_service_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from werkzeug.utils import secure_filename
import uuid
import threading
import json

from flask import Flask, request, jsonify
from flask_cors import CORS
Expand Down Expand Up @@ -215,6 +216,7 @@ def reconstruct3D():
image_files = request.files.getlist("images")
video_file = request.files.get("video")
camera_ids = request.form.getlist("camera_ids")
camera_locations = request.form.getlist("camera_locations", None)
Comment thread
daddo-intel marked this conversation as resolved.

if (not image_files) and (video_file is None):
set_status(request_id, state="failed", updated_at=time.time(), error="Provide images and/or video")
Expand All @@ -228,17 +230,32 @@ def reconstruct3D():
images = None
if image_files:
images = []
pairs = zip(image_files, camera_ids) if camera_ids else [(f, None) for f in image_files]
for f, cam_id in pairs:

for idx, f in enumerate(image_files):
if not f or not f.filename:
continue

raw = f.read()
if not raw:
continue

cam_id = camera_ids[idx] if idx < len(camera_ids) else None

cam_loc = None
if camera_locations and idx < len(camera_locations):
try:
cam_loc = json.loads(camera_locations[idx])
Comment thread
daddo-intel marked this conversation as resolved.
except Exception as e:
log.warning(
f"Invalid JSON camera location for camera {idx} | error: {e}"
)
cam_loc = None

images.append({
"filename": secure_filename(f.filename),
"camera_id": cam_id,
"data": base64.b64encode(raw).decode("utf-8"),
"filename": secure_filename(f.filename),
"camera_id": cam_id,
"camera_location": cam_loc, # Only populated if provided
"data": base64.b64encode(raw).decode("utf-8"),
})

if not images:
Expand Down
191 changes: 150 additions & 41 deletions mapping/src/vggt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from PIL import Image
import torchvision.transforms as tvf


import tempfile
import numpy as np
import trimesh
Expand Down Expand Up @@ -104,9 +105,13 @@ def runInference(self, images: List[Dict[str, Any]]) -> Dict[str, Any]:
# Decode images and get original sizes
pil_images = []
original_sizes = []
camera_ids = []
camera_locations = []

for img_data in images:
img_array = self.decodeBase64Image(img_data["data"])
camera_ids.append(img_data.get("camera_id"))
camera_locations.append(img_data.get("camera_location"))
# Apply CLAHE for improved contrast
img_array = self._applyCLAHE(img_array)
pil_image = Image.fromarray(img_array)
Expand All @@ -121,7 +126,7 @@ def runInference(self, images: List[Dict[str, Any]]) -> Dict[str, Any]:
predictions = self._runModelInference(images_tensor)

# Process outputs
result = self._processOutputs(predictions, original_sizes, model_size)
result = self._processOutputs(predictions, original_sizes, model_size, camera_ids=camera_ids, camera_locations=camera_locations)

return result

Expand All @@ -137,6 +142,14 @@ def getNativeOutput(self) -> str:
"""Get native output format."""
return "pointcloud"

def _camera_center_from_c2w(self, c2w: np.ndarray) -> np.ndarray:
return c2w[:3, 3]

def _baseline_units(self, c2w_a: np.ndarray, c2w_b: np.ndarray) -> float:
ca = self._camera_center_from_c2w(c2w_a)
cb = self._camera_center_from_c2w(c2w_b)
return float(np.linalg.norm(cb - ca))

def scaleIntrinsicsToOriginalSize(self, intrinsics: np.ndarray, model_size: tuple, original_sizes: list,
preprocessing_mode: str = "crop") -> list:
"""Scale intrinsics for VGGT preprocessing (simple resize + crop/pad)"""
Expand Down Expand Up @@ -398,41 +411,43 @@ def createOutput(

def _preprocessImages(self, pil_images: List[Image.Image]) -> tuple:
"""
Preprocess images using VGGT's logic.

Args:
pil_images: List of PIL images

Returns:
Tuple of (processed_tensor, model_size)
No-padding preprocess:
1) Resize so the SHORTER side becomes 518 (keeps aspect ratio).
2) (Optionally) round resized dims to multiples of 14 for VGGT.
3) Center-crop to 518x518.
"""
processed_images = []
target = 518
n = len(pil_images)

# Preallocate on CPU, then move once
batch = torch.empty((n, 3, target, target), dtype=torch.float32)
for pil_image in pil_images:
w, h = pil_image.size

# Scale so min side == 518
scale = target / float(min(w, h))
new_w = int(round((w * scale) / 14.0) * 14)
new_h = int(round((h * scale) / 14.0) * 14)

Comment thread
daddo-intel marked this conversation as resolved.
for i, im in enumerate(pil_images):
w, h = im.size
new_w = target
new_h = round(h * (new_w / w) / 14) * 14
new_h = max(14, new_h)
# Safety: ensure both dims >= 518 after rounding
new_w = max(target, new_w)
new_h = max(target, new_h)

im = im.resize((new_w, new_h), Image.Resampling.BICUBIC)
img_resized = pil_image.resize((new_w, new_h), Image.Resampling.BICUBIC)
img_tensor = tvf.ToTensor()(img_resized) # (3, H, W)

if new_h > target:
top = (new_h - target) // 2
im = im.crop((0, top, target, top + target))
elif new_h < target:
pad_top = (target - new_h) // 2
canvas = Image.new(im.mode, (target, target))
canvas.paste(im, (0, pad_top))
im = canvas
# Center crop to 518x518 (no padding)
H, W = img_tensor.shape[1], img_tensor.shape[2]
top = (H - target) // 2
left = (W - target) // 2
img_tensor = img_tensor[:, top:top + target, left:left + target]

batch[i] = tvf.ToTensor()(im)
if img_tensor.shape[1] != target or img_tensor.shape[2] != target:
raise RuntimeError(f"Preprocess produced {tuple(img_tensor.shape)}; expected (3,{target},{target})")

images_tensor = batch.to(self.device, non_blocking=True)
return images_tensor, (target, target)
processed_images.append(img_tensor)

images_tensor = torch.stack(processed_images, dim=0).to(self.device) # (N,3,518,518)
model_size = (target, target)
return images_tensor, model_size

def _runModelInference(self, images_tensor: torch.Tensor) -> Dict[str, Any]:
"""
Expand All @@ -454,8 +469,51 @@ def _runModelInference(self, images_tensor: torch.Tensor) -> Dict[str, Any]:

return predictions

def _baseline_m_from_camera_locations(self, camera_locations, camera_ids=None) -> float:
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
"""
Robustly compute a metric baseline (meters) from camera_locations.

camera_locations may contain dicts, None, and/or malformed entries.
Each valid dict must contain 'translation': [x,y,z] in meters.
"""
if not camera_locations or len(camera_locations) < 2:
return 0.0

try:
translations = []
for loc in camera_locations:
if not isinstance(loc, dict):
continue
t = loc.get("translation", None)
if t is None or len(t) != 3:
continue
t = np.asarray(t, dtype=np.float32)
if t.shape != (3,) or not np.isfinite(t).all():
continue
translations.append(t)

if len(translations) < 2:
return 0.0

distances = []
for i in range(len(translations)):
for j in range(i + 1, len(translations)):
d = float(np.linalg.norm(translations[j] - translations[i]))
if np.isfinite(d) and d > 1e-6:
distances.append(d)

if not distances:
return 0.0

# Median is robust to occasional bad pose entries
return float(np.median(distances))
Comment thread
saratpoluri marked this conversation as resolved.
Outdated

except Exception as e:
log.exception(f"Failed to compute baseline from camera_locations: {e}")
return 0.0

def _processOutputs(self, predictions: Dict[str, Any], original_sizes: List[tuple],
model_size: tuple) -> Dict[str, Any]:
model_size: tuple, camera_ids: List[Any] = None, camera_locations: List[Any] = None) -> Dict[str, Any]:
"""
Process VGGT outputs into standard format.

Expand Down Expand Up @@ -502,32 +560,83 @@ def _processOutputs(self, predictions: Dict[str, Any], original_sizes: List[tupl
intrinsics_list = []

extrinsic_matrices = predictions["extrinsic"] # Shape: (S, 4, 4) - world-to-camera
rotation_x_180 = np.array([
[1, 0, 0, 0],
[0, -1, 0, 0],
[0, 0, -1, 0],
[0, 0, 0, 1]
], dtype=np.float32)

# --- build camera_to_world for all frames first ---
camera_to_world_list = []

for i in range(extrinsic_matrices.shape[0]):
# VGGT outputs extrinsics (world-to-camera), but we want camera poses (camera-to-world)
# Convert by taking the inverse of the extrinsic matrix
world_to_camera = extrinsic_matrices[i] # 4x4 matrix
world_to_camera = extrinsic_matrices[i] # (4,4) or (3,4)

# Convert 3x4 to 4x4 if needed
if world_to_camera.shape == (3, 4):
world_to_camera_4x4 = np.eye(4)
world_to_camera_4x4[:3, :4] = world_to_camera
world_to_camera = world_to_camera_4x4
w2c = np.eye(4, dtype=np.float32)
w2c[:3, :4] = world_to_camera
world_to_camera = w2c

# Invert to get camera-to-world
c2w = np.linalg.inv(world_to_camera).astype(np.float32)

# Apply your orientation fix
c2w = rotation_x_180 @ c2w

camera_to_world_list.append(c2w)

# --- SCALE FIX: compute metric baseline from provided camera_locations ---
baseline_m = self._baseline_m_from_camera_locations(camera_locations, camera_ids=camera_ids)
Comment thread
saratpoluri marked this conversation as resolved.
Outdated

if baseline_m <= 0:
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
log.warning("VGGT: camera_locations missing/invalid; skipping metric scaling (scale will be arbitrary).")

if baseline_m > 0 and len(camera_to_world_list) >= 2:
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
b_units = self._baseline_units(camera_to_world_list[0], camera_to_world_list[1])
if b_units > 1e-6:
s = baseline_m / b_units
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
log.info(f"Scaling VGGT outputs by s={s:.6f} (baseline {baseline_m:.6f}m / {b_units:.6f} units)")
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
Comment thread
saratpoluri marked this conversation as resolved.
Outdated

# Invert to get camera-to-world (camera pose)
camera_to_world = np.linalg.inv(world_to_camera)
# scale camera translations
for k in range(len(camera_to_world_list)):
camera_to_world_list[k] = camera_to_world_list[k].copy()
camera_to_world_list[k][:3, 3] *= s
Comment thread
saratpoluri marked this conversation as resolved.
Outdated

intrinsic_matrix = original_intrinsics[i] # Use scaled intrinsics
# scale world points (affects glb_size -> pixels_per_meter)
if isinstance(predictions.get("world_points_from_depth"), np.ndarray):
predictions["world_points_from_depth"] *= s
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
if isinstance(predictions.get("world_points"), np.ndarray):
predictions["world_points"] *= s
Comment thread
saratpoluri marked this conversation as resolved.
Outdated

# optional: scale depth too (only if used elsewhere)
if isinstance(predictions.get("depth"), np.ndarray):
predictions["depth"] *= s
Comment thread
saratpoluri marked this conversation as resolved.
Outdated
else:
log.warning(f"VGGT: predicted baseline too small ({b_units}); skipping scaling.")

# --- now build camera_poses + intrinsics_list using the scaled camera_to_world_list ---
camera_poses = []
intrinsics_list = []

for i, camera_to_world in enumerate(camera_to_world_list):
cam_id = camera_ids[i] if camera_ids is not None and i < len(camera_ids) else None
K = original_intrinsics[i]

# Convert rotation matrix to quaternion
rotation_matrix = camera_to_world[:3, :3]
quaternion = self.rotationMatrixToQuaternion(rotation_matrix)

camera_poses.append({
"rotation": quaternion.tolist(), # [x, y, z, w]
"camera_id": cam_id,
"rotation": quaternion.tolist(), # [x, y, z, w]
"translation": camera_to_world[:3, 3].tolist()
})
intrinsics_list.append(intrinsic_matrix.tolist())

intrinsics_list.append({
"camera_id": cam_id,
"K": K.tolist()
})

return {
"predictions": predictions,
Expand Down
Loading
Loading