facebookresearch · xiaoya27 · Dec 12, 2025
diff --git a/demo_colmap.py b/demo_colmap.py
@@ -11,6 +11,9 @@
 import copy
 import torch
 import torch.nn.functional as F
+import math
+import torch.multiprocessing as mp
+from typing import List, Tuple
 
 # Configure CUDA settings
 torch.backends.cudnn.enabled = True
@@ -44,6 +47,15 @@ def parse_args():
     parser.add_argument("--scene_dir", type=str, required=True, help="Directory containing the scene images")
     parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
     parser.add_argument("--use_ba", action="store_true", default=False, help="Use BA for reconstruction")
+    ######### Multi-GPU parameters #########
+    parser.add_argument(
+        "--multi_gpu", action="store_true", default=False, 
+        help="Enable parallel multi-GPU mode for VGGT inference. All GPUs process simultaneously using torch.multiprocessing."
+    )
+    parser.add_argument(
+        "--gpu_ids", type=str, default=None,
+        help="Comma-separated list of GPU IDs to use (e.g., '0,1,2,3'). If not set, uses all available GPUs."
+    )
     ######### BA parameters #########
     parser.add_argument(
         "--max_reproj_error", type=float, default=8.0, help="Maximum reprojection error for reconstruction"
@@ -90,6 +102,195 @@ def run_VGGT(model, images, dtype, resolution=518):
     return extrinsic, intrinsic, depth_map, depth_conf
 
 
+def _worker_process(gpu_id: int, images_shard: torch.Tensor, model_url: str, 
+                    dtype_str: str, resolution: int, result_queue: mp.Queue, 
+                    shard_idx: int, start_idx: int):
+    """
+    Worker process for parallel multi-GPU inference.
+
+    This function runs in a separate process, loads the model, runs inference,
+    and puts results into a shared queue.
+
+    Args:
+        gpu_id: GPU device ID to use
+        images_shard: Tensor of images [N, 3, H, W] for this shard (shared memory)
+        model_url: URL or path to model weights
+        dtype_str: String representation of dtype ('bfloat16' or 'float16')
+        resolution: VGGT inference resolution
+        result_queue: Multiprocessing queue to put results
+        shard_idx: Index of this shard (for ordering results)
+        start_idx: Starting frame index in the original sequence
+    """
+    try:
+        # Set up GPU
+        device = f"cuda:{gpu_id}"
+        torch.cuda.set_device(gpu_id)
+
+        # Convert dtype string back to torch dtype
+        dtype = torch.bfloat16 if dtype_str == 'bfloat16' else torch.float16
+
+        print(f"[GPU {gpu_id}] Worker started for shard {shard_idx} ({images_shard.shape[0]} frames)")
+
+        # Load model
+        model = VGGT()
+        model.load_state_dict(torch.hub.load_state_dict_from_url(model_url, map_location=device))
+        model.eval()
+        model = model.to(device)
+
+        # Move images to GPU (they're in shared memory)
+        images_gpu = images_shard.to(device)
+
+        # Run inference
+        extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images_gpu, dtype, resolution)
+
+        print(f"[GPU {gpu_id}] Shard {shard_idx} inference complete. Shape: {extrinsic.shape}")
+
+        # Clean up GPU memory
+        del model, images_gpu
+        torch.cuda.empty_cache()
+
+        # Put results in queue
+        result_queue.put({
+            'shard_idx': shard_idx,
+            'start_idx': start_idx,
+            'extrinsic': extrinsic,
+            'intrinsic': intrinsic,
+            'depth_map': depth_map,
+            'depth_conf': depth_conf,
+            'error': None
+        })
+
+    except Exception as e:
+        import traceback
+        error_msg = f"[GPU {gpu_id}] Error in shard {shard_idx}: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        result_queue.put({
+            'shard_idx': shard_idx,
+            'start_idx': start_idx,
+            'error': error_msg
+        })
+
+
+def run_VGGT_multi_gpu(images: torch.Tensor, gpu_ids: List[int], 
+                        dtype: torch.dtype, 
+                        resolution: int = 518) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Run VGGT inference across multiple GPUs in PARALLEL using torch.multiprocessing.
+
+    This implementation provides TRUE parallel processing:
+    - Each GPU runs inference in a separate process simultaneously
+    - Uses shared memory for efficient tensor transfer
+    - Collects results via multiprocessing queue
+
+    Memory benefit: Each GPU only loads frames for its shard (e.g., 55 frames instead of 221)
+    Speed benefit: All GPUs work simultaneously (near-linear speedup for inference)
+
+    Args:
+        images: Tensor of all images [N, 3, H, W] on CPU
+        gpu_ids: List of GPU IDs to use
+        dtype: Data type for inference
+        resolution: VGGT inference resolution
+
+    Returns:
+        Tuple of (extrinsic, intrinsic, depth_map, depth_conf) as numpy arrays
+    """
+    num_images = images.shape[0]
+    num_gpus = len(gpu_ids)
+
+    # Evenly distribute frames across GPUs
+    shard_size = math.ceil(num_images / num_gpus)
+
+    # Create shards - assign to GPUs round-robin
+    shards = []
+    for i in range(0, num_images, shard_size):
+        end_idx = min(i + shard_size, num_images)
+        gpu_idx = len(shards) % num_gpus
+        shards.append({
+            'start_idx': i,
+            'end_idx': end_idx,
+            'gpu_id': gpu_ids[gpu_idx],
+            'shard_idx': len(shards)
+        })
+
+    print(f"\n{'='*60}")
+    print(f"PARALLEL Multi-GPU VGGT Inference")
+    print(f"{'='*60}")
+    print(f"Total frames: {num_images}")
+    print(f"Number of shards: {len(shards)}")
+    print(f"GPUs: {gpu_ids}")
+    print(f"Shard assignments:")
+    for s in shards:
+        print(f"  Shard {s['shard_idx']}: frames [{s['start_idx']}, {s['end_idx']}) -> GPU {s['gpu_id']}")
+    print(f"{'='*60}\n")
+
+    # Model URL
+    model_url = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
+
+    # Convert dtype to string for pickling
+    dtype_str = 'bfloat16' if dtype == torch.bfloat16 else 'float16'
+
+    # Move images to shared memory for efficient inter-process sharing
+    images = images.share_memory_()
+
+    # Create result queue
+    result_queue = mp.Queue()
+
+    # Start worker processes
+    processes = []
+    for shard in shards:
+        # Extract shard images
+        images_shard = images[shard['start_idx']:shard['end_idx']].clone().share_memory_()
+
+        p = mp.Process(
+            target=_worker_process,
+            args=(
+                shard['gpu_id'],
+                images_shard,
+                model_url,
+                dtype_str,
+                resolution,
+                result_queue,
+                shard['shard_idx'],
+                shard['start_idx']
+            )
+        )
+        p.start()
+        processes.append(p)
+        print(f"Started process for shard {shard['shard_idx']} on GPU {shard['gpu_id']}")
+
+    # Collect results
+    results = []
+    for _ in range(len(shards)):
+        result = result_queue.get()
+        if result['error'] is not None:
+            # Clean up processes on error
+            for p in processes:
+                p.terminate()
+            raise RuntimeError(f"Worker process failed: {result['error']}")
+        results.append(result)
+        print(f"Received results from shard {result['shard_idx']}")
+
+    # Wait for all processes to finish
+    for p in processes:
+        p.join()
+
+    # Sort results by start_idx to maintain original frame order
+    results.sort(key=lambda x: x['start_idx'])
+
+    # Concatenate results
+    extrinsic = np.concatenate([r['extrinsic'] for r in results], axis=0)
+    intrinsic = np.concatenate([r['intrinsic'] for r in results], axis=0)
+    depth_map = np.concatenate([r['depth_map'] for r in results], axis=0)
+    depth_conf = np.concatenate([r['depth_conf'] for r in results], axis=0)
+
+    print(f"\n{'='*60}")
+    print(f"Parallel multi-GPU inference COMPLETE")
+    print(f"Combined results: extrinsic={extrinsic.shape}, depth_map={depth_map.shape}")
+    print(f"{'='*60}\n")
+
+    return extrinsic, intrinsic, depth_map, depth_conf
+
+
 def demo_fn(args):
     # Print configuration
     print("Arguments:", vars(args))
@@ -109,13 +310,16 @@ def demo_fn(args):
     print(f"Using device: {device}")
     print(f"Using dtype: {dtype}")
 
-    # Run VGGT for camera and depth estimation
-    model = VGGT()
-    _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
-    model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
-    model.eval()
-    model = model.to(device)
-    print(f"Model loaded")
+    # Parse GPU IDs for multi-GPU mode
+    if args.multi_gpu:
+        if args.gpu_ids is not None:
+            gpu_ids = [int(x.strip()) for x in args.gpu_ids.split(",")]
+        else:
+            gpu_ids = list(range(torch.cuda.device_count()))
+        print(f"Multi-GPU mode enabled. Using GPUs: {gpu_ids}")
+        if len(gpu_ids) < 2:
+            print("Warning: Multi-GPU mode requested but only 1 GPU available. Using single-GPU mode.")
+            args.multi_gpu = False
 
     # Get image paths and preprocess them
     image_dir = os.path.join(args.scene_dir, "images")
@@ -130,13 +334,41 @@ def demo_fn(args):
     img_load_resolution = 1024
 
     images, original_coords = load_and_preprocess_images_square(image_path_list, img_load_resolution)
-    images = images.to(device)
-    original_coords = original_coords.to(device)
     print(f"Loaded {len(images)} images from {image_dir}")
 
-    # Run VGGT to estimate camera and depth
-    # Run with 518x518 images
-    extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution)
+    if args.multi_gpu:
+        # Multi-GPU mode: keep images on CPU, shards will be moved to respective GPUs
+        original_coords = original_coords.to(gpu_ids[0])  # Move coords to first GPU for later use
+
+        # Run parallel multi-GPU inference
+        extrinsic, intrinsic, depth_map, depth_conf = run_VGGT_multi_gpu(
+            images, gpu_ids, dtype, vggt_fixed_resolution
+        )
+
+        # Move images to first GPU for subsequent operations (tracking)
+        device = f"cuda:{gpu_ids[0]}"
+        images = images.to(device)
+    else:
+        # Single-GPU mode: original behavior
+        # Run VGGT for camera and depth estimation
+        model = VGGT()
+        _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
+        model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
+        model.eval()
+        model = model.to(device)
+        print(f"Model loaded")
+
+        images = images.to(device)
+        original_coords = original_coords.to(device)
+
+        # Run VGGT to estimate camera and depth
+        # Run with 518x518 images
+        extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution)
+
+        # Clean up model to free memory
+        del model
+        torch.cuda.empty_cache()
+
     points_3d = unproject_depth_map_to_point_map(depth_map, extrinsic, intrinsic)
 
     if args.use_ba:
@@ -293,13 +525,15 @@ def rename_colmap_recons_and_rescale_camera(
 
 
 if __name__ == "__main__":
+    # Set multiprocessing start method for CUDA compatibility
+    # 'spawn' is required for CUDA tensors in child processes
+    mp.set_start_method('spawn', force=True)
+
     args = parse_args()
     with torch.no_grad():
         demo_fn(args)
 
 
-# Work in Progress (WIP)
-
 """
 VGGT Runner Script
 =================
@@ -327,4 +561,29 @@ def rename_colmap_recons_and_rescale_camera(
 • Dual-mode Support: Run reconstructions using either VGGT or VGGT+BA
 • Resolution Preservation: Maintains original image resolution in camera parameters and tracks
 • COLMAP Compatibility: Exports results in standard COLMAP sparse reconstruction format
+• Multi-GPU Support: Shard frames across multiple GPUs to reduce per-GPU memory usage
+• Parallel Multi-GPU: True parallel processing with torch.multiprocessing for speedup
+
+Multi-GPU Usage
+--------------
+Enable multi-GPU mode for parallel processing across GPUs:
+
+    # Use all available GPUs
+    python demo_colmap.py --scene_dir=/path/to/scene --multi_gpu
+
+    # Specify which GPUs to use
+    python demo_colmap.py --scene_dir=/path/to/scene --multi_gpu --gpu_ids=0,1,2,3
+
+Multi-GPU Benefits
+------------------
+- All GPUs work simultaneously using torch.multiprocessing
+- Memory benefit: Each GPU only loads its shard (~55 frames instead of 221)
+- Speed benefit: Near-linear speedup (N GPUs ≈ N× faster for inference)
+
+Memory Profile (221 images on Nvidia GPUs):
+- Single GPU: ~77 GB peak
+- 2 GPUs (~110 frames each): ~42 GB per GPU
+- Speed: ~N× faster inference with N GPUs
+
+Note: Bundle Adjustment (BA) and tracking still run on a single GPU after multi-GPU inference.
 """