diff --git a/src/lerobot/policies/vqbet/modeling_vqbet.py b/src/lerobot/policies/vqbet/modeling_vqbet.py index 359b4fdb1d2..6d3976b7932 100644 --- a/src/lerobot/policies/vqbet/modeling_vqbet.py +++ b/src/lerobot/policies/vqbet/modeling_vqbet.py @@ -467,8 +467,8 @@ def discretize(self, n_vqvae_training_steps, actions): self.vqvae_model.optimized_steps += 1 # if we updated RVQ more than `n_vqvae_training_steps` steps, we freeze the RVQ part. if self.vqvae_model.optimized_steps >= n_vqvae_training_steps: - self.vqvae_model.discretized = torch.tensor(True) - self.vqvae_model.vq_layer.freeze_codebook = torch.tensor(True) + self.vqvae_model.discretized.fill_(True) + self.vqvae_model.vq_layer.freeze_codebook.fill_(True) print("Finished discretizing action data!") self.vqvae_model.eval() for param in self.vqvae_model.vq_layer.parameters(): diff --git a/tests/policies/test_vqbet_modeling.py b/tests/policies/test_vqbet_modeling.py new file mode 100644 index 00000000000..be40d982359 --- /dev/null +++ b/tests/policies/test_vqbet_modeling.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Regression tests for VQBeT modeling internals.""" + +import pytest +import torch + +from lerobot.configs.types import FeatureType, PolicyFeature +from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig +from lerobot.policies.vqbet.modeling_vqbet import VQBeTHead # noqa: E402 +from lerobot.utils.constants import ACTION, OBS_IMAGE, OBS_STATE + + +def _make_minimal_config(n_vqvae_training_steps: int = 3) -> VQBeTConfig: + """Return a small VQBeTConfig suitable for CPU unit tests.""" + config = VQBeTConfig() + config.input_features = { + OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)), + OBS_IMAGE: PolicyFeature(type=FeatureType.VISUAL, shape=(3, 96, 96)), + } + config.output_features = { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)), + } + # Use tiny sizes so the test is fast on CPU. + config.n_vqvae_training_steps = n_vqvae_training_steps + config.vqvae_n_embed = 8 + config.vqvae_embedding_dim = 32 + config.vqvae_enc_hidden_dim = 32 + config.action_chunk_size = 2 + config.crop_shape = (84, 84) + config.device = "cpu" + return config + + +def test_discretize_updates_buffers_in_place(): + """Regression test: discretize() must update registered buffers in-place. + + Replacing them with `self.vqvae_model.discretized = torch.tensor(True)` creates + a new CPU tensor and breaks DDP buffer synchronisation with NCCL when the model + is on GPU (RuntimeError: No backend type associated with device type cpu). + + This test verifies that after discretization the underlying tensor storage + (data_ptr) is unchanged, i.e. no new tensor was allocated. + """ + config = _make_minimal_config(n_vqvae_training_steps=3) + head = VQBeTHead(config) + head.eval() + + vqvae = head.vqvae_model + vq_layer = vqvae.vq_layer + + # Capture storage addresses of the two registered buffers before discretization. + discretized_ptr_before = vqvae.discretized.data_ptr() + freeze_codebook_ptr_before = vq_layer.freeze_codebook.data_ptr() + + # Both flags should be False at init. + assert not vqvae.discretized.item(), "discretized should be False before training" + assert not vq_layer.freeze_codebook.item(), "freeze_codebook should be False before training" + + # Run discretize() until the threshold is crossed (n_vqvae_training_steps calls). + batch_size = 4 + seq_len = config.action_chunk_size # minimum sequence length that produces at least one chunk + action_dim = config.action_feature.shape[0] + dummy_actions = torch.randn(batch_size, seq_len, action_dim) + + n_steps = config.n_vqvae_training_steps + for _ in range(n_steps): + head.discretize(n_steps, dummy_actions) + + # After discretization both flags must be True. + assert vqvae.discretized.item(), "discretized should be True after training" + assert vq_layer.freeze_codebook.item(), "freeze_codebook should be True after training" + + # Most importantly: the storage must be the *same* tensor objects (in-place update). + # If a new tensor was created the data_ptr would differ, and DDP would try to + # NCCL-broadcast a CPU tensor when the model lives on GPU. + assert vqvae.discretized.data_ptr() == discretized_ptr_before, ( + "vqvae_model.discretized was replaced with a new tensor instead of being updated in-place. " + "This breaks DDP GPU buffer synchronisation." + ) + assert vq_layer.freeze_codebook.data_ptr() == freeze_codebook_ptr_before, ( + "vq_layer.freeze_codebook was replaced with a new tensor instead of being updated in-place. " + "This breaks DDP GPU buffer synchronisation." + ) + + # The buffers must still appear in the module's named_buffers() after update. + buffer_names = {name for name, _ in vqvae.named_buffers()} + assert "discretized" in buffer_names, ( + "vqvae_model.discretized is no longer a registered buffer after discretize(). " + "Use .fill_() instead of direct assignment to preserve buffer registration." + ) + freeze_buffer_names = {name for name, _ in vq_layer.named_buffers()} + assert "freeze_codebook" in freeze_buffer_names, ( + "vq_layer.freeze_codebook is no longer a registered buffer after discretize(). " + "Use .fill_() instead of direct assignment to preserve buffer registration." + ) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_discretize_keeps_buffers_on_gpu(): + """Regression test: discretize() must not move registered buffers from GPU to CPU. + + This is the exact failure mode that caused the DDP crash: + RuntimeError: No backend type associated with device type cpu + + When `self.vqvae_model.discretized = torch.tensor(True)` is used (wrong), + torch.tensor() creates a CPU tensor by default, silently moving the buffer + off the GPU. DDP's _sync_buffers() then tries to NCCL-broadcast a CPU tensor, + which NCCL does not support. + + This test places the model on GPU and verifies that after discretize() both + buffers remain on CUDA, preventing the above regression. + """ + config = _make_minimal_config(n_vqvae_training_steps=3) + head = VQBeTHead(config) + device = torch.device("cuda:0") + head = head.to(device) + + vqvae = head.vqvae_model + vq_layer = vqvae.vq_layer + + # Confirm buffers start on GPU. + assert vqvae.discretized.device.type == "cuda", "discretized should start on CUDA" + assert vq_layer.freeze_codebook.device.type == "cuda", "freeze_codebook should start on CUDA" + + # Run discretize() until the threshold is crossed. + batch_size = 4 + seq_len = config.action_chunk_size + action_dim = config.action_feature.shape[0] + dummy_actions = torch.randn(batch_size, seq_len, action_dim, device=device) + + n_steps = config.n_vqvae_training_steps + for _ in range(n_steps): + head.discretize(n_steps, dummy_actions) + + # Flags must be True. + assert vqvae.discretized.item(), "discretized should be True after training" + assert vq_layer.freeze_codebook.item(), "freeze_codebook should be True after training" + + # Core assertion: buffers must still live on GPU after discretize(). + # A direct-assignment `= torch.tensor(True)` creates a CPU tensor and fails here. + assert vqvae.discretized.device.type == "cuda", ( + "vqvae_model.discretized was moved to CPU during discretize(). " + "This would cause 'RuntimeError: No backend type associated with device type cpu' " + "in DDP._sync_buffers(). Use .fill_(True) instead of = torch.tensor(True)." + ) + assert vq_layer.freeze_codebook.device.type == "cuda", ( + "vq_layer.freeze_codebook was moved to CPU during discretize(). " + "This would cause 'RuntimeError: No backend type associated with device type cpu' " + "in DDP._sync_buffers(). Use .fill_(True) instead of = torch.tensor(True)." + )