add backwards pass for non-deepEP version

pstjohn · pstjohn · commit 2a83dd54ac76 · 2026-03-13T15:38:59.000Z
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/mixtral/modeling_mixtral_te.py b/bionemo-recipes/models/mixtral/modeling_mixtral_te.py
@@ -949,6 +949,54 @@ class _AllToAllHandle:
     output_split_sizes: list[int] | None = None
 
 
+class _DifferentiableAllToAll(torch.autograd.Function):
+    """Differentiable wrapper around dist.all_to_all_single.
+
+    The forward pass performs the standard all-to-all communication.
+    The backward pass reverses the communication direction (swapping
+    input/output split sizes) so that gradients flow correctly.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        output_split_sizes: list[int],
+        input_split_sizes: list[int],
+        group: dist.ProcessGroup,
+    ) -> torch.Tensor:
+        """Perform all-to-all forward and save sizes for backward."""
+        ctx.input_split_sizes = input_split_sizes
+        ctx.output_split_sizes = output_split_sizes
+        ctx.group = group
+        output = torch.empty(
+            sum(output_split_sizes),
+            input.shape[1],
+            device=input.device,
+            dtype=input.dtype,
+        )
+        dist.all_to_all_single(output, input.contiguous(), output_split_sizes, input_split_sizes, group=group)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple[torch.Tensor, None, None, None]:
+        """Reverse all-to-all: swap input and output split sizes."""
+        grad_input = torch.empty(
+            sum(ctx.input_split_sizes),
+            grad_output.shape[1],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        dist.all_to_all_single(
+            grad_input,
+            grad_output.contiguous(),
+            ctx.input_split_sizes,
+            ctx.output_split_sizes,
+            group=ctx.group,
+        )
+        return grad_input, None, None, None
+
+
 class AllToAllTokenDispatcher:
     """TokenDispatcher using NCCL all-to-all for expert-parallel communication.
 
@@ -998,10 +1046,7 @@ def dispatch(
         # Compute m_splits: number of tokens per expert
         m_splits_tensor = torch.bincount(selected_experts.reshape(-1), minlength=self.num_experts).int()
 
-        if self.ep_size > 1:
-            assert self._ep_group is not None, (
-                "EP group must be set via set_ep_group() before dispatch when ep_size > 1"
-            )
+        if self._ep_group is not None:
             ep_group = self._ep_group
 
             # Token counts per expert, reshaped to [ep_size, num_local_experts]
@@ -1016,14 +1061,10 @@ def dispatch(
             output_split_sizes = recv_counts.sum(dim=1).tolist()
             local_m_splits = recv_counts.sum(dim=0).int().tolist()
 
-            # Dispatch tokens to expert-owning ranks
-            recv_tokens = torch.empty(
-                sum(output_split_sizes),
-                self.hidden_size,
-                device=permuted_hidden.device,
-                dtype=permuted_hidden.dtype,
+            # Dispatch tokens to expert-owning ranks (differentiable)
+            recv_tokens = _DifferentiableAllToAll.apply(
+                permuted_hidden, output_split_sizes, input_split_sizes, ep_group
             )
-            dist.all_to_all_single(recv_tokens, permuted_hidden, output_split_sizes, input_split_sizes, group=ep_group)
 
             # Sort received tokens by local expert index.
             # After all_to_all layout is [src0_exp0..src0_expL, src1_exp0..src1_expL, ...].
@@ -1060,21 +1101,14 @@ def combine(self, expert_output: torch.Tensor, handle: _AllToAllHandle) -> torch
         Returns:
             Combined output tensor of shape ``[N, H]`` with routing weights applied.
         """
-        if self.ep_size > 1:
+        if self._ep_group is not None:
             assert handle.unsort_indices is not None
-            # Unsort back to source-rank-grouped order and reverse all_to_all
-            combined = torch.empty(
-                sum(handle.input_split_sizes),
-                self.hidden_size,
-                device=expert_output.device,
-                dtype=expert_output.dtype,
-            )
-            dist.all_to_all_single(
-                combined,
+            # Unsort back to source-rank-grouped order and reverse all_to_all (differentiable)
+            combined = _DifferentiableAllToAll.apply(
                 expert_output[handle.unsort_indices],
                 handle.input_split_sizes,
                 handle.output_split_sizes,
-                group=self._ep_group,
+                self._ep_group,
             )
         else:
             combined = expert_output
diff --git a/bionemo-recipes/models/mixtral/tests/test_ep.py b/bionemo-recipes/models/mixtral/tests/test_ep.py
@@ -40,15 +40,15 @@
 )
 
 
-@requires_multi_gpu
-def test_ep2_matches_ep1(unused_tcp_port):
-    """Test that EP=2 produces the same logits as EP=1."""
+def _run_torchrun(test_name: str, port: int):
+    """Run a named test worker via torchrun with 2 GPUs."""
     cmd = [
         "torchrun",
         "--nproc_per_node=2",
         "--rdzv-backend=c10d",
-        f"--rdzv-endpoint=localhost:{unused_tcp_port}",
+        f"--rdzv-endpoint=localhost:{port}",
         str(Path(__file__).resolve()),
+        test_name,
     ]
     result = subprocess.run(
         cmd,
@@ -62,7 +62,19 @@ def test_ep2_matches_ep1(unused_tcp_port):
     if result.returncode != 0:
         print(f"STDOUT:\n{result.stdout}")
         print(f"STDERR:\n{result.stderr}")
-        pytest.fail(f"EP equivalence test failed with exit code {result.returncode}")
+        pytest.fail(f"EP {test_name} test failed with exit code {result.returncode}")
+
+
+@requires_multi_gpu
+def test_ep2_matches_ep1(unused_tcp_port):
+    """Test that EP=2 produces the same logits as EP=1."""
+    _run_torchrun("forward", unused_tcp_port)
+
+
+@requires_multi_gpu
+def test_ep2_backward_matches_ep1(unused_tcp_port):
+    """Test that EP=2 backward pass produces the same gradients as EP=1."""
+    _run_torchrun("backward", unused_tcp_port)
 
 
 # ---------------------------------------------------------------------------
@@ -85,18 +97,22 @@ def _distribute_state_dict(full_state_dict: dict, model: torch.nn.Module, device
     from torch.distributed.tensor import DTensor, distribute_tensor
 
     distributed_state: dict = {}
-    for key, value in model.state_dict().items():
-        if key not in full_state_dict:
+    # model.state_dict() filters _extra_state keys via the NVMixtralPreTrainedModel
+    # override, so use nn.Module.state_dict to get the unfiltered dict that includes
+    # TransformerEngine _extra_state entries required by load_state_dict(strict=True).
+    for key, value in torch.nn.Module.state_dict(model).items():
+        if key.endswith("_extra_state"):
+            distributed_state[key] = value
+        elif key not in full_state_dict:
             continue
-        full_value = full_state_dict[key]
-        if isinstance(value, DTensor):
+        elif isinstance(value, DTensor):
             distributed_state[key] = distribute_tensor(
-                full_value.to(device),
+                full_state_dict[key].to(device),
                 value.device_mesh,
                 list(value.placements),
             )
         else:
-            distributed_state[key] = full_value
+            distributed_state[key] = full_state_dict[key]
     return distributed_state
 
 
@@ -183,5 +199,95 @@ def _run_ep_equivalence_test():
     torch.distributed.destroy_process_group()
 
 
+def _run_ep_backward_test():
+    """Worker function for the EP backward equivalence test.
+
+    1. Init distributed with 2 GPUs.
+    2. Every rank creates an EP=1 model, runs forward+backward, saves gradients.
+    3. Create EP=2 model with sharded weights, runs forward+backward, compares gradients.
+    """
+    from torch.distributed.tensor.device_mesh import DeviceMesh
+
+    dist_config = DistributedConfig()
+    device = torch.device(f"cuda:{dist_config.local_rank}")
+    torch.cuda.set_device(device)
+    torch.distributed.init_process_group(backend="nccl", device_id=device)
+    ep_size = dist_config.world_size
+
+    # --- Phase 1: EP=1 reference (every rank computes independently) ---
+    config_ep1 = create_small_mixtral_config(expert_parallel_size=1)
+    torch.manual_seed(0)
+    model_ep1 = NVMixtralForCausalLM(config_ep1).to(dtype=torch.bfloat16, device=device)
+
+    batch = get_dummy_batch(config_ep1.vocab_size, seq_len=32, batch_size=2, device=device)
+
+    outputs_ep1 = model_ep1(**batch)
+    outputs_ep1.loss.backward()
+
+    ref_grads = {name: p.grad.detach().clone().cpu() for name, p in model_ep1.named_parameters() if p.grad is not None}
+    loss_ep1 = outputs_ep1.loss.detach().clone().cpu()
+
+    full_state_dict = {k: v.clone().cpu() for k, v in model_ep1.state_dict().items()}
+    del model_ep1, outputs_ep1
+    torch.cuda.empty_cache()
+
+    # --- Phase 2: EP=2 distributed run ---
+    config_ep2 = create_small_mixtral_config(expert_parallel_size=ep_size)
+    torch.manual_seed(0)
+    model_ep2 = NVMixtralForCausalLM(config_ep2).to(dtype=torch.bfloat16, device=device)
+
+    ep_mesh = DeviceMesh("cuda", list(range(ep_size)))
+    ep_group = ep_mesh.get_group()
+    model_ep2.model.set_ep_groups(ep_group, ep_mesh)
+
+    distributed_state = _distribute_state_dict(full_state_dict, model_ep2, device)
+    model_ep2.load_state_dict(distributed_state, strict=True)
+
+    batch_cuda = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+
+    outputs_ep2 = model_ep2(**batch_cuda)
+    outputs_ep2.loss.backward()
+
+    test_grads = {}
+    for name, p in model_ep2.named_parameters():
+        if p.grad is not None:
+            g = p.grad
+            if hasattr(g, "full_tensor"):
+                g = g.full_tensor()
+            test_grads[name] = g.detach().clone().cpu()
+    loss_ep2 = outputs_ep2.loss.detach().clone().cpu()
+
+    # --- Phase 3: Compare on rank 0 ---
+    if dist_config.is_main_process():
+        torch.testing.assert_close(
+            loss_ep2,
+            loss_ep1,
+            atol=1e-3,
+            rtol=1e-3,
+            msg="EP=2 backward: loss does not match EP=1 loss",
+        )
+
+        # All EP=1 parameters should have gradients in EP=2 as well
+        for name in ref_grads:
+            assert name in test_grads, f"EP=2 model missing gradient for {name}"
+
+        for name in ref_grads:
+            torch.testing.assert_close(
+                test_grads[name],
+                ref_grads[name],
+                atol=3e-2,
+                rtol=3e-2,
+                msg=f"EP=2 gradient mismatch for {name}",
+            )
+
+        print("EP backward test PASSED: EP=2 gradients match EP=1")
+
+    torch.distributed.destroy_process_group()
+
+
 if __name__ == "__main__":
-    _run_ep_equivalence_test()
+    test_name = sys.argv[1] if len(sys.argv) > 1 else "forward"
+    if test_name == "backward":
+        _run_ep_backward_test()
+    else:
+        _run_ep_equivalence_test()
diff --git a/bionemo-recipes/models/mixtral/tests/test_fsdp_ep.py b/bionemo-recipes/models/mixtral/tests/test_fsdp_ep.py
@@ -62,18 +62,22 @@ def _distribute_state_dict(full_state_dict: dict, model: torch.nn.Module, device
     from torch.distributed.tensor import DTensor, distribute_tensor
 
     distributed_state: dict = {}
-    for key, value in model.state_dict().items():
-        if key not in full_state_dict:
+    # model.state_dict() filters _extra_state keys via the NVMixtralPreTrainedModel
+    # override, so use nn.Module.state_dict to get the unfiltered dict that includes
+    # TransformerEngine _extra_state entries required by load_state_dict(strict=True).
+    for key, value in torch.nn.Module.state_dict(model).items():
+        if key.endswith("_extra_state"):
+            distributed_state[key] = value
+        elif key not in full_state_dict:
             continue
-        full_value = full_state_dict[key]
-        if isinstance(value, DTensor):
+        elif isinstance(value, DTensor):
             distributed_state[key] = distribute_tensor(
-                full_value.to(device),
+                full_state_dict[key].to(device),
                 value.device_mesh,
                 list(value.placements),
             )
         else:
-            distributed_state[key] = full_value
+            distributed_state[key] = full_state_dict[key]
     return distributed_state
 
 
diff --git a/bionemo-recipes/models/mixtral/tests/test_hybrid_ep.py b/bionemo-recipes/models/mixtral/tests/test_hybrid_ep.py
@@ -332,14 +332,16 @@ def _run_backward_test():
 
         assert len(ref_grads) > 0, "AllToAll model produced no gradients"
         assert len(test_grads) > 0, "FusedTokenRouter model produced no gradients"
+
+        # Both AllToAll (via _DifferentiableAllToAll) and FusedTokenRouter provide
+        # differentiable dispatch/combine, so all parameters should have gradients.
         assert ref_grads.keys() == test_grads.keys(), (
             f"Gradient key mismatch: "
-            f"only in ref={ref_grads.keys() - test_grads.keys()}, "
-            f"only in test={test_grads.keys() - ref_grads.keys()}"
+            f"AllToAll-only={ref_grads.keys() - test_grads.keys()}, "
+            f"Fused-only={test_grads.keys() - ref_grads.keys()}"
         )
 
         for name in ref_grads:
-            assert name in test_grads, f"FusedTokenRouter missing gradient for parameter: {name}"
             torch.testing.assert_close(
                 test_grads[name],
                 ref_grads[name],
diff --git a/bionemo-recipes/models/mixtral/tests/test_modeling_mixtral.py b/bionemo-recipes/models/mixtral/tests/test_modeling_mixtral.py