add ep test

pstjohn · pstjohn · commit efdf89bb2047 · 2026-03-11T15:46:58.000-06:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/mixtral/tests/test_ep.py b/bionemo-recipes/models/mixtral/tests/test_ep.py
@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for expert parallelism (EP) in the Mixtral MoE model.
+
+Verifies that running with EP=2 (experts sharded across 2 GPUs) produces
+the same logits and loss as EP=1 (all experts on a single GPU).
+"""
+
+import os
+import subprocess
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import pytest
+import torch
+
+from modeling_mixtral_te import NVMixtralConfig, NVMixtralForCausalLM
+
+
+requires_multi_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+    reason="Test requires at least 2 GPUs",
+)
+
+
+def _create_small_mixtral_config(**overrides) -> NVMixtralConfig:
+    """Create a small Mixtral config suitable for testing."""
+    defaults = {
+        "hidden_size": 128,
+        "intermediate_size": 256,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "num_local_experts": 4,
+        "num_experts_per_tok": 2,
+        "max_position_embeddings": 128,
+        "vocab_size": 1000,
+        "attn_input_format": "bshd",
+        "self_attn_mask_type": "causal",
+        "router_jitter_noise": 0.0,
+    }
+    defaults.update(overrides)
+    return NVMixtralConfig(**defaults)
+
+
+def _get_dummy_batch(vocab_size: int, seq_len: int = 32, batch_size: int = 2, device: str = "cuda"):
+    """Create a simple dummy batch for testing."""
+    torch.manual_seed(42)
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+    attention_mask = torch.ones_like(input_ids)
+    labels = input_ids.clone()
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
+
+
+@requires_multi_gpu
+def test_ep2_matches_ep1(unused_tcp_port):
+    """Test that EP=2 produces the same logits as EP=1."""
+    cmd = [
+        "torchrun",
+        "--nproc_per_node=2",
+        "--rdzv-backend=c10d",
+        f"--rdzv-endpoint=localhost:{unused_tcp_port}",
+        os.path.relpath(__file__),
+    ]
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        cwd=str(Path(__file__).parent.parent),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=300,
+    )
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"EP equivalence test failed with exit code {result.returncode}")
+
+
+# ---------------------------------------------------------------------------
+# Distributed worker executed via torchrun
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class DistributedConfig:
+    """Distributed environment configuration."""
+
+    rank: int = field(default_factory=lambda: int(os.environ.setdefault("RANK", "0")))
+    local_rank: int = field(default_factory=lambda: int(os.environ.setdefault("LOCAL_RANK", "0")))
+    world_size: int = field(default_factory=lambda: int(os.environ.setdefault("WORLD_SIZE", "1")))
+    _master_addr: str = field(default_factory=lambda: os.environ.setdefault("MASTER_ADDR", "localhost"))
+    _master_port: str = field(default_factory=lambda: os.environ.setdefault("MASTER_PORT", "12355"))
+
+    def is_main_process(self) -> bool:
+        """Return True if this is the global rank 0 process."""
+        return self.rank == 0
+
+
+def _shard_expert_weights(full_state_dict: dict, ep_rank: int, ep_size: int, num_experts: int) -> dict:
+    """Shard expert weights from a full (EP=1) state dict for a given EP rank.
+
+    Expert weight keys follow the TE GroupedLinear naming convention:
+    ``...experts_gate_up.weight{i}`` and ``...experts_down.weight{i}``
+    where ``i`` is the global expert index.
+
+    For EP, each rank keeps only its local slice of experts and renumbers
+    the weight keys starting from 0.
+
+    Args:
+        full_state_dict: Complete state dict from an EP=1 model (without _extra_state keys).
+        ep_rank: This rank's index in the EP group.
+        ep_size: Total number of EP ranks.
+        num_experts: Total number of experts (before sharding).
+    """
+    experts_per_rank = num_experts // ep_size
+    start_expert = ep_rank * experts_per_rank
+    end_expert = start_expert + experts_per_rank
+
+    new_state_dict = {}
+    for key, value in full_state_dict.items():
+        if "experts_gate_up.weight" in key or "experts_down.weight" in key:
+            # Extract global expert index from key like "...weight3"
+            prefix, weight_part = key.rsplit("weight", 1)
+            global_idx = int(weight_part)
+            if start_expert <= global_idx < end_expert:
+                local_idx = global_idx - start_expert
+                new_key = f"{prefix}weight{local_idx}"
+                new_state_dict[new_key] = value
+        else:
+            # Non-expert weights are replicated
+            new_state_dict[key] = value
+
+    return new_state_dict
+
+
+def _run_ep_equivalence_test():
+    """Main worker function for the EP equivalence test.
+
+    1. Set up each rank's device, init distributed.
+    2. Every rank creates an EP=1 model on its own GPU, runs forward, saves reference.
+    3. Create EP=2 model with sharded expert weights, run forward, compare.
+    """
+    # --- Setup distributed first so each rank uses its own GPU ---
+    dist_config = DistributedConfig()
+    device = torch.device(f"cuda:{dist_config.local_rank}")
+    torch.cuda.set_device(device)
+    torch.distributed.init_process_group(backend="nccl", device_id=device)
+    ep_rank = dist_config.rank
+    ep_size = dist_config.world_size
+
+    # --- Phase 1: EP=1 reference (every rank computes independently) ---
+    config_ep1 = _create_small_mixtral_config(expert_parallel_size=1)
+    torch.manual_seed(0)
+    model_ep1 = NVMixtralForCausalLM(config_ep1).to(dtype=torch.bfloat16, device=device)
+    model_ep1.eval()
+
+    batch = _get_dummy_batch(config_ep1.vocab_size, seq_len=32, batch_size=2, device=device)
+
+    with torch.no_grad():
+        outputs_ep1 = model_ep1(**batch)
+
+    logits_ep1 = outputs_ep1.logits.detach().clone().cpu()
+    loss_ep1 = outputs_ep1.loss.detach().clone().cpu()
+
+    # Save EP=1 full state dict on CPU for sharding
+    full_state_dict = {k: v.clone().cpu() for k, v in model_ep1.state_dict().items()}
+
+    del model_ep1, outputs_ep1
+    torch.cuda.empty_cache()
+
+    # --- Phase 2: EP=2 distributed run ---
+    config_ep2 = _create_small_mixtral_config(expert_parallel_size=ep_size)
+    torch.manual_seed(0)
+    model_ep2 = NVMixtralForCausalLM(config_ep2).to(dtype=torch.bfloat16, device=device)
+
+    # Load sharded expert weights (strict=False to skip TE _extra_state keys)
+    sharded_state_dict = _shard_expert_weights(full_state_dict, ep_rank, ep_size, config_ep1.num_local_experts)
+    model_ep2.load_state_dict(sharded_state_dict, strict=False)
+    model_ep2.eval()
+
+    # Set EP process group on all MoE blocks
+    ep_group = torch.distributed.group.WORLD
+    model_ep2.model.set_ep_groups(ep_group)
+
+    # Same batch on all ranks (EP dispatches tokens, input is replicated)
+    batch_cuda = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+
+    with torch.no_grad():
+        outputs_ep2 = model_ep2(**batch_cuda)
+
+    logits_ep2 = outputs_ep2.logits.detach().cpu()
+    loss_ep2 = outputs_ep2.loss.detach().cpu()
+
+    # --- Phase 3: Compare on rank 0 ---
+    if dist_config.is_main_process():
+        torch.testing.assert_close(
+            logits_ep2,
+            logits_ep1,
+            atol=1e-2,
+            rtol=1e-2,
+            msg="EP=2 logits do not match EP=1 logits",
+        )
+
+        torch.testing.assert_close(
+            loss_ep2,
+            loss_ep1,
+            atol=1e-3,
+            rtol=1e-3,
+            msg="EP=2 loss does not match EP=1 loss",
+        )
+
+        print("EP equivalence test PASSED: EP=2 logits and loss match EP=1")
+
+    torch.distributed.destroy_process_group()
+
+
+if __name__ == "__main__":
+    _run_ep_equivalence_test()