Add CP annotation to sparse_matmul

RissyRan · RissyRan · commit 119be1fbe98b · 2025-05-29T18:40:48.000Z
diff --git a/MaxText/layers/moe.py b/MaxText/layers/moe.py
@@ -571,7 +571,7 @@ def gmm(inputs, kernel, group_sizes, expert_assignments):
         output = output[: hs_shape[0]]
       return output
 
-    # Currently, we only support data and tensor parallelism with Megablox.
+    # Currently, we support data, tensor, and expert parallelism with Megablox.
     # We all gather the input activations over tensor parallelism to follow strategy
     # in https://parsa.epfl.ch/course-info/cs723/papers/Megatron.pdf.
 
@@ -589,10 +589,10 @@ def gmm(inputs, kernel, group_sizes, expert_assignments):
     else:
       batch_logical_axis = "activation_batch_no_exp"
 
-    input_partition_pspec = nn.logical_to_mesh_axes((batch_logical_axis, None, None))
-    gate_logits_pspec = nn.logical_to_mesh_axes((batch_logical_axis, None, None))
+    input_partition_pspec = nn.logical_to_mesh_axes((batch_logical_axis, "activation_length", None))
+    gate_logits_pspec = nn.logical_to_mesh_axes((batch_logical_axis, "activation_length", None))
     if self.config.model_name.startswith("deepseek3"):
-      pre_bias_logits_pspec = nn.logical_to_mesh_axes((batch_logical_axis, None, None))
+      pre_bias_logits_pspec = nn.logical_to_mesh_axes((batch_logical_axis, "activation_length", None))
     else:
       # pre_bias_logits is None for non-DeepSeek v3 models
       pre_bias_logits_pspec = None
@@ -610,7 +610,7 @@ def gmm(inputs, kernel, group_sizes, expert_assignments):
         shard_map.shard_map,
         mesh=self.mesh,
         in_specs=(input_partition_pspec, gate_logits_pspec, pre_bias_logits_pspec, w0_pspec, w1_pspec, wo_pspec),
-        out_specs=(nn.logical_to_mesh_axes((batch_logical_axis, None, "activation_embed"))),
+        out_specs=(nn.logical_to_mesh_axes((batch_logical_axis, "activation_length", "activation_embed"))),
         check_rep=False,
     )
     def wrapper(x, logits, pre_bias_logits, w0, w1, wo):
diff --git a/MaxText/tests/moe_test.py b/MaxText/tests/moe_test.py
@@ -348,13 +348,16 @@ def test_megablox(self):
         dtype="bfloat16",
         megablox=True,
         sparse_matmul=True,
-        per_device_batch_size=4,
+        per_device_batch_size=1,
     )
 
     rng = jax.random.PRNGKey(1234)
     rng_model, rng_hidden_states = jax.random.split(rng)
+    device_count = jax.device_count()
     hidden_states = jax.random.uniform(
-        rng_hidden_states, (int(cfg.per_device_batch_size), cfg.max_target_length, cfg.base_emb_dim), dtype=cfg.dtype
+        rng_hidden_states,
+        (int(cfg.per_device_batch_size) * device_count, cfg.max_target_length, cfg.base_emb_dim),
+        dtype=cfg.dtype,
     )
 
     devices_array = maxtext_utils.create_device_mesh(cfg)
@@ -373,13 +376,16 @@ def test_ragged_dot(self):
         dtype="bfloat16",
         megablox=False,
         sparse_matmul=True,
-        per_device_batch_size=4,
+        per_device_batch_size=1,
     )
 
     rng = jax.random.PRNGKey(1234)
     rng_model, rng_hidden_states = jax.random.split(rng)
+    device_count = jax.device_count()
     hidden_states = jax.random.uniform(
-        rng_hidden_states, (int(cfg.per_device_batch_size), cfg.max_target_length, cfg.base_emb_dim), dtype=cfg.dtype
+        rng_hidden_states,
+        (int(cfg.per_device_batch_size) * device_count, cfg.max_target_length, cfg.base_emb_dim),
+        dtype=cfg.dtype,
     )
 
     devices_array = maxtext_utils.create_device_mesh(cfg)
@@ -398,13 +404,16 @@ def test_dense(self):
         dtype="float32",
         megablox=False,
         sparse_matmul=False,
-        per_device_batch_size=4,
+        per_device_batch_size=1,
     )
 
     rng = jax.random.PRNGKey(2345)
     rng_model, rng_hidden_states = jax.random.split(rng)
+    device_count = jax.device_count()
     hidden_states = jax.random.uniform(
-        rng_hidden_states, (int(cfg.per_device_batch_size), cfg.max_target_length, cfg.base_emb_dim), dtype=cfg.dtype
+        rng_hidden_states,
+        (int(cfg.per_device_batch_size) * device_count, cfg.max_target_length, cfg.base_emb_dim),
+        dtype=cfg.dtype,
     )
 
     devices_array = maxtext_utils.create_device_mesh(cfg)
@@ -423,14 +432,47 @@ def test_megablox_expert_parallelism(self):
         dtype="bfloat16",
         megablox=True,
         sparse_matmul=True,
-        per_device_batch_size=4,
+        per_device_batch_size=1,
         ici_expert_parallelism=4,
     )
 
     rng = jax.random.PRNGKey(2345)
     rng_model, rng_hidden_states = jax.random.split(rng)
+    device_count = jax.device_count()
+    hidden_states = jax.random.uniform(
+        rng_hidden_states,
+        (int(cfg.per_device_batch_size) * device_count, cfg.max_target_length, cfg.base_emb_dim),
+        dtype=cfg.dtype,
+    )
+
+    devices_array = maxtext_utils.create_device_mesh(cfg)
+    mesh = Mesh(devices_array, cfg.mesh_axes)
+    with nn_partitioning.axis_rules(cfg.logical_axis_rules):
+      variables, expected_output = self.get_expected_output(rng_model, hidden_states, cfg)
+      actual_output, _ = self.get_moe_output(variables, hidden_states, cfg, mesh)
+      self.assertTrue(jax.numpy.allclose(expected_output, actual_output, rtol=1e-02, atol=1e-02, equal_nan=False))
+
+  @pytest.mark.tpu_only
+  def test_megablox_context_parallelism(self):
+    cfg = pyconfig.initialize(
+        [None, os.path.join(PKG_DIR, "configs", "base.yml")],
+        run_name="moe_block_megablox_cp_test",
+        enable_checkpointing=False,
+        model_name="mixtral-8x7b",
+        dtype="bfloat16",
+        megablox=True,
+        sparse_matmul=True,
+        per_device_batch_size=1,
+        ici_context_parallelism=4,
+    )
+
+    rng = jax.random.PRNGKey(2345)
+    rng_model, rng_hidden_states = jax.random.split(rng)
+    device_count = jax.device_count()
     hidden_states = jax.random.uniform(
-        rng_hidden_states, (int(cfg.per_device_batch_size), cfg.max_target_length, cfg.base_emb_dim), dtype=cfg.dtype
+        rng_hidden_states,
+        (int(cfg.per_device_batch_size) * device_count, cfg.max_target_length, cfg.base_emb_dim),
+        dtype=cfg.dtype,
     )
 
     devices_array = maxtext_utils.create_device_mesh(cfg)
@@ -541,7 +583,7 @@ def test_local_permute_no_offset(self):
 
   def test_local_permute_offset(self):
     experts_per_group = 2
-    expert_groups = 4 # aka number of expert shards.
+    expert_groups = 4  # aka number of expert shards.
     num_experts = 8
 
     # Global group sizes for each of the 8 experts