fixed: Modified the topkgating function and modified the test_moe file for testing

xiongjyu · xiongjyu · commit 62c70c7b7faf · 2025-03-21T11:55:56.000+08:00
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
@@ -429,6 +429,7 @@ def topkgating(
             tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu)
             new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
         capacity = new_capacity
+        locations = torch.cumsum(mask, dim=0) - 1
 
     # normalize gates
     gates_masked = gates * mask
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
@@ -242,6 +242,15 @@ def check_equal(logits, cap, sparse_truth, res):
         check_equal(logits2, 2, position_sec_sparse, position_dispatch_res)
 
 
+        #s=4   e=4  topk=2   drop_tokens=False
+        logits3 = torch.tensor([[0.95, 0.85, 0.90, 0.80], [0.70, 0.65, 0.75, 0.60], [0.50, 0.55, 0.45, 0.40], 
+                                [0.35, 0.30, 0.25, 0.20]])
+        logits3 *= dist.get_rank() + 1
+        dispatch_res = topkgating(logits3, 2, 1, min_capacity=1, drop_tokens=False)[2]
+        sec_sparse = torch.tensor([[0, 0, 0], [0, 2, 0], [1, 0, 1], [1, 2, 1], [2, 0, 2], [2, 1, 0], [3, 0, 3],
+                                   [3, 1, 1]])
+        check_equal(logits3, 4, sec_sparse, dispatch_res)
+
 class TestExpertWeightGradWithZero(DistributedTest):
     world_size = 2
 
@@ -351,3 +360,4 @@ def _get_weight_bias(experts):
         assert len(expert_weight_grad_ep1) == len(expert_weight_grad_ep2)
         for grad_from_ep1, grad_from_ep2 in zip(expert_weight_grad_ep1, expert_weight_grad_ep2):
             assert torch.allclose(grad_from_ep1, grad_from_ep2, atol=0, rtol=1e-4)
+