fix(moe): address Copilot review comments for stacked MoE fast path

Aegis-AI · Aegis-AI · commit 6a57d00da7df · 2026-04-26T19:49:14.000-07:00
- Compute and use separate `_stackedDownBytesPerExpert` for the down
  projection so it doesn't incorrectly reuse the gate/up stride.
- Fix docstring for `computeExpertsFused` to refer generically to
  outputDims/inputDims instead of intermediate/hidden.
- Add `StackedMoETests.swift` unit test to verify the fast path cleanly
  falls back without crashing when enabled on non-quantized models.
diff --git a/Libraries/MLXLMCommon/SwitchLayers.swift b/Libraries/MLXLMCommon/SwitchLayers.swift
@@ -102,6 +102,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
     private var _tokenCounter: Int = 0
     // Bytes per expert slab in a stacked buffer; computed once on cold init.
     private var _stackedBytesPerExpert: Int = 0
+    private var _stackedDownBytesPerExpert: Int = 0
 
     // ── Fused gate+up SwiGLU mode (env-gated MLX_MOE_FUSE_GATEUP=1) ──
     // SwiGLU MLP is `silu(gate(x)) * up(x)`; gate and up are independent
@@ -204,7 +205,8 @@ public class SwitchGLU: Module, @unchecked Sendable {
                 if let cb = _combinedGateUpBiases { coldEvalList.append(cb) }
                 MLX.eval(coldEvalList)
                 _stackedGateUpBytesPerProj = _stackedGateUp!.nbytes / CACHE_SLOTS / 2
-                _stackedBytesPerExpert = _stackedGateUpBytesPerProj  // shared with down
+                _stackedBytesPerExpert = _stackedGateUpBytesPerProj
+                _stackedDownBytesPerExpert = _stackedDown!.nbytes / CACHE_SLOTS
             } else {
                 _stackedGate = MLXArray.zeros(
                     [CACHE_SLOTS, qGate.weight.dim(1), qGate.weight.dim(2)]
@@ -220,6 +222,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
                 _tokenCounter = 0
                 MLX.eval([idx, _stackedGate!, _stackedUp!, _stackedDown!])
                 _stackedBytesPerExpert = _stackedGate!.nbytes / CACHE_SLOTS
+                _stackedDownBytesPerExpert = _stackedDown!.nbytes / CACHE_SLOTS
             }
         } else {
             // Warm path: kick off GPU work asynchronously while we
@@ -268,6 +271,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
         }
         if !specTargets.isEmpty {
             let bpe = _stackedBytesPerExpert
+            let downBpe = _stackedDownBytesPerExpert
             DispatchQueue.concurrentPerform(iterations: specTargets.count * 3) { [specTargets] i in
                 let mIdx = i / 3
                 let proj = i % 3
@@ -295,7 +299,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
                     }
                 default:
                     MLXFast.preadIntoOffset(self._stackedDown!, safetensorsPath: downSSD.path,
-                                            tensorName: downSSD.tensorName, expertIndex: UInt32(info.expertId), dstOffset: info.slot * bpe)
+                                            tensorName: downSSD.tensorName, expertIndex: UInt32(info.expertId), dstOffset: info.slot * downBpe)
                 }
             }
         }
@@ -367,6 +371,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
         // ── Pread misses into stacked-buffer slots ──
         if !missesNeedingPread.isEmpty {
             let bpe = _stackedBytesPerExpert
+            let downBpe = _stackedDownBytesPerExpert
             DispatchQueue.concurrentPerform(iterations: missesNeedingPread.count * 3) { [missesNeedingPread] i in
                 let mIdx = i / 3
                 let proj = i % 3
@@ -392,7 +397,7 @@ public class SwitchGLU: Module, @unchecked Sendable {
                     }
                 default:
                     MLXFast.preadIntoOffset(self._stackedDown!, safetensorsPath: downSSD.path,
-                                            tensorName: downSSD.tensorName, expertIndex: UInt32(info.expertId), dstOffset: info.slot * bpe)
+                                            tensorName: downSSD.tensorName, expertIndex: UInt32(info.expertId), dstOffset: info.slot * downBpe)
                 }
             }
         }
@@ -1183,8 +1188,8 @@ public class QuantizedSwitchLinear: SwitchLinear, Quantized {
     /// single dispatch over the full stacked weight buffer.
     ///
     /// - Parameters:
-    ///   - x: input activations, shape `[totalTokens, ..., hidden]`.
-    ///   - stackedBuffer: weight buffer, shape `[CACHE_SLOTS, intermediate, hidden]`.
+    ///   - x: input activations, shape `[totalTokens, ..., inputDims]`.
+    ///   - stackedBuffer: weight buffer, shape `[CACHE_SLOTS, outputDims, inputDims]`.
     ///       Slots are populated externally via `MLXFast.preadIntoOffset`.
     ///   - slotPerToken: uint32 array mapping each token (along axis 0 of `x`)
     ///       to a slot index in `stackedBuffer`. Built from the routing.
@@ -1198,7 +1203,7 @@ public class QuantizedSwitchLinear: SwitchLinear, Quantized {
     ) -> MLXArray {
         let slotExpertsMLX = MLXArray(slotExperts).asType(.uint32)
         // Gather scales/biases for the experts currently in our slots.
-        // Result shape: [N_slots, intermediate, hidden / groupSize].
+        // Result shape: [N_slots, outputDims, inputDims / groupSize].
         let stackedScales = MLX.take(self.scales, slotExpertsMLX, axis: 0)
         var stackedBiases: MLXArray? = nil
         if let b = self.biases { stackedBiases = MLX.take(b, slotExpertsMLX, axis: 0) }
@@ -1214,8 +1219,8 @@ public class QuantizedSwitchLinear: SwitchLinear, Quantized {
 
         // Optional per-token bias add (gathered from per-slot bias).
         if let bias = self.bias {
-            let stackedBias = MLX.take(bias, slotExpertsMLX, axis: 0)             // [N_slots, intermediate]
-            let perTokenBias = MLX.take(stackedBias, slotPerToken, axis: 0)       // [tokens, intermediate]
+            let stackedBias = MLX.take(bias, slotExpertsMLX, axis: 0)             // [N_slots, outputDims]
+            let perTokenBias = MLX.take(stackedBias, slotPerToken, axis: 0)       // [tokens, outputDims]
             output = output + MLX.expandedDimensions(perTokenBias, axis: -2)
         }
 
diff --git a/Tests/MLXLMTests/StackedMoETests.swift b/Tests/MLXLMTests/StackedMoETests.swift
@@ -0,0 +1,71 @@
+import Foundation
+import MLX
+import MLXLLM
+import MLXLMCommon
+import MLXNN
+import Testing
+
+@Suite
+struct StackedMoETests {
+    
+    /// Create a minimal test configuration for Gemma 4 Text MoE
+    private func makeTinyTextMoEConfigData() -> Data {
+        let json = """
+        {
+            "model_type": "gemma4_text",
+            "hidden_size": 64,
+            "num_hidden_layers": 2,
+            "intermediate_size": 128,
+            "num_attention_heads": 4,
+            "head_dim": 16,
+            "global_head_dim": 64,
+            "rms_norm_eps": 1e-6,
+            "vocab_size": 100,
+            "num_key_value_heads": 2,
+            "rope_traditional": false,
+            "sliding_window": 128,
+            "sliding_window_pattern": 1,
+            "max_position_embeddings": 512,
+            "num_kv_shared_layers": 0,
+            "use_double_wide_mlp": false,
+            "tie_word_embeddings": true,
+            "hidden_size_per_layer_input": 32,
+            "vocab_size_per_layer_input": 10,
+            "final_logit_softcapping": 30.0,
+            "enable_moe_block": true,
+            "num_experts": 4,
+            "top_k_experts": 2,
+            "moe_intermediate_size": 128,
+            "attention_k_eq_v": false
+        }
+        """
+        return json.data(using: .utf8)!
+    }
+
+    @Test("Stacked MoE fast path falls back for non-quantized models")
+    func testStackedMoEFallback() throws {
+        // Set env vars directly. Since tests run concurrently, this might affect others
+        // if SwitchGLU is initialized here first, which is fine since the fallback is safe.
+        setenv("MLX_MOE_STACKED", "1", 1)
+        setenv("MLX_MOE_FUSE_GATEUP", "1", 1)
+        defer {
+            unsetenv("MLX_MOE_STACKED")
+            unsetenv("MLX_MOE_FUSE_GATEUP")
+        }
+
+        let data = makeTinyTextMoEConfigData()
+        let config = try JSONDecoder().decode(Gemma4TextConfiguration.self, from: data)
+        let model = Gemma4TextModel(config)
+        
+        // This validates that the fast path falls back cleanly because
+        // the weights are not quantized (they are standard MLXArray).
+        let input = MLXArray(0..<8).reshaped(1, 8)
+        let output = model(input, cache: nil)
+
+        #expect(output.shape == [1, 8, model.vocabularySize])
+        
+        let sum = output.sum().item(Float.self)
+        #expect(!sum.isNaN)
+        #expect(!sum.isInfinite)
+    }
+}