fix(mtp): Resolve speculative decoding memory collapse and expand MoE support

Aegis-AI · Aegis-AI · commit cadd98aa16c2 · 2026-05-06T22:57:51.000-07:00
- Force MLX evaluation of mtpLogits to prevent recursive compute graph explosion (OOM).
- Apply dynamic KV cache quantization to all MTP draft heads during rewinds.
- Optimize SwitchGLU SSD streaming with async pre-reads and fused buffers.
- Add Qwen3.6-35B model definition and MTP speculation hooks.
diff --git a/Libraries/MLXLLM/Models/Qwen35.swift b/Libraries/MLXLLM/Models/Qwen35.swift
@@ -832,6 +832,35 @@ public class Qwen35Model: Module, LLMModel, KVCacheDimensionProvider {
             sanitized[key] = value
         }
 
+        // FP8 block-wise dequantization for Qwen3.6-27B-FP8 (dense checkpoint).
+        // Official FP8 checkpoints ship each weight tensor alongside a
+        // "weight_scale_inv" tensor with shape [outFeatures/128, inFeatures/128].
+        // We dequantize eagerly here (dense model fits in 64 GB without lazy streaming).
+        var processed = [String: MLXArray]()
+        for (key, value) in sanitized {
+            if key.hasSuffix(".weight_scale_inv") {
+                let wKey = key.replacingOccurrences(of: "_scale_inv", with: "")
+                if let w = sanitized[wKey], processed[wKey] == nil {
+                    // Block-wise: scale_inv is [outBlocks, inBlocks], w is [outDim, inDim]
+                    // Swift MLX maps F8_E4M3 → uint8; fromFp8 gives the same signed
+                    // [-448,448] range that Python mx.load() produces automatically.
+                    let wFp: MLXArray = MLXFast.fromFp8(w, dtype: .bfloat16)
+                    let bs = 128
+                    let (m, n) = (wFp.dim(0), wFp.dim(1))
+                    let padBottom = (bs - m % bs) % bs
+                    let padSide   = (bs - n % bs) % bs
+                    var padded = MLX.padded(wFp, widths: [[0, padBottom], [0, padSide]])
+                    padded = padded.reshaped([(m + padBottom) / bs, bs, (n + padSide) / bs, bs])
+                    let scaled = padded * value[0..., .newAxis, 0..., .newAxis]
+                    let dequant = scaled.reshaped([m + padBottom, n + padSide])[0 ..< m, 0 ..< n]
+                    processed[wKey] = dequant.asType(.bfloat16)
+                }
+            } else if processed[key] == nil {
+                processed[key] = value
+            }
+        }
+        if !processed.isEmpty { sanitized = processed }
+
         return languageModel.sanitize(weights: sanitized)
     }
 }
diff --git a/Libraries/MLXLLM/Models/Qwen35MoE.swift b/Libraries/MLXLLM/Models/Qwen35MoE.swift
@@ -38,6 +38,10 @@ public struct Qwen35Configuration: Codable, Sendable {
 public class Qwen35MoEModel: Qwen35Model {
 
     override public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        // ── Step 1: FP8 dequantization (official Qwen3.6-35B-A3B-FP8 checkpoint) ──
+        // The FP8 release stores quantized weights alongside weight_scale_inv tensors.
+        // We preserve them and stack them so they can be lazily dequantized in SwitchLinear.
+        // ── Step 2: Key remapping ──
         var newWeights = [String: MLXArray]()
         for (key, value) in weights {
             if key.hasPrefix("vision_tower") || key.hasPrefix("model.visual") {
@@ -53,45 +57,165 @@ public class Qwen35MoEModel: Qwen35Model {
             newWeights[key] = value
         }
 
+        // ── Step 3: MoE expert weight stacking (main layers) ──
+        // Format A: community 4-bit checkpoints ship a pre-stacked "gate_up_proj" → split into gate/up
+        // Format B: FP8/BF16 official checkpoints ship per-expert "experts.N.{gate,up,down}_proj" → stack
+        let nExperts = languageModel.configuration.numExperts
         for l in 0 ..< languageModel.configuration.hiddenLayers {
             let prefix = "language_model.model.layers.\(l).mlp"
+
+            // Format A
             let gateUpKey = "\(prefix).experts.gate_up_proj"
             if let gateUp = newWeights[gateUpKey] {
                 newWeights[gateUpKey] = nil
                 let mid = gateUp.dim(-2) / 2
-                newWeights["\(prefix).switch_mlp.gate_proj.weight"] =
-                    gateUp[.ellipsis, ..<mid, 0...]
-                newWeights["\(prefix).switch_mlp.up_proj.weight"] =
-                    gateUp[.ellipsis, mid..., 0...]
-                if let downProj = newWeights["\(prefix).experts.down_proj"] {
+                newWeights["\(prefix).switch_mlp.gate_proj.weight"] = gateUp[.ellipsis, ..<mid, 0...]
+                newWeights["\(prefix).switch_mlp.up_proj.weight"]   = gateUp[.ellipsis, mid..., 0...]
+                if let dp = newWeights["\(prefix).experts.down_proj"] {
                     newWeights["\(prefix).experts.down_proj"] = nil
-                    newWeights["\(prefix).switch_mlp.down_proj.weight"] = downProj
+                    newWeights["\(prefix).switch_mlp.down_proj.weight"] = dp
+                }
+            }
+
+            // Format B
+            if newWeights["\(prefix).experts.0.gate_proj.weight"] != nil {
+                for projName in ["gate_proj", "up_proj", "down_proj"] {
+                    let perExpert = (0 ..< nExperts).compactMap {
+                        newWeights["\(prefix).experts.\($0).\(projName).weight"]
+                    }
+                    let perExpertScale = (0 ..< nExperts).compactMap {
+                        newWeights["\(prefix).experts.\($0).\(projName).weight_scale_inv"]
+                    }
+
+                    if perExpert.count == nExperts {
+                        if perExpertScale.count == nExperts {
+                            // FP8 checkpoint: eager per-expert dequant at load time.
+                            // Avoids re-running fromFp8 + block-scale on the full [256,outDim,inDim]
+                            // stacked tensor on every forward pass (would be prohibitively slow).
+                            let bs = 128
+                            let dequanted: [MLXArray] = zip(perExpert, perExpertScale).map { w, inv in
+                                let wFp = MLXFast.fromFp8(w, dtype: .bfloat16)
+                                let (m, n) = (wFp.dim(0), wFp.dim(1))
+                                let padB = (bs - m % bs) % bs
+                                let padS = (bs - n % bs) % bs
+                                var p = MLX.padded(wFp, widths: [[0, padB], [0, padS]])
+                                p = p.reshaped([(m + padB) / bs, bs, (n + padS) / bs, bs])
+                                let scaled = p * inv[0..., .newAxis, 0..., .newAxis]
+                                return scaled.reshaped([m + padB, n + padS])[0 ..< m, 0 ..< n].asType(.bfloat16)
+                            }
+                            let stacked = MLX.stacked(dequanted)
+                            // Eagerly eval to pay the dequant cost at load time, not during prefill.
+                            // Without this, the entire lazy graph materializes on first forward pass.
+                            MLX.eval(stacked)
+                            newWeights["\(prefix).switch_mlp.\(projName).weight"] = stacked
+                            // Scale tensors consumed — do NOT store weight_scale_inv
+                            for i in 0 ..< nExperts {
+                                newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight")
+                                newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight_scale_inv")
+                            }
+                        } else {
+                            // BF16 checkpoint: stack as-is
+                            newWeights["\(prefix).switch_mlp.\(projName).weight"] = MLX.stacked(perExpert)
+                            for i in 0 ..< nExperts {
+                                newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight")
+                            }
+                        }
+                    }
                 }
             }
         }
-        
+
+        // ── Step 4: MoE expert weight stacking (MTP heads) ──
         for l in 0 ..< languageModel.configuration.numNextnPredictLayers {
             let prefixes = [
                 "language_model.mtp.\(l).layers.0.mlp",
-                "language_model.mtp.layers.0.mlp"
+                "language_model.mtp.layers.0.mlp",
+                "language_model.mtp.layers.\(l).mlp"
             ]
             for prefix in prefixes {
+                // Format A
                 let gateUpKey = "\(prefix).experts.gate_up_proj"
                 if let gateUp = newWeights[gateUpKey] {
                     newWeights[gateUpKey] = nil
                     let mid = gateUp.dim(-2) / 2
-                    newWeights["\(prefix).switch_mlp.gate_proj.weight"] =
-                        gateUp[.ellipsis, ..<mid, 0...]
-                    newWeights["\(prefix).switch_mlp.up_proj.weight"] =
-                        gateUp[.ellipsis, mid..., 0...]
-                    if let downProj = newWeights["\(prefix).experts.down_proj"] {
+                    newWeights["\(prefix).switch_mlp.gate_proj.weight"] = gateUp[.ellipsis, ..<mid, 0...]
+                    newWeights["\(prefix).switch_mlp.up_proj.weight"]   = gateUp[.ellipsis, mid..., 0...]
+                    if let dp = newWeights["\(prefix).experts.down_proj"] {
                         newWeights["\(prefix).experts.down_proj"] = nil
-                        newWeights["\(prefix).switch_mlp.down_proj.weight"] = downProj
+                        newWeights["\(prefix).switch_mlp.down_proj.weight"] = dp
+                    }
+                }
+
+                // Format B
+                if newWeights["\(prefix).experts.0.gate_proj.weight"] != nil {
+                    for projName in ["gate_proj", "up_proj", "down_proj"] {
+                        let perExpert = (0 ..< nExperts).compactMap {
+                            newWeights["\(prefix).experts.\($0).\(projName).weight"]
+                        }
+                        let perExpertScale = (0 ..< nExperts).compactMap {
+                            newWeights["\(prefix).experts.\($0).\(projName).weight_scale_inv"]
+                        }
+                        if perExpert.count == nExperts {
+                            if perExpertScale.count == nExperts {
+                                let bs = 128
+                                let dequanted: [MLXArray] = zip(perExpert, perExpertScale).map { w, inv in
+                                    let wFp = MLXFast.fromFp8(w, dtype: .bfloat16)
+                                    let (m, n) = (wFp.dim(0), wFp.dim(1))
+                                    let padB = (bs - m % bs) % bs; let padS = (bs - n % bs) % bs
+                                    var p = MLX.padded(wFp, widths: [[0, padB], [0, padS]])
+                                    p = p.reshaped([(m + padB) / bs, bs, (n + padS) / bs, bs])
+                                    return (p * inv[0..., .newAxis, 0..., .newAxis]).reshaped([m + padB, n + padS])[0 ..< m, 0 ..< n].asType(.bfloat16)
+                                }
+                                let stacked = MLX.stacked(dequanted)
+                                MLX.eval(stacked)
+                                newWeights["\(prefix).switch_mlp.\(projName).weight"] = stacked
+                                for i in 0 ..< nExperts {
+                                    newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight")
+                                    newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight_scale_inv")
+                                }
+                            } else {
+                                newWeights["\(prefix).switch_mlp.\(projName).weight"] = MLX.stacked(perExpert)
+                                for i in 0 ..< nExperts {
+                                    newWeights.removeValue(forKey: "\(prefix).experts.\(i).\(projName).weight")
+                                }
+                            }
+                        }
                     }
                 }
             }
         }
 
+        // ── Step 5: Eager FP8 block-wise dequantization for remaining non-expert Linear layers ──
+        // After Steps 3+4, ALL switch_mlp expert scale tensors have been consumed during stacking.
+        // Any remaining "weight_scale_inv" keys belong to regular Linear layers
+        // (attention projections, shared_expert, GatedDeltaNet, lm_head, etc.).
+        // These cannot carry weight_scale_inv, so we eagerly dequantize here.
+        var processed = [String: MLXArray]()
+        for (key, value) in newWeights {
+            if key.hasSuffix(".weight_scale_inv") {
+                let wKey = key.replacingOccurrences(of: "_scale_inv", with: "")
+                if let w = newWeights[wKey], processed[wKey] == nil {
+                    // Swift MLX maps F8_E4M3 → uint8; fromFp8 gives proper signed floats.
+                    let wFp: MLXArray = MLXFast.fromFp8(w, dtype: .bfloat16)
+                    let bs = 128
+                    let (m, n) = (wFp.dim(0), wFp.dim(1))
+                    let padBottom = (bs - m % bs) % bs
+                    let padSide   = (bs - n % bs) % bs
+                    var padded = MLX.padded(wFp, widths: [[0, padBottom], [0, padSide]])
+                    padded = padded.reshaped([(m + padBottom) / bs, bs, (n + padSide) / bs, bs])
+                    let scaled = padded * value[0..., .newAxis, 0..., .newAxis]
+                    let dequant = scaled.reshaped([m + padBottom, n + padSide])[0 ..< m, 0 ..< n]
+                    processed[wKey] = dequant.asType(.bfloat16)
+                }
+                // Drop the scale tensor — Linear has no slot for it.
+            } else if processed[key] == nil {
+                processed[key] = value
+            }
+        }
+        if !processed.isEmpty { newWeights = processed }
+
+
         return languageModel.sanitize(weights: newWeights)
     }
+
 }
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -1126,8 +1126,19 @@ public struct MTPTokenIterator: TokenIteratorProtocol {
 
             // Save future MTP logits for next iteration
             self.mtpLogits = mtpResult.count > 1 ? Array(mtpResult.dropFirst()) : nil
-            
+
+            // Force evaluation of MTP state to prevent graph collapse
+            var evalArrays = [token]
+            if let mtpLogits = self.mtpLogits { evalArrays.append(contentsOf: mtpLogits) }
+            eval(evalArrays)
+
+            pendingTokens.append(token.item(Int.self))
+            y = .init(tokens: token)
+
             quantizeKVCache(&cache)
+            for i in mtpCaches.indices {
+                quantizeKVCache(&mtpCaches[i])
+            }
             return
         }
 
@@ -1163,8 +1174,7 @@ public struct MTPTokenIterator: TokenIteratorProtocol {
             mainTokens = sampler.sample(logits: verifyLogits)
         }
 
-        // Compare and accept proposed tokens
-        eval(mainTokens, draftTokens)
+        // We defer eval() until after we compute mtpLogits to force the graph
         let mainTokensList = mainTokens.asArray(Int.self)
         let draftTokensList = concatenated(draftTokens).asArray(Int.self)
         var accepted = 0
@@ -1191,6 +1201,9 @@ public struct MTPTokenIterator: TokenIteratorProtocol {
 
         // Apply dynamic cache quantization after rewind
         quantizeKVCache(&cache)
+        for i in mtpCaches.indices {
+            quantizeKVCache(&mtpCaches[i])
+        }
 
         // Set y for the next round
         y = .init(tokens: finalToken)
@@ -1203,6 +1216,11 @@ public struct MTPTokenIterator: TokenIteratorProtocol {
         } else {
             self.mtpLogits = nil
         }
+
+        // Force evaluation of MTP state to prevent graph collapse
+        var evalArrays = [mainTokens] + draftTokens
+        if let mtpLogits = self.mtpLogits { evalArrays.append(contentsOf: mtpLogits) }
+        eval(evalArrays)
     }
 
     mutating public func next() -> Int? {
diff --git a/Libraries/MLXLMCommon/Load.swift b/Libraries/MLXLMCommon/Load.swift
@@ -89,14 +89,41 @@ public func loadWeights(
         // and fall back to the bare path if none match.
         let knownPrefixes = ["language_model.", "model.language_model.", ""]
         for (path, module) in model.leafModules().flattened() {
-            if let qsl = module as? QuantizedSwitchLinear {
+            if let sl = module as? SwitchLinear {
                 let bareName = "\(path).weight"
-                // Find the original key that exists in the shard index
+                
+                // First, check for unstacked format (e.g. Qwen FP8: "experts.N.gate_proj")
+                if bareName.contains(".switch_mlp.") {
+                    let unstackedBaseName = bareName.replacingOccurrences(of: ".switch_mlp.", with: ".experts.")
+                    // Try to find expert 0 to confirm unstacked format
+                    let expert0Name = unstackedBaseName.replacingOccurrences(of: ".experts.", with: ".experts.0.")
+                    
+                    var foundUnstacked = false
+                    for prefix in knownPrefixes {
+                        if ExpertStreamerManager.shared?.getFile(for: prefix + expert0Name) != nil {
+                            foundUnstacked = true
+                            var map = [Int: (path: String, tensorName: String)]()
+                            for i in 0 ..< sl.numExperts {
+                                let expertName = unstackedBaseName.replacingOccurrences(of: ".experts.", with: ".experts.\(i).")
+                                let fullKey = prefix + expertName
+                                if let file = ExpertStreamerManager.shared?.getFile(for: fullKey),
+                                   let dir = ExpertStreamingConfig.shared.modelDirectory {
+                                    map[i] = (dir.appendingPathComponent(file).path, fullKey)
+                                }
+                            }
+                            sl.unstackedSSDMap = map
+                            break
+                        }
+                    }
+                    if foundUnstacked { continue }
+                }
+
+                // Normal stacked format
                 let originalKey = knownPrefixes.lazy
                     .map { $0 + bareName }
                     .first { ExpertStreamerManager.shared?.getFile(for: $0) != nil }
-                    ?? bareName  // fallback: use bare name (works when model has no VLM wrapper)
-                qsl.tensorName = originalKey
+                    ?? bareName  // fallback: use bare name
+                sl.tensorName = originalKey
             }
         }
     }
diff --git a/Libraries/MLXLMCommon/SwitchLayers.swift b/Libraries/MLXLMCommon/SwitchLayers.swift
diff --git a/Package.swift b/Package.swift