Merge pull request #38 from SharpAI/fix/ssd-streaming-crash-recovery

solderzzc · web-flow · commit 2c2cd9eed4ee · 2026-04-28T12:36:25.000-07:00
Recover from SSD streaming errors without crashing
diff --git a/Libraries/MLXLMCommon/ConcurrentError.swift b/Libraries/MLXLMCommon/ConcurrentError.swift
@@ -1,11 +1,87 @@
 import Foundation
 import MLX
 
+/// Error thrown when SSD expert streaming encounters a corrupted, truncated,
+/// or incomplete safetensors file during pread I/O.
+public struct SSDStreamingError: Error, LocalizedError {
+    public let underlyingError: Error
+
+    public init(underlyingError: Error) {
+        self.underlyingError = underlyingError
+    }
+
+    public var errorDescription: String? {
+        "MLX SSD Streaming Error: \(underlyingError.localizedDescription). The model safetensors file may be corrupted, truncated, or incomplete. Try re-downloading the model."
+    }
+}
+
+private enum SSDStreamingErrorLatchContext {
+    static let threadDictionaryKey = "MLXLMCommon.SSDStreamingErrorLatch.active"
+}
+
+/// Error latch for SSD streaming errors that occur inside non-throwing
+/// `callAsFunction` paths. A generation installs its own active latch around
+/// model execution so concurrent sessions do not cross-contaminate each other.
+public final class SSDStreamingErrorLatch: @unchecked Sendable {
+    public static let shared = SSDStreamingErrorLatch()
+    private let lock = NSLock()
+    private var _error: Error?
+
+    public init() {}
+
+    package static func withActive<T>(_ latch: SSDStreamingErrorLatch, _ body: () throws -> T) rethrows -> T {
+        let key = SSDStreamingErrorLatchContext.threadDictionaryKey as NSString
+        let threadDictionary = Thread.current.threadDictionary
+        let previous = threadDictionary[key]
+        threadDictionary[key] = latch
+        defer {
+            if let previous {
+                threadDictionary[key] = previous
+            } else {
+                threadDictionary.removeObject(forKey: key)
+            }
+        }
+        return try body()
+    }
+
+    package static var active: SSDStreamingErrorLatch? {
+        let key = SSDStreamingErrorLatchContext.threadDictionaryKey as NSString
+        return Thread.current.threadDictionary[key] as? SSDStreamingErrorLatch
+    }
+
+    /// Record an error (first-wins semantics).
+    public func set(_ error: Error) {
+        lock.withLock {
+            if _error == nil { _error = error }
+        }
+    }
+
+    /// Consume and return the recorded error, resetting the latch.
+    /// Returns nil if no error was recorded.
+    public func consume() -> Error? {
+        lock.withLock {
+            let e = _error
+            _error = nil
+            return e
+        }
+    }
+
+    /// Throw the recorded error if one exists, then clear it.
+    public func throwIfSet() throws {
+        if let error = consume() {
+            throw error
+        }
+    }
+}
+
 package final class ThreadSafeError: @unchecked Sendable {
     package let lock = NSLock()
     package var error: Swift.Error?
+    private let latch: SSDStreamingErrorLatch?
     
-    package init() {}
+    package init(latch: SSDStreamingErrorLatch? = SSDStreamingErrorLatch.active) {
+        self.latch = latch
+    }
 
     package func catchError(_ block: () throws -> Void) {
         do {
@@ -19,9 +95,20 @@ package final class ThreadSafeError: @unchecked Sendable {
         }
     }
     
-    package func check() {
+    /// Check if any error was recorded during concurrent I/O.
+    ///
+    /// Instead of calling `fatalError` (which crashes the entire app), this
+    /// posts the error to the global `SSDStreamingErrorLatch` so the generation
+    /// loop can detect it after the current token and surface it gracefully
+    /// in the UI (e.g., prompting a re-download).
+    @discardableResult
+    package func check() -> SSDStreamingError? {
         if let error = error {
-            fatalError("MLX SSD Streaming Error: \(error.localizedDescription). (The model safetensors file may be corrupted, truncated, or incomplete).")
+            let streamingError = SSDStreamingError(underlyingError: error)
+            latch?.set(streamingError)
+            SSDStreamingErrorLatch.shared.set(streamingError)
+            return streamingError
         }
+        return nil
     }
 }
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -502,6 +502,7 @@ protocol TokenIteratorProtocol: Sequence, IteratorProtocol where Element == Int
     var maxTokens: Int? { get }
     var tokenCount: Int { get }
     var promptPrefillTime: TimeInterval { get }
+    var streamingError: SSDStreamingError? { get }
 }
 
 /// Generator of tokens.
@@ -546,6 +547,8 @@ public struct TokenIterator: TokenIteratorProtocol {
 
     // Internal metrics
     var promptPrefillTime: TimeInterval = 0.0
+    var streamingError: SSDStreamingError?
+    let ssdErrorLatch = SSDStreamingErrorLatch()
 
     /// Initialize a `TokenIterator` with the given tokens. Note: this has been
     /// replaced with ``init(input:model:cache:parameters:)``.
@@ -646,16 +649,25 @@ public struct TokenIterator: TokenIteratorProtocol {
     mutating func prepare(input: LMInput, windowSize: Int? = nil) throws {
         processor?.prompt(input.text.tokens)
 
-        switch try model.prepare(input, cache: cache, windowSize: windowSize) {
+        let preparation = try SSDStreamingErrorLatch.withActive(ssdErrorLatch) {
+            try model.prepare(input, cache: cache, windowSize: windowSize)
+        }
+
+        switch preparation {
         case .tokens(let tokens):
             y = tokens
 
+            try ssdErrorLatch.throwIfSet()
+
             // evaluate the remainder of the prompt -- this primes the pump
-            let token = step(previous: y)
+            let token = try step(previous: y)
+
             y = .init(tokens: token)
             asyncEval(y.tokens)
 
         case .logits(let result):
+            try ssdErrorLatch.throwIfSet()
+
             y = .init(tokens: convertToToken(logits: result.logits))
             asyncEval(y.tokens)
 
@@ -677,11 +689,14 @@ public struct TokenIterator: TokenIteratorProtocol {
     }
 
     /// Evaluate the next token and return the new token (y), updating cache state
-    mutating func step(previous: LMInput.Text) -> MLXArray {
-        let result = model(
-            previous[text: .newAxis], cache: cache.isEmpty ? nil : cache, state: state)
+    mutating func step(previous: LMInput.Text) throws -> MLXArray {
+        let result = SSDStreamingErrorLatch.withActive(ssdErrorLatch) {
+            model(previous[text: .newAxis], cache: cache.isEmpty ? nil : cache, state: state)
+        }
         self.state = result.state
 
+        try ssdErrorLatch.throwIfSet()
+
         // Apply dynamic cache quantization after each step
         maybeQuantizeKVCache(
             cache: &cache,
@@ -694,6 +709,10 @@ public struct TokenIterator: TokenIteratorProtocol {
     }
 
     mutating public func next() -> Int? {
+        if streamingError != nil {
+            return nil
+        }
+
         if let maxTokens, tokenCount >= maxTokens {
             return nil
         }
@@ -702,7 +721,17 @@ public struct TokenIterator: TokenIteratorProtocol {
         let previousY = y
 
         // compute the next state and async eval the next token
-        let token = step(previous: previousY)
+        let token: MLXArray
+        do {
+            token = try step(previous: previousY)
+        } catch let error as SSDStreamingError {
+            streamingError = error
+            return nil
+        } catch {
+            streamingError = SSDStreamingError(underlyingError: error)
+            return nil
+        }
+
         y = .init(tokens: token)
         asyncEval(token)
 
@@ -746,6 +775,7 @@ public struct SpeculativeTokenIterator: TokenIteratorProtocol {
     let draftModel: any LanguageModel
 
     var mainState: LMOutput.State?
+    public let streamingError: SSDStreamingError? = nil
     var mainCache: [KVCache]
     var draftCache: [KVCache]
     let quantizeKVCache: (inout [KVCache]) -> Void
@@ -1685,7 +1715,7 @@ private func generateLoopTask<Handler: TokenLoopHandler>(
     // Launch a Task to perform iteration asynchronously.
     let task = Task {
         let performIteration = {
-            let iterator = iterator.consume()
+            var iterator = iterator.consume()
             var handler = handler.consume()
 
             var start = Date.timeIntervalSinceReferenceDate
@@ -1698,7 +1728,7 @@ private func generateLoopTask<Handler: TokenLoopHandler>(
                 tokenizer: tokenizer
             )
 
-            for token in iterator {
+            while let token = iterator.next() {
                 // Check for cancellation on every loop iteration.
                 if Task.isCancelled {
                     stopReason = .cancelled
@@ -1732,7 +1762,7 @@ private func generateLoopTask<Handler: TokenLoopHandler>(
             }
 
             if stopReason == nil {
-                if Task.isCancelled {
+                if Task.isCancelled || iterator.streamingError != nil {
                     stopReason = .cancelled
                 } else if let maxTokens = iterator.maxTokens, tokenCount >= maxTokens {
                     stopReason = .length
diff --git a/Tests/MLXLMTests/CorruptSafetensorsTests.swift b/Tests/MLXLMTests/CorruptSafetensorsTests.swift
@@ -5,6 +5,32 @@ import Testing
 
 @Suite
 struct CorruptSafetensorsTests {
+    @Test
+    func testThreadSafeErrorCheckPublishesToActiveLatch() throws {
+        let latch = SSDStreamingErrorLatch()
+
+        SSDStreamingErrorLatch.withActive(latch) {
+            let errState = ThreadSafeError()
+            errState.catchError {
+                throw NSError(domain: "CorruptSafetensorsTests", code: 13, userInfo: [
+                    NSLocalizedDescriptionKey: "truncated shard"
+                ])
+            }
+
+            let latched = errState.check()
+            #expect(latched != nil)
+        }
+
+        do {
+            try latch.throwIfSet()
+            Issue.record("Expected latch.throwIfSet() to surface an SSDStreamingError")
+        } catch let error as SSDStreamingError {
+            #expect(error.localizedDescription.contains("truncated shard"))
+        } catch {
+            Issue.record("Unexpected error type: \(error)")
+        }
+    }
+
     @Test
     func testDeadlock() throws {
         let tempDir = FileManager.default.temporaryDirectory