project-chip
diff --git a/‎src/darwin/Framework/CHIP/MTRCommissioningOperation.mm‎
Lines changed: 309 additions & 3 deletions b/‎src/darwin/Framework/CHIP/MTRCommissioningOperation.mm‎
Lines changed: 309 additions & 3 deletions
@@ -16,10 +16,12 @@
 
 #import <Foundation/Foundation.h>
 #import <Matter/MTRDeviceAttestationDelegate.h>
+#import <os/lock.h>
 
 #import "MTRCommissioningDelegate_Internal.h"
 #import "MTRCommissioningOperation.h"
 #import "MTRCommissioningOperation_Internal.h"
+#import "MTRCommissioningOperation_Test.h"
 #import "MTRDefines_Internal.h"
 #import "MTRDeviceControllerDelegate_Internal.h"
 #import "MTRDeviceController_Concrete.h"
@@ -36,6 +38,19 @@
 using namespace chip;
 using namespace chip::Tracing::DarwinFramework;
 
+// Production interval after which the post-PASE watchdog fires.  Five
+// minutes is comfortably longer than the commissionee fail-safe (60-180
+// seconds) but short enough that the user gets unblocked well within a
+// single sit-down troubleshooting session.  Exposed via
+// MTRCommissioningOperation_Test.h so tests can assert against the same
+// value production uses.
+extern const NSTimeInterval kMTRPostPASEWatchdogInterval = 5 * 60;
+
+// Test-only override for kMTRPostPASEWatchdogInterval.  Non-zero values
+// take effect for all subsequently armed watchdogs in the current
+// process; production reads kMTRPostPASEWatchdogInterval directly.
+static NSTimeInterval sMTRPostPASEWatchdogIntervalForTesting = 0;
+
 @interface MTRCommissioningOperationDeviceAttestationDelegate : NSObject <MTRDeviceAttestationDelegate>
 @property (nonatomic, weak) MTRCommissioningOperation * commissioningOperation;
 @end
@@ -49,6 +64,67 @@ @implementation MTRCommissioningOperation {
     id<MTRCommissioningDelegate> __weak _delegate;
     dispatch_queue_t _delegateQueue;
     MTRDeviceController_Concrete * __weak _controller;
+    // Watchdog timer armed once PASE has been established to bound how long we
+    // hold the controller in the "in-progress commissioning" state if the
+    // client drops the flow without ever calling commissionNodeWithID: or
+    // cancelCommissioningForNodeID:.  Without this watchdog,
+    // _currentInternalCommissioning is never cleared and every subsequent
+    // commissioning attempt hits CHIP_ERROR_BUSY (0xDB) until the process
+    // restarts.
+    //
+    // _postPASEWatchdog is read/written only on _delegateQueue.
+    dispatch_source_t _postPASEWatchdog;
+    // Backing ivar for the isWaitingAfterPASEEstablished property.  Reads and
+    // writes are serialized by _stateLock so callers from any queue see a
+    // consistent value immediately after the setter returns; the watchdog
+    // teardown that runs as a side effect of clearing the flag is bounced
+    // onto _delegateQueue (which is the only queue allowed to touch
+    // _postPASEWatchdog).
+    BOOL _isWaitingAfterPASEEstablished;
+    os_unfair_lock _stateLock;
+    // Test-only one-shot fault injection.  When YES, the next call to
+    // _armPostPASEWatchdog will simulate a dispatch_source_create failure
+    // (return NO without arming the timer) and clear the flag.  Read and
+    // written only on _delegateQueue, mirroring _postPASEWatchdog itself.
+    BOOL _forceNextArmFailureForTesting;
+}
+
+@synthesize commissioningID = _commissioningID;
+
+- (BOOL)isWaitingAfterPASEEstablished
+{
+    os_unfair_lock_lock(&_stateLock);
+    BOOL value = _isWaitingAfterPASEEstablished;
+    os_unfair_lock_unlock(&_stateLock);
+    return value;
+}
+
+- (void)setIsWaitingAfterPASEEstablished:(BOOL)isWaitingAfterPASEEstablished
+{
+    // Write the flag synchronously under _stateLock so that any caller that
+    // immediately reads back isWaitingAfterPASEEstablished observes the
+    // updated value (callers like MTRDeviceController_Concrete depend on
+    // this caller-visible ordering).  Only the watchdog teardown side
+    // effect is bounced onto _delegateQueue, since _postPASEWatchdog itself
+    // is _delegateQueue-owned.
+    BOOL changedToNo = NO;
+    os_unfair_lock_lock(&_stateLock);
+    if (_isWaitingAfterPASEEstablished != isWaitingAfterPASEEstablished) {
+        _isWaitingAfterPASEEstablished = isWaitingAfterPASEEstablished;
+        changedToNo = !isWaitingAfterPASEEstablished;
+    }
+    os_unfair_lock_unlock(&_stateLock);
+
+    if (changedToNo) {
+        // The client has either started commissioning (via
+        // commissionNodeWithID:) or moved on; the watchdog no longer needs
+        // to fire.  The cancel must happen on _delegateQueue because that
+        // is where _postPASEWatchdog is owned and where the timer's
+        // event_handler is delivered.
+        dispatch_async(_delegateQueue, ^{
+            [self _cancelPostPASEWatchdog];
+        });
+    }
 }
 
 - (instancetype)initWithParameters:(MTRCommissioningParameters *)parameters
@@ -97,13 +173,158 @@ - (instancetype)initWithParameters:(MTRCommissioningParameters *)parameters
     _isInternallyCreated = isInternallyCreated;
     _delegate = delegate;
     _delegateQueue = queue;
+    _stateLock = OS_UNFAIR_LOCK_INIT;
 
     // Don't hold on to the provided attestation delegate, which we never use.
     _parameters.deviceAttestationDelegate = nil;
 
     return self;
 }
 
+- (void)dealloc
+{
+    // Defensive: ensure the watchdog timer doesn't outlive us.  In all
+    // non-dealloc code paths, _postPASEWatchdog is read/written only on
+    // _delegateQueue.  In dealloc itself the timer source's event_handler
+    // cannot still be in flight (the handler retains self via mtr_strongify
+    // for its duration), so a synchronous cancel here is safe regardless
+    // of which queue is executing dealloc.
+    if (_postPASEWatchdog) {
+        dispatch_source_cancel(_postPASEWatchdog);
+        _postPASEWatchdog = nil;
+    }
+}
+
+- (BOOL)_armPostPASEWatchdog
+{
+    // _postPASEWatchdog is owned by _delegateQueue; production callers
+    // invoke this from inside that queue.
+
+    // Test-only one-shot fault injection: simulate a
+    // dispatch_source_create failure so callers can exercise the
+    // "watchdog could not be armed -> bail out" path without having to
+    // actually exhaust dispatch sources.
+    if (_forceNextArmFailureForTesting) {
+        _forceNextArmFailureForTesting = NO;
+        MTR_LOG_ERROR("%@ post-PASE watchdog arm forced to fail (testing)", self);
+        return NO;
+    }
+
+    // Production interval lives in kMTRPostPASEWatchdogInterval; tests
+    // can shorten it via setPostPASEWatchdogIntervalForTesting:.
+    NSTimeInterval intervalSeconds = (sMTRPostPASEWatchdogIntervalForTesting > 0)
+        ? sMTRPostPASEWatchdogIntervalForTesting
+        : kMTRPostPASEWatchdogInterval;
+
+    if (_postPASEWatchdog) {
+        // Already armed; should not happen but defend against double-arm.
+        return YES;
+    }
+
+    _postPASEWatchdog = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, _delegateQueue);
+    if (!_postPASEWatchdog) {
+        MTR_LOG_ERROR("%@ failed to create post-PASE watchdog timer", self);
+        return NO;
+    }
+
+    dispatch_source_set_timer(_postPASEWatchdog,
+        dispatch_time(DISPATCH_TIME_NOW, (int64_t) (intervalSeconds * NSEC_PER_SEC)),
+        DISPATCH_TIME_FOREVER,
+        1 * NSEC_PER_SEC);
+
+    mtr_weakify(self);
+    dispatch_source_set_event_handler(_postPASEWatchdog, ^{
+        mtr_strongify(self);
+        if (!self) {
+            return;
+        }
+        // Late-fire guard: if the client has already advanced past the
+        // post-PASE waiting state (e.g. the client's commissionNodeWithID:
+        // succeeded and the controller flipped isWaitingAfterPASEEstablished
+        // back to NO) but the timer block had already been posted to the
+        // queue ahead of the cancel, we must NOT route a spurious
+        // CHIP_ERROR_TIMEOUT through the failure path -- doing so would
+        // tear down a commissioning that is now legitimately in flight.
+        if (!self.isWaitingAfterPASEEstablished) {
+            return;
+        }
+        MTR_LOG_ERROR("%@ post-PASE watchdog fired -- client never advanced past paseSessionEstablishmentComplete; cancelling commissioning to release controller", self);
+        [self _firePostPASEWatchdog];
+    });
+
+    dispatch_resume(_postPASEWatchdog);
+    return YES;
+}
+
+- (void)_cancelPostPASEWatchdog
+{
+    // All access to _postPASEWatchdog is serialized on _delegateQueue
+    // in production paths; tests may invoke this helper directly so we
+    // do not assert.
+
+    if (_postPASEWatchdog) {
+        dispatch_source_cancel(_postPASEWatchdog);
+        _postPASEWatchdog = nil;
+    }
+}
+
+- (void)_firePostPASEWatchdog
+{
+    // The handler is delivered on _delegateQueue.  Self-cancel by asking the
+    // controller to stop commissioning for our commissioningID (which routes
+    // through StopPairing in both internal and non-internal flows), then
+    // notify the delegate via the standard error path so any UI / client
+    // state can settle back to "no commissioning in flight".
+
+    [self _cancelPostPASEWatchdog];
+
+    MTRDeviceController_Concrete * strongController = _controller;
+    if (strongController) {
+        // Distinguish two failure modes for stopCommissioning:forCommissioningID::
+        //
+        //   (1) "Replaced": the commissioning slot has already been taken by a
+        //       successor MTRCommissioningOperation, so currentCommissioning
+        //       on the controller is no longer self.  In that case we must NOT
+        //       dispatch a timeout, because the wedge has already been resolved
+        //       by the replacement and a timeout dispatched here would tear
+        //       down the unrelated successor's delegate state.
+        //
+        //   (2) "Genuine StopPairing failure": currentCommissioning is still
+        //       self but the C++ StopPairing returned a non-OK error (e.g.
+        //       CHIP_ERROR_INVALID_DEVICE_DESCRIPTOR if the commissionee's
+        //       fail-safe (60-180s) has already expired before the 5-minute
+        //       watchdog fires).  In this case the wedge persists, so we MUST
+        //       still escalate the timeout to the client -- otherwise the very
+        //       wedge the watchdog exists to break is left in place silently.
+        //
+        // stopCommissioning:forCommissioningID: returns NO in BOTH cases, so we
+        // must distinguish them by checking currentCommissioning ourselves.
+        BOOL replaced = (strongController.currentCommissioning != self);
+        if (replaced) {
+            // Replacement path already cleared our delegate via
+            // commissioningDone:, so just log and bail.
+            MTR_LOG("%@ post-PASE watchdog fired but commissioning was already replaced; suppressing spurious timeout", self);
+            return;
+        }
+
+        // Use stopCommissioning:forCommissioningID: rather than
+        // cancelCommissioningForNodeID:.  The latter only does anything for
+        // the legacy isInternallyCreated:YES flow, but the watchdog is
+        // armed for any client that implements
+        // commissioning:paseSessionEstablishmentComplete:, regardless of
+        // creation path.  stopCommissioning: invokes StopPairing on the
+        // CHIP layer in both cases so the controller actually releases
+        // its in-progress commissioning slot.  We deliberately ignore the
+        // return value here: even if StopPairing failed at the CHIP layer
+        // (e.g. fail-safe already expired), we still need to surface the
+        // timeout to the client so it can settle back to "no commissioning
+        // in flight" rather than waiting forever.
+        (void) [strongController stopCommissioning:self forCommissioningID:_commissioningID];
+    }
+
+    [self _dispatchCommissioningError:[MTRError errorForCHIPErrorCode:CHIP_ERROR_TIMEOUT]];
+}
+
 static inline void emitMetricForSetupPayload(NSString * payload)
 {
     std::vector<SetupPayload> payloads;
@@ -170,12 +391,35 @@ - (void)startWithController:(MTRDeviceController *)controller
 - (BOOL)stop
 {
     MTRDeviceController_Concrete * strongController = _controller;
+    BOOL stopResult;
     if (!strongController) {
         // Nothing to do; controller is gone, so we are stopped no matter what.
-        return NO;
+        stopResult = NO;
+    } else {
+        stopResult = [strongController stopCommissioning:self forCommissioningID:_commissioningID];
     }
 
-    return [strongController stopCommissioning:self forCommissioningID:_commissioningID];
+    // Clear the waiting-after-PASE flag synchronously AFTER the controller
+    // transition.  If we cleared it before stopCommissioning:, the watchdog's
+    // late-fire guard could observe (flag=NO, currentCommissioning=self) and
+    // suppress a fire that legitimately needed to drive cleanup.  By clearing
+    // after the controller transition, either:
+    //   - the watchdog fires before our clear: it sees flag=YES and runs its
+    //     normal fire path, which is fine because stopCommissioning above is
+    //     idempotent (a duplicate stopCommissioning: returns NO and the
+    //     watchdog suppresses the timeout via the "replaced" check), OR
+    //   - the watchdog fires after our clear: the late-fire guard suppresses
+    //     it, also fine, since we have already done the controller transition.
+    // Then bounce the cancel of _postPASEWatchdog itself onto _delegateQueue
+    // so we don't race with the timer's event_handler.
+    os_unfair_lock_lock(&_stateLock);
+    _isWaitingAfterPASEEstablished = NO;
+    os_unfair_lock_unlock(&_stateLock);
+    dispatch_async(_delegateQueue, ^{
+        [self _cancelPostPASEWatchdog];
+    });
+
+    return stopResult;
 }
 
 - (void)_earlyFailCommissioning:(CHIP_ERROR)error
@@ -208,6 +452,18 @@ - (void)_dispatchCommissioningError:(NSError *)error withMetrics:(MTRMetrics *)m
 
 - (void)_dispatchCommissioningError:(NSError *)error forCommissioningID:(NSNumber *)commissioningID withMetrics:(MTRMetrics *)metrics
 {
+    // Any terminal error path implies the watchdog (if armed) no longer needs
+    // to fire.  Clear the waiting-after-PASE flag synchronously so that any
+    // already-enqueued watchdog event_handler is suppressed by the late-fire
+    // guard, then bounce the dispatch_source cancel itself onto _delegateQueue
+    // (this method may be invoked on the Matter / CHIP thread).
+    os_unfair_lock_lock(&_stateLock);
+    _isWaitingAfterPASEEstablished = NO;
+    os_unfair_lock_unlock(&_stateLock);
+    dispatch_async(_delegateQueue, ^{
+        [self _cancelPostPASEWatchdog];
+    });
+
     MTRDeviceController_Concrete * strongController = _controller;
 
     MTR_LOG("%@ Device commissioning failed with controller %@ metrics %@", self, strongController, metrics);
@@ -263,7 +519,32 @@ - (void)controller:(MTRDeviceController *)controller commissioningSessionEstabli
     // commissioning ourselves if not.
     if ([strongDelegate respondsToSelector:@selector(commissioning:paseSessionEstablishmentComplete:)]) {
         dispatch_async(_delegateQueue, ^{
-            self.isWaitingAfterPASEEstablished = YES;
+            // Arm the watchdog inside this block so that the arm and the
+            // paired ivar write below are both serialized on _delegateQueue
+            // with all subsequent _postPASEWatchdog mutations.  A client
+            // that handles paseSessionEstablishmentComplete: but then
+            // loses interest -- without ever calling commissionNodeWithID:
+            // or cancelCommissioningForNodeID: -- would otherwise
+            // permanently wedge the controller's commissioning state.
+            BOOL armed = [self _armPostPASEWatchdog];
+            if (!armed) {
+                // Failing to arm the watchdog means we cannot bound the
+                // post-PASE wait at all -- that is the very wedge this
+                // logic exists to prevent.  Surface a no-memory error so
+                // the client tears down rather than going into the
+                // unbounded wait state.
+                MTR_LOG_ERROR("%@ failed to arm post-PASE watchdog; aborting commissioning to avoid an unbounded wait", self);
+                [self _dispatchCommissioningError:[MTRError errorForCHIPErrorCode:CHIP_ERROR_NO_MEMORY]];
+                return;
+            }
+            // We are already on _delegateQueue, so write the backing ivar
+            // directly under _stateLock rather than going through the
+            // setter (which would re-enter the cancel path on a NO->YES
+            // transition only -- but using the lock here keeps the
+            // reader/writer contract uniform).
+            os_unfair_lock_lock(&self->_stateLock);
+            self->_isWaitingAfterPASEEstablished = YES;
+            os_unfair_lock_unlock(&self->_stateLock);
             [strongDelegate commissioning:self paseSessionEstablishmentComplete:error];
         });
 
@@ -502,3 +783,28 @@ - (void)deviceAttestationCompletedForController:(MTRDeviceController *)controlle
 }
 
 @end
+
+#pragma mark - PostPASEWatchdogTesting
+
+@implementation MTRCommissioningOperation (PostPASEWatchdogTesting)
+
++ (void)setPostPASEWatchdogIntervalForTesting:(NSTimeInterval)interval
+{
+    // Negative values are nonsensical; clamp to 0 (== "use production
+    // interval").  The override is read on _delegateQueue when arming a
+    // new watchdog; tests are expected to set this before kicking off
+    // the commissioning flow they want to observe.
+    sMTRPostPASEWatchdogIntervalForTesting = (interval > 0) ? interval : 0;
+}
+
+- (void)setForceNextArmFailureForTesting:(BOOL)force
+{
+    // _forceNextArmFailureForTesting is read/written only on
+    // _delegateQueue (matching _postPASEWatchdog).  Bounce onto that
+    // queue so the test does not have to know which queue it is on.
+    dispatch_async(_delegateQueue, ^{
+        self->_forceNextArmFailureForTesting = force;
+    });
+}
+
+@end