project-chip
diff --git a/‎src/darwin/Framework/CHIP/MTRCommissioningOperation.mm‎
Lines changed: 273 additions & 1 deletion b/‎src/darwin/Framework/CHIP/MTRCommissioningOperation.mm‎
Lines changed: 273 additions & 1 deletion
diff --git a/‎src/darwin/Framework/CHIP/MTRCommissioningOperation_Internal.h‎
Lines changed: 18 additions & 2 deletions b/‎src/darwin/Framework/CHIP/MTRCommissioningOperation_Internal.h‎
Lines changed: 18 additions & 2 deletions
@@ -16,10 +16,12 @@
 
 #import <Foundation/Foundation.h>
 #import <Matter/MTRDeviceAttestationDelegate.h>
+#import <os/lock.h>
 
 #import "MTRCommissioningDelegate_Internal.h"
 #import "MTRCommissioningOperation.h"
 #import "MTRCommissioningOperation_Internal.h"
+#import "MTRCommissioningOperation_Test.h"
 #import "MTRDefines_Internal.h"
 #import "MTRDeviceControllerDelegate_Internal.h"
 #import "MTRDeviceController_Concrete.h"
@@ -36,6 +38,19 @@
 using namespace chip;
 using namespace chip::Tracing::DarwinFramework;
 
+// Production interval after which the post-PASE watchdog fires.  Five
+// minutes is comfortably longer than the commissionee fail-safe (60-180
+// seconds) but short enough that the user gets unblocked well within a
+// single sit-down troubleshooting session.  Exposed via
+// MTRCommissioningOperation_Test.h so tests can assert against the same
+// value production uses.
+extern const NSTimeInterval kMTRPostPASEWatchdogInterval = 5 * 60;
+
+// Test-only override for kMTRPostPASEWatchdogInterval.  Non-zero values
+// take effect for all subsequently armed watchdogs in the current
+// process; production reads kMTRPostPASEWatchdogInterval directly.
+static NSTimeInterval sMTRPostPASEWatchdogIntervalForTesting = 0;
+
 @interface MTRCommissioningOperationDeviceAttestationDelegate : NSObject <MTRDeviceAttestationDelegate>
 @property (nonatomic, weak) MTRCommissioningOperation * commissioningOperation;
 @end
@@ -49,6 +64,67 @@ @implementation MTRCommissioningOperation {
     id<MTRCommissioningDelegate> __weak _delegate;
     dispatch_queue_t _delegateQueue;
     MTRDeviceController_Concrete * __weak _controller;
+    // Watchdog timer armed once PASE has been established to bound how long we
+    // hold the controller in the "in-progress commissioning" state if the
+    // client (Home app, HomeSmartMatter appex, etc.) drops the flow without
+    // ever calling commissionNodeWithID: or cancelCommissioningForNodeID:.
+    // Without this watchdog, _currentInternalCommissioning is never cleared
+    // and every subsequent commissioning attempt hits CHIP_ERROR_BUSY (0xDB)
+    // until the process restarts.
+    //
+    // _postPASEWatchdog is read/written only on _delegateQueue.
+    dispatch_source_t _postPASEWatchdog;
+    // Backing ivar for the isWaitingAfterPASEEstablished property.  Reads and
+    // writes are serialized by _stateLock so callers from any queue see a
+    // consistent value immediately after the setter returns; the watchdog
+    // teardown that runs as a side effect of clearing the flag is bounced
+    // onto _delegateQueue (which is the only queue allowed to touch
+    // _postPASEWatchdog).
+    BOOL _isWaitingAfterPASEEstablished;
+    os_unfair_lock _stateLock;
+    // Test-only one-shot fault injection.  When YES, the next call to
+    // _armPostPASEWatchdog will simulate a dispatch_source_create failure
+    // (return NO without arming the timer) and clear the flag.  Read and
+    // written only on _delegateQueue, mirroring _postPASEWatchdog itself.
+    BOOL _forceNextArmFailureForTesting;
+}
+
+@synthesize commissioningID = _commissioningID;
+
+- (BOOL)isWaitingAfterPASEEstablished
+{
+    os_unfair_lock_lock(&_stateLock);
+    BOOL value = _isWaitingAfterPASEEstablished;
+    os_unfair_lock_unlock(&_stateLock);
+    return value;
+}
+
+- (void)setIsWaitingAfterPASEEstablished:(BOOL)isWaitingAfterPASEEstablished
+{
+    // Write the flag synchronously under _stateLock so that any caller that
+    // immediately reads back isWaitingAfterPASEEstablished observes the
+    // updated value (callers like MTRDeviceController_Concrete depend on
+    // this caller-visible ordering).  Only the watchdog teardown side
+    // effect is bounced onto _delegateQueue, since _postPASEWatchdog itself
+    // is _delegateQueue-owned.
+    BOOL changedToNo = NO;
+    os_unfair_lock_lock(&_stateLock);
+    if (_isWaitingAfterPASEEstablished != isWaitingAfterPASEEstablished) {
+        _isWaitingAfterPASEEstablished = isWaitingAfterPASEEstablished;
+        changedToNo = !isWaitingAfterPASEEstablished;
+    }
+    os_unfair_lock_unlock(&_stateLock);
+
+    if (changedToNo) {
+        // The client has either started commissioning (via
+        // commissionNodeWithID:) or moved on; the watchdog no longer needs
+        // to fire.  The cancel must happen on _delegateQueue because that
+        // is where _postPASEWatchdog is owned and where the timer's
+        // event_handler is delivered.
+        dispatch_async(_delegateQueue, ^{
+            [self _cancelPostPASEWatchdog];
+        });
+    }
 }
 
 - (instancetype)initWithParameters:(MTRCommissioningParameters *)parameters
@@ -97,13 +173,135 @@ - (instancetype)initWithParameters:(MTRCommissioningParameters *)parameters
     _isInternallyCreated = isInternallyCreated;
     _delegate = delegate;
     _delegateQueue = queue;
+    _stateLock = OS_UNFAIR_LOCK_INIT;
 
     // Don't hold on to the provided attestation delegate, which we never use.
     _parameters.deviceAttestationDelegate = nil;
 
     return self;
 }
 
+- (void)dealloc
+{
+    // Defensive: ensure the watchdog timer doesn't outlive us.  In all
+    // non-dealloc code paths, _postPASEWatchdog is read/written only on
+    // _delegateQueue.  In dealloc itself the timer source's event_handler
+    // cannot still be in flight (the handler retains self via mtr_strongify
+    // for its duration), so a synchronous cancel here is safe regardless
+    // of which queue is executing dealloc.
+    if (_postPASEWatchdog) {
+        dispatch_source_cancel(_postPASEWatchdog);
+        _postPASEWatchdog = nil;
+    }
+}
+
+- (BOOL)_armPostPASEWatchdog
+{
+    // _postPASEWatchdog is owned by _delegateQueue; production callers
+    // invoke this from inside that queue.
+
+    // Test-only one-shot fault injection: simulate a
+    // dispatch_source_create failure so callers can exercise the
+    // "watchdog could not be armed -> bail out" path without having to
+    // actually exhaust dispatch sources.
+    if (_forceNextArmFailureForTesting) {
+        _forceNextArmFailureForTesting = NO;
+        MTR_LOG_ERROR("%@ post-PASE watchdog arm forced to fail (testing)", self);
+        return NO;
+    }
+
+    // Production interval lives in kMTRPostPASEWatchdogInterval; tests
+    // can shorten it via setPostPASEWatchdogIntervalForTesting:.
+    NSTimeInterval intervalSeconds = (sMTRPostPASEWatchdogIntervalForTesting > 0)
+        ? sMTRPostPASEWatchdogIntervalForTesting
+        : kMTRPostPASEWatchdogInterval;
+
+    if (_postPASEWatchdog) {
+        // Already armed; should not happen but defend against double-arm.
+        return YES;
+    }
+
+    _postPASEWatchdog = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, _delegateQueue);
+    if (!_postPASEWatchdog) {
+        MTR_LOG_ERROR("%@ failed to create post-PASE watchdog timer", self);
+        return NO;
+    }
+
+    dispatch_source_set_timer(_postPASEWatchdog,
+        dispatch_time(DISPATCH_TIME_NOW, (int64_t) (intervalSeconds * NSEC_PER_SEC)),
+        DISPATCH_TIME_FOREVER,
+        1 * NSEC_PER_SEC);
+
+    mtr_weakify(self);
+    dispatch_source_set_event_handler(_postPASEWatchdog, ^{
+        mtr_strongify(self);
+        if (!self) {
+            return;
+        }
+        // Late-fire guard: if the client has already advanced past the
+        // post-PASE waiting state (e.g. the client's commissionNodeWithID:
+        // succeeded and the controller flipped isWaitingAfterPASEEstablished
+        // back to NO) but the timer block had already been posted to the
+        // queue ahead of the cancel, we must NOT route a spurious
+        // CHIP_ERROR_TIMEOUT through the failure path -- doing so would
+        // tear down a commissioning that is now legitimately in flight.
+        if (!self.isWaitingAfterPASEEstablished) {
+            return;
+        }
+        MTR_LOG_ERROR("%@ post-PASE watchdog fired -- client never advanced past paseSessionEstablishmentComplete; cancelling commissioning to release controller", self);
+        [self _firePostPASEWatchdog];
+    });
+
+    dispatch_resume(_postPASEWatchdog);
+    return YES;
+}
+
+- (void)_cancelPostPASEWatchdog
+{
+    // All access to _postPASEWatchdog is serialized on _delegateQueue
+    // in production paths; tests may invoke this helper directly so we
+    // do not assert.
+
+    if (_postPASEWatchdog) {
+        dispatch_source_cancel(_postPASEWatchdog);
+        _postPASEWatchdog = nil;
+    }
+}
+
+- (void)_firePostPASEWatchdog
+{
+    // The handler is delivered on _delegateQueue.  Self-cancel by asking the
+    // controller to stop commissioning for our commissioningID (which routes
+    // through StopPairing in both internal and non-internal flows), then
+    // notify the delegate via the standard error path so any UI / client
+    // state can settle back to "no commissioning in flight".
+
+    [self _cancelPostPASEWatchdog];
+
+    MTRDeviceController_Concrete * strongController = _controller;
+    if (strongController) {
+        // Use stopCommissioning:forCommissioningID: rather than
+        // cancelCommissioningForNodeID:.  The latter only does anything for
+        // the legacy isInternallyCreated:YES flow, but the watchdog is
+        // armed for any client that implements
+        // commissioning:paseSessionEstablishmentComplete:, regardless of
+        // creation path.  stopCommissioning: invokes StopPairing on the
+        // CHIP layer in both cases so the controller actually releases
+        // its in-progress commissioning slot.
+        BOOL stopped = [strongController stopCommissioning:self forCommissioningID:_commissioningID];
+        if (!stopped) {
+            // Commissioning was already replaced by a successor; do NOT
+            // surface a spurious CHIP_ERROR_TIMEOUT into a now-unrelated
+            // commissioning.  The replacement path already cleared our
+            // delegate via commissioningDone:, so just log and bail.
+            MTR_LOG("%@ post-PASE watchdog fired but commissioning was already replaced; suppressing spurious timeout", self);
+            return;
+        }
+    }
+
+    [self _dispatchCommissioningError:[MTRError errorForCHIPErrorCode:CHIP_ERROR_TIMEOUT]];
+}
+
 static inline void emitMetricForSetupPayload(NSString * payload)
 {
     std::vector<SetupPayload> payloads;
@@ -169,6 +367,18 @@ - (void)startWithController:(MTRDeviceController *)controller
 
 - (BOOL)stop
 {
+    // Clear the waiting-after-PASE flag synchronously so that any in-flight
+    // watchdog event_handler block already sitting on _delegateQueue is
+    // suppressed by the late-fire guard before its cancel arrives.  Then
+    // bounce the cancel of _postPASEWatchdog itself onto _delegateQueue so
+    // we don't race with the timer's event_handler.
+    os_unfair_lock_lock(&_stateLock);
+    _isWaitingAfterPASEEstablished = NO;
+    os_unfair_lock_unlock(&_stateLock);
+    dispatch_async(_delegateQueue, ^{
+        [self _cancelPostPASEWatchdog];
+    });
+
     MTRDeviceController_Concrete * strongController = _controller;
     if (!strongController) {
         // Nothing to do; controller is gone, so we are stopped no matter what.
@@ -208,6 +418,18 @@ - (void)_dispatchCommissioningError:(NSError *)error withMetrics:(MTRMetrics *)m
 
 - (void)_dispatchCommissioningError:(NSError *)error forCommissioningID:(NSNumber *)commissioningID withMetrics:(MTRMetrics *)metrics
 {
+    // Any terminal error path implies the watchdog (if armed) no longer needs
+    // to fire.  Clear the waiting-after-PASE flag synchronously so that any
+    // already-enqueued watchdog event_handler is suppressed by the late-fire
+    // guard, then bounce the dispatch_source cancel itself onto _delegateQueue
+    // (this method may be invoked on the Matter / CHIP thread).
+    os_unfair_lock_lock(&_stateLock);
+    _isWaitingAfterPASEEstablished = NO;
+    os_unfair_lock_unlock(&_stateLock);
+    dispatch_async(_delegateQueue, ^{
+        [self _cancelPostPASEWatchdog];
+    });
+
     MTRDeviceController_Concrete * strongController = _controller;
 
     MTR_LOG("%@ Device commissioning failed with controller %@ metrics %@", self, strongController, metrics);
@@ -263,7 +485,32 @@ - (void)controller:(MTRDeviceController *)controller commissioningSessionEstabli
     // commissioning ourselves if not.
     if ([strongDelegate respondsToSelector:@selector(commissioning:paseSessionEstablishmentComplete:)]) {
         dispatch_async(_delegateQueue, ^{
-            self.isWaitingAfterPASEEstablished = YES;
+            // Arm the watchdog inside this block so that the arm and the
+            // paired ivar write below are both serialized on _delegateQueue
+            // with all subsequent _postPASEWatchdog mutations.  A client
+            // that handles paseSessionEstablishmentComplete: but then
+            // loses interest -- without ever calling commissionNodeWithID:
+            // or cancelCommissioningForNodeID: -- would otherwise
+            // permanently wedge the controller's commissioning state.
+            BOOL armed = [self _armPostPASEWatchdog];
+            if (!armed) {
+                // Failing to arm the watchdog means we cannot bound the
+                // post-PASE wait at all -- that is the very wedge this
+                // logic exists to prevent.  Surface a no-memory error so
+                // the client tears down rather than going into the
+                // unbounded wait state.
+                MTR_LOG_ERROR("%@ failed to arm post-PASE watchdog; aborting commissioning to avoid an unbounded wait", self);
+                [self _dispatchCommissioningError:[MTRError errorForCHIPErrorCode:CHIP_ERROR_NO_MEMORY]];
+                return;
+            }
+            // We are already on _delegateQueue, so write the backing ivar
+            // directly under _stateLock rather than going through the
+            // setter (which would re-enter the cancel path on a NO->YES
+            // transition only -- but using the lock here keeps the
+            // reader/writer contract uniform).
+            os_unfair_lock_lock(&self->_stateLock);
+            self->_isWaitingAfterPASEEstablished = YES;
+            os_unfair_lock_unlock(&self->_stateLock);
             [strongDelegate commissioning:self paseSessionEstablishmentComplete:error];
         });
 
@@ -502,3 +749,28 @@ - (void)deviceAttestationCompletedForController:(MTRDeviceController *)controlle
 }
 
 @end
+
+#pragma mark - PostPASEWatchdogTesting
+
+@implementation MTRCommissioningOperation (PostPASEWatchdogTesting)
+
++ (void)setPostPASEWatchdogIntervalForTesting:(NSTimeInterval)interval
+{
+    // Negative values are nonsensical; clamp to 0 (== "use production
+    // interval").  The override is read on _delegateQueue when arming a
+    // new watchdog; tests are expected to set this before kicking off
+    // the commissioning flow they want to observe.
+    sMTRPostPASEWatchdogIntervalForTesting = (interval > 0) ? interval : 0;
+}
+
+- (void)setForceNextArmFailureForTesting:(BOOL)force
+{
+    // _forceNextArmFailureForTesting is read/written only on
+    // _delegateQueue (matching _postPASEWatchdog).  Bounce onto that
+    // queue so the test does not have to know which queue it is on.
+    dispatch_async(_delegateQueue, ^{
+        self->_forceNextArmFailureForTesting = force;
+    });
+}
+
+@end
@@ -38,14 +38,30 @@ NS_ASSUME_NONNULL_BEGIN
 
 @property (nonatomic, readonly, assign) BOOL isInternallyCreated;
 
+// The commissioning identifier this operation was constructed with (the
+// random "future node ID" we use to track the commissioning flow before the
+// commissionee has been assigned a real node ID).  Exposed so callers like
+// MTRDeviceController_Concrete that need to drive cancellation against this
+// commissioning have something to pass to StopPairing without reaching into
+// private ivars.
+@property (nonatomic, readonly, copy) NSNumber * commissioningID;
+
 // True if the commissioning is waiting to resume after PASE has been
 // established and the delegate chose to be notified about that.
 //
 // This is currently only true if isInternallyCreated, and is readwrite because
 // MTRDeviceController_Concrete helps maintain this state.
 //
-// This property should generally be written on client queues only, not on the
-// Matter queue.
+// Threading: reads and writes go through the property, which is internally
+// guarded by an os_unfair_lock so the value the setter just wrote is
+// observable to a synchronous reader on any queue.  The paired post-PASE
+// watchdog timer (an internal implementation detail) is owned by the
+// operation's _delegateQueue; the setter clears the flag synchronously and
+// bounces the watchdog teardown side effect onto _delegateQueue.  The
+// watchdog event_handler consults isWaitingAfterPASEEstablished as a
+// late-fire guard to avoid a spurious CHIP_ERROR_TIMEOUT when the client
+// has legitimately advanced past the post-PASE waiting state but a timer
+// block was already enqueued ahead of the cancel.
 @property (nonatomic, readwrite, assign) BOOL isWaitingAfterPASEEstablished;
 
 @end