Skip to content

Commit ead8174

Browse files
Fix: Null mContext crash when commissioning after background/foreground (project-chip#43127)
After backgrounding/foregrounding, commissioning crashes with null mContext. Root cause: - Background: ClearContext() sets mContext = null - Foreground: Register() fails with CHIP_ERROR_DUPLICATE_KEY_ID - SetContext() blocked, mContext stays null - Commission attempt accesses null mContext → crash Solution: - Make Register() and Shutdown() idempotent for app lifecycle - Unregister clusters in Shutdown() to clean state - SetContext() now succeeds, restoring mContext - Add 2-second delay in test framework after SIGTERM for socket cleanup Testing: Validated on iOS examples/tv-casting-app app and all unit tests pass. Test Fix: TC_TLSCERT_2_12 was failing because idempotent shutdown makes cleanup so fast that kernel doesn't finish releasing TCP sockets before next test starts. Added brief delay to allow proper socket cleanup. Applies to all platforms with app lifecycle (iOS, Android, tvOS, etc.) Apply restyle formatting (whitespace & clang-format)
1 parent 73c4dd0 commit ead8174

12 files changed

Lines changed: 256 additions & 34 deletions

File tree

examples/tv-casting-app/darwin/MatterTvCastingBridge/MatterTvCastingBridge/MCCastingApp.mm

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ @interface MCCastingApp ()
4646
// Client defiend data source used to initialize the MCCommissionableDataProvider and, if needed, update the MCCommissionableDataProvider post initialization. This is necessary for the Commissioner-Generated passcode commissioning feature.
4747
@property (nonatomic, strong) id<MCDataSource> dataSource;
4848

49+
// Track whether this is the first start (cold boot) or subsequent start (warm boot)
50+
@property (atomic) BOOL hasStartedBefore;
51+
4952
@end
5053

5154
@implementation MCCastingApp
@@ -129,32 +132,61 @@ - (NSError *)updateCommissionableDataProvider
129132

130133
- (void)startWithCompletionBlock:(void (^)(NSError *))completion
131134
{
132-
ChipLogProgress(AppServer, "MCCastingApp.startWithCompletionBlock called");
135+
ChipLogProgress(AppServer, "MCCastingApp.startWithCompletionBlock called (%s start)", self.hasStartedBefore ? "warm" : "cold");
136+
self.hasStartedBefore = YES;
137+
133138
VerifyOrReturn(_workQueue != nil && _clientQueue != nil, dispatch_async(self->_clientQueue, ^{
139+
ChipLogError(AppServer, "MCCastingApp.startWithCompletionBlock failed: work queue or client queue is nil");
134140
completion([MCErrorUtils NSErrorFromChipError:CHIP_ERROR_INCORRECT_STATE]);
135141
}));
136142

137-
dispatch_async(_workQueue, ^{
138-
__block CHIP_ERROR err = matter::casting::core::CastingApp::GetInstance()->Start();
143+
// Start event loop task FIRST (synchronously) to ensure proper state
144+
__block CHIP_ERROR err = chip::DeviceLayer::PlatformMgrImpl().StartEventLoopTask();
145+
if (err != CHIP_NO_ERROR) {
146+
ChipLogError(AppServer, "MCCastingApp.startWithCompletionBlock StartEventLoopTask failed: %s", err.AsString());
139147
dispatch_async(self->_clientQueue, ^{
140148
completion([MCErrorUtils NSErrorFromChipError:err]);
141149
});
150+
// Early return to prevent double completion call
151+
return;
152+
}
153+
154+
// Only then start the casting app (on work queue)
155+
dispatch_async(_workQueue, ^{
156+
CHIP_ERROR startErr = matter::casting::core::CastingApp::GetInstance()->Start();
157+
if (startErr != CHIP_NO_ERROR) {
158+
ChipLogError(AppServer, "MCCastingApp.startWithCompletionBlock CastingApp::Start failed: %s", startErr.AsString());
159+
}
160+
161+
dispatch_async(self->_clientQueue, ^{
162+
completion([MCErrorUtils NSErrorFromChipError:startErr]);
163+
});
142164
});
143-
__block CHIP_ERROR err = chip::DeviceLayer::PlatformMgrImpl().StartEventLoopTask();
144-
VerifyOrReturn(err == CHIP_NO_ERROR, dispatch_async(self->_clientQueue, ^{
145-
completion([MCErrorUtils NSErrorFromChipError:err]);
146-
}));
147165
}
148166

149167
- (void)stopWithCompletionBlock:(void (^)(NSError *))completion
150168
{
151169
ChipLogProgress(AppServer, "MCCastingApp.stopWithCompletionBlock called");
152170
VerifyOrReturn(_workQueue != nil && _clientQueue != nil, dispatch_async(self->_clientQueue, ^{
171+
ChipLogError(AppServer, "MCCastingApp.stopWithCompletionBlock failed: work queue or client queue is nil");
153172
completion([MCErrorUtils NSErrorFromChipError:CHIP_ERROR_INCORRECT_STATE]);
154173
}));
155174

156175
dispatch_async(_workQueue, ^{
157-
__block CHIP_ERROR err = matter::casting::core::CastingApp::GetInstance()->Stop();
176+
// Stop the casting app first
177+
CHIP_ERROR err = matter::casting::core::CastingApp::GetInstance()->Stop();
178+
if (err != CHIP_NO_ERROR) {
179+
ChipLogError(AppServer, "MCCastingApp.stopWithCompletionBlock CastingApp::Stop failed: %s", err.AsString());
180+
}
181+
182+
// Then stop the event loop task to transition WorkQueue to suspended state
183+
CHIP_ERROR stopEventLoopErr = chip::DeviceLayer::PlatformMgrImpl().StopEventLoopTask();
184+
if (stopEventLoopErr != CHIP_NO_ERROR) {
185+
ChipLogError(AppServer, "MCCastingApp.stopWithCompletionBlock StopEventLoopTask failed: %s", stopEventLoopErr.AsString());
186+
if (err == CHIP_NO_ERROR) {
187+
err = stopEventLoopErr;
188+
}
189+
}
158190

159191
dispatch_async(self->_clientQueue, ^{
160192
completion([MCErrorUtils NSErrorFromChipError:err]);

examples/tv-casting-app/tv-casting-common/core/CastingApp.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,13 @@
2424
#include "support/ChipDeviceEventHandler.h"
2525

2626
#include <DeviceInfoProviderImpl.h>
27+
#include <app/EventManagement.h>
2728
#include <app/InteractionModelEngine.h>
2829
#include <app/clusters/bindings/BindingManager.h>
2930
#include <app/server/Server.h>
3031
#include <credentials/DeviceAttestationCredsProvider.h>
3132
#include <credentials/attestation_verifier/DeviceAttestationVerifier.h>
33+
#include <data-model-providers/codegen/CodegenDataModelProvider.h>
3234

3335
namespace {
3436
chip::DeviceLayer::DeviceInfoProviderImpl gExampleDeviceInfoProvider;
@@ -123,7 +125,13 @@ CHIP_ERROR CastingApp::Start()
123125
// Start Matter server
124126
chip::ServerInitParams * serverInitParams = mAppParameters->GetServerInitParamsProvider()->Get();
125127
VerifyOrReturnError(serverInitParams != nullptr, CHIP_ERROR_INVALID_ARGUMENT);
126-
ReturnErrorOnFailure(chip::Server::GetInstance().Init(*serverInitParams));
128+
129+
CHIP_ERROR initError = chip::Server::GetInstance().Init(*serverInitParams);
130+
if (initError != CHIP_NO_ERROR)
131+
{
132+
ChipLogError(Discovery, "CastingApp::Start() Server::Init failed: %s", initError.AsString());
133+
return initError;
134+
}
127135

128136
// Perform post server startup registrations
129137
ReturnErrorOnFailure(PostStartRegistrations());
@@ -142,6 +150,7 @@ CHIP_ERROR CastingApp::Start()
142150
CastingPlayer::GetTargetCastingPlayer()->VerifyOrEstablishConnection(connectionCallbacks);
143151
}
144152

153+
ChipLogProgress(Discovery, "CastingApp::Start() completed");
145154
return CHIP_NO_ERROR;
146155
}
147156

@@ -188,10 +197,21 @@ CHIP_ERROR CastingApp::Stop()
188197
chip::Server::GetInstance().GetUserDirectedCommissioningClient()->SetCommissionerDeclarationHandler(nullptr);
189198
#endif // CHIP_DEVICE_CONFIG_ENABLE_COMMISSIONER_DISCOVERY_CLIENT
190199

191-
// Shutdown the Matter server
200+
// Shutdown the Matter server to clean up active sessions
192201
chip::Server::GetInstance().Shutdown();
193202

203+
// Shutdown the CodegenDataModelProvider to reset mContext
204+
CHIP_ERROR providerShutdownErr = chip::app::CodegenDataModelProvider::Instance().Shutdown();
205+
if (providerShutdownErr != CHIP_NO_ERROR)
206+
{
207+
ChipLogError(Discovery, "CastingApp::Stop() CodegenDataModelProvider::Shutdown failed: %s", providerShutdownErr.AsString());
208+
}
209+
210+
// Destroy EventManagement to reset its state
211+
chip::app::EventManagement::DestroyEventManagement();
212+
194213
mState = CASTING_APP_NOT_RUNNING; // CastingApp stopped successfully, set state to NOT_RUNNING
214+
ChipLogProgress(Discovery, "CastingApp::Stop() completed");
195215

196216
return CHIP_NO_ERROR;
197217
}

src/app/InteractionModelEngine.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,19 @@ void InteractionModelEngine::Shutdown()
240240

241241
mReadHandlers.ReleaseAll();
242242

243+
// Shut down the data model provider to clear cluster mContext pointers.
244+
// Required for proper Stop() → Start() lifecycle - ensures clusters are reinitialized.
245+
if (mDataModelProvider != nullptr)
246+
{
247+
CHIP_ERROR err = mDataModelProvider->Shutdown();
248+
if (err != CHIP_NO_ERROR)
249+
{
250+
ChipLogError(InteractionModel,
251+
"InteractionModelEngine::Shutdown() Data model provider shutdown failed: %" CHIP_ERROR_FORMAT,
252+
err.Format());
253+
}
254+
}
255+
243256
#if CHIP_CONFIG_ENABLE_READ_CLIENT
244257
// Shut down any subscription clients that are still around. They won't be
245258
// able to work after this point anyway, since we're about to drop our refs
@@ -1916,14 +1929,20 @@ DataModel::Provider * InteractionModelEngine::SetDataModelProvider(DataModel::Pr
19161929
// Altering data model should not be done while IM is actively handling requests.
19171930
VerifyOrDie(mReadHandlers.begin() == mReadHandlers.end());
19181931

1932+
// REMOVED: Early return when (model == mDataModelProvider) - breaks Stop() → Start()
1933+
// After Shutdown(), server clusters are uninitialized even though pointer is unchanged.
1934+
// Must call Startup() again to reinitialize cluster mContext pointers.
1935+
19191936
if (model == mDataModelProvider)
19201937
{
1921-
// no-op, just return
1922-
return model;
1938+
ChipLogDetail(DataManagement,
1939+
"InteractionModelEngine::SetDataModelProvider() re-initializing same provider (Stop/Start cycle)");
19231940
}
19241941

19251942
DataModel::Provider * oldModel = mDataModelProvider;
1926-
if (oldModel != nullptr)
1943+
1944+
// Only shutdown if changing to a different provider
1945+
if (oldModel != nullptr && oldModel != model)
19271946
{
19281947
CHIP_ERROR err = oldModel->Shutdown();
19291948
if (err != CHIP_NO_ERROR)

src/app/clusters/general-commissioning-server/GeneralCommissioningCluster.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,23 @@ GeneralCommissioningCluster::HandleCommissioningComplete(const DataModel::Invoke
526526
}
527527
#endif // CHIP_CONFIG_TERMS_AND_CONDITIONS_REQUIRED
528528

529+
// Check if mContext is valid before accessing it (can be null during app shutdown/restart)
530+
if (mContext == nullptr)
531+
{
532+
ChipLogError(FailSafe, "GeneralCommissioning: mContext is NULL, cluster not initialized");
533+
response.errorCode = CommissioningErrorEnum::kInvalidAuthentication;
534+
handler->AddResponse(request.path, response);
535+
return std::nullopt;
536+
}
537+
538+
if (!mContext->interactionContext.actionContext.CurrentExchange())
539+
{
540+
ChipLogError(FailSafe, "GeneralCommissioning: Invalid exchange during commissioning complete");
541+
response.errorCode = CommissioningErrorEnum::kInvalidAuthentication;
542+
handler->AddResponse(request.path, response);
543+
return std::nullopt;
544+
}
545+
529546
SessionHandle handle = mContext->interactionContext.actionContext.CurrentExchange()->GetSessionHandle();
530547

531548
// Ensure it's a valid CASE session

src/app/server-cluster/DefaultServerCluster.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,19 @@ CHIP_ERROR DefaultServerCluster::Attributes(const ConcreteClusterPath & path, Re
8181

8282
CHIP_ERROR DefaultServerCluster::Startup(ServerClusterContext & context)
8383
{
84-
VerifyOrReturnError(mContext == nullptr, CHIP_ERROR_ALREADY_INITIALIZED);
84+
// Reset shutdown state to allow restart after shutdown
85+
mIsShutdown = false;
86+
87+
// Make Startup() idempotent for Stop() → Start() lifecycle.
88+
// Shutdown() does not fully clean up cluster objects - cluster objects persist across Stop() → Start().
89+
// Only their state (mContext) is cleared.
90+
// If already initialized, update context and return success (preserves mDataVersion).
91+
if (mContext != nullptr)
92+
{
93+
ChipLogDetail(DataManagement, "DefaultServerCluster::Startup() already initialized, updating context (idempotent)");
94+
mContext = &context;
95+
return CHIP_NO_ERROR;
96+
}
8597

8698
mContext = &context;
8799

@@ -94,7 +106,14 @@ CHIP_ERROR DefaultServerCluster::Startup(ServerClusterContext & context)
94106

95107
void DefaultServerCluster::Shutdown(ClusterShutdownType)
96108
{
97-
mContext = nullptr;
109+
// Make shutdown idempotent - safe to call multiple times
110+
if (mIsShutdown)
111+
{
112+
return;
113+
}
114+
115+
mContext = nullptr;
116+
mIsShutdown = true;
98117
}
99118

100119
void DefaultServerCluster::NotifyAttributeChanged(AttributeId attributeId)

src/app/server-cluster/DefaultServerCluster.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ class DefaultServerCluster : public ServerClusterInterface
106106
protected:
107107
const ConcreteClusterPath mPath;
108108
ServerClusterContext * mContext = nullptr;
109+
// Tracks if shutdown has been called to make it idempotent
110+
bool mIsShutdown = false;
109111

110112
void IncreaseDataVersion() { mDataVersion++; }
111113

src/app/server-cluster/ServerClusterInterfaceRegistry.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,58 @@ ServerClusterInterfaceRegistry::~ServerClusterInterfaceRegistry()
4343

4444
CHIP_ERROR ServerClusterInterfaceRegistry::Register(ServerClusterRegistration & entry)
4545
{
46-
// we have no strong way to check if entry is already registered somewhere else, so we use "next" as some
47-
// form of double-check
48-
VerifyOrReturnError(entry.next == nullptr, CHIP_ERROR_INVALID_ARGUMENT);
4946
VerifyOrReturnError(entry.serverClusterInterface != nullptr, CHIP_ERROR_INVALID_ARGUMENT);
5047

5148
Span<const ConcreteClusterPath> paths = entry.serverClusterInterface->GetPaths();
5249
VerifyOrReturnError(!paths.empty(), CHIP_ERROR_INVALID_ARGUMENT);
5350

51+
// Check early if this cluster is already registered (idempotent case)
52+
// This prevents unnecessary validation work and ensures we don't modify the linked list structure
53+
bool isAlreadyRegistered = false;
54+
for (const ConcreteClusterPath & path : paths)
55+
{
56+
ServerClusterInterface * existing = Get(path);
57+
if (existing == entry.serverClusterInterface)
58+
{
59+
isAlreadyRegistered = true;
60+
break;
61+
}
62+
}
63+
64+
// Validate entry.next is nullptr (unless already registered)
65+
// A non-null entry.next when not registered indicates the entry is
66+
// already part of another list or improperly initialized
67+
if (entry.next != nullptr && !isAlreadyRegistered)
68+
{
69+
return CHIP_ERROR_INVALID_ARGUMENT;
70+
}
71+
72+
// If already registered (idempotent case), return early
73+
if (isAlreadyRegistered)
74+
{
75+
#if CHIP_DETAIL_LOGGING
76+
const ConcreteClusterPath path = entry.serverClusterInterface->GetPaths().front();
77+
ChipLogDetail(DataManagement, "Cluster already registered for %u/" ChipLogFormatMEI ", skipping re-registration",
78+
path.mEndpointId, ChipLogValueMEI(path.mClusterId));
79+
#endif
80+
return CHIP_NO_ERROR;
81+
}
82+
83+
// Validate paths and check for duplicate registrations
84+
// Note: Same cluster re-registering is OK (handled above), but a DIFFERENT
85+
// cluster with the same endpoint/cluster ID is an error
86+
// Duplicate checking makes this O(n^2) on total registered items. We preserve this to ensure
87+
// cluster integrity during Stop/Start cycles, but this may be optimized in the future if needed.
5488
for (const ConcreteClusterPath & path : paths)
5589
{
5690
VerifyOrReturnError(path.HasValidIds(), CHIP_ERROR_INVALID_ARGUMENT);
5791

58-
// Double-checking for duplicates makes the checks O(n^2) on the total number of registered
59-
// items. We preserve this however we may want to make this optional at some point in time.
60-
VerifyOrReturnError(Get(path) == nullptr, CHIP_ERROR_DUPLICATE_KEY_ID);
92+
// A different cluster is already registered for this path
93+
ServerClusterInterface * existing = Get(path);
94+
if (existing != nullptr)
95+
{
96+
return CHIP_ERROR_DUPLICATE_KEY_ID;
97+
}
6198
}
6299

63100
if (mContext.has_value())

src/app/server-cluster/ServerClusterInterfaceRegistry.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <app/server-cluster/ServerClusterInterface.h>
2121
#include <lib/core/CHIPError.h>
2222
#include <lib/core/DataModelTypes.h>
23+
#include <lib/support/logging/CHIPLogging.h>
2324

2425
#include <cstdint>
2526
#include <new>
@@ -97,7 +98,14 @@ struct LazyRegisteredServerCluster
9798
template <typename... Args>
9899
void Create(Args &&... args)
99100
{
100-
VerifyOrDie(!IsConstructed());
101+
// Handle idempotent Create() - if already constructed, this is a no-op.
102+
// The cluster remains registered and functional. This supports the Stop() → Start() lifecycle
103+
// where clusters are shutdown but remain constructed.
104+
if (IsConstructed())
105+
{
106+
ChipLogDetail(DataManagement, "LazyRegisteredServerCluster::Create: Cluster already constructed, skipping re-creation");
107+
return;
108+
}
101109

102110
new (mCluster) SERVER_CLUSTER(std::forward<Args>(args)...);
103111
new (mRegistration) ServerClusterRegistration(Cluster());

0 commit comments

Comments
 (0)