@@ -185,14 +185,49 @@ void ConverseExit(int exitcode)
185185 // increment number of PEs ready for exit
186186 std::atomic_fetch_add_explicit (&numPEsReadyForExit, 1 , std::memory_order_release);
187187 // we need everyone to spin unlike old converse to be able to exit threads
188- while (std::atomic_load_explicit (&numPEsReadyForExit, std::memory_order_acquire) != CmiMyNodeSize ()) {}
189-
190- // All threads call exitThread() for their own cleanup
191- comm_backend::exitThread ();
192-
193- // only rank 0 does cleanup and exits
188+ while (std::atomic_load_explicit (&numPEsReadyForExit, std::memory_order_acquire) != CmiMyNodeSize ()) {
189+ // make progress while waiting so network progress can continue
190+ comm_backend::progress ();
191+ }
192+
193+ // At this point all threads on the node are ready to exit. We must perform
194+ // the inter-node barrier while per-thread communication contexts are still
195+ // alive so that progress() can drive completion. To coordinate that,
196+ // rank 0 performs the inter-node barrier and then notifies other threads
197+ // (via an atomic) that the barrier is complete. After the notification,
198+ // every thread performs its per-thread cleanup (exitThread). Finally,
199+ // rank 0 waits for all threads to finish exitThread before tearing down
200+ // the global comm backend and exiting the process.
201+
202+ static std::atomic<int > barrier_done{0 };
203+ static std::atomic<int > exitThread_done{0 };
204+
194205 if (CmiMyRank () == 0 ) {
206+ // participate in the global barrier (blocks until other nodes arrive)
195207 comm_backend::barrier ();
208+ // let other local threads know barrier has completed
209+ barrier_done.store (1 , std::memory_order_release);
210+ } else {
211+ // other threads help make progress until rank 0 finishes the barrier
212+ while (std::atomic_load_explicit (&barrier_done, std::memory_order_acquire) == 0 ) {
213+ comm_backend::progress ();
214+ }
215+ }
216+
217+ // Now every thread can clean up its thread-local comm state.
218+ comm_backend::exitThread ();
219+
220+ // signal we've finished exitThread()
221+ std::atomic_fetch_add_explicit (&exitThread_done, 1 , std::memory_order_release);
222+
223+ if (CmiMyRank () == 0 ) {
224+ // wait for all local threads to complete their per-thread cleanup. Use
225+ // progress() to avoid deadlock if any backend progress is needed.
226+ while (std::atomic_load_explicit (&exitThread_done, std::memory_order_acquire) != CmiMyNodeSize ()) {
227+ comm_backend::progress ();
228+ }
229+
230+ // safe to tear down global comm backend and process-wide structures now
196231 comm_backend::exit ();
197232 delete[] Cmi_queues;
198233 delete CmiNodeQueue;
@@ -202,8 +237,8 @@ void ConverseExit(int exitcode)
202237 CmiHandlerTable = nullptr ;
203238 exit (exitcode);
204239 } else {
205- // Non-rank-0 threads block here indefinitely
206- // Rank 0's exit() call will terminate the entire process
240+ // Non-rank-0 threads block here indefinitely; rank 0 will terminate the
241+ // process once cleanup is done.
207242 while (true ) {
208243 std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
209244 }
0 commit comments