Skip to content

Commit 11db82f

Browse files
committed
fix hang at exit with multiple devices
1 parent 33781d6 commit 11db82f

File tree

1 file changed

+43
-8
lines changed

1 file changed

+43
-8
lines changed

src/convcore.cpp

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,49 @@ void ConverseExit(int exitcode)
185185
// increment number of PEs ready for exit
186186
std::atomic_fetch_add_explicit(&numPEsReadyForExit, 1, std::memory_order_release);
187187
// we need everyone to spin unlike old converse to be able to exit threads
188-
while (std::atomic_load_explicit(&numPEsReadyForExit, std::memory_order_acquire) != CmiMyNodeSize()) {}
189-
190-
// All threads call exitThread() for their own cleanup
191-
comm_backend::exitThread();
192-
193-
// only rank 0 does cleanup and exits
188+
while (std::atomic_load_explicit(&numPEsReadyForExit, std::memory_order_acquire) != CmiMyNodeSize()) {
189+
// make progress while waiting so network progress can continue
190+
comm_backend::progress();
191+
}
192+
193+
// At this point all threads on the node are ready to exit. We must perform
194+
// the inter-node barrier while per-thread communication contexts are still
195+
// alive so that progress() can drive completion. To coordinate that,
196+
// rank 0 performs the inter-node barrier and then notifies other threads
197+
// (via an atomic) that the barrier is complete. After the notification,
198+
// every thread performs its per-thread cleanup (exitThread). Finally,
199+
// rank 0 waits for all threads to finish exitThread before tearing down
200+
// the global comm backend and exiting the process.
201+
202+
static std::atomic<int> barrier_done{0};
203+
static std::atomic<int> exitThread_done{0};
204+
194205
if (CmiMyRank() == 0) {
206+
// participate in the global barrier (blocks until other nodes arrive)
195207
comm_backend::barrier();
208+
// let other local threads know barrier has completed
209+
barrier_done.store(1, std::memory_order_release);
210+
} else {
211+
// other threads help make progress until rank 0 finishes the barrier
212+
while (std::atomic_load_explicit(&barrier_done, std::memory_order_acquire) == 0) {
213+
comm_backend::progress();
214+
}
215+
}
216+
217+
// Now every thread can clean up its thread-local comm state.
218+
comm_backend::exitThread();
219+
220+
// signal we've finished exitThread()
221+
std::atomic_fetch_add_explicit(&exitThread_done, 1, std::memory_order_release);
222+
223+
if (CmiMyRank() == 0) {
224+
// wait for all local threads to complete their per-thread cleanup. Use
225+
// progress() to avoid deadlock if any backend progress is needed.
226+
while (std::atomic_load_explicit(&exitThread_done, std::memory_order_acquire) != CmiMyNodeSize()) {
227+
comm_backend::progress();
228+
}
229+
230+
// safe to tear down global comm backend and process-wide structures now
196231
comm_backend::exit();
197232
delete[] Cmi_queues;
198233
delete CmiNodeQueue;
@@ -202,8 +237,8 @@ void ConverseExit(int exitcode)
202237
CmiHandlerTable = nullptr;
203238
exit(exitcode);
204239
} else {
205-
// Non-rank-0 threads block here indefinitely
206-
// Rank 0's exit() call will terminate the entire process
240+
// Non-rank-0 threads block here indefinitely; rank 0 will terminate the
241+
// process once cleanup is done.
207242
while (true) {
208243
std::this_thread::sleep_for(std::chrono::milliseconds(100));
209244
}

0 commit comments

Comments
 (0)