1- /*
2- * gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX shared memory.
3- *
4- * Multiple MPI ranks can safely target the same GPU:
5- * cf_hookDev() — acquires GPU via CAS 0->1 on an atomic flag array (platform-specific)
6- * cf_releaseDev() — releases the lock so other processes can attach
7- * cf_cleanupLock()— unmaps and unlinks shared memory on exit
8- */
1+ // gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX
2+ // shared memory.
3+ // Multiple MPI ranks (should...) safely target the same GPU:
94
105#ifndef MODEM_GPU_LOCK_H
116#define MODEM_GPU_LOCK_H
1611#include < sys/mman.h>
1712#include < unistd.h>
1813
19- /* ------------------------------------------------------------------ */
20- /* Device state flags */
21- /* ------------------------------------------------------------------ */
14+ // Define device state flags
2215
2316#define DEVICE_FREE 0
2417#define DEVICE_IN_USE 1
2518
26- /* ------------------------------------------------------------------ */
27- /* Lock structure & global state */
28- /* ------------------------------------------------------------------ */
29-
19+ // Lock structure & global state
20+ // we probably don't have more than 64 devices on a single node
3021static constexpr int LOCK_MAX_DEVICES = 64 ;
3122
32- /*
33- * Raw byte flag at offset 0 — no C++ object lifetime requirements.
34- * Used *only* to coordinate who runs placement-new on the atomics.
35- */
23+ // cnstr: compiler-built-in atomic on raw int — safe before object lifetime begins.
24+ // All other members are std::atomic<int>, constructed by placement-new once,
25+ // then used via normal C++ atomic operations.
3626struct alignas (64 ) GpuLock {
37- unsigned char constructed; /* 0 = raw storage, !=0 = atomics live */
38- std::atomic<int > initialized; /* cross-process init guard (0->1 CAS) */
27+ int cnstr; // 0 = not constructed, 1 = atomics live
3928 std::atomic<int > occupied[LOCK_MAX_DEVICES ];
4029};
4130
4231static GpuLock* g_lock = nullptr ;
4332static bool g_lock_inited = false ;
4433
45- /* ------------------------------------------------------------------ */
46- /* Internal: create / map shared-memory segment and construct atomics */
47- /* ------------------------------------------------------------------ */
48-
34+ // Internal: create / map shared-memory segment and construct atomics
4935static inline int init_gpu_lock ()
5036{
5137 const char * name = " /ModEM_gpu_lock" ;
5238
53- /* Try to open existing shared memory segment first */
39+ // Try to open existing shared memory segment first
5440 int fd = shm_open (name, O_RDWR , 0600 );
5541 if (fd < 0 ) {
56- /* Doesn't exist -- create it */
42+ // Doesn't exist -- create it
5743 fd = shm_open (name, O_CREAT | O_RDWR , 0600 );
5844 if (fd < 0 ) return 1 ;
5945 if (ftruncate (fd, sizeof (GpuLock)) < 0 ) { close (fd); return 1 ; }
@@ -65,23 +51,30 @@ static inline int init_gpu_lock()
6551 close (fd);
6652 if (g_lock == MAP_FAILED ) { g_lock = nullptr ; return 1 ; }
6753
68- /* Begin object lifetimes once across all processes */
69- if (__atomic_test_and_set (&g_lock->constructed , __ATOMIC_ACQ_REL)) {
70- /* we are not the first -- just wait */
54+ // Coordinate exactly-once construction of std::atomic members.
55+ // cnstr uses __atomic_* built-ins: safe on raw storage
56+ // the winner does placement-new on every std::atomic<int>
57+ // the other (losers) spin-wait with ACQUIRE on "cnstr" until it's 1,
58+ // then proceed to use the atomics.
59+
60+ if (__atomic_exchange_n (&g_lock->cnstr , 1 , __ATOMIC_ACQ_REL) != 0 ) {
61+ // Another process won the race — spin-wait until construction finishes.
62+ while (!__atomic_load_n (&g_lock->cnstr , __ATOMIC_ACQUIRE))
63+ ;
7164 } else {
72- /* we are the first -- construct the atomics in shared memory */
73- new (&g_lock->initialized ) std::atomic<int >(0 );
65+ // We are the winner — construct all std::atomic members.
7466 for (int i = 0 ; i < LOCK_MAX_DEVICES ; i++)
7567 new (&g_lock->occupied [i]) std::atomic<int >(DEVICE_FREE );
68+
69+ // cnstr is already 1 from the exchange above (ACQ_REL ensures
70+ // the constructed atomics are visible to other processes).
7671 }
7772
7873 g_lock_inited = true ;
7974 return 0 ;
8075}
8176
82- /* ------------------------------------------------------------------ */
83- /* Public C-bindings (called from Fortran) */
84- /* ------------------------------------------------------------------ */
77+ // Public C-bindings (called from Fortran)
8578
8679extern " C" void cf_releaseDev (int dev_idx)
8780{
@@ -101,4 +94,4 @@ extern "C" void cf_cleanupLock()
10194 g_lock_inited = false ;
10295}
10396
104- #endif /* MODEM_GPU_LOCK_H */
97+ #endif // MODEM_GPU_LOCK_H
0 commit comments