Skip to content

Commit 36621bb

Browse files
committed
refactor: simplify construction coordination with single atomic field
f90/3D_MT/FWD_SP2/gpu_lock.h - Replace __atomic_test_and_set with __atomic_exchange_n on `cnstr`, combining winner-selection and flag-setting into one operation. - cnstr now serves as both the raw-storage constructed flag and the cross-process init guard.
1 parent e6ddc23 commit 36621bb

1 file changed

Lines changed: 29 additions & 36 deletions

File tree

f90/3D_MT/FWD_SP2/gpu_lock.h

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
1-
/*
2-
* gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX shared memory.
3-
*
4-
* Multiple MPI ranks can safely target the same GPU:
5-
* cf_hookDev() — acquires GPU via CAS 0->1 on an atomic flag array (platform-specific)
6-
* cf_releaseDev() — releases the lock so other processes can attach
7-
* cf_cleanupLock()— unmaps and unlinks shared memory on exit
8-
*/
1+
// gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX
2+
// shared memory.
3+
// Multiple MPI ranks (should...) safely target the same GPU:
94

105
#ifndef MODEM_GPU_LOCK_H
116
#define MODEM_GPU_LOCK_H
@@ -16,44 +11,35 @@
1611
#include <sys/mman.h>
1712
#include <unistd.h>
1813

19-
/* ------------------------------------------------------------------ */
20-
/* Device state flags */
21-
/* ------------------------------------------------------------------ */
14+
// Define device state flags
2215

2316
#define DEVICE_FREE 0
2417
#define DEVICE_IN_USE 1
2518

26-
/* ------------------------------------------------------------------ */
27-
/* Lock structure & global state */
28-
/* ------------------------------------------------------------------ */
29-
19+
// Lock structure & global state
20+
// we probably don't have more than 64 devices on a single node
3021
static constexpr int LOCK_MAX_DEVICES = 64;
3122

32-
/*
33-
* Raw byte flag at offset 0 — no C++ object lifetime requirements.
34-
* Used *only* to coordinate who runs placement-new on the atomics.
35-
*/
23+
// cnstr: compiler-built-in atomic on raw int — safe before object lifetime begins.
24+
// All other members are std::atomic<int>, constructed by placement-new once,
25+
// then used via normal C++ atomic operations.
3626
struct alignas(64) GpuLock {
37-
unsigned char constructed; /* 0 = raw storage, !=0 = atomics live */
38-
std::atomic<int> initialized; /* cross-process init guard (0->1 CAS) */
27+
int cnstr; // 0 = not constructed, 1 = atomics live
3928
std::atomic<int> occupied[LOCK_MAX_DEVICES];
4029
};
4130

4231
static GpuLock* g_lock = nullptr;
4332
static bool g_lock_inited = false;
4433

45-
/* ------------------------------------------------------------------ */
46-
/* Internal: create / map shared-memory segment and construct atomics */
47-
/* ------------------------------------------------------------------ */
48-
34+
// Internal: create / map shared-memory segment and construct atomics
4935
static inline int init_gpu_lock()
5036
{
5137
const char* name = "/ModEM_gpu_lock";
5238

53-
/* Try to open existing shared memory segment first */
39+
// Try to open existing shared memory segment first
5440
int fd = shm_open(name, O_RDWR, 0600);
5541
if (fd < 0) {
56-
/* Doesn't exist -- create it */
42+
// Doesn't exist -- create it
5743
fd = shm_open(name, O_CREAT | O_RDWR, 0600);
5844
if (fd < 0) return 1;
5945
if (ftruncate(fd, sizeof(GpuLock)) < 0) { close(fd); return 1; }
@@ -65,23 +51,30 @@ static inline int init_gpu_lock()
6551
close(fd);
6652
if (g_lock == MAP_FAILED) { g_lock = nullptr; return 1; }
6753

68-
/* Begin object lifetimes once across all processes */
69-
if (__atomic_test_and_set(&g_lock->constructed, __ATOMIC_ACQ_REL)) {
70-
/* we are not the first -- just wait */
54+
// Coordinate exactly-once construction of std::atomic members.
55+
// cnstr uses __atomic_* built-ins: safe on raw storage
56+
// the winner does placement-new on every std::atomic<int>
57+
// the other (losers) spin-wait with ACQUIRE on "cnstr" until it's 1,
58+
// then proceed to use the atomics.
59+
60+
if (__atomic_exchange_n(&g_lock->cnstr, 1, __ATOMIC_ACQ_REL) != 0) {
61+
// Another process won the race — spin-wait until construction finishes.
62+
while (!__atomic_load_n(&g_lock->cnstr, __ATOMIC_ACQUIRE))
63+
;
7164
} else {
72-
/* we are the first -- construct the atomics in shared memory */
73-
new (&g_lock->initialized) std::atomic<int>(0);
65+
// We are the winner — construct all std::atomic members.
7466
for (int i = 0; i < LOCK_MAX_DEVICES; i++)
7567
new (&g_lock->occupied[i]) std::atomic<int>(DEVICE_FREE);
68+
69+
// cnstr is already 1 from the exchange above (ACQ_REL ensures
70+
// the constructed atomics are visible to other processes).
7671
}
7772

7873
g_lock_inited = true;
7974
return 0;
8075
}
8176

82-
/* ------------------------------------------------------------------ */
83-
/* Public C-bindings (called from Fortran) */
84-
/* ------------------------------------------------------------------ */
77+
// Public C-bindings (called from Fortran)
8578

8679
extern "C" void cf_releaseDev(int dev_idx)
8780
{
@@ -101,4 +94,4 @@ extern "C" void cf_cleanupLock()
10194
g_lock_inited = false;
10295
}
10396

104-
#endif /* MODEM_GPU_LOCK_H */
97+
#endif // MODEM_GPU_LOCK_H

0 commit comments

Comments
 (0)