Skip to content

Commit 44b47c3

Browse files
authored
Merge pull request #54 from dong-hao/feature/add-atomic-gpu-lock
feat: GPU locking via atomic operation using POSIX shared memory It seems my (production) tests on the new gpu lock feature ran without problems - I will just go ahead and merge it (it is blocking as I plan to back port the new GPU parallel routines to the main).
2 parents 050fd6c + 36621bb commit 44b47c3

6 files changed

Lines changed: 250 additions & 221 deletions

File tree

f90/3D_MT/FWD_SP2/EMsolve3D.f90

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,15 @@ subroutine FWDSolve3D(bRHS,omega,eSol,device_id)
542542
Call deall(tvec)
543543
deallocate(KSSiter%rerr)
544544

545+
! Release GPU lock so other processes can hook on
546+
! NOTE: do NOT call cf_cleanupLock() here — the shared-memory lock
547+
! must persist across transmitter iterations
548+
#if defined(CUDA) || defined(HIP)
549+
if (device_id >= 0) then
550+
call cf_releaseDev(device_id)
551+
end if
552+
#endif
553+
545554
end subroutine FWDsolve3D
546555

547556
#if defined(MPI) && defined(FG)
@@ -1194,6 +1203,15 @@ subroutine FWDsolve3Dfg(bRHS,omega,eSol,device_id,comm_local)
11941203
& rank_local
11951204
end if
11961205
end if
1206+
1207+
! Release GPU lock so other processes can hook on
1208+
! NOTE: do NOT call cf_cleanupLock() here — similar to FWDsolve3D.
1209+
#if defined(CUDA) || defined(HIP)
1210+
if (device_id >= 0 .and. size_local > 1) then
1211+
call cf_releaseDev(device_id)
1212+
end if
1213+
#endif
1214+
11971215
call MPI_BARRIER(comm_local,ierr)
11981216
if (rank_local .eq. 0) then ! Leader
11991217
! deallocate local temporary arrays

f90/3D_MT/FWD_SP2/cudaFortMap.f90

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,6 +1698,21 @@ end function cf_hookDev
16981698
! ! get the number of GPU devices
16991699
! end function kernelc_getDevNum
17001700

1701+
! cf_releaseDev - release the GPU gate so other processes can hook on
1702+
subroutine cf_releaseDev(device_idx) &
1703+
& bind(C, name="cf_releaseDev")
1704+
use iso_c_binding
1705+
implicit none
1706+
integer(c_int), value :: device_idx
1707+
end subroutine cf_releaseDev
1708+
1709+
! cf_cleanupLock - cleanup shared memory lock on program exit
1710+
subroutine cf_cleanupLock() &
1711+
& bind(C, name="cf_cleanupLock")
1712+
use iso_c_binding
1713+
implicit none
1714+
end subroutine cf_cleanupLock
1715+
17011716
end interface
17021717

17031718
end module

f90/3D_MT/FWD_SP2/gpu_lock.h

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX
2+
// shared memory.
3+
// Multiple MPI ranks (should...) safely target the same GPU:
4+
5+
#ifndef MODEM_GPU_LOCK_H
6+
#define MODEM_GPU_LOCK_H
7+
8+
#include <atomic>
9+
#include <fcntl.h>
10+
#include <new>
11+
#include <sys/mman.h>
12+
#include <unistd.h>
13+
14+
// Define device state flags
15+
16+
#define DEVICE_FREE 0
17+
#define DEVICE_IN_USE 1
18+
19+
// Lock structure & global state
20+
// we probably don't have more than 64 devices on a single node
21+
static constexpr int LOCK_MAX_DEVICES = 64;
22+
23+
// cnstr: compiler-built-in atomic on raw int — safe before object lifetime begins.
24+
// All other members are std::atomic<int>, constructed by placement-new once,
25+
// then used via normal C++ atomic operations.
26+
struct alignas(64) GpuLock {
27+
int cnstr; // 0 = not constructed, 1 = atomics live
28+
std::atomic<int> occupied[LOCK_MAX_DEVICES];
29+
};
30+
31+
static GpuLock* g_lock = nullptr;
32+
static bool g_lock_inited = false;
33+
34+
// Internal: create / map shared-memory segment and construct atomics
35+
static inline int init_gpu_lock()
36+
{
37+
const char* name = "/ModEM_gpu_lock";
38+
39+
// Try to open existing shared memory segment first
40+
int fd = shm_open(name, O_RDWR, 0600);
41+
if (fd < 0) {
42+
// Doesn't exist -- create it
43+
fd = shm_open(name, O_CREAT | O_RDWR, 0600);
44+
if (fd < 0) return 1;
45+
if (ftruncate(fd, sizeof(GpuLock)) < 0) { close(fd); return 1; }
46+
}
47+
48+
g_lock = static_cast<GpuLock*>(mmap(nullptr, sizeof(GpuLock),
49+
PROT_READ | PROT_WRITE,
50+
MAP_SHARED, fd, 0));
51+
close(fd);
52+
if (g_lock == MAP_FAILED) { g_lock = nullptr; return 1; }
53+
54+
// Coordinate exactly-once construction of std::atomic members.
55+
// cnstr uses __atomic_* built-ins: safe on raw storage
56+
// the winner does placement-new on every std::atomic<int>
57+
// the other (losers) spin-wait with ACQUIRE on "cnstr" until it's 1,
58+
// then proceed to use the atomics.
59+
60+
if (__atomic_exchange_n(&g_lock->cnstr, 1, __ATOMIC_ACQ_REL) != 0) {
61+
// Another process won the race — spin-wait until construction finishes.
62+
while (!__atomic_load_n(&g_lock->cnstr, __ATOMIC_ACQUIRE))
63+
;
64+
} else {
65+
// We are the winner — construct all std::atomic members.
66+
for (int i = 0; i < LOCK_MAX_DEVICES; i++)
67+
new (&g_lock->occupied[i]) std::atomic<int>(DEVICE_FREE);
68+
69+
// cnstr is already 1 from the exchange above (ACQ_REL ensures
70+
// the constructed atomics are visible to other processes).
71+
}
72+
73+
g_lock_inited = true;
74+
return 0;
75+
}
76+
77+
// Public C-bindings (called from Fortran)
78+
79+
extern "C" void cf_releaseDev(int dev_idx)
80+
{
81+
if (g_lock_inited && g_lock != nullptr &&
82+
dev_idx >= 0 && dev_idx < LOCK_MAX_DEVICES)
83+
{
84+
g_lock->occupied[dev_idx].store(DEVICE_FREE, std::memory_order_release);
85+
}
86+
}
87+
88+
extern "C" void cf_cleanupLock()
89+
{
90+
if (g_lock != nullptr && g_lock != MAP_FAILED)
91+
munmap(g_lock, sizeof(GpuLock));
92+
shm_unlink("/ModEM_gpu_lock");
93+
g_lock = nullptr;
94+
g_lock_inited = false;
95+
}
96+
97+
#endif // MODEM_GPU_LOCK_H

f90/3D_MT/FWD_SP2/hipFortMap.f90

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ end function cublasDaxpy
685685

686686
! hipblasZaxpy
687687
integer(c_int) function cublasZaxpy(handle,n,alpha,x,incx,y,incy) &
688-
& bind(C,name="hipblasZaxpy_v2")
688+
& bind(C,name="hipblasZaxpy")
689689
! compute y = y + a*x with complex double precision
690690
! note that x and y should be located in GPU memory
691691
use iso_c_binding
@@ -715,7 +715,7 @@ end function cublasDcopy
715715

716716
! hipblasZcopy
717717
integer(c_int) function cublasZcopy(handle,n,x,incx,y,incy) &
718-
& bind(C,name="hipblasZcopy_v2")
718+
& bind(C,name="hipblasZcopy")
719719
! compute y = x with complex double precision
720720
use iso_c_binding
721721
implicit none
@@ -729,7 +729,7 @@ end function cublasZcopy
729729

730730
! hipblasDdot
731731
integer(c_int) function cublasDdot(handle,n,x,incx,y,incy,result) &
732-
& bind(C,name="hipblasDdot_v2")
732+
& bind(C,name="hipblasDdot")
733733
! compute result = x dot y with double precision
734734
use iso_c_binding
735735
implicit none
@@ -744,7 +744,7 @@ end function cublasDdot
744744

745745
! hipblasZdot
746746
integer(c_int) function cublasZdot(handle,n,x,incx,y,incy,result) &
747-
& bind(C,name="hipblasZdotc_v2")
747+
& bind(C,name="hipblasZdotc")
748748
! compute result = x dot y with complex double precision
749749
use iso_c_binding
750750
implicit none
@@ -772,7 +772,7 @@ end function cublasDnrm2
772772

773773
! hipblasZnrm2 there is no such thing like znrm2!
774774
integer(c_int) function cublasZnrm2(handle,n,x,incx,norm) &
775-
& bind(C,name="hipblasDznrm2_v2")
775+
& bind(C,name="hipblasDznrm2")
776776
! compute result = norm(x) in complex double precision
777777
use iso_c_binding
778778
implicit none
@@ -1704,6 +1704,21 @@ integer(c_int) function cf_resetFlag(device_idx) &
17041704
integer(c_int),value :: device_idx
17051705
end function cf_resetFlag
17061706

1707+
! cf_releaseDev - release the GPU gate so other processes can hook on
1708+
subroutine cf_releaseDev(device_idx) &
1709+
& bind(C, name="cf_releaseDev")
1710+
use iso_c_binding
1711+
implicit none
1712+
integer(c_int), value :: device_idx
1713+
end subroutine cf_releaseDev
1714+
1715+
! cf_cleanupLock - cleanup shared memory lock on program exit
1716+
subroutine cf_cleanupLock() &
1717+
& bind(C, name="cf_cleanupLock")
1718+
use iso_c_binding
1719+
implicit none
1720+
end subroutine cf_cleanupLock
1721+
17071722
end interface
17081723

17091724
end module hipFortMap

0 commit comments

Comments
 (0)