Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions f90/3D_MT/FWD_SP2/EMsolve3D.f90
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,15 @@ subroutine FWDSolve3D(bRHS,omega,eSol,device_id)
Call deall(tvec)
deallocate(KSSiter%rerr)

! Release GPU lock so other processes can hook on
! NOTE: do NOT call cf_cleanupLock() here — the shared-memory lock
! must persist across transmitter iterations
#if defined(CUDA) || defined(HIP)
if (device_id >= 0) then
call cf_releaseDev(device_id)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, NO- resetting this (while finishing one solve task for a transmitter) will reset the shared lock for the other processes - will definitely cause some (undefined) problems.

end if
#endif

end subroutine FWDsolve3D

#if defined(MPI) && defined(FG)
Expand Down Expand Up @@ -1194,6 +1203,15 @@ subroutine FWDsolve3Dfg(bRHS,omega,eSol,device_id,comm_local)
& rank_local
end if
end if

! Release GPU lock so other processes can hook on
! NOTE: do NOT call cf_cleanupLock() here — similar to FWDsolve3D.
#if defined(CUDA) || defined(HIP)
if (device_id >= 0 .and. size_local > 1) then
call cf_releaseDev(device_id)
end if
#endif

call MPI_BARRIER(comm_local,ierr)
if (rank_local .eq. 0) then ! Leader
! deallocate local temporary arrays
Expand Down
15 changes: 15 additions & 0 deletions f90/3D_MT/FWD_SP2/cudaFortMap.f90
Original file line number Diff line number Diff line change
Expand Up @@ -1698,6 +1698,21 @@ end function cf_hookDev
! ! get the number of GPU devices
! end function kernelc_getDevNum

! cf_releaseDev - release the GPU gate so other processes can hook on
subroutine cf_releaseDev(device_idx) &
& bind(C, name="cf_releaseDev")
use iso_c_binding
implicit none
integer(c_int), value :: device_idx
end subroutine cf_releaseDev

! cf_cleanupLock - cleanup shared memory lock on program exit
subroutine cf_cleanupLock() &
& bind(C, name="cf_cleanupLock")
use iso_c_binding
implicit none
end subroutine cf_cleanupLock

end interface

end module
97 changes: 97 additions & 0 deletions f90/3D_MT/FWD_SP2/gpu_lock.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX
// shared memory.
// Multiple MPI ranks (should...) safely target the same GPU:

#ifndef MODEM_GPU_LOCK_H
#define MODEM_GPU_LOCK_H

#include <atomic>
#include <fcntl.h>
#include <new>
#include <sys/mman.h>
#include <unistd.h>

// Define device state flags

#define DEVICE_FREE 0
#define DEVICE_IN_USE 1

// Lock structure & global state
// we probably don't have more than 64 devices on a single node
static constexpr int LOCK_MAX_DEVICES = 64;

// cnstr: compiler-built-in atomic on raw int — safe before object lifetime begins.
// All other members are std::atomic<int>, constructed by placement-new once,
// then used via normal C++ atomic operations.
struct alignas(64) GpuLock {
int cnstr; // 0 = not constructed, 1 = atomics live
std::atomic<int> occupied[LOCK_MAX_DEVICES];
};

static GpuLock* g_lock = nullptr;
static bool g_lock_inited = false;

// Internal: create / map shared-memory segment and construct atomics
static inline int init_gpu_lock()
{
const char* name = "/ModEM_gpu_lock";

// Try to open existing shared memory segment first
int fd = shm_open(name, O_RDWR, 0600);
if (fd < 0) {
// Doesn't exist -- create it
fd = shm_open(name, O_CREAT | O_RDWR, 0600);
if (fd < 0) return 1;
if (ftruncate(fd, sizeof(GpuLock)) < 0) { close(fd); return 1; }
}

g_lock = static_cast<GpuLock*>(mmap(nullptr, sizeof(GpuLock),
PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0));
close(fd);
if (g_lock == MAP_FAILED) { g_lock = nullptr; return 1; }

// Coordinate exactly-once construction of std::atomic members.
// cnstr uses __atomic_* built-ins: safe on raw storage
// the winner does placement-new on every std::atomic<int>
// the other (losers) spin-wait with ACQUIRE on "cnstr" until it's 1,
// then proceed to use the atomics.

if (__atomic_exchange_n(&g_lock->cnstr, 1, __ATOMIC_ACQ_REL) != 0) {
// Another process won the race — spin-wait until construction finishes.
while (!__atomic_load_n(&g_lock->cnstr, __ATOMIC_ACQUIRE))
;
} else {
// We are the winner — construct all std::atomic members.
for (int i = 0; i < LOCK_MAX_DEVICES; i++)
new (&g_lock->occupied[i]) std::atomic<int>(DEVICE_FREE);

// cnstr is already 1 from the exchange above (ACQ_REL ensures
// the constructed atomics are visible to other processes).
}

g_lock_inited = true;
return 0;
}

// Public C-bindings (called from Fortran)

extern "C" void cf_releaseDev(int dev_idx)
{
if (g_lock_inited && g_lock != nullptr &&
dev_idx >= 0 && dev_idx < LOCK_MAX_DEVICES)
{
g_lock->occupied[dev_idx].store(DEVICE_FREE, std::memory_order_release);
}
}

extern "C" void cf_cleanupLock()
{
if (g_lock != nullptr && g_lock != MAP_FAILED)
munmap(g_lock, sizeof(GpuLock));
shm_unlink("/ModEM_gpu_lock");
g_lock = nullptr;
g_lock_inited = false;
}

#endif // MODEM_GPU_LOCK_H
25 changes: 20 additions & 5 deletions f90/3D_MT/FWD_SP2/hipFortMap.f90
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ end function cublasDaxpy

! hipblasZaxpy
integer(c_int) function cublasZaxpy(handle,n,alpha,x,incx,y,incy) &
& bind(C,name="hipblasZaxpy_v2")
& bind(C,name="hipblasZaxpy")
! compute y = y + a*x with complex double precision
! note that x and y should be located in GPU memory
use iso_c_binding
Expand Down Expand Up @@ -715,7 +715,7 @@ end function cublasDcopy

! hipblasZcopy
integer(c_int) function cublasZcopy(handle,n,x,incx,y,incy) &
& bind(C,name="hipblasZcopy_v2")
& bind(C,name="hipblasZcopy")
! compute y = x with complex double precision
use iso_c_binding
implicit none
Expand All @@ -729,7 +729,7 @@ end function cublasZcopy

! hipblasDdot
integer(c_int) function cublasDdot(handle,n,x,incx,y,incy,result) &
& bind(C,name="hipblasDdot_v2")
& bind(C,name="hipblasDdot")
! compute result = x dot y with double precision
use iso_c_binding
implicit none
Expand All @@ -744,7 +744,7 @@ end function cublasDdot

! hipblasZdot
integer(c_int) function cublasZdot(handle,n,x,incx,y,incy,result) &
& bind(C,name="hipblasZdotc_v2")
& bind(C,name="hipblasZdotc")
! compute result = x dot y with complex double precision
use iso_c_binding
implicit none
Expand Down Expand Up @@ -772,7 +772,7 @@ end function cublasDnrm2

! hipblasZnrm2 there is no such thing like znrm2!
integer(c_int) function cublasZnrm2(handle,n,x,incx,norm) &
& bind(C,name="hipblasDznrm2_v2")
& bind(C,name="hipblasDznrm2")
! compute result = norm(x) in complex double precision
use iso_c_binding
implicit none
Expand Down Expand Up @@ -1704,6 +1704,21 @@ integer(c_int) function cf_resetFlag(device_idx) &
integer(c_int),value :: device_idx
end function cf_resetFlag

! cf_releaseDev - release the GPU gate so other processes can hook on
subroutine cf_releaseDev(device_idx) &
& bind(C, name="cf_releaseDev")
use iso_c_binding
implicit none
integer(c_int), value :: device_idx
end subroutine cf_releaseDev

! cf_cleanupLock - cleanup shared memory lock on program exit
subroutine cf_cleanupLock() &
& bind(C, name="cf_cleanupLock")
use iso_c_binding
implicit none
end subroutine cf_cleanupLock

end interface

end module hipFortMap
Loading
Loading