Merge pull request #54 from dong-hao/feature/add-atomic-gpu-lock

dong-hao · web-flow · commit 44b47c3a832a · 2026-05-19T13:50:36.000+08:00
feat: GPU locking via atomic operation using POSIX shared memory

It seems my (production) tests on the new gpu lock feature ran without problems - I will just go ahead and merge it (it is blocking as I plan to back port the new GPU parallel routines to the main).
diff --git a/f90/3D_MT/FWD_SP2/EMsolve3D.f90 b/f90/3D_MT/FWD_SP2/EMsolve3D.f90
@@ -542,6 +542,15 @@ subroutine FWDSolve3D(bRHS,omega,eSol,device_id)
     Call deall(tvec)
     deallocate(KSSiter%rerr)
 
+    ! Release GPU lock so other processes can hook on
+    ! NOTE: do NOT call cf_cleanupLock() here — the shared-memory lock
+    ! must persist across transmitter iterations
+#if defined(CUDA) || defined(HIP)
+    if (device_id >= 0) then
+        call cf_releaseDev(device_id)
+    end if
+#endif
+
   end subroutine FWDsolve3D
 
 #if defined(MPI) && defined(FG)
@@ -1194,6 +1203,15 @@ subroutine FWDsolve3Dfg(bRHS,omega,eSol,device_id,comm_local)
    &             rank_local
          end if
      end if
+
+    ! Release GPU lock so other processes can hook on
+    ! NOTE: do NOT call cf_cleanupLock() here — similar to FWDsolve3D.
+#if defined(CUDA) || defined(HIP)
+    if (device_id >= 0 .and. size_local > 1) then
+        call cf_releaseDev(device_id)
+    end if
+#endif
+
      call MPI_BARRIER(comm_local,ierr)
      if (rank_local .eq. 0) then ! Leader 
          ! deallocate local temporary arrays
diff --git a/f90/3D_MT/FWD_SP2/cudaFortMap.f90 b/f90/3D_MT/FWD_SP2/cudaFortMap.f90
@@ -1698,6 +1698,21 @@ end function cf_hookDev
    !   ! get the number of GPU devices  
    ! end function kernelc_getDevNum
 
+   ! cf_releaseDev - release the GPU gate so other processes can hook on
+   subroutine cf_releaseDev(device_idx) &
+    &              bind(C, name="cf_releaseDev")
+     use iso_c_binding
+     implicit none
+     integer(c_int), value :: device_idx
+   end subroutine cf_releaseDev
+
+   ! cf_cleanupLock - cleanup shared memory lock on program exit
+   subroutine cf_cleanupLock() &
+    &              bind(C, name="cf_cleanupLock")
+     use iso_c_binding
+     implicit none
+   end subroutine cf_cleanupLock
+
    end interface  
 
 end module
diff --git a/f90/3D_MT/FWD_SP2/gpu_lock.h b/f90/3D_MT/FWD_SP2/gpu_lock.h
@@ -0,0 +1,97 @@
+// gpu_lock.h — Platform-independent, cross-process GPU locking via POSIX 
+// shared memory.
+// Multiple MPI ranks (should...) safely target the same GPU:
+
+#ifndef MODEM_GPU_LOCK_H
+#define MODEM_GPU_LOCK_H
+
+#include <atomic>
+#include <fcntl.h>
+#include <new>
+#include <sys/mman.h>
+#include <unistd.h>
+
+// Define device state flags                                                 
+
+#define DEVICE_FREE  0
+#define DEVICE_IN_USE 1
+
+// Lock structure & global state 
+// we probably don't have more than 64 devices on a single node
+static constexpr int LOCK_MAX_DEVICES = 64;
+
+// cnstr: compiler-built-in atomic on raw int — safe before object lifetime begins.
+// All other members are std::atomic<int>, constructed by placement-new once,
+// then used via normal C++ atomic operations.
+struct alignas(64) GpuLock {
+    int cnstr;      // 0 = not constructed, 1 = atomics live
+    std::atomic<int> occupied[LOCK_MAX_DEVICES];
+};
+
+static GpuLock* g_lock       = nullptr;
+static bool     g_lock_inited = false;
+
+// Internal: create / map shared-memory segment and construct atomics 
+static inline int init_gpu_lock()
+{
+    const char* name = "/ModEM_gpu_lock";
+
+    // Try to open existing shared memory segment first 
+    int fd = shm_open(name, O_RDWR, 0600);
+    if (fd < 0) {
+        // Doesn't exist -- create it 
+        fd = shm_open(name, O_CREAT | O_RDWR, 0600);
+        if (fd < 0) return 1;
+        if (ftruncate(fd, sizeof(GpuLock)) < 0) { close(fd); return 1; }
+    }
+
+    g_lock = static_cast<GpuLock*>(mmap(nullptr, sizeof(GpuLock),
+                                         PROT_READ | PROT_WRITE,
+                                         MAP_SHARED, fd, 0));
+    close(fd);
+    if (g_lock == MAP_FAILED) { g_lock = nullptr; return 1; }
+
+    // Coordinate exactly-once construction of std::atomic members.   
+    // cnstr uses __atomic_* built-ins: safe on raw storage       
+    // the winner does placement-new on every std::atomic<int>     
+    // the other (losers) spin-wait with ACQUIRE on "cnstr" until it's 1, 
+    // then proceed to use the atomics.
+
+    if (__atomic_exchange_n(&g_lock->cnstr, 1, __ATOMIC_ACQ_REL) != 0) {
+        // Another process won the race — spin-wait until construction finishes.
+        while (!__atomic_load_n(&g_lock->cnstr, __ATOMIC_ACQUIRE))
+            ;
+    } else {
+        // We are the winner — construct all std::atomic members.
+        for (int i = 0; i < LOCK_MAX_DEVICES; i++)
+            new (&g_lock->occupied[i]) std::atomic<int>(DEVICE_FREE);
+
+        // cnstr is already 1 from the exchange above (ACQ_REL ensures
+        // the constructed atomics are visible to other processes).
+    }
+
+    g_lock_inited = true;
+    return 0;
+}
+
+// Public C-bindings (called from Fortran)                            
+
+extern "C" void cf_releaseDev(int dev_idx)
+{
+    if (g_lock_inited && g_lock != nullptr &&
+        dev_idx >= 0 && dev_idx < LOCK_MAX_DEVICES)
+    {
+        g_lock->occupied[dev_idx].store(DEVICE_FREE, std::memory_order_release);
+    }
+}
+
+extern "C" void cf_cleanupLock()
+{
+    if (g_lock != nullptr && g_lock != MAP_FAILED)
+        munmap(g_lock, sizeof(GpuLock));
+    shm_unlink("/ModEM_gpu_lock");
+    g_lock = nullptr;
+    g_lock_inited = false;
+}
+
+#endif // MODEM_GPU_LOCK_H
diff --git a/f90/3D_MT/FWD_SP2/hipFortMap.f90 b/f90/3D_MT/FWD_SP2/hipFortMap.f90
@@ -685,7 +685,7 @@ end function cublasDaxpy
 
    ! hipblasZaxpy
    integer(c_int) function cublasZaxpy(handle,n,alpha,x,incx,y,incy) &
-    &             bind(C,name="hipblasZaxpy_v2")
+    &             bind(C,name="hipblasZaxpy")
      ! compute y = y + a*x with complex double precision
      ! note that x and y should be located in GPU memory
      use iso_c_binding
@@ -715,7 +715,7 @@ end function cublasDcopy
 
    ! hipblasZcopy
    integer(c_int) function cublasZcopy(handle,n,x,incx,y,incy) &
-    &             bind(C,name="hipblasZcopy_v2")
+    &             bind(C,name="hipblasZcopy")
      ! compute y = x with complex double precision
      use iso_c_binding
      implicit none
@@ -729,7 +729,7 @@ end function cublasZcopy
 
    ! hipblasDdot
    integer(c_int) function cublasDdot(handle,n,x,incx,y,incy,result) &
-    &            bind(C,name="hipblasDdot_v2")
+    &            bind(C,name="hipblasDdot")
      ! compute result = x dot y with double precision
      use iso_c_binding
      implicit none
@@ -744,7 +744,7 @@ end function cublasDdot
 
    ! hipblasZdot
    integer(c_int) function cublasZdot(handle,n,x,incx,y,incy,result) &
-    &            bind(C,name="hipblasZdotc_v2")
+    &            bind(C,name="hipblasZdotc")
      ! compute result = x dot y with complex double precision
      use iso_c_binding
      implicit none
@@ -772,7 +772,7 @@ end function cublasDnrm2
 
    ! hipblasZnrm2 there is no such thing like znrm2!
    integer(c_int) function cublasZnrm2(handle,n,x,incx,norm) &
-    &            bind(C,name="hipblasDznrm2_v2")
+    &            bind(C,name="hipblasDznrm2")
      ! compute result = norm(x) in complex double precision
      use iso_c_binding
      implicit none
@@ -1704,6 +1704,21 @@ integer(c_int) function cf_resetFlag(device_idx) &
      integer(c_int),value ::  device_idx
    end function cf_resetFlag
 
+   ! cf_releaseDev - release the GPU gate so other processes can hook on
+   subroutine cf_releaseDev(device_idx) &
+    &              bind(C, name="cf_releaseDev")
+     use iso_c_binding
+     implicit none
+     integer(c_int), value :: device_idx
+   end subroutine cf_releaseDev
+
+   ! cf_cleanupLock - cleanup shared memory lock on program exit
+   subroutine cf_cleanupLock() &
+    &              bind(C, name="cf_cleanupLock")
+     use iso_c_binding
+     implicit none
+   end subroutine cf_cleanupLock
+
    end interface  
 
 end module hipFortMap
diff --git a/f90/3D_MT/FWD_SP2/kernel_c.cu b/f90/3D_MT/FWD_SP2/kernel_c.cu
diff --git a/f90/3D_MT/FWD_SP2/kernel_c.hip b/f90/3D_MT/FWD_SP2/kernel_c.hip