docs(research): Update multi-GPU HeFFTe research documentation and examples

ahojukka5 · ahojukka5 · commit b616580f430b · 2025-12-02T21:32:41.000+02:00
- Add support for HeFFTe build without GPU-aware MPI (for testing)
- Update CMakeLists.txt to support HEFFTE_NO_GPU_AWARE environment variable
- Document successful multi-GPU testing without GPU-aware MPI
- Add detailed instructions for building and testing multi-GPU setup
- Update cuda_fft_example.cpp with GPU device verification and memory reporting
- Add GPU UUID and device property information to output
- Document that multi-GPU works via CPU transfer fallback (slower but functional)
- Add performance notes about GPU-aware MPI benefits
diff --git a/research/multigpu_heffte/CMakeLists.txt b/research/multigpu_heffte/CMakeLists.txt
@@ -10,8 +10,15 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 find_package(MPI REQUIRED)
 
 # Find HeFFTe from our custom installation
-# The HeffteConfig.cmake is located at ~/opt/heffte/2.4.1-cuda/lib64/cmake/Heffte/
-set(Heffte_DIR "$ENV{HOME}/opt/heffte/2.4.1-cuda/lib64/cmake/Heffte")
+# Option to use version without GPU-aware MPI for testing
+# Set environment variable: export HEFFTE_NO_GPU_AWARE=1
+if(DEFINED ENV{HEFFTE_NO_GPU_AWARE} AND $ENV{HEFFTE_NO_GPU_AWARE} EQUAL 1)
+    set(Heffte_DIR "$ENV{HOME}/opt/heffte/2.4.1-cuda-no-gpuaware/lib64/cmake/Heffte")
+    message(STATUS "Using HeFFTe WITHOUT GPU-aware MPI (for testing)")
+else()
+    set(Heffte_DIR "$ENV{HOME}/opt/heffte/2.4.1-cuda/lib64/cmake/Heffte")
+    message(STATUS "Using HeFFTe WITH GPU-aware MPI (default)")
+endif()
 find_package(Heffte 2.4.1 REQUIRED PATHS ${Heffte_DIR})
 
 # Print information about found HeFFTe
diff --git a/research/multigpu_heffte/README.md b/research/multigpu_heffte/README.md
@@ -105,7 +105,10 @@ mpirun -np 2 ./cuda_fft_example
 
 **Note:** 
 - **Single GPU (1 MPI rank) works fine** - verified locally ✓
-- **Multi-GPU (2+ MPI ranks) fails** - requires GPU-aware MPI support which is not available in the current OpenMPI installation
+- **Multi-GPU (2+ MPI ranks) works** - verified on cluster with GPU-aware MPI disabled ✓
+  - Without GPU-aware MPI: HeFFTe automatically transfers data to CPU for MPI communication, then back to GPU
+  - This works but is slower than GPU-aware MPI
+  - To use this mode, rebuild with `HEFFTE_NO_GPU_AWARE=1` (see instructions below)
 
 ### Implementation Details
 
@@ -228,8 +231,72 @@ grep -i "GPU_AWARE" ~/dev/heffte/build_2.4.1_cuda/configured/summary.txt
    - This causes segmentation faults in multi-GPU scenarios when HeFFTe tries to use GPU-aware MPI for inter-GPU communication
    - Single GPU operations work fine, but multi-GPU requires GPU-aware MPI support
 
+### Testing Multi-GPU Without GPU-Aware MPI
+
+**Status: ✓ CONFIRMED - Multi-GPU works without GPU-aware MPI!**
+
+We successfully tested HeFFTe with 2 GPUs on the cluster. Without GPU-aware MPI, HeFFTe automatically transfers data to CPU for MPI communication, then back to GPU. This works but is slower than GPU-aware MPI.
+
+**To test multi-GPU (already done, but here's how to repeat):**
+
+1. **Rebuild HeFFTe without GPU-aware MPI** (if not already done):
+   ```bash
+   cd ~/dev/heffte
+   mkdir -p build_2.4.1_cuda_no_gpuaware
+   cd build_2.4.1_cuda_no_gpuaware
+   module load cuda
+   cmake .. \
+     -DCMAKE_INSTALL_PREFIX=~/opt/heffte/2.4.1-cuda-no-gpuaware \
+     -DHeffte_ENABLE_FFTW=ON \
+     -DHeffte_ENABLE_CUDA=ON \
+     -DHeffte_ENABLE_GPU_AWARE_MPI=OFF \
+     -DCMAKE_BUILD_TYPE=Release
+   make -j$(nproc)
+   make install
+   ```
+
+2. **Rebuild the example with the non-GPU-aware version:**
+   ```bash
+   cd research/multigpu_heffte/build
+   export HEFFTE_NO_GPU_AWARE=1
+   cmake ..
+   make
+   ```
+
+3. **Submit job for 2 GPU test:**
+   ```bash
+   cd research/multigpu_heffte
+   sbatch run_2gpu_test.sh
+   ```
+
+**Test Results:**
+- ✓ Job completed successfully (ExitCode: 0:0)
+- ✓ 2 MPI ranks running on 2 different GPUs (GPU 0 and GPU 1)
+- ✓ Domain decomposition working: Rank 0 handles z=[0,31], Rank 1 handles z=[32,63]
+- ✓ Forward FFT completed
+- ✓ Laplacian operator applied in Fourier domain
+- ✓ No segmentation faults
+
+**Performance Note:** Without GPU-aware MPI, data is transferred CPU↔GPU for MPI communication, which adds overhead. For production use, GPU-aware MPI would be preferred for better performance.
+
+### Checking GPU-Aware MPI Status
+
+**Check OpenMPI GPU-aware support:**
+```bash
+ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
+```
+
+**Check HeFFTe GPU-aware MPI configuration:**
+```bash
+# Current build (with GPU-aware MPI)
+grep -i "GPU_AWARE" ~/dev/heffte/build_2.4.1_cuda/CMakeCache.txt
+
+# Test build (without GPU-aware MPI)
+grep -i "GPU_AWARE" ~/dev/heffte/build_2.4.1_cuda_no_gpuaware/CMakeCache.txt
+```
+
 ### Next Steps
 
 - Create a more complex demo that uses multiple GPU cards
 - Reference examples from `~/dev/heffte/examples/` to understand how to implement multi-GPU usage in practice
-- Verify GPU-aware MPI support before attempting multi-GPU runs
+- Test multi-GPU performance with and without GPU-aware MPI
diff --git a/research/multigpu_heffte/cuda_fft_example.cpp b/research/multigpu_heffte/cuda_fft_example.cpp
@@ -90,8 +90,27 @@ int main(int argc, char **argv) {
   if (heffte::gpu::device_count() > 0) {
     int device_id = my_rank % heffte::gpu::device_count();
     heffte::gpu::device_set(device_id);
+
+    // Verify we're on the correct device and get device properties
+    int current_device;
+    cudaGetDevice(&current_device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, current_device);
+
+    // Format UUID
+    char uuid_str[64];
+    snprintf(uuid_str, sizeof(uuid_str),
+             "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+             prop.uuid.bytes[0], prop.uuid.bytes[1], prop.uuid.bytes[2],
+             prop.uuid.bytes[3], prop.uuid.bytes[4], prop.uuid.bytes[5],
+             prop.uuid.bytes[6], prop.uuid.bytes[7], prop.uuid.bytes[8],
+             prop.uuid.bytes[9], prop.uuid.bytes[10], prop.uuid.bytes[11],
+             prop.uuid.bytes[12], prop.uuid.bytes[13], prop.uuid.bytes[14],
+             prop.uuid.bytes[15]);
+
     std::cout << "Rank " << my_rank << " using GPU device " << device_id
-              << std::endl;
+              << " (verified: " << current_device << ")"
+              << " - " << prop.name << " [UUID: " << uuid_str << "]" << std::endl;
   } else {
     if (my_rank == 0) {
       std::cerr << "ERROR: No CUDA devices found!" << std::endl;
@@ -153,9 +172,27 @@ int main(int argc, char **argv) {
   }
 
   // Transfer input to GPU
+  // Verify GPU memory allocation on correct device
+  int device_before;
+  cudaGetDevice(&device_before);
+  size_t free_mem_before, total_mem_before;
+  cudaMemGetInfo(&free_mem_before, &total_mem_before);
+
   heffte::gpu::vector<std::complex<double>> gpu_input =
       heffte::gpu::transfer().load(input);
 
+  // Verify memory was allocated on correct device
+  int device_after;
+  cudaGetDevice(&device_after);
+  size_t free_mem_after, total_mem_after;
+  cudaMemGetInfo(&free_mem_after, &total_mem_after);
+  size_t mem_used = free_mem_before - free_mem_after;
+
+  std::cout << "Rank " << my_rank << " GPU " << device_after << " memory: used "
+            << mem_used / (1024 * 1024) << " MB"
+            << " (free: " << free_mem_after / (1024 * 1024) << " MB / "
+            << total_mem_after / (1024 * 1024) << " MB total)" << std::endl;
+
   // Allocate GPU memory for FFT output
   heffte::gpu::vector<std::complex<double>> gpu_output(fft.size_outbox());