CERN · purepani · Mar 11, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu
diff --git a/Common/CUDA/GD_TV.cu b/Common/CUDA/GD_TV.cu
diff --git a/Common/CUDA/GpuIds.cpp b/Common/CUDA/GpuIds.cpp
@@ -1,7 +1,7 @@
 #include "GpuIds.hpp"
 #include <stdlib.h>
 #include <string.h>
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 
 GpuIds::~GpuIds() {
     free(m_piDeviceIds); m_piDeviceIds = nullptr;
@@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) {
 
 bool GpuIds::AreEqualDevices() const {
     int deviceCount = this->GetLength();
-    const int devicenamelength = 256;  // The length 256 is fixed by spec of cudaDeviceProp::name
+    const int devicenamelength = 256;  // The length 256 is fixed by spec of hipDeviceProp_t::name
     char devicename[devicenamelength];
-    cudaDeviceProp deviceProp;
+    hipDeviceProp_t deviceProp;
     for (int dev = 0; dev < deviceCount; dev++) {
-        // cudaSetDevice(m_piDeviceIds[dev]);
-        cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
+        // hipSetDevice(m_piDeviceIds[dev]);
+        hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
         if (dev>0) {
             if (strcmp(devicename, deviceProp.name) != 0) {
                 return false;

diff --git a/Common/CUDA/PICCS.cu b/Common/CUDA/PICCS.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -60,10 +61,10 @@ Codes  : https://github.com/CERN/TIGRE
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("ERROR in: %s \n",msg);\
-                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\
         } \
 } while (0)
 
@@ -263,9 +264,9 @@ do { \
 bool isnan_cuda(float* vec, size_t size){
     bool*d_nan;
     bool h_nan;
-    cudaMalloc((void **)&d_nan, sizeof (bool));
+    hipMalloc((void **)&d_nan, sizeof (bool));
     isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
-    cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
+    hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost);
     return h_nan;
 
 }
@@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){
 
         float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
         // memory for image
-        cudaMalloc(&d_image, mem_size);
-        cudaMalloc(&d_prior, mem_size);
+        hipMalloc(&d_image, mem_size);
+        hipMalloc(&d_prior, mem_size);
 
         cudaCheckErrors("Malloc Image error");
-        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
+        hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice);
+        hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice);
         cudaCheckErrors("Memory Malloc and Memset: SRC");
         // memory for df
-        cudaMalloc(&d_dimgTV, mem_size);
-        cudaMalloc(&d_dpiccsTV, mem_size);
+        hipMalloc(&d_dimgTV, mem_size);
+        hipMalloc(&d_dpiccsTV, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_norm2, mem_size);
+        hipMalloc(&d_norm2, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_aux_image, mem_size);
+        hipMalloc(&d_aux_image, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
 
         // memory for L2norm auxiliar
-        cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
+        hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
         cudaCheckErrors("Memory Malloc and Memset: NORMAux");
 
 
@@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){
 
         for(unsigned int i=0;i<maxIter;i++){
 
-            cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy( d_aux_image,d_image, mem_size, hipMemcpyDeviceToDevice);
 //             mexPrintf("Iteration %d\n",(int)i);
 
             // Compute the gradient of the TV norm
             gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio),   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 
             substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
 
             gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio,   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 //             mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
 
             addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             //NOMRALIZE via reduction
             //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice);
             cudaCheckErrors("Copy from gradient call error");
             reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("reduce1");
             if (dimgridRed > 1) {
                 reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
-                cudaDeviceSynchronize();
+                hipDeviceSynchronize();
                 cudaCheckErrors("reduce2");
-                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
 
             }
             else {
-                cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
             }
 //             mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
             //MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2),  total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
             //SUBSTRACT GRADIENT
             substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
 //             mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
 //             mexPrintf("\n");
@@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){
 
         cudaCheckErrors("TV minimization");
 
-        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
+        hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost);
         cudaCheckErrors("Copy result back");
 
-        cudaFree(d_image);
-        cudaFree(d_dpiccsTV);
-        cudaFree(d_aux_image);
-        cudaFree(d_aux_small);
-        cudaFree(d_prior);
-        cudaFree(d_norm2);
+        hipFree(d_image);
+        hipFree(d_dpiccsTV);
+        hipFree(d_aux_image);
+        hipFree(d_aux_small);
+        hipFree(d_prior);
+        hipFree(d_norm2);
 
 
         cudaCheckErrors("Memory free");
-        cudaDeviceReset();
+        hipDeviceReset();
     }
 
diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for random number generator
@@ -45,48 +46,49 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <curand.h>
+#include <hip/hip_runtime.h>
+#include <hiprand/hiprand_kernel.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand.h>
 
 #include "gpuUtils.hpp"
 #include "RandomNumberGenerator.hpp"
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\
         } \
 } while (0)
 
 
-__global__ void setup_kernel(curandState *state) {
+__global__ void setup_kernel(hiprandState *state) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Each thread gets same seed, a different sequence number, no offset */
-    curand_init(1234, idx, 0, &state[idx]);
+    hiprand_init(1234, idx, 0, &state[idx]);
 }
 
-__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
+__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             pfOut[uiPos] = (float)uiPoisson;
         }
     }
     /* Copy state back to global memory */
     state[idx] = localState;
 }
 
-__global__ void GeneratePoissonAddGaussian(curandState *state,
+__global__ void GeneratePoissonAddGaussian(hiprandState *state,
                         const float* pfIn,
                         size_t uiLen, 
                         float fGaussMu,
@@ -95,15 +97,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state,
 {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             /* Gaussian */
-            float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
+            float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu;
             pfOut[uiPos] = fNormal + (float)uiPoisson;
         }
     }
@@ -127,31 +129,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu
     // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 1024;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
 
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }
 
 void poisson_gaussian_1d(const float* pfIn,
@@ -164,30 +166,30 @@ void poisson_gaussian_1d(const float* pfIn,
     // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 64;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
 
 
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }