Skip to content

HIP interface #646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 78 additions & 77 deletions Common/CUDA/GD_AwTV.cu

Large diffs are not rendered by default.

159 changes: 80 additions & 79 deletions Common/CUDA/GD_TV.cu

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions Common/CUDA/GpuIds.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "GpuIds.hpp"
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime_api.h>
#include <hip/hip_runtime_api.h>

GpuIds::~GpuIds() {
free(m_piDeviceIds); m_piDeviceIds = nullptr;
Expand Down Expand Up @@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) {

bool GpuIds::AreEqualDevices() const {
int deviceCount = this->GetLength();
const int devicenamelength = 256; // The length 256 is fixed by spec of cudaDeviceProp::name
const int devicenamelength = 256; // The length 256 is fixed by spec of hipDeviceProp_t::name
char devicename[devicenamelength];
cudaDeviceProp deviceProp;
hipDeviceProp_t deviceProp;
for (int dev = 0; dev < deviceCount; dev++) {
// cudaSetDevice(m_piDeviceIds[dev]);
cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
// hipSetDevice(m_piDeviceIds[dev]);
hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
if (dev>0) {
if (strcmp(devicename, deviceProp.name) != 0) {
return false;
Expand Down
77 changes: 39 additions & 38 deletions Common/CUDA/PICCS.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
/*-------------------------------------------------------------------------
*
* CUDA functions for Steepest descend in POCS-type algorithms.
Expand Down Expand Up @@ -60,10 +61,10 @@ Codes : https://github.com/CERN/TIGRE

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
hipError_t __err = hipGetLastError(); \
if (__err != hipSuccess) { \
mexPrintf("ERROR in: %s \n",msg);\
mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\
} \
} while (0)

Expand Down Expand Up @@ -263,9 +264,9 @@ do { \
bool isnan_cuda(float* vec, size_t size){
bool*d_nan;
bool h_nan;
cudaMalloc((void **)&d_nan, sizeof (bool));
hipMalloc((void **)&d_nan, sizeof (bool));
isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost);
return h_nan;

}
Expand All @@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){

float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
// memory for image
cudaMalloc(&d_image, mem_size);
cudaMalloc(&d_prior, mem_size);
hipMalloc(&d_image, mem_size);
hipMalloc(&d_prior, mem_size);

cudaCheckErrors("Malloc Image error");
cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice);
hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice);
cudaCheckErrors("Memory Malloc and Memset: SRC");
// memory for df
cudaMalloc(&d_dimgTV, mem_size);
cudaMalloc(&d_dpiccsTV, mem_size);
hipMalloc(&d_dimgTV, mem_size);
hipMalloc(&d_dpiccsTV, mem_size);
cudaCheckErrors("Memory Malloc and Memset: TV");
cudaMalloc(&d_norm2, mem_size);
hipMalloc(&d_norm2, mem_size);
cudaCheckErrors("Memory Malloc and Memset: TV");
cudaMalloc(&d_aux_image, mem_size);
hipMalloc(&d_aux_image, mem_size);
cudaCheckErrors("Memory Malloc and Memset: TV");

// memory for L2norm auxiliar
cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
cudaCheckErrors("Memory Malloc and Memset: NORMAux");


Expand All @@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){

for(unsigned int i=0;i<maxIter;i++){

cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
hipMemcpy( d_aux_image,d_image, mem_size, hipMemcpyDeviceToDevice);
// mexPrintf("Iteration %d\n",(int)i);

// Compute the gradient of the TV norm
gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Gradient");
// mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");


multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio), total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Multiplication error");

substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Substraction error");

gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Gradient");
// mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");

multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio, total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Multiplication error");
// mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");


addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
//NOMRALIZE via reduction
//mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice);
cudaCheckErrors("Copy from gradient call error");
reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("reduce1");
if (dimgridRed > 1) {
reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("reduce2");
cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy");
hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost);
cudaCheckErrors("hipMemcpy");

}
else {
cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy");
hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost);
cudaCheckErrors("hipMemcpy");
}
// mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
//MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2), total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Multiplication error");
//SUBSTRACT GRADIENT
substractArrays <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
cudaDeviceSynchronize();
hipDeviceSynchronize();
cudaCheckErrors("Substraction error");
// mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
// mexPrintf("\n");
Expand All @@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){

cudaCheckErrors("TV minimization");

cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost);
cudaCheckErrors("Copy result back");

cudaFree(d_image);
cudaFree(d_dpiccsTV);
cudaFree(d_aux_image);
cudaFree(d_aux_small);
cudaFree(d_prior);
cudaFree(d_norm2);
hipFree(d_image);
hipFree(d_dpiccsTV);
hipFree(d_aux_image);
hipFree(d_aux_small);
hipFree(d_prior);
hipFree(d_norm2);


cudaCheckErrors("Memory free");
cudaDeviceReset();
hipDeviceReset();
}

90 changes: 46 additions & 44 deletions Common/CUDA/RandomNumberGenerator.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
/*-------------------------------------------------------------------------
*
* CUDA functions for random number generator
Expand Down Expand Up @@ -45,48 +46,49 @@

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand_kernel.h>
#include <curand.h>
#include <hip/hip_runtime.h>
#include <hiprand/hiprand_kernel.h>
#include <hiprand/hiprand.h>
#include <hiprand/hiprand.h>

#include "gpuUtils.hpp"
#include "RandomNumberGenerator.hpp"

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
hipError_t __err = hipGetLastError(); \
if (__err != hipSuccess) { \
mexPrintf("%s \n",msg);\
cudaDeviceReset();\
mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
hipDeviceReset();\
mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\
} \
} while (0)


__global__ void setup_kernel(curandState *state) {
__global__ void setup_kernel(hiprandState *state) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
/* Each thread gets same seed, a different sequence number, no offset */
curand_init(1234, idx, 0, &state[idx]);
hiprand_init(1234, idx, 0, &state[idx]);
}

__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
/* Copy state to local memory for efficiency */
curandState localState = state[idx];
hiprandState localState = state[idx];
int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
for (int iI = 0; iI < iIter; ++iI) {
size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
if (uiPos < uiLen) {
/* Poisson */
unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
pfOut[uiPos] = (float)uiPoisson;
}
}
/* Copy state back to global memory */
state[idx] = localState;
}

__global__ void GeneratePoissonAddGaussian(curandState *state,
__global__ void GeneratePoissonAddGaussian(hiprandState *state,
const float* pfIn,
size_t uiLen,
float fGaussMu,
Expand All @@ -95,15 +97,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state,
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
/* Copy state to local memory for efficiency */
curandState localState = state[idx];
hiprandState localState = state[idx];
int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
for (int iI = 0; iI < iIter; ++iI) {
size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
if (uiPos < uiLen) {
/* Poisson */
unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
/* Gaussian */
float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu;
pfOut[uiPos] = fNormal + (float)uiPoisson;
}
}
Expand All @@ -127,31 +129,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu
// printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
float* d_pfIn = nullptr;
float* d_pfOut = nullptr;
cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
cudaCheckErrors("poisson_1d fail cudaMalloc 1");
cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
cudaCheckErrors("poisson_1d fail cudaMalloc 2");
cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
cudaCheckErrors("poisson_1d fail hipMalloc 1");
hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
cudaCheckErrors("poisson_1d fail hipMalloc 2");
hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
cudaCheckErrors("poisson_1d fail hipMemcpy 1");

// float fMin, fMax;
// GetMinMax(pfIn, uiLen, fMin, fMax);
// printf("fMin, fMax = %f, %f\n", fMin, fMax);
curandState *curandStates = nullptr;
hiprandState *curandStates = nullptr;
const int kiBlockDim = 1024; // Threads per Block
const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
cudaCheckErrors("poisson_1d fail cudaMalloc 3");
hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
cudaCheckErrors("poisson_1d fail hipMalloc 3");
setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
cudaCheckErrors("poisson_1d fail hipMemcpy 2");
// GetMinMax(pfOut, uiLen, fMin, fMax);
// printf("fMin, fMax = %f, %f\n", fMin, fMax);

cudaFree(d_pfIn); d_pfIn = nullptr;
cudaFree(d_pfOut); d_pfOut = nullptr;
cudaFree(curandStates); curandStates = nullptr;
hipFree(d_pfIn); d_pfIn = nullptr;
hipFree(d_pfOut); d_pfOut = nullptr;
hipFree(curandStates); curandStates = nullptr;
}

void poisson_gaussian_1d(const float* pfIn,
Expand All @@ -164,30 +166,30 @@ void poisson_gaussian_1d(const float* pfIn,
// printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
float* d_pfIn = nullptr;
float* d_pfOut = nullptr;
cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1");
hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2");
hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1");

// float fMin, fMax;
// GetMinMax(pfIn, uiLen, fMin, fMax);
// printf("fMin, fMax = %f, %f\n", fMin, fMax);
curandState *curandStates = nullptr;
hiprandState *curandStates = nullptr;
const int kiBlockDim = 64; // Threads per Block
const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3");
setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2");
// GetMinMax(pfOut, uiLen, fMin, fMax);
// printf("fMin, fMax = %f, %f\n", fMin, fMax);


cudaFree(d_pfIn); d_pfIn = nullptr;
cudaFree(d_pfOut); d_pfOut = nullptr;
cudaFree(curandStates); curandStates = nullptr;
hipFree(d_pfIn); d_pfIn = nullptr;
hipFree(d_pfOut); d_pfOut = nullptr;
hipFree(curandStates); curandStates = nullptr;
}
Loading