Skip to content

Commit 5a05fad

Browse files
authored
Merge pull request #1750 from CEED/jeremy/no-handroll-blas
gpu - prefer cu/hipBlas over handrolls
2 parents d7c5932 + e84c3eb commit 5a05fad

File tree

2 files changed

+341
-211
lines changed

2 files changed

+341
-211
lines changed

backends/cuda-ref/ceed-cuda-ref-vector.c

Lines changed: 168 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -253,14 +253,30 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
253253
CeedScalar *copy_array;
254254

255255
CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
256+
#if (CUDA_VERSION >= 12000)
257+
cublasHandle_t handle;
258+
Ceed ceed;
259+
260+
CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
261+
CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
262+
#if defined(CEED_SCALAR_IS_FP32)
263+
CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
264+
#else /* CEED_SCALAR */
265+
CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)length, impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
266+
#endif /* CEED_SCALAR */
267+
CeedCallBackend(CeedDestroy(&ceed));
268+
#else /* CUDA_VERSION */
256269
CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, step, length, copy_array));
270+
#endif /* CUDA_VERSION */
257271
CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
272+
impl->h_array = NULL;
258273
} else if (impl->h_array) {
259274
CeedScalar *copy_array;
260275

261276
CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
262277
CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, step, length, copy_array));
263278
CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
279+
impl->d_array = NULL;
264280
} else {
265281
return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
266282
}
@@ -459,9 +475,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType
459475
static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
460476
Ceed ceed;
461477
CeedSize length;
462-
#if CUDA_VERSION < 12000
478+
#if (CUDA_VERSION < 12000)
463479
CeedSize num_calls;
464-
#endif
480+
#endif /* CUDA_VERSION */
465481
const CeedScalar *d_array;
466482
CeedVector_Cuda *impl;
467483
cublasHandle_t handle;
@@ -471,142 +487,142 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
471487
CeedCallBackend(CeedVectorGetLength(vec, &length));
472488
CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
473489

474-
#if CUDA_VERSION < 12000
490+
#if (CUDA_VERSION < 12000)
475491
// With CUDA 12, we can use the 64-bit integer interface. Prior to that,
476492
// we need to check if the vector is too long to handle with int32,
477493
// and if so, divide it into subsections for repeated cuBLAS calls.
478494
num_calls = length / INT_MAX;
479495
if (length % INT_MAX > 0) num_calls += 1;
480-
#endif
496+
#endif /* CUDA_VERSION */
481497

482498
// Compute norm
483499
CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
484500
switch (type) {
485501
case CEED_NORM_1: {
486502
*norm = 0.0;
487-
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
488-
#if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers
489-
CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
490-
#else
491-
float sub_norm = 0.0;
492-
float *d_array_start;
493-
494-
for (CeedInt i = 0; i < num_calls; i++) {
495-
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
496-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
497-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
498-
499-
CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
500-
*norm += sub_norm;
501-
}
502-
#endif
503-
} else {
504-
#if CUDA_VERSION >= 12000
505-
CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
506-
#else
507-
double sub_norm = 0.0;
508-
double *d_array_start;
509-
510-
for (CeedInt i = 0; i < num_calls; i++) {
511-
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
512-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
513-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
514-
515-
CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
516-
*norm += sub_norm;
517-
}
518-
#endif
503+
#if defined(CEED_SCALAR_IS_FP32)
504+
#if (CUDA_VERSION >= 12000) // We have CUDA 12, and can use 64-bit integers
505+
CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
506+
#else /* CUDA_VERSION */
507+
float sub_norm = 0.0;
508+
float *d_array_start;
509+
510+
for (CeedInt i = 0; i < num_calls; i++) {
511+
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
512+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
513+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
514+
515+
CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
516+
*norm += sub_norm;
519517
}
518+
#endif /* CUDA_VERSION */
519+
#else /* CEED_SCALAR */
520+
#if (CUDA_VERSION >= 12000)
521+
CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
522+
#else /* CUDA_VERSION */
523+
double sub_norm = 0.0;
524+
double *d_array_start;
525+
526+
for (CeedInt i = 0; i < num_calls; i++) {
527+
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
528+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
529+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
530+
531+
CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
532+
*norm += sub_norm;
533+
}
534+
#endif /* CUDA_VERSION */
535+
#endif /* CEED_SCALAR */
520536
break;
521537
}
522538
case CEED_NORM_2: {
523-
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
524-
#if CUDA_VERSION >= 12000
525-
CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
526-
#else
527-
float sub_norm = 0.0, norm_sum = 0.0;
528-
float *d_array_start;
529-
530-
for (CeedInt i = 0; i < num_calls; i++) {
531-
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
532-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
533-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
534-
535-
CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
536-
norm_sum += sub_norm * sub_norm;
537-
}
538-
*norm = sqrt(norm_sum);
539-
#endif
540-
} else {
541-
#if CUDA_VERSION >= 12000
542-
CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
543-
#else
544-
double sub_norm = 0.0, norm_sum = 0.0;
545-
double *d_array_start;
546-
547-
for (CeedInt i = 0; i < num_calls; i++) {
548-
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
549-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
550-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
551-
552-
CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
553-
norm_sum += sub_norm * sub_norm;
554-
}
555-
*norm = sqrt(norm_sum);
556-
#endif
539+
#if defined(CEED_SCALAR_IS_FP32)
540+
#if (CUDA_VERSION >= 12000)
541+
CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
542+
#else /* CUDA_VERSION */
543+
float sub_norm = 0.0, norm_sum = 0.0;
544+
float *d_array_start;
545+
546+
for (CeedInt i = 0; i < num_calls; i++) {
547+
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
548+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
549+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
550+
551+
CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
552+
norm_sum += sub_norm * sub_norm;
553+
}
554+
*norm = sqrt(norm_sum);
555+
#endif /* CUDA_VERSION */
556+
#else /* CEED_SCALAR */
557+
#if (CUDA_VERSION >= 12000)
558+
CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
559+
#else /* CUDA_VERSION */
560+
double sub_norm = 0.0, norm_sum = 0.0;
561+
double *d_array_start;
562+
563+
for (CeedInt i = 0; i < num_calls; i++) {
564+
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
565+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
566+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
567+
568+
CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
569+
norm_sum += sub_norm * sub_norm;
557570
}
571+
*norm = sqrt(norm_sum);
572+
#endif /* CUDA_VERSION */
573+
#endif /* CEED_SCALAR */
558574
break;
559575
}
560576
case CEED_NORM_MAX: {
561-
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
562-
#if CUDA_VERSION >= 12000
563-
int64_t index;
564-
CeedScalar norm_no_abs;
565-
566-
CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
567-
CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
568-
*norm = fabs(norm_no_abs);
569-
#else
570-
CeedInt index;
571-
float sub_max = 0.0, current_max = 0.0;
572-
float *d_array_start;
573-
574-
for (CeedInt i = 0; i < num_calls; i++) {
575-
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
576-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
577-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
578-
579-
CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
580-
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
581-
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
582-
}
583-
*norm = current_max;
584-
#endif
585-
} else {
586-
#if CUDA_VERSION >= 12000
587-
int64_t index;
588-
CeedScalar norm_no_abs;
589-
590-
CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
591-
CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
592-
*norm = fabs(norm_no_abs);
593-
#else
594-
CeedInt index;
595-
double sub_max = 0.0, current_max = 0.0;
596-
double *d_array_start;
597-
598-
for (CeedInt i = 0; i < num_calls; i++) {
599-
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
600-
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
601-
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
602-
603-
CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
604-
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
605-
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
606-
}
607-
*norm = current_max;
608-
#endif
577+
#if defined(CEED_SCALAR_IS_FP32)
578+
#if (CUDA_VERSION >= 12000)
579+
int64_t index;
580+
CeedScalar norm_no_abs;
581+
582+
CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
583+
CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
584+
*norm = fabs(norm_no_abs);
585+
#else /* CUDA_VERSION */
586+
CeedInt index;
587+
float sub_max = 0.0, current_max = 0.0;
588+
float *d_array_start;
589+
590+
for (CeedInt i = 0; i < num_calls; i++) {
591+
d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX;
592+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
593+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
594+
595+
CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
596+
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
597+
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
609598
}
599+
*norm = current_max;
600+
#endif /* CUDA_VERSION */
601+
#else /* CEED_SCALAR */
602+
#if (CUDA_VERSION >= 12000)
603+
int64_t index;
604+
CeedScalar norm_no_abs;
605+
606+
CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
607+
CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
608+
*norm = fabs(norm_no_abs);
609+
#else /* CUDA_VERSION */
610+
CeedInt index;
611+
double sub_max = 0.0, current_max = 0.0;
612+
double *d_array_start;
613+
614+
for (CeedInt i = 0; i < num_calls; i++) {
615+
d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX;
616+
CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
617+
CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
618+
619+
CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
620+
CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
621+
if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
622+
}
623+
*norm = current_max;
624+
#endif /* CUDA_VERSION */
625+
#endif /* CEED_SCALAR */
610626
break;
611627
}
612628
}
@@ -663,13 +679,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length)
663679
//------------------------------------------------------------------------------
664680
static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
665681
CeedSize length;
666-
CeedVector_Cuda *x_impl;
682+
CeedVector_Cuda *impl;
667683

668-
CeedCallBackend(CeedVectorGetData(x, &x_impl));
684+
CeedCallBackend(CeedVectorGetData(x, &impl));
669685
CeedCallBackend(CeedVectorGetLength(x, &length));
670686
// Set value for synced device/host array
671-
if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
672-
if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
687+
if (impl->d_array) {
688+
#if (CUDA_VERSION >= 12000)
689+
cublasHandle_t handle;
690+
691+
CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(x), &handle));
692+
#if defined(CEED_SCALAR_IS_FP32)
693+
CeedCallCublas(CeedVectorReturnCeed(x), cublasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
694+
#else /* CEED_SCALAR */
695+
CeedCallCublas(CeedVectorReturnCeed(x), cublasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
696+
#endif /* CEED_SCALAR */
697+
#else /* CUDA_VERSION */
698+
CeedCallBackend(CeedDeviceScale_Cuda(impl->d_array, alpha, length));
699+
#endif /* CUDA_VERSION */
700+
impl->h_array = NULL;
701+
} else if (impl->h_array) {
702+
CeedCallBackend(CeedHostScale_Cuda(impl->h_array, alpha, length));
703+
impl->d_array = NULL;
704+
}
673705
return CEED_ERROR_SUCCESS;
674706
}
675707

@@ -699,11 +731,23 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
699731
// Set value for synced device/host array
700732
if (y_impl->d_array) {
701733
CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
734+
#if (CUDA_VERSION >= 12000)
735+
cublasHandle_t handle;
736+
737+
CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(y), &handle));
738+
#if defined(CEED_SCALAR_IS_FP32)
739+
CeedCallCublas(CeedVectorReturnCeed(y), cublasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
740+
#else /* CEED_SCALAR */
741+
CeedCallCublas(CeedVectorReturnCeed(y), cublasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
742+
#endif /* CEED_SCALAR */
743+
#else /* CUDA_VERSION */
702744
CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
703-
}
704-
if (y_impl->h_array) {
745+
#endif /* CUDA_VERSION */
746+
y_impl->h_array = NULL;
747+
} else if (y_impl->h_array) {
705748
CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
706749
CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
750+
y_impl->d_array = NULL;
707751
}
708752
return CEED_ERROR_SUCCESS;
709753
}

0 commit comments

Comments
 (0)