@@ -253,14 +253,30 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
253253 CeedScalar * copy_array ;
254254
255255 CeedCallBackend (CeedVectorGetArray (vec_copy , CEED_MEM_DEVICE , & copy_array ));
256+ #if (CUDA_VERSION >= 12000 )
257+ cublasHandle_t handle ;
258+ Ceed ceed ;
259+
260+ CeedCallBackend (CeedVectorGetCeed (vec , & ceed ));
261+ CeedCallBackend (CeedGetCublasHandle_Cuda (ceed , & handle ));
262+ #if defined(CEED_SCALAR_IS_FP32 )
263+ CeedCallCublas (ceed , cublasScopy_64 (handle , (int64_t )length , impl -> d_array + start , (int64_t )step , copy_array + start , (int64_t )step ));
264+ #else /* CEED_SCALAR */
265+ CeedCallCublas (ceed , cublasDcopy_64 (handle , (int64_t )length , impl -> d_array + start , (int64_t )step , copy_array + start , (int64_t )step ));
266+ #endif /* CEED_SCALAR */
267+ CeedCallBackend (CeedDestroy (& ceed ));
268+ #else /* CUDA_VERSION */
256269 CeedCallBackend (CeedDeviceCopyStrided_Cuda (impl -> d_array , start , step , length , copy_array ));
270+ #endif /* CUDA_VERSION */
257271 CeedCallBackend (CeedVectorRestoreArray (vec_copy , & copy_array ));
272+ impl -> h_array = NULL ;
258273 } else if (impl -> h_array ) {
259274 CeedScalar * copy_array ;
260275
261276 CeedCallBackend (CeedVectorGetArray (vec_copy , CEED_MEM_HOST , & copy_array ));
262277 CeedCallBackend (CeedHostCopyStrided_Cuda (impl -> h_array , start , step , length , copy_array ));
263278 CeedCallBackend (CeedVectorRestoreArray (vec_copy , & copy_array ));
279+ impl -> d_array = NULL ;
264280 } else {
265281 return CeedError (CeedVectorReturnCeed (vec ), CEED_ERROR_BACKEND , "CeedVector must have valid data set" );
266282 }
@@ -459,9 +475,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType
459475static int CeedVectorNorm_Cuda (CeedVector vec , CeedNormType type , CeedScalar * norm ) {
460476 Ceed ceed ;
461477 CeedSize length ;
462- #if CUDA_VERSION < 12000
478+ #if ( CUDA_VERSION < 12000 )
463479 CeedSize num_calls ;
464- #endif
480+ #endif /* CUDA_VERSION */
465481 const CeedScalar * d_array ;
466482 CeedVector_Cuda * impl ;
467483 cublasHandle_t handle ;
@@ -471,142 +487,142 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
471487 CeedCallBackend (CeedVectorGetLength (vec , & length ));
472488 CeedCallBackend (CeedGetCublasHandle_Cuda (ceed , & handle ));
473489
474- #if CUDA_VERSION < 12000
490+ #if ( CUDA_VERSION < 12000 )
475491 // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
476492 // we need to check if the vector is too long to handle with int32,
477493 // and if so, divide it into subsections for repeated cuBLAS calls.
478494 num_calls = length / INT_MAX ;
479495 if (length % INT_MAX > 0 ) num_calls += 1 ;
480- #endif
496+ #endif /* CUDA_VERSION */
481497
482498 // Compute norm
483499 CeedCallBackend (CeedVectorGetArrayRead (vec , CEED_MEM_DEVICE , & d_array ));
484500 switch (type ) {
485501 case CEED_NORM_1 : {
486502 * norm = 0.0 ;
487- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
488- #if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers
489- CeedCallCublas (ceed , cublasSasum_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
490- #else
491- float sub_norm = 0.0 ;
492- float * d_array_start ;
493-
494- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
495- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
496- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
497- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
498-
499- CeedCallCublas (ceed , cublasSasum (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
500- * norm += sub_norm ;
501- }
502- #endif
503- } else {
504- #if CUDA_VERSION >= 12000
505- CeedCallCublas (ceed , cublasDasum_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
506- #else
507- double sub_norm = 0.0 ;
508- double * d_array_start ;
509-
510- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
511- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
512- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
513- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
514-
515- CeedCallCublas (ceed , cublasDasum (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
516- * norm += sub_norm ;
517- }
518- #endif
503+ #if defined(CEED_SCALAR_IS_FP32 )
504+ #if (CUDA_VERSION >= 12000 ) // We have CUDA 12, and can use 64-bit integers
505+ CeedCallCublas (ceed , cublasSasum_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
506+ #else /* CUDA_VERSION */
507+ float sub_norm = 0.0 ;
508+ float * d_array_start ;
509+
510+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
511+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
512+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
513+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
514+
515+ CeedCallCublas (ceed , cublasSasum (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
516+ * norm += sub_norm ;
519517 }
518+ #endif /* CUDA_VERSION */
519+ #else /* CEED_SCALAR */
520+ #if (CUDA_VERSION >= 12000 )
521+ CeedCallCublas (ceed , cublasDasum_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
522+ #else /* CUDA_VERSION */
523+ double sub_norm = 0.0 ;
524+ double * d_array_start ;
525+
526+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
527+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
528+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
529+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
530+
531+ CeedCallCublas (ceed , cublasDasum (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
532+ * norm += sub_norm ;
533+ }
534+ #endif /* CUDA_VERSION */
535+ #endif /* CEED_SCALAR */
520536 break ;
521537 }
522538 case CEED_NORM_2 : {
523- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
524- #if CUDA_VERSION >= 12000
525- CeedCallCublas (ceed , cublasSnrm2_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
526- #else
527- float sub_norm = 0.0 , norm_sum = 0.0 ;
528- float * d_array_start ;
529-
530- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
531- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
532- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
533- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
534-
535- CeedCallCublas (ceed , cublasSnrm2 (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
536- norm_sum += sub_norm * sub_norm ;
537- }
538- * norm = sqrt (norm_sum );
539- #endif
540- } else {
541- #if CUDA_VERSION >= 12000
542- CeedCallCublas (ceed , cublasDnrm2_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
543- #else
544- double sub_norm = 0.0 , norm_sum = 0.0 ;
545- double * d_array_start ;
546-
547- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
548- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
549- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
550- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
551-
552- CeedCallCublas (ceed , cublasDnrm2 (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
553- norm_sum += sub_norm * sub_norm ;
554- }
555- * norm = sqrt (norm_sum );
556- #endif
539+ #if defined(CEED_SCALAR_IS_FP32 )
540+ #if (CUDA_VERSION >= 12000 )
541+ CeedCallCublas (ceed , cublasSnrm2_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
542+ #else /* CUDA_VERSION */
543+ float sub_norm = 0.0 , norm_sum = 0.0 ;
544+ float * d_array_start ;
545+
546+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
547+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
548+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
549+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
550+
551+ CeedCallCublas (ceed , cublasSnrm2 (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
552+ norm_sum += sub_norm * sub_norm ;
553+ }
554+ * norm = sqrt (norm_sum );
555+ #endif /* CUDA_VERSION */
556+ #else /* CEED_SCALAR */
557+ #if (CUDA_VERSION >= 12000 )
558+ CeedCallCublas (ceed , cublasDnrm2_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
559+ #else /* CUDA_VERSION */
560+ double sub_norm = 0.0 , norm_sum = 0.0 ;
561+ double * d_array_start ;
562+
563+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
564+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
565+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
566+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
567+
568+ CeedCallCublas (ceed , cublasDnrm2 (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
569+ norm_sum += sub_norm * sub_norm ;
557570 }
571+ * norm = sqrt (norm_sum );
572+ #endif /* CUDA_VERSION */
573+ #endif /* CEED_SCALAR */
558574 break ;
559575 }
560576 case CEED_NORM_MAX : {
561- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
562- #if CUDA_VERSION >= 12000
563- int64_t index ;
564- CeedScalar norm_no_abs ;
565-
566- CeedCallCublas (ceed , cublasIsamax_64 (handle , (int64_t )length , (float * )d_array , 1 , & index ));
567- CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
568- * norm = fabs (norm_no_abs );
569- #else
570- CeedInt index ;
571- float sub_max = 0.0 , current_max = 0.0 ;
572- float * d_array_start ;
573-
574- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
575- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
576- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
577- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
578-
579- CeedCallCublas (ceed , cublasIsamax (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & index ));
580- CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
581- if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
582- }
583- * norm = current_max ;
584- #endif
585- } else {
586- #if CUDA_VERSION >= 12000
587- int64_t index ;
588- CeedScalar norm_no_abs ;
589-
590- CeedCallCublas (ceed , cublasIdamax_64 (handle , (int64_t )length , (double * )d_array , 1 , & index ));
591- CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
592- * norm = fabs (norm_no_abs );
593- #else
594- CeedInt index ;
595- double sub_max = 0.0 , current_max = 0.0 ;
596- double * d_array_start ;
597-
598- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
599- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
600- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
601- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
602-
603- CeedCallCublas (ceed , cublasIdamax (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & index ));
604- CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
605- if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
606- }
607- * norm = current_max ;
608- #endif
577+ #if defined(CEED_SCALAR_IS_FP32 )
578+ #if (CUDA_VERSION >= 12000 )
579+ int64_t index ;
580+ CeedScalar norm_no_abs ;
581+
582+ CeedCallCublas (ceed , cublasIsamax_64 (handle , (int64_t )length , (float * )d_array , 1 , & index ));
583+ CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
584+ * norm = fabs (norm_no_abs );
585+ #else /* CUDA_VERSION */
586+ CeedInt index ;
587+ float sub_max = 0.0 , current_max = 0.0 ;
588+ float * d_array_start ;
589+
590+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
591+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
592+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
593+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
594+
595+ CeedCallCublas (ceed , cublasIsamax (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & index ));
596+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
597+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
609598 }
599+ * norm = current_max ;
600+ #endif /* CUDA_VERSION */
601+ #else /* CEED_SCALAR */
602+ #if (CUDA_VERSION >= 12000 )
603+ int64_t index ;
604+ CeedScalar norm_no_abs ;
605+
606+ CeedCallCublas (ceed , cublasIdamax_64 (handle , (int64_t )length , (double * )d_array , 1 , & index ));
607+ CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
608+ * norm = fabs (norm_no_abs );
609+ #else /* CUDA_VERSION */
610+ CeedInt index ;
611+ double sub_max = 0.0 , current_max = 0.0 ;
612+ double * d_array_start ;
613+
614+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
615+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
616+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
617+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
618+
619+ CeedCallCublas (ceed , cublasIdamax (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & index ));
620+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
621+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
622+ }
623+ * norm = current_max ;
624+ #endif /* CUDA_VERSION */
625+ #endif /* CEED_SCALAR */
610626 break ;
611627 }
612628 }
@@ -663,13 +679,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length)
663679//------------------------------------------------------------------------------
664680static int CeedVectorScale_Cuda (CeedVector x , CeedScalar alpha ) {
665681 CeedSize length ;
666- CeedVector_Cuda * x_impl ;
682+ CeedVector_Cuda * impl ;
667683
668- CeedCallBackend (CeedVectorGetData (x , & x_impl ));
684+ CeedCallBackend (CeedVectorGetData (x , & impl ));
669685 CeedCallBackend (CeedVectorGetLength (x , & length ));
670686 // Set value for synced device/host array
671- if (x_impl -> d_array ) CeedCallBackend (CeedDeviceScale_Cuda (x_impl -> d_array , alpha , length ));
672- if (x_impl -> h_array ) CeedCallBackend (CeedHostScale_Cuda (x_impl -> h_array , alpha , length ));
687+ if (impl -> d_array ) {
688+ #if (CUDA_VERSION >= 12000 )
689+ cublasHandle_t handle ;
690+
691+ CeedCallBackend (CeedGetCublasHandle_Cuda (CeedVectorReturnCeed (x ), & handle ));
692+ #if defined(CEED_SCALAR_IS_FP32 )
693+ CeedCallCublas (CeedVectorReturnCeed (x ), cublasSscal_64 (handle , (int64_t )length , & alpha , impl -> d_array , 1 ));
694+ #else /* CEED_SCALAR */
695+ CeedCallCublas (CeedVectorReturnCeed (x ), cublasDscal_64 (handle , (int64_t )length , & alpha , impl -> d_array , 1 ));
696+ #endif /* CEED_SCALAR */
697+ #else /* CUDA_VERSION */
698+ CeedCallBackend (CeedDeviceScale_Cuda (impl -> d_array , alpha , length ));
699+ #endif /* CUDA_VERSION */
700+ impl -> h_array = NULL ;
701+ } else if (impl -> h_array ) {
702+ CeedCallBackend (CeedHostScale_Cuda (impl -> h_array , alpha , length ));
703+ impl -> d_array = NULL ;
704+ }
673705 return CEED_ERROR_SUCCESS ;
674706}
675707
@@ -699,11 +731,23 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
699731 // Set value for synced device/host array
700732 if (y_impl -> d_array ) {
701733 CeedCallBackend (CeedVectorSyncArray (x , CEED_MEM_DEVICE ));
734+ #if (CUDA_VERSION >= 12000 )
735+ cublasHandle_t handle ;
736+
737+ CeedCallBackend (CeedGetCublasHandle_Cuda (CeedVectorReturnCeed (y ), & handle ));
738+ #if defined(CEED_SCALAR_IS_FP32 )
739+ CeedCallCublas (CeedVectorReturnCeed (y ), cublasSaxpy_64 (handle , (int64_t )length , & alpha , x_impl -> d_array , 1 , y_impl -> d_array , 1 ));
740+ #else /* CEED_SCALAR */
741+ CeedCallCublas (CeedVectorReturnCeed (y ), cublasDaxpy_64 (handle , (int64_t )length , & alpha , x_impl -> d_array , 1 , y_impl -> d_array , 1 ));
742+ #endif /* CEED_SCALAR */
743+ #else /* CUDA_VERSION */
702744 CeedCallBackend (CeedDeviceAXPY_Cuda (y_impl -> d_array , alpha , x_impl -> d_array , length ));
703- }
704- if (y_impl -> h_array ) {
745+ #endif /* CUDA_VERSION */
746+ y_impl -> h_array = NULL ;
747+ } else if (y_impl -> h_array ) {
705748 CeedCallBackend (CeedVectorSyncArray (x , CEED_MEM_HOST ));
706749 CeedCallBackend (CeedHostAXPY_Cuda (y_impl -> h_array , alpha , x_impl -> h_array , length ));
750+ y_impl -> d_array = NULL ;
707751 }
708752 return CEED_ERROR_SUCCESS ;
709753}
0 commit comments