@@ -253,14 +253,30 @@ static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize s
253
253
CeedScalar * copy_array ;
254
254
255
255
CeedCallBackend (CeedVectorGetArray (vec_copy , CEED_MEM_DEVICE , & copy_array ));
256
+ #if (CUDA_VERSION >= 12000 )
257
+ cublasHandle_t handle ;
258
+ Ceed ceed ;
259
+
260
+ CeedCallBackend (CeedVectorGetCeed (vec , & ceed ));
261
+ CeedCallBackend (CeedGetCublasHandle_Cuda (ceed , & handle ));
262
+ #if defined(CEED_SCALAR_IS_FP32 )
263
+ CeedCallCublas (ceed , cublasScopy_64 (handle , (int64_t )length , impl -> d_array + start , (int64_t )step , copy_array + start , (int64_t )step ));
264
+ #else /* CEED_SCALAR */
265
+ CeedCallCublas (ceed , cublasDcopy_64 (handle , (int64_t )length , impl -> d_array + start , (int64_t )step , copy_array + start , (int64_t )step ));
266
+ #endif /* CEED_SCALAR */
267
+ CeedCallBackend (CeedDestroy (& ceed ));
268
+ #else /* CUDA_VERSION */
256
269
CeedCallBackend (CeedDeviceCopyStrided_Cuda (impl -> d_array , start , step , length , copy_array ));
270
+ #endif /* CUDA_VERSION */
257
271
CeedCallBackend (CeedVectorRestoreArray (vec_copy , & copy_array ));
272
+ impl -> h_array = NULL ;
258
273
} else if (impl -> h_array ) {
259
274
CeedScalar * copy_array ;
260
275
261
276
CeedCallBackend (CeedVectorGetArray (vec_copy , CEED_MEM_HOST , & copy_array ));
262
277
CeedCallBackend (CeedHostCopyStrided_Cuda (impl -> h_array , start , step , length , copy_array ));
263
278
CeedCallBackend (CeedVectorRestoreArray (vec_copy , & copy_array ));
279
+ impl -> d_array = NULL ;
264
280
} else {
265
281
return CeedError (CeedVectorReturnCeed (vec ), CEED_ERROR_BACKEND , "CeedVector must have valid data set" );
266
282
}
@@ -459,9 +475,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType
459
475
static int CeedVectorNorm_Cuda (CeedVector vec , CeedNormType type , CeedScalar * norm ) {
460
476
Ceed ceed ;
461
477
CeedSize length ;
462
- #if CUDA_VERSION < 12000
478
+ #if ( CUDA_VERSION < 12000 )
463
479
CeedSize num_calls ;
464
- #endif
480
+ #endif /* CUDA_VERSION */
465
481
const CeedScalar * d_array ;
466
482
CeedVector_Cuda * impl ;
467
483
cublasHandle_t handle ;
@@ -471,142 +487,142 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
471
487
CeedCallBackend (CeedVectorGetLength (vec , & length ));
472
488
CeedCallBackend (CeedGetCublasHandle_Cuda (ceed , & handle ));
473
489
474
- #if CUDA_VERSION < 12000
490
+ #if ( CUDA_VERSION < 12000 )
475
491
// With CUDA 12, we can use the 64-bit integer interface. Prior to that,
476
492
// we need to check if the vector is too long to handle with int32,
477
493
// and if so, divide it into subsections for repeated cuBLAS calls.
478
494
num_calls = length / INT_MAX ;
479
495
if (length % INT_MAX > 0 ) num_calls += 1 ;
480
- #endif
496
+ #endif /* CUDA_VERSION */
481
497
482
498
// Compute norm
483
499
CeedCallBackend (CeedVectorGetArrayRead (vec , CEED_MEM_DEVICE , & d_array ));
484
500
switch (type ) {
485
501
case CEED_NORM_1 : {
486
502
* norm = 0.0 ;
487
- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
488
- #if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers
489
- CeedCallCublas (ceed , cublasSasum_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
490
- #else
491
- float sub_norm = 0.0 ;
492
- float * d_array_start ;
493
-
494
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
495
- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
496
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
497
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
498
-
499
- CeedCallCublas (ceed , cublasSasum (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
500
- * norm += sub_norm ;
501
- }
502
- #endif
503
- } else {
504
- #if CUDA_VERSION >= 12000
505
- CeedCallCublas (ceed , cublasDasum_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
506
- #else
507
- double sub_norm = 0.0 ;
508
- double * d_array_start ;
509
-
510
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
511
- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
512
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
513
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
514
-
515
- CeedCallCublas (ceed , cublasDasum (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
516
- * norm += sub_norm ;
517
- }
518
- #endif
503
+ #if defined(CEED_SCALAR_IS_FP32 )
504
+ #if (CUDA_VERSION >= 12000 ) // We have CUDA 12, and can use 64-bit integers
505
+ CeedCallCublas (ceed , cublasSasum_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
506
+ #else /* CUDA_VERSION */
507
+ float sub_norm = 0.0 ;
508
+ float * d_array_start ;
509
+
510
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
511
+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
512
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
513
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
514
+
515
+ CeedCallCublas (ceed , cublasSasum (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
516
+ * norm += sub_norm ;
519
517
}
518
+ #endif /* CUDA_VERSION */
519
+ #else /* CEED_SCALAR */
520
+ #if (CUDA_VERSION >= 12000 )
521
+ CeedCallCublas (ceed , cublasDasum_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
522
+ #else /* CUDA_VERSION */
523
+ double sub_norm = 0.0 ;
524
+ double * d_array_start ;
525
+
526
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
527
+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
528
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
529
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
530
+
531
+ CeedCallCublas (ceed , cublasDasum (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
532
+ * norm += sub_norm ;
533
+ }
534
+ #endif /* CUDA_VERSION */
535
+ #endif /* CEED_SCALAR */
520
536
break ;
521
537
}
522
538
case CEED_NORM_2 : {
523
- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
524
- #if CUDA_VERSION >= 12000
525
- CeedCallCublas (ceed , cublasSnrm2_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
526
- #else
527
- float sub_norm = 0.0 , norm_sum = 0.0 ;
528
- float * d_array_start ;
529
-
530
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
531
- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
532
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
533
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
534
-
535
- CeedCallCublas (ceed , cublasSnrm2 (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
536
- norm_sum += sub_norm * sub_norm ;
537
- }
538
- * norm = sqrt (norm_sum );
539
- #endif
540
- } else {
541
- #if CUDA_VERSION >= 12000
542
- CeedCallCublas (ceed , cublasDnrm2_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
543
- #else
544
- double sub_norm = 0.0 , norm_sum = 0.0 ;
545
- double * d_array_start ;
546
-
547
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
548
- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
549
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
550
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
551
-
552
- CeedCallCublas (ceed , cublasDnrm2 (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
553
- norm_sum += sub_norm * sub_norm ;
554
- }
555
- * norm = sqrt (norm_sum );
556
- #endif
539
+ #if defined(CEED_SCALAR_IS_FP32 )
540
+ #if (CUDA_VERSION >= 12000 )
541
+ CeedCallCublas (ceed , cublasSnrm2_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
542
+ #else /* CUDA_VERSION */
543
+ float sub_norm = 0.0 , norm_sum = 0.0 ;
544
+ float * d_array_start ;
545
+
546
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
547
+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
548
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
549
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
550
+
551
+ CeedCallCublas (ceed , cublasSnrm2 (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
552
+ norm_sum += sub_norm * sub_norm ;
553
+ }
554
+ * norm = sqrt (norm_sum );
555
+ #endif /* CUDA_VERSION */
556
+ #else /* CEED_SCALAR */
557
+ #if (CUDA_VERSION >= 12000 )
558
+ CeedCallCublas (ceed , cublasDnrm2_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
559
+ #else /* CUDA_VERSION */
560
+ double sub_norm = 0.0 , norm_sum = 0.0 ;
561
+ double * d_array_start ;
562
+
563
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
564
+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
565
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
566
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
567
+
568
+ CeedCallCublas (ceed , cublasDnrm2 (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
569
+ norm_sum += sub_norm * sub_norm ;
557
570
}
571
+ * norm = sqrt (norm_sum );
572
+ #endif /* CUDA_VERSION */
573
+ #endif /* CEED_SCALAR */
558
574
break ;
559
575
}
560
576
case CEED_NORM_MAX : {
561
- if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
562
- #if CUDA_VERSION >= 12000
563
- int64_t index ;
564
- CeedScalar norm_no_abs ;
565
-
566
- CeedCallCublas (ceed , cublasIsamax_64 (handle , (int64_t )length , (float * )d_array , 1 , & index ));
567
- CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
568
- * norm = fabs (norm_no_abs );
569
- #else
570
- CeedInt index ;
571
- float sub_max = 0.0 , current_max = 0.0 ;
572
- float * d_array_start ;
573
-
574
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
575
- d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
576
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
577
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
578
-
579
- CeedCallCublas (ceed , cublasIsamax (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & index ));
580
- CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
581
- if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
582
- }
583
- * norm = current_max ;
584
- #endif
585
- } else {
586
- #if CUDA_VERSION >= 12000
587
- int64_t index ;
588
- CeedScalar norm_no_abs ;
589
-
590
- CeedCallCublas (ceed , cublasIdamax_64 (handle , (int64_t )length , (double * )d_array , 1 , & index ));
591
- CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
592
- * norm = fabs (norm_no_abs );
593
- #else
594
- CeedInt index ;
595
- double sub_max = 0.0 , current_max = 0.0 ;
596
- double * d_array_start ;
597
-
598
- for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
599
- d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
600
- CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
601
- CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
602
-
603
- CeedCallCublas (ceed , cublasIdamax (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & index ));
604
- CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
605
- if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
606
- }
607
- * norm = current_max ;
608
- #endif
577
+ #if defined(CEED_SCALAR_IS_FP32 )
578
+ #if (CUDA_VERSION >= 12000 )
579
+ int64_t index ;
580
+ CeedScalar norm_no_abs ;
581
+
582
+ CeedCallCublas (ceed , cublasIsamax_64 (handle , (int64_t )length , (float * )d_array , 1 , & index ));
583
+ CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
584
+ * norm = fabs (norm_no_abs );
585
+ #else /* CUDA_VERSION */
586
+ CeedInt index ;
587
+ float sub_max = 0.0 , current_max = 0.0 ;
588
+ float * d_array_start ;
589
+
590
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
591
+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
592
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
593
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
594
+
595
+ CeedCallCublas (ceed , cublasIsamax (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & index ));
596
+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
597
+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
609
598
}
599
+ * norm = current_max ;
600
+ #endif /* CUDA_VERSION */
601
+ #else /* CEED_SCALAR */
602
+ #if (CUDA_VERSION >= 12000 )
603
+ int64_t index ;
604
+ CeedScalar norm_no_abs ;
605
+
606
+ CeedCallCublas (ceed , cublasIdamax_64 (handle , (int64_t )length , (double * )d_array , 1 , & index ));
607
+ CeedCallCuda (ceed , cudaMemcpy (& norm_no_abs , impl -> d_array + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
608
+ * norm = fabs (norm_no_abs );
609
+ #else /* CUDA_VERSION */
610
+ CeedInt index ;
611
+ double sub_max = 0.0 , current_max = 0.0 ;
612
+ double * d_array_start ;
613
+
614
+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
615
+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
616
+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
617
+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
618
+
619
+ CeedCallCublas (ceed , cublasIdamax (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & index ));
620
+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + index - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
621
+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
622
+ }
623
+ * norm = current_max ;
624
+ #endif /* CUDA_VERSION */
625
+ #endif /* CEED_SCALAR */
610
626
break ;
611
627
}
612
628
}
@@ -663,13 +679,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length)
663
679
//------------------------------------------------------------------------------
664
680
static int CeedVectorScale_Cuda (CeedVector x , CeedScalar alpha ) {
665
681
CeedSize length ;
666
- CeedVector_Cuda * x_impl ;
682
+ CeedVector_Cuda * impl ;
667
683
668
- CeedCallBackend (CeedVectorGetData (x , & x_impl ));
684
+ CeedCallBackend (CeedVectorGetData (x , & impl ));
669
685
CeedCallBackend (CeedVectorGetLength (x , & length ));
670
686
// Set value for synced device/host array
671
- if (x_impl -> d_array ) CeedCallBackend (CeedDeviceScale_Cuda (x_impl -> d_array , alpha , length ));
672
- if (x_impl -> h_array ) CeedCallBackend (CeedHostScale_Cuda (x_impl -> h_array , alpha , length ));
687
+ if (impl -> d_array ) {
688
+ #if (CUDA_VERSION >= 12000 )
689
+ cublasHandle_t handle ;
690
+
691
+ CeedCallBackend (CeedGetCublasHandle_Cuda (CeedVectorReturnCeed (x ), & handle ));
692
+ #if defined(CEED_SCALAR_IS_FP32 )
693
+ CeedCallCublas (CeedVectorReturnCeed (x ), cublasSscal_64 (handle , (int64_t )length , & alpha , impl -> d_array , 1 ));
694
+ #else /* CEED_SCALAR */
695
+ CeedCallCublas (CeedVectorReturnCeed (x ), cublasDscal_64 (handle , (int64_t )length , & alpha , impl -> d_array , 1 ));
696
+ #endif /* CEED_SCALAR */
697
+ #else /* CUDA_VERSION */
698
+ CeedCallBackend (CeedDeviceScale_Cuda (impl -> d_array , alpha , length ));
699
+ #endif /* CUDA_VERSION */
700
+ impl -> h_array = NULL ;
701
+ } else if (impl -> h_array ) {
702
+ CeedCallBackend (CeedHostScale_Cuda (impl -> h_array , alpha , length ));
703
+ impl -> d_array = NULL ;
704
+ }
673
705
return CEED_ERROR_SUCCESS ;
674
706
}
675
707
@@ -699,11 +731,23 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
699
731
// Set value for synced device/host array
700
732
if (y_impl -> d_array ) {
701
733
CeedCallBackend (CeedVectorSyncArray (x , CEED_MEM_DEVICE ));
734
+ #if (CUDA_VERSION >= 12000 )
735
+ cublasHandle_t handle ;
736
+
737
+ CeedCallBackend (CeedGetCublasHandle_Cuda (CeedVectorReturnCeed (y ), & handle ));
738
+ #if defined(CEED_SCALAR_IS_FP32 )
739
+ CeedCallCublas (CeedVectorReturnCeed (y ), cublasSaxpy_64 (handle , (int64_t )length , & alpha , x_impl -> d_array , 1 , y_impl -> d_array , 1 ));
740
+ #else /* CEED_SCALAR */
741
+ CeedCallCublas (CeedVectorReturnCeed (y ), cublasDaxpy_64 (handle , (int64_t )length , & alpha , x_impl -> d_array , 1 , y_impl -> d_array , 1 ));
742
+ #endif /* CEED_SCALAR */
743
+ #else /* CUDA_VERSION */
702
744
CeedCallBackend (CeedDeviceAXPY_Cuda (y_impl -> d_array , alpha , x_impl -> d_array , length ));
703
- }
704
- if (y_impl -> h_array ) {
745
+ #endif /* CUDA_VERSION */
746
+ y_impl -> h_array = NULL ;
747
+ } else if (y_impl -> h_array ) {
705
748
CeedCallBackend (CeedVectorSyncArray (x , CEED_MEM_HOST ));
706
749
CeedCallBackend (CeedHostAXPY_Cuda (y_impl -> h_array , alpha , x_impl -> h_array , length ));
750
+ y_impl -> d_array = NULL ;
707
751
}
708
752
return CEED_ERROR_SUCCESS ;
709
753
}
0 commit comments