55 * @file tungsten_ops_kernels.cu
66 * @brief CUDA kernel implementations for Tungsten-specific operations
77 *
8- * This file contains CUDA kernel code for Tungsten operations. It is only compiled when CUDA is enabled.
8+ * This file contains CUDA kernel code for Tungsten operations. It is only compiled
9+ * when CUDA is enabled.
910 */
1011
1112#if defined(OpenPFC_ENABLE_CUDA)
1213
1314#include " tungsten_ops.hpp"
14- #include < cuda_runtime.h>
1515#include < cuComplex.h>
16+ #include < cuda_runtime.h>
1617#include < type_traits>
1718
1819namespace tungsten {
@@ -23,10 +24,10 @@ namespace detail {
2324template <typename RealType>
2425__global__ void multiply_complex_real_kernel_impl (
2526 const typename std::conditional<std::is_same<RealType, double >::value,
26- cuDoubleComplex, cuFloatComplex>::type *a,
27+ cuDoubleComplex, cuFloatComplex>::type *a,
2728 const RealType *b,
28- typename std::conditional<std::is_same<RealType, double >::value,
29- cuDoubleComplex, cuFloatComplex>::type *out,
29+ typename std::conditional<std::is_same<RealType, double >::value, cuDoubleComplex,
30+ cuFloatComplex>::type *out,
3031 size_t n) {
3132 size_t idx = blockIdx .x * blockDim .x + threadIdx .x ;
3233 if (idx < n) {
@@ -43,16 +44,18 @@ __global__ void multiply_complex_real_kernel_impl(
4344}
4445
4546// Explicit instantiations for float and double (required for CUDA template kernels)
46- template __global__ void multiply_complex_real_kernel_impl<double >(
47- const cuDoubleComplex *, const double *, cuDoubleComplex *, size_t );
48- template __global__ void multiply_complex_real_kernel_impl<float >(
49- const cuFloatComplex *, const float *, cuFloatComplex *, size_t );
47+ template __global__ void
48+ multiply_complex_real_kernel_impl<double >(const cuDoubleComplex *, const double *,
49+ cuDoubleComplex *, size_t );
50+ template __global__ void
51+ multiply_complex_real_kernel_impl<float >(const cuFloatComplex *, const float *,
52+ cuFloatComplex *, size_t );
5053
5154// CUDA kernel: Compute nonlinear term (template-based for precision)
5255template <typename RealType>
53- __global__ void compute_nonlinear_kernel (
54- const RealType *u, const RealType *v, RealType p3, RealType p4, RealType q3,
55- RealType q4, RealType *out, size_t n) {
56+ __global__ void compute_nonlinear_kernel (const RealType *u, const RealType *v,
57+ RealType p3, RealType p4, RealType q3,
58+ RealType q4, RealType *out, size_t n) {
5659 size_t idx = blockIdx .x * blockDim .x + threadIdx .x ;
5760 if (idx < n) {
5861 RealType u_val = u[idx];
@@ -66,39 +69,43 @@ __global__ void compute_nonlinear_kernel(
6669}
6770
6871// Explicit instantiations for float and double
69- template __global__ void compute_nonlinear_kernel<double >(
70- const double *, const double *, double , double , double , double , double *,
71- size_t );
72- template __global__ void compute_nonlinear_kernel<float >(
73- const float *, const float *, float , float , float , float , float *, size_t );
72+ template __global__ void compute_nonlinear_kernel<double >(const double *,
73+ const double *, double ,
74+ double , double , double ,
75+ double *, size_t );
76+ template __global__ void compute_nonlinear_kernel<float >(const float *,
77+ const float *, float , float ,
78+ float , float , float *,
79+ size_t );
7480
7581// CUDA kernel: Apply stabilization (template-based for precision)
7682template <typename RealType>
77- __global__ void apply_stabilization_kernel (
78- const RealType *in, const RealType *field, RealType stabP, RealType *out,
79- size_t n) {
83+ __global__ void apply_stabilization_kernel (const RealType *in, const RealType *field,
84+ RealType stabP, RealType *out, size_t n) {
8085 size_t idx = blockIdx .x * blockDim .x + threadIdx .x ;
8186 if (idx < n) {
8287 out[idx] = in[idx] - stabP * field[idx];
8388 }
8489}
8590
8691// Explicit instantiations for float and double
87- template __global__ void apply_stabilization_kernel<double >(
88- const double *, const double *, double , double *, size_t );
89- template __global__ void apply_stabilization_kernel<float >(
90- const float *, const float *, float , float *, size_t );
92+ template __global__ void apply_stabilization_kernel<double >(const double *,
93+ const double *, double ,
94+ double *, size_t );
95+ template __global__ void apply_stabilization_kernel<float >(const float *,
96+ const float *, float ,
97+ float *, size_t );
9198
9299// CUDA kernel: Apply time integration (template-based for precision)
93100template <typename RealType>
94101__global__ void apply_time_integration_kernel_impl (
95102 const typename std::conditional<std::is_same<RealType, double >::value,
96- cuDoubleComplex, cuFloatComplex>::type *psi_F,
103+ cuDoubleComplex, cuFloatComplex>::type *psi_F,
97104 const typename std::conditional<std::is_same<RealType, double >::value,
98- cuDoubleComplex, cuFloatComplex>::type *psiN_F,
105+ cuDoubleComplex, cuFloatComplex>::type *psiN_F,
99106 const RealType *opL, const RealType *opN,
100- typename std::conditional<std::is_same<RealType, double >::value,
101- cuDoubleComplex, cuFloatComplex>::type *out,
107+ typename std::conditional<std::is_same<RealType, double >::value, cuDoubleComplex,
108+ cuFloatComplex>::type *out,
102109 size_t n) {
103110 size_t idx = blockIdx .x * blockDim .x + threadIdx .x ;
104111 if (idx < n) {
@@ -107,7 +114,7 @@ __global__ void apply_time_integration_kernel_impl(
107114 cuDoubleComplex psiN_F_val = psiN_F[idx];
108115 double opL_val = opL[idx];
109116 double opN_val = opN[idx];
110-
117+
111118 // out = opL * psi_F + opN * psiN_F
112119 cuDoubleComplex term1 = cuCmul (cuDoubleComplex{opL_val, 0.0 }, psi_F_val);
113120 cuDoubleComplex term2 = cuCmul (cuDoubleComplex{opN_val, 0.0 }, psiN_F_val);
@@ -117,7 +124,7 @@ __global__ void apply_time_integration_kernel_impl(
117124 cuFloatComplex psiN_F_val = psiN_F[idx];
118125 float opL_val = opL[idx];
119126 float opN_val = opN[idx];
120-
127+
121128 // out = opL * psi_F + opN * psiN_F
122129 cuFloatComplex term1 = cuCmulf (cuFloatComplex{opL_val, 0 .0f }, psi_F_val);
123130 cuFloatComplex term2 = cuCmulf (cuFloatComplex{opN_val, 0 .0f }, psiN_F_val);
@@ -130,9 +137,10 @@ __global__ void apply_time_integration_kernel_impl(
130137template __global__ void apply_time_integration_kernel_impl<double >(
131138 const cuDoubleComplex *, const cuDoubleComplex *, const double *, const double *,
132139 cuDoubleComplex *, size_t );
133- template __global__ void apply_time_integration_kernel_impl<float >(
134- const cuFloatComplex *, const cuFloatComplex *, const float *, const float *,
135- cuFloatComplex *, size_t );
140+ template __global__ void
141+ apply_time_integration_kernel_impl<float >(const cuFloatComplex *,
142+ const cuFloatComplex *, const float *,
143+ const float *, cuFloatComplex *, size_t );
136144
137145// Helper function to launch kernels with appropriate grid/block sizes
138146// Optimized for modern GPUs (H100): use larger block size for better occupancy
@@ -182,13 +190,14 @@ void TungstenOps<pfc::backend::CudaTag, double>::multiply_complex_real_impl(
182190 throw std::runtime_error (" CUDA kernel launch failed (multiply_complex_real): " +
183191 std::string (cudaGetErrorString (err)));
184192 }
185- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
193+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
194+ // utilization
186195}
187196
188197void TungstenOps<pfc::backend::CudaTag, double >::compute_nonlinear_impl(
189198 const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &u,
190- const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &v,
191- double p3, double p4, double q3, double q4,
199+ const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &v, double p3,
200+ double p4, double q3, double q4,
192201 pfc::core::DataBuffer<pfc::backend::CudaTag, double > &out) {
193202 const size_t N = u.size ();
194203 if (v.size () != N || out.size () != N) {
@@ -209,13 +218,13 @@ void TungstenOps<pfc::backend::CudaTag, double>::compute_nonlinear_impl(
209218 throw std::runtime_error (" CUDA kernel launch failed (compute_nonlinear): " +
210219 std::string (cudaGetErrorString (err)));
211220 }
212- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
221+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
222+ // utilization
213223}
214224
215225void TungstenOps<pfc::backend::CudaTag, double >::apply_stabilization_impl(
216226 const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &in,
217- const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &field,
218- double stabP,
227+ const pfc::core::DataBuffer<pfc::backend::CudaTag, double > &field, double stabP,
219228 pfc::core::DataBuffer<pfc::backend::CudaTag, double > &out) {
220229 const size_t N = in.size ();
221230 if (field.size () != N || out.size () != N) {
@@ -228,15 +237,16 @@ void TungstenOps<pfc::backend::CudaTag, double>::apply_stabilization_impl(
228237 int blocks, threads_per_block;
229238 launch_kernel (N, blocks, threads_per_block);
230239
231- detail::apply_stabilization_kernel<double ><<<blocks, threads_per_block>>> (
232- in.data (), field.data (), stabP, out.data (), N);
240+ detail::apply_stabilization_kernel<double >
241+ <<<blocks, threads_per_block>>> ( in.data (), field.data (), stabP, out.data (), N);
233242
234243 cudaError_t err = cudaGetLastError ();
235244 if (err != cudaSuccess) {
236245 throw std::runtime_error (" CUDA kernel launch failed (apply_stabilization): " +
237246 std::string (cudaGetErrorString (err)));
238247 }
239- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
248+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
249+ // utilization
240250}
241251
242252void TungstenOps<pfc::backend::CudaTag, double >::apply_time_integration_impl(
@@ -264,16 +274,16 @@ void TungstenOps<pfc::backend::CudaTag, double>::apply_time_integration_impl(
264274 const double *opN_ptr = opN.data ();
265275 cuDoubleComplex *out_ptr = reinterpret_cast <cuDoubleComplex *>(out.data ());
266276
267- apply_time_integration_kernel_impl<double >
268- <<<blocks, threads_per_block>>> (psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr,
269- out_ptr, N);
277+ apply_time_integration_kernel_impl<double ><<<blocks, threads_per_block>>> (
278+ psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr, out_ptr, N);
270279
271280 cudaError_t err = cudaGetLastError ();
272281 if (err != cudaSuccess) {
273282 throw std::runtime_error (" CUDA kernel launch failed (apply_time_integration): " +
274283 std::string (cudaGetErrorString (err)));
275284 }
276- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
285+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
286+ // utilization
277287}
278288
279289// CUDA specialization for float precision - implement methods
@@ -304,14 +314,14 @@ void TungstenOps<pfc::backend::CudaTag, float>::multiply_complex_real_impl(
304314 throw std::runtime_error (" CUDA kernel launch failed (multiply_complex_real): " +
305315 std::string (cudaGetErrorString (err)));
306316 }
307- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
317+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
318+ // utilization
308319}
309320
310321void TungstenOps<pfc::backend::CudaTag, float >::compute_nonlinear_impl(
311322 const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &u,
312- const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &v,
313- float p3, float p4, float q3, float q4,
314- pfc::core::DataBuffer<pfc::backend::CudaTag, float > &out) {
323+ const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &v, float p3, float p4,
324+ float q3, float q4, pfc::core::DataBuffer<pfc::backend::CudaTag, float > &out) {
315325 const size_t N = u.size ();
316326 if (v.size () != N || out.size () != N) {
317327 throw std::runtime_error (" Size mismatch in compute_nonlinear" );
@@ -331,13 +341,13 @@ void TungstenOps<pfc::backend::CudaTag, float>::compute_nonlinear_impl(
331341 throw std::runtime_error (" CUDA kernel launch failed (compute_nonlinear): " +
332342 std::string (cudaGetErrorString (err)));
333343 }
334- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
344+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
345+ // utilization
335346}
336347
337348void TungstenOps<pfc::backend::CudaTag, float >::apply_stabilization_impl(
338349 const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &in,
339- const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &field,
340- float stabP,
350+ const pfc::core::DataBuffer<pfc::backend::CudaTag, float > &field, float stabP,
341351 pfc::core::DataBuffer<pfc::backend::CudaTag, float > &out) {
342352 const size_t N = in.size ();
343353 if (field.size () != N || out.size () != N) {
@@ -350,15 +360,16 @@ void TungstenOps<pfc::backend::CudaTag, float>::apply_stabilization_impl(
350360 int blocks, threads_per_block;
351361 launch_kernel (N, blocks, threads_per_block);
352362
353- detail::apply_stabilization_kernel<float ><<<blocks, threads_per_block>>> (
354- in.data (), field.data (), stabP, out.data (), N);
363+ detail::apply_stabilization_kernel<float >
364+ <<<blocks, threads_per_block>>> ( in.data (), field.data (), stabP, out.data (), N);
355365
356366 cudaError_t err = cudaGetLastError ();
357367 if (err != cudaSuccess) {
358368 throw std::runtime_error (" CUDA kernel launch failed (apply_stabilization): " +
359369 std::string (cudaGetErrorString (err)));
360370 }
361- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
371+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
372+ // utilization
362373}
363374
364375void TungstenOps<pfc::backend::CudaTag, float >::apply_time_integration_impl(
@@ -386,16 +397,16 @@ void TungstenOps<pfc::backend::CudaTag, float>::apply_time_integration_impl(
386397 const float *opN_ptr = opN.data ();
387398 cuFloatComplex *out_ptr = reinterpret_cast <cuFloatComplex *>(out.data ());
388399
389- detail::apply_time_integration_kernel_impl<float >
390- <<<blocks, threads_per_block>>> (psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr,
391- out_ptr, N);
400+ detail::apply_time_integration_kernel_impl<float ><<<blocks, threads_per_block>>> (
401+ psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr, out_ptr, N);
392402
393403 cudaError_t err = cudaGetLastError ();
394404 if (err != cudaSuccess) {
395405 throw std::runtime_error (" CUDA kernel launch failed (apply_time_integration): " +
396406 std::string (cudaGetErrorString (err)));
397407 }
398- // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
408+ // Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
409+ // utilization
399410}
400411
401412} // namespace detail
0 commit comments