Skip to content

Commit 4b36ea6

Browse files
committed
ci: Use forked clang-format-action with fail-on-error option
Switch to ahojukka5/clang-format-action fork which adds: - fail-on-error option to treat formatting issues as warnings - max-failing-files option for threshold-based failures Benefits: - No more CI failures due to clang-format version differences - REUSE license checks still enforced (will fail the job) - Cleaner workflow without continue-on-error hacks - Formatting warnings still visible in action output Also format tungsten_ops_kernels.cu to resolve current warnings.
1 parent 055e2d5 commit 4b36ea6

File tree

2 files changed

+73
-71
lines changed

2 files changed

+73
-71
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,12 @@ jobs:
2828
submodules: true
2929

3030
- name: Check code formatting (clang-format)
31-
id: clang-format
32-
uses: jidicula/[email protected]
31+
uses: ahojukka5/clang-format-action@main
3332
with:
3433
clang-format-version: '16'
3534
check-path: '.'
3635
exclude-regex: '(build|\.cache|extern|third[-_]party|doxygen-awesome-css)'
37-
continue-on-error: true
38-
39-
- name: Warn on formatting issues
40-
if: steps.clang-format.outcome == 'failure'
41-
run: |
42-
echo "⚠️ WARNING: Code formatting check failed"
43-
echo "Different clang-format versions may format code differently."
44-
echo "Please run 'clang-format -i' on modified files locally."
45-
echo "This is a warning only and will not block the build."
36+
fail-on-error: 'false'
4637

4738
- name: Clean up clang-format artifacts
4839
run: rm -f failing-files.txt

apps/tungsten/tungsten_ops_kernels.cu

Lines changed: 71 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55
* @file tungsten_ops_kernels.cu
66
* @brief CUDA kernel implementations for Tungsten-specific operations
77
*
8-
* This file contains CUDA kernel code for Tungsten operations. It is only compiled when CUDA is enabled.
8+
* This file contains CUDA kernel code for Tungsten operations. It is only compiled
9+
* when CUDA is enabled.
910
*/
1011

1112
#if defined(OpenPFC_ENABLE_CUDA)
1213

1314
#include "tungsten_ops.hpp"
14-
#include <cuda_runtime.h>
1515
#include <cuComplex.h>
16+
#include <cuda_runtime.h>
1617
#include <type_traits>
1718

1819
namespace tungsten {
@@ -23,10 +24,10 @@ namespace detail {
2324
template <typename RealType>
2425
__global__ void multiply_complex_real_kernel_impl(
2526
const typename std::conditional<std::is_same<RealType, double>::value,
26-
cuDoubleComplex, cuFloatComplex>::type *a,
27+
cuDoubleComplex, cuFloatComplex>::type *a,
2728
const RealType *b,
28-
typename std::conditional<std::is_same<RealType, double>::value,
29-
cuDoubleComplex, cuFloatComplex>::type *out,
29+
typename std::conditional<std::is_same<RealType, double>::value, cuDoubleComplex,
30+
cuFloatComplex>::type *out,
3031
size_t n) {
3132
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
3233
if (idx < n) {
@@ -43,16 +44,18 @@ __global__ void multiply_complex_real_kernel_impl(
4344
}
4445

4546
// Explicit instantiations for float and double (required for CUDA template kernels)
46-
template __global__ void multiply_complex_real_kernel_impl<double>(
47-
const cuDoubleComplex *, const double *, cuDoubleComplex *, size_t);
48-
template __global__ void multiply_complex_real_kernel_impl<float>(
49-
const cuFloatComplex *, const float *, cuFloatComplex *, size_t);
47+
template __global__ void
48+
multiply_complex_real_kernel_impl<double>(const cuDoubleComplex *, const double *,
49+
cuDoubleComplex *, size_t);
50+
template __global__ void
51+
multiply_complex_real_kernel_impl<float>(const cuFloatComplex *, const float *,
52+
cuFloatComplex *, size_t);
5053

5154
// CUDA kernel: Compute nonlinear term (template-based for precision)
5255
template <typename RealType>
53-
__global__ void compute_nonlinear_kernel(
54-
const RealType *u, const RealType *v, RealType p3, RealType p4, RealType q3,
55-
RealType q4, RealType *out, size_t n) {
56+
__global__ void compute_nonlinear_kernel(const RealType *u, const RealType *v,
57+
RealType p3, RealType p4, RealType q3,
58+
RealType q4, RealType *out, size_t n) {
5659
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
5760
if (idx < n) {
5861
RealType u_val = u[idx];
@@ -66,39 +69,43 @@ __global__ void compute_nonlinear_kernel(
6669
}
6770

6871
// Explicit instantiations for float and double
69-
template __global__ void compute_nonlinear_kernel<double>(
70-
const double *, const double *, double, double, double, double, double *,
71-
size_t);
72-
template __global__ void compute_nonlinear_kernel<float>(
73-
const float *, const float *, float, float, float, float, float *, size_t);
72+
template __global__ void compute_nonlinear_kernel<double>(const double *,
73+
const double *, double,
74+
double, double, double,
75+
double *, size_t);
76+
template __global__ void compute_nonlinear_kernel<float>(const float *,
77+
const float *, float, float,
78+
float, float, float *,
79+
size_t);
7480

7581
// CUDA kernel: Apply stabilization (template-based for precision)
7682
template <typename RealType>
77-
__global__ void apply_stabilization_kernel(
78-
const RealType *in, const RealType *field, RealType stabP, RealType *out,
79-
size_t n) {
83+
__global__ void apply_stabilization_kernel(const RealType *in, const RealType *field,
84+
RealType stabP, RealType *out, size_t n) {
8085
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
8186
if (idx < n) {
8287
out[idx] = in[idx] - stabP * field[idx];
8388
}
8489
}
8590

8691
// Explicit instantiations for float and double
87-
template __global__ void apply_stabilization_kernel<double>(
88-
const double *, const double *, double, double *, size_t);
89-
template __global__ void apply_stabilization_kernel<float>(
90-
const float *, const float *, float, float *, size_t);
92+
template __global__ void apply_stabilization_kernel<double>(const double *,
93+
const double *, double,
94+
double *, size_t);
95+
template __global__ void apply_stabilization_kernel<float>(const float *,
96+
const float *, float,
97+
float *, size_t);
9198

9299
// CUDA kernel: Apply time integration (template-based for precision)
93100
template <typename RealType>
94101
__global__ void apply_time_integration_kernel_impl(
95102
const typename std::conditional<std::is_same<RealType, double>::value,
96-
cuDoubleComplex, cuFloatComplex>::type *psi_F,
103+
cuDoubleComplex, cuFloatComplex>::type *psi_F,
97104
const typename std::conditional<std::is_same<RealType, double>::value,
98-
cuDoubleComplex, cuFloatComplex>::type *psiN_F,
105+
cuDoubleComplex, cuFloatComplex>::type *psiN_F,
99106
const RealType *opL, const RealType *opN,
100-
typename std::conditional<std::is_same<RealType, double>::value,
101-
cuDoubleComplex, cuFloatComplex>::type *out,
107+
typename std::conditional<std::is_same<RealType, double>::value, cuDoubleComplex,
108+
cuFloatComplex>::type *out,
102109
size_t n) {
103110
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
104111
if (idx < n) {
@@ -107,7 +114,7 @@ __global__ void apply_time_integration_kernel_impl(
107114
cuDoubleComplex psiN_F_val = psiN_F[idx];
108115
double opL_val = opL[idx];
109116
double opN_val = opN[idx];
110-
117+
111118
// out = opL * psi_F + opN * psiN_F
112119
cuDoubleComplex term1 = cuCmul(cuDoubleComplex{opL_val, 0.0}, psi_F_val);
113120
cuDoubleComplex term2 = cuCmul(cuDoubleComplex{opN_val, 0.0}, psiN_F_val);
@@ -117,7 +124,7 @@ __global__ void apply_time_integration_kernel_impl(
117124
cuFloatComplex psiN_F_val = psiN_F[idx];
118125
float opL_val = opL[idx];
119126
float opN_val = opN[idx];
120-
127+
121128
// out = opL * psi_F + opN * psiN_F
122129
cuFloatComplex term1 = cuCmulf(cuFloatComplex{opL_val, 0.0f}, psi_F_val);
123130
cuFloatComplex term2 = cuCmulf(cuFloatComplex{opN_val, 0.0f}, psiN_F_val);
@@ -130,9 +137,10 @@ __global__ void apply_time_integration_kernel_impl(
130137
template __global__ void apply_time_integration_kernel_impl<double>(
131138
const cuDoubleComplex *, const cuDoubleComplex *, const double *, const double *,
132139
cuDoubleComplex *, size_t);
133-
template __global__ void apply_time_integration_kernel_impl<float>(
134-
const cuFloatComplex *, const cuFloatComplex *, const float *, const float *,
135-
cuFloatComplex *, size_t);
140+
template __global__ void
141+
apply_time_integration_kernel_impl<float>(const cuFloatComplex *,
142+
const cuFloatComplex *, const float *,
143+
const float *, cuFloatComplex *, size_t);
136144

137145
// Helper function to launch kernels with appropriate grid/block sizes
138146
// Optimized for modern GPUs (H100): use larger block size for better occupancy
@@ -182,13 +190,14 @@ void TungstenOps<pfc::backend::CudaTag, double>::multiply_complex_real_impl(
182190
throw std::runtime_error("CUDA kernel launch failed (multiply_complex_real): " +
183191
std::string(cudaGetErrorString(err)));
184192
}
185-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
193+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
194+
// utilization
186195
}
187196

188197
void TungstenOps<pfc::backend::CudaTag, double>::compute_nonlinear_impl(
189198
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &u,
190-
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &v,
191-
double p3, double p4, double q3, double q4,
199+
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &v, double p3,
200+
double p4, double q3, double q4,
192201
pfc::core::DataBuffer<pfc::backend::CudaTag, double> &out) {
193202
const size_t N = u.size();
194203
if (v.size() != N || out.size() != N) {
@@ -209,13 +218,13 @@ void TungstenOps<pfc::backend::CudaTag, double>::compute_nonlinear_impl(
209218
throw std::runtime_error("CUDA kernel launch failed (compute_nonlinear): " +
210219
std::string(cudaGetErrorString(err)));
211220
}
212-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
221+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
222+
// utilization
213223
}
214224

215225
void TungstenOps<pfc::backend::CudaTag, double>::apply_stabilization_impl(
216226
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &in,
217-
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &field,
218-
double stabP,
227+
const pfc::core::DataBuffer<pfc::backend::CudaTag, double> &field, double stabP,
219228
pfc::core::DataBuffer<pfc::backend::CudaTag, double> &out) {
220229
const size_t N = in.size();
221230
if (field.size() != N || out.size() != N) {
@@ -228,15 +237,16 @@ void TungstenOps<pfc::backend::CudaTag, double>::apply_stabilization_impl(
228237
int blocks, threads_per_block;
229238
launch_kernel(N, blocks, threads_per_block);
230239

231-
detail::apply_stabilization_kernel<double><<<blocks, threads_per_block>>>(
232-
in.data(), field.data(), stabP, out.data(), N);
240+
detail::apply_stabilization_kernel<double>
241+
<<<blocks, threads_per_block>>>(in.data(), field.data(), stabP, out.data(), N);
233242

234243
cudaError_t err = cudaGetLastError();
235244
if (err != cudaSuccess) {
236245
throw std::runtime_error("CUDA kernel launch failed (apply_stabilization): " +
237246
std::string(cudaGetErrorString(err)));
238247
}
239-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
248+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
249+
// utilization
240250
}
241251

242252
void TungstenOps<pfc::backend::CudaTag, double>::apply_time_integration_impl(
@@ -264,16 +274,16 @@ void TungstenOps<pfc::backend::CudaTag, double>::apply_time_integration_impl(
264274
const double *opN_ptr = opN.data();
265275
cuDoubleComplex *out_ptr = reinterpret_cast<cuDoubleComplex *>(out.data());
266276

267-
apply_time_integration_kernel_impl<double>
268-
<<<blocks, threads_per_block>>>(psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr,
269-
out_ptr, N);
277+
apply_time_integration_kernel_impl<double><<<blocks, threads_per_block>>>(
278+
psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr, out_ptr, N);
270279

271280
cudaError_t err = cudaGetLastError();
272281
if (err != cudaSuccess) {
273282
throw std::runtime_error("CUDA kernel launch failed (apply_time_integration): " +
274283
std::string(cudaGetErrorString(err)));
275284
}
276-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
285+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
286+
// utilization
277287
}
278288

279289
// CUDA specialization for float precision - implement methods
@@ -304,14 +314,14 @@ void TungstenOps<pfc::backend::CudaTag, float>::multiply_complex_real_impl(
304314
throw std::runtime_error("CUDA kernel launch failed (multiply_complex_real): " +
305315
std::string(cudaGetErrorString(err)));
306316
}
307-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
317+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
318+
// utilization
308319
}
309320

310321
void TungstenOps<pfc::backend::CudaTag, float>::compute_nonlinear_impl(
311322
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &u,
312-
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &v,
313-
float p3, float p4, float q3, float q4,
314-
pfc::core::DataBuffer<pfc::backend::CudaTag, float> &out) {
323+
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &v, float p3, float p4,
324+
float q3, float q4, pfc::core::DataBuffer<pfc::backend::CudaTag, float> &out) {
315325
const size_t N = u.size();
316326
if (v.size() != N || out.size() != N) {
317327
throw std::runtime_error("Size mismatch in compute_nonlinear");
@@ -331,13 +341,13 @@ void TungstenOps<pfc::backend::CudaTag, float>::compute_nonlinear_impl(
331341
throw std::runtime_error("CUDA kernel launch failed (compute_nonlinear): " +
332342
std::string(cudaGetErrorString(err)));
333343
}
334-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
344+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
345+
// utilization
335346
}
336347

337348
void TungstenOps<pfc::backend::CudaTag, float>::apply_stabilization_impl(
338349
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &in,
339-
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &field,
340-
float stabP,
350+
const pfc::core::DataBuffer<pfc::backend::CudaTag, float> &field, float stabP,
341351
pfc::core::DataBuffer<pfc::backend::CudaTag, float> &out) {
342352
const size_t N = in.size();
343353
if (field.size() != N || out.size() != N) {
@@ -350,15 +360,16 @@ void TungstenOps<pfc::backend::CudaTag, float>::apply_stabilization_impl(
350360
int blocks, threads_per_block;
351361
launch_kernel(N, blocks, threads_per_block);
352362

353-
detail::apply_stabilization_kernel<float><<<blocks, threads_per_block>>>(
354-
in.data(), field.data(), stabP, out.data(), N);
363+
detail::apply_stabilization_kernel<float>
364+
<<<blocks, threads_per_block>>>(in.data(), field.data(), stabP, out.data(), N);
355365

356366
cudaError_t err = cudaGetLastError();
357367
if (err != cudaSuccess) {
358368
throw std::runtime_error("CUDA kernel launch failed (apply_stabilization): " +
359369
std::string(cudaGetErrorString(err)));
360370
}
361-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
371+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
372+
// utilization
362373
}
363374

364375
void TungstenOps<pfc::backend::CudaTag, float>::apply_time_integration_impl(
@@ -386,16 +397,16 @@ void TungstenOps<pfc::backend::CudaTag, float>::apply_time_integration_impl(
386397
const float *opN_ptr = opN.data();
387398
cuFloatComplex *out_ptr = reinterpret_cast<cuFloatComplex *>(out.data());
388399

389-
detail::apply_time_integration_kernel_impl<float>
390-
<<<blocks, threads_per_block>>>(psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr,
391-
out_ptr, N);
400+
detail::apply_time_integration_kernel_impl<float><<<blocks, threads_per_block>>>(
401+
psi_F_ptr, psiN_F_ptr, opL_ptr, opN_ptr, out_ptr, N);
392402

393403
cudaError_t err = cudaGetLastError();
394404
if (err != cudaSuccess) {
395405
throw std::runtime_error("CUDA kernel launch failed (apply_time_integration): " +
396406
std::string(cudaGetErrorString(err)));
397407
}
398-
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU utilization
408+
// Removed cudaDeviceSynchronize() - allows kernel overlap and better GPU
409+
// utilization
399410
}
400411

401412
} // namespace detail

0 commit comments

Comments
 (0)