fixup! Add minimal clang-tidy pass to cmake and CI

Jacobfaib · Jacobfaib · commit 28e7ae9603bb · 2026-03-26T19:13:55.000-04:00
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -21,6 +21,7 @@ workflows:
   #       args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' }
   #
   override:
+    - { jobs: ['build'], project: 'tidy', std: 'min', cxx: ['clang'], cudacxx: ['clang'], ctk: 'clang-cuda', sm: '75' }
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:
diff --git a/cudax/include/cuda/experimental/__cuco/hyperloglog_ref.cuh b/cudax/include/cuda/experimental/__cuco/hyperloglog_ref.cuh
@@ -22,6 +22,8 @@
 #endif // no system header
 
 #include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/span>
 #include <cuda/stream>
 
@@ -105,8 +107,38 @@ public:
   //!
   //! @param __group CUDA Cooperative group this operation is executed in
   template <class _CG>
-  _CCCL_DEVICE constexpr void clear(_CG __group) noexcept
+  _CCCL_DEVICE constexpr ::cuda::std::enable_if_t<!::cuda::std::is_convertible_v<_CG, ::cuda::stream_ref>, void>
+  clear(_CG __group) noexcept
   {
+    // The enable_if above is to work around an incompatibility between host and device
+    // overload preference for clang and NVCC. See
+    // https://llvm.org/docs/CompileCudaWithLLVM.html#overloading-based-on-host-and-device-attributes
+    // for further reading, but the bottom line is when:
+    //
+    // 1. Compiling in device mode (and clang compiles CUDA in a "hybrid" host-device mode,
+    //    also explained by the link above).
+    // 2. And the current function is __host__ __device__.
+    // 3. And the function whose overload needs to be resolved has both a __host__ __device__,
+    //    and __device__ (and/or __host__) overload.
+    //
+    // Then clang will prefer these overloads (assuming they have equal priority under C++
+    // rules) in the following order:
+    //
+    // 1. __host__ __device__
+    // 2. __device__
+    // 3. __host__
+    //
+    // In this particular case, `clear(_CG)` conflicts with `clear(::cuda::stream_ref)` when called
+    // from `hyperloglog::clear(::cuda::stream_ref)`. `hyperloglog::clear(::cuda::stream_ref)`
+    // is constexpr, and therefore implicitly __host__ __device__. Since
+    // `clear(::cuda::stream_ref)` on this class is only __host__, it will take lower priority
+    // that `clear(_CG)`, and we get:
+    //
+    // cudax/include/cuda/experimental/__cuco/__hyperloglog/hyperloglog_impl.cuh:131:28: error: no member named
+    // 'thread_rank' in 'cuda::stream_ref' [clang-diagnostic-error]
+    //
+    // 131 | for (int __i = __group.thread_rank(); __i < __sketch.size(); __i += __group.size())
+    //     |               ~~~~~~~ ^
     __impl.__clear(__group);
   }
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
@@ -301,7 +301,11 @@ private:
 
     // without the following, the kernel in __host_start will fail to launch with
     // cudaErrorInvalidDeviceFunction.
+#if _CCCL_HAS_CDP()
+    // clang<22 errors when compiling this, complaining that we are taking a reference to
+    // __global__ function inside a __device__ function.
     ::__cccl_unused(&__completion_kernel<__block_threads, _Rcvr, __results_t>);
+#endif
     __state.__state_.__complete_inline_ = true;
     execution::start(__state.__opstate_);
   }
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -181,8 +181,8 @@ public:
   template <typename Fun>
   void operator->*(Fun&& f)
   {
-#  if __NVCOMPILER
-    // With nvc++, all lambdas can run on host and device.
+#  if __NVCOMPILER || _CCCL_CUDA_COMPILER(CLANG)
+    // With nvc++ or clang, all lambdas can run on host and device.
     static constexpr bool is_extended_host_device_lambda_closure_type = true,
                           is_extended_device_lambda_closure_type      = false;
 #  else
@@ -306,7 +306,7 @@ public:
   template <typename Fun>
   void operator->*(Fun&& f)
   {
-#  if __NVCOMPILER
+#  if __NVCOMPILER || _CCCL_CUDA_COMPILER(CLANG)
     // With nvc++, all lambdas can run on host and device.
     static constexpr bool is_extended_host_device_lambda_closure_type = true,
                           is_extended_device_lambda_closure_type      = false;
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -614,7 +614,7 @@ public:
 
     static constexpr bool need_reduction = (deps_ops_t::does_work || ...);
 
-#  if __NVCOMPILER
+#  if __NVCOMPILER || _CCCL_CUDA_COMPILER(CLANG)
     // With nvc++, all lambdas can run on host and device.
     static constexpr bool is_extended_host_device_lambda_closure_type = true,
                           is_extended_device_lambda_closure_type      = false;
diff --git a/cudax/test/execution/test_let_value.cu b/cudax/test/execution/test_let_value.cu
@@ -467,7 +467,7 @@ C2H_TEST("let_value works when the function returns a dependent sender", "[adapt
 
 #endif // _CCCL_HOST_COMPILATION()
 
-#if !_CCCL_CUDA_COMPILER(NVCC) || !defined(_CCCL_CLANG_TIDY_INVOKED)
+#if !_CCCL_CUDA_COMPILER(NVCC) && !defined(_CCCL_CLANG_TIDY_INVOKED)
 // This example causes nvcc to segfault, and clang-tidy to error out with
 //
 // cudax/test/execution/test_let_value.cu:487:17: error: static assertion failed due to requirement

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ workflows:`
`21`	`21`	`# args: '--preset libcudacxx --lit-tests "cuda/utility/basic_any.pass.cpp"' }`
`22`	`22`	`#`
`23`	`23`	`override:`
	`24`	`+ - { jobs: ['build'], project: 'tidy', std: 'min', cxx: ['clang'], cudacxx: ['clang'], ctk: 'clang-cuda', sm: '75' }`
`24`	`25`
`25`	`26`	`pull_request:`
`26`	`27`	`# Old CTK: Oldest/newest supported host compilers:`