scverse · Intron7 · May 16, 2026 · May 16, 2026 · May 16, 2026 · coderabbitai
diff --git a/docs/release-notes/0.15.1.md b/docs/release-notes/0.15.1.md
@@ -21,3 +21,4 @@ is passed to ``tl.leiden`` and ``tl.louvain`` to match behaviour in Scanpy, and
 resolutions are passed. Previously it was always stored as a list. {pr}`648`. {smaller}`J Pintar`
 * Drop dependency on ``cuml.thirdparty_adapters.check_array`` (removed in cuml 26.06); ``init_pos`` validation in ``tl.umap`` and ``tl.draw_graph`` is now handled locally {pr}`660` {smaller}`S Dicks`
 * Unify cuBLAS handle creation across GMM and harmony {pr}`662` {smaller}`S Dicks`
+* Use C++ math overloads in templated CUDA kernels and switch `1/sqrt` to `rsqrt` where precision-tolerant {pr}`666` {smaller}`S Dicks`
diff --git a/src/rapids_singlecell/_cuda/harmony/clustering/kernels_clustering.cuh b/src/rapids_singlecell/_cuda/harmony/clustering/kernels_clustering.cuh
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <type_traits>
 
 // ---- Fused entropy kernel ----
 // One block per row. Row-normalize R, accumulate x*log(x+eps), atomicAdd scaled
@@ -44,10 +43,7 @@ __global__ void entropy_kernel(const T* __restrict__ R, T sigma, int n_cells,
     T entropy = T(0);
     for (int col = threadIdx.x; col < n_clusters; col += blockDim.x) {
         T x = R_row[col] * inv_rsum;
-        if constexpr (std::is_same<T, float>::value)
-            entropy += x * logf(x + T(1e-12));
-        else
-            entropy += x * log(x + T(1e-12));
+        entropy += x * log(x + T(1e-12));
     }
 
 #pragma unroll
@@ -83,12 +79,7 @@ __global__ void diversity_kernel(const T* __restrict__ O,
         int batch = i / n_clusters;
         T numer = Stabilized ? (O[i] + E[i] + T(1)) : (O[i] + T(1));
         T ratio = numer / (E[i] + T(1));
-        T log_val;
-        if constexpr (std::is_same<T, float>::value)
-            log_val = logf(ratio);
-        else
-            log_val = log(ratio);
-        acc += theta[batch] * O[i] * log_val;
+        acc += theta[batch] * O[i] * log(ratio);
     }
 
 #pragma unroll

diff --git a/src/rapids_singlecell/_cuda/harmony/normalize/kernels_normalize.cuh b/src/rapids_singlecell/_cuda/harmony/normalize/kernels_normalize.cuh
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <type_traits>
 
 // ---- L2 row normalize ----
 // One block per row. Writes to separate output buffer.
@@ -42,13 +41,9 @@ __global__ void l2_row_normalize_kernel(const T* __restrict__ src,
         for (int offset = 16; offset > 0; offset >>= 1)
             val += __shfl_down_sync(0xffffffff, val, offset);
         if (threadIdx.x == 0) {
-            T norm = val;
-            if constexpr (std::is_same<T, float>::value)
-                norm = sqrtf(norm);
-            else
-                norm = sqrt(norm);
-            if (norm < T(1e-12)) norm = T(1e-12);
-            warp_sums[0] = T(1) / norm;
+            T inv_norm = rsqrt(val);
+            if (inv_norm > T(1e12)) inv_norm = T(1e12);
+            warp_sums[0] = inv_norm;
         }
     }
     __syncthreads();

diff --git a/src/rapids_singlecell/_cuda/harmony/pen/kernels_pen.cuh b/src/rapids_singlecell/_cuda/harmony/pen/kernels_pen.cuh
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <type_traits>
 
 // ---- Penalty kernel ----
 // Stabilized=false: penalty = pow((E+1) / (O+1), theta)       [Harmony1]
@@ -18,10 +17,7 @@ __global__ void penalty_kernel(const T* __restrict__ E, const T* __restrict__ O,
     T denom = Stabilized ? (O[i] + E[i] + T(1)) : (O[i] + T(1));
     T ratio = (E[i] + T(1)) / denom;
     T th = theta[batch];
-    if constexpr (std::is_same<T, float>::value)
-        penalty[i] = powf(ratio, th);
-    else
-        penalty[i] = pow(ratio, th);
+    penalty[i] = pow(ratio, th);
 }
 
 // ---- Fused penalty + normalize ----
@@ -45,11 +41,7 @@ __global__ void fused_pen_norm_kernel(const T* __restrict__ similarities,
     T local_sum = T(0);
     for (int col = threadIdx.x; col < n_cols; col += blockDim.x) {
         T sim = similarities[sim_row * n_cols + col];
-        T val;
-        if constexpr (std::is_same<T, float>::value)
-            val = expf(term * (T(1) - sim));
-        else
-            val = exp(term * (T(1) - sim));
+        T val = exp(term * (T(1) - sim));
         val *= penalty[(size_t)cat * n_cols + col];
         R_out[(size_t)row * n_cols + col] = val;
         local_sum += val;

diff --git a/src/rapids_singlecell/_cuda/nn_descent/kernels_dist.cuh b/src/rapids_singlecell/_cuda/nn_descent/kernels_dist.cuh
@@ -37,7 +37,7 @@ __global__ void compute_distances_cosine_kernel(
         float v = data[base1 + d];
         sum_i1 += v * v;  // powf(v, 2)
     }
-    float norm_i1 = sqrtf(sum_i1);
+    float inv_norm_i1 = (sum_i1 > 0.0f) ? rsqrtf(sum_i1) : 0.0f;
     for (long long j = 0; j < n_neighbors; ++j) {
         long long i2 = static_cast<long long>(pairs[i1 * n_neighbors + j]);
         float dot = 0.0f;
@@ -47,10 +47,10 @@ __global__ void compute_distances_cosine_kernel(
             float v1 = data[base1 + d];
             float v2 = data[base2 + d];
             dot += v1 * v2;
-            sum_i2 += v2 * v2;  // powf(v2, 2)
+            sum_i2 += v2 * v2;
         }
-        float denom = norm_i1 * sqrtf(sum_i2);
-        out[i1 * n_neighbors + j] = 1.0f - (denom > 0.0f ? dot / denom : 0.0f);
+        float inv_norm_i2 = (sum_i2 > 0.0f) ? rsqrtf(sum_i2) : 0.0f;
+        out[i1 * n_neighbors + j] = 1.0f - dot * inv_norm_i1 * inv_norm_i2;
     }
 }
 

diff --git a/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh b/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh
@@ -76,7 +76,7 @@ __global__ void dense_norm_res_kernel(const T* __restrict__ X,
     T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
     long long res_index = static_cast<long long>(cell) * n_genes + gene;
     T r = X[res_index] - mu;
-    r /= sqrt(mu + mu * mu * inv_theta);
+    r *= rsqrt(mu + mu * mu * inv_theta);
     if (r < -clip) r = -clip;
     if (r > clip) r = clip;
     residuals[res_index] = r;
-    T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
-    long long res_index = static_cast<long long>(cell) * n_genes + gene;
-    T r = X[res_index] - mu;
-    r /= sqrt(mu + mu * mu * inv_theta);
-    r *= rsqrt(mu + mu * mu * inv_theta);
-    if (r < -clip) r = -clip;
-    if (r > clip) r = clip;
-    residuals[res_index] = r;
+    T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
+    long long res_index = static_cast<long long>(cell) * n_genes + gene;
+    T r = X[res_index] - mu;
+    T var = mu + mu * mu * inv_theta;
+    if (var > T(0)) r *= rsqrt(var);
+    if (r < -clip) r = -clip;
+    if (r > clip) r = clip;
+    residuals[res_index] = r;
-    T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
-    long long res_index = static_cast<long long>(cell) * n_genes + gene;
-    T r = X[res_index] - mu;
-    r /= sqrt(mu + mu * mu * inv_theta);
-    r *= rsqrt(mu + mu * mu * inv_theta);
-    if (r < -clip) r = -clip;
-    if (r > clip) r = clip;
-    residuals[res_index] = r;
+    T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
+    long long res_index = static_cast<long long>(cell) * n_genes + gene;
+    T r = X[res_index] - mu;
+    T var = mu + mu * mu * inv_theta;
+    if (var > T(0)) r *= rsqrt(var);
+    if (r < -clip) r = -clip;
+    if (r > clip) r = clip;
+    residuals[res_index] = r;