Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/0.15.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ is passed to ``tl.leiden`` and ``tl.louvain`` to match behaviour in Scanpy, and
resolutions are passed. Previously it was always stored as a list. {pr}`648`. {smaller}`J Pintar`
* Drop dependency on ``cuml.thirdparty_adapters.check_array`` (removed in cuml 26.06); ``init_pos`` validation in ``tl.umap`` and ``tl.draw_graph`` is now handled locally {pr}`660` {smaller}`S Dicks`
* Unify cuBLAS handle creation across GMM and harmony {pr}`662` {smaller}`S Dicks`
* Use C++ math overloads in templated CUDA kernels and switch `1/sqrt` to `rsqrt` where precision-tolerant {pr}`666` {smaller}`S Dicks`
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include <cuda_runtime.h>
#include <type_traits>

// ---- Fused entropy kernel ----
// One block per row. Row-normalize R, accumulate x*log(x+eps), atomicAdd scaled
Expand Down Expand Up @@ -44,10 +43,7 @@ __global__ void entropy_kernel(const T* __restrict__ R, T sigma, int n_cells,
T entropy = T(0);
for (int col = threadIdx.x; col < n_clusters; col += blockDim.x) {
T x = R_row[col] * inv_rsum;
if constexpr (std::is_same<T, float>::value)
entropy += x * logf(x + T(1e-12));
else
entropy += x * log(x + T(1e-12));
entropy += x * log(x + T(1e-12));
}

#pragma unroll
Expand Down Expand Up @@ -83,12 +79,7 @@ __global__ void diversity_kernel(const T* __restrict__ O,
int batch = i / n_clusters;
T numer = Stabilized ? (O[i] + E[i] + T(1)) : (O[i] + T(1));
T ratio = numer / (E[i] + T(1));
T log_val;
if constexpr (std::is_same<T, float>::value)
log_val = logf(ratio);
else
log_val = log(ratio);
acc += theta[batch] * O[i] * log_val;
acc += theta[batch] * O[i] * log(ratio);
}

#pragma unroll
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include <cuda_runtime.h>
#include <type_traits>

// ---- L2 row normalize ----
// One block per row. Writes to separate output buffer.
Expand Down Expand Up @@ -42,13 +41,9 @@ __global__ void l2_row_normalize_kernel(const T* __restrict__ src,
for (int offset = 16; offset > 0; offset >>= 1)
val += __shfl_down_sync(0xffffffff, val, offset);
if (threadIdx.x == 0) {
T norm = val;
if constexpr (std::is_same<T, float>::value)
norm = sqrtf(norm);
else
norm = sqrt(norm);
if (norm < T(1e-12)) norm = T(1e-12);
warp_sums[0] = T(1) / norm;
T inv_norm = rsqrt(val);
if (inv_norm > T(1e12)) inv_norm = T(1e12);
warp_sums[0] = inv_norm;
}
}
__syncthreads();
Expand Down
12 changes: 2 additions & 10 deletions src/rapids_singlecell/_cuda/harmony/pen/kernels_pen.cuh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include <cuda_runtime.h>
#include <type_traits>

// ---- Penalty kernel ----
// Stabilized=false: penalty = pow((E+1) / (O+1), theta) [Harmony1]
Expand All @@ -18,10 +17,7 @@ __global__ void penalty_kernel(const T* __restrict__ E, const T* __restrict__ O,
T denom = Stabilized ? (O[i] + E[i] + T(1)) : (O[i] + T(1));
T ratio = (E[i] + T(1)) / denom;
T th = theta[batch];
if constexpr (std::is_same<T, float>::value)
penalty[i] = powf(ratio, th);
else
penalty[i] = pow(ratio, th);
penalty[i] = pow(ratio, th);
}

// ---- Fused penalty + normalize ----
Expand All @@ -45,11 +41,7 @@ __global__ void fused_pen_norm_kernel(const T* __restrict__ similarities,
T local_sum = T(0);
for (int col = threadIdx.x; col < n_cols; col += blockDim.x) {
T sim = similarities[sim_row * n_cols + col];
T val;
if constexpr (std::is_same<T, float>::value)
val = expf(term * (T(1) - sim));
else
val = exp(term * (T(1) - sim));
T val = exp(term * (T(1) - sim));
val *= penalty[(size_t)cat * n_cols + col];
R_out[(size_t)row * n_cols + col] = val;
local_sum += val;
Expand Down
8 changes: 4 additions & 4 deletions src/rapids_singlecell/_cuda/nn_descent/kernels_dist.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ __global__ void compute_distances_cosine_kernel(
float v = data[base1 + d];
sum_i1 += v * v; // powf(v, 2)
}
float norm_i1 = sqrtf(sum_i1);
float inv_norm_i1 = (sum_i1 > 0.0f) ? rsqrtf(sum_i1) : 0.0f;
for (long long j = 0; j < n_neighbors; ++j) {
long long i2 = static_cast<long long>(pairs[i1 * n_neighbors + j]);
float dot = 0.0f;
Expand All @@ -47,10 +47,10 @@ __global__ void compute_distances_cosine_kernel(
float v1 = data[base1 + d];
float v2 = data[base2 + d];
dot += v1 * v2;
sum_i2 += v2 * v2; // powf(v2, 2)
sum_i2 += v2 * v2;
}
float denom = norm_i1 * sqrtf(sum_i2);
out[i1 * n_neighbors + j] = 1.0f - (denom > 0.0f ? dot / denom : 0.0f);
float inv_norm_i2 = (sum_i2 > 0.0f) ? rsqrtf(sum_i2) : 0.0f;
out[i1 * n_neighbors + j] = 1.0f - dot * inv_norm_i1 * inv_norm_i2;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/rapids_singlecell/_cuda/pr/kernels_pr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ __global__ void dense_norm_res_kernel(const T* __restrict__ X,
T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
long long res_index = static_cast<long long>(cell) * n_genes + gene;
T r = X[res_index] - mu;
r /= sqrt(mu + mu * mu * inv_theta);
r *= rsqrt(mu + mu * mu * inv_theta);
if (r < -clip) r = -clip;
if (r > clip) r = clip;
residuals[res_index] = r;
Comment on lines 76 to 82
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Potential NaN when mu = 0 and X = mu.

When mu = 0 (e.g., gene or cell with zero sum), rsqrt(0) = +inf. If X[res_index] also equals zero, then r = 0 and 0 * inf = NaN. NaN values will not be clamped by lines 80-81 (since NaN comparisons are always false).

This is not a regression from the rsqrt change (the original / sqrt(0) would also produce NaN), but worth considering a guard for robustness.

Suggested guard
     T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
     long long res_index = static_cast<long long>(cell) * n_genes + gene;
     T r = X[res_index] - mu;
-    r *= rsqrt(mu + mu * mu * inv_theta);
+    T var = mu + mu * mu * inv_theta;
+    if (var > T(0)) r *= rsqrt(var);
     if (r < -clip) r = -clip;
     if (r > clip) r = clip;
     residuals[res_index] = r;

As per coding guidelines: "add epsilon checks before division, handle... NaN/Inf in input data."

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
long long res_index = static_cast<long long>(cell) * n_genes + gene;
T r = X[res_index] - mu;
r /= sqrt(mu + mu * mu * inv_theta);
r *= rsqrt(mu + mu * mu * inv_theta);
if (r < -clip) r = -clip;
if (r > clip) r = clip;
residuals[res_index] = r;
T mu = sums_genes[gene] * sums_cells[cell] * inv_inv_sum_total;
long long res_index = static_cast<long long>(cell) * n_genes + gene;
T r = X[res_index] - mu;
T var = mu + mu * mu * inv_theta;
if (var > T(0)) r *= rsqrt(var);
if (r < -clip) r = -clip;
if (r > clip) r = clip;
residuals[res_index] = r;
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/rapids_singlecell/_cuda/pr/kernels_pr.cuh` around lines 76 - 82, The
computation can produce NaN when mu == 0 because rsqrt(0) is +inf and 0 * +inf
-> NaN; update the block computing r (using symbols mu, X[res_index], rsqrt,
inv_theta, residuals, clip, sums_genes, sums_cells, inv_inv_sum_total) to guard
against denom == 0: compute denom = mu + mu*mu*inv_theta and if denom is zero
(or extremely small) set r = 0 (or clamp to safe value) instead of calling
rsqrt; otherwise compute r = (X[res_index] - mu) * rsqrt(denom) and then apply
the existing clip logic so residuals[res_index] never becomes NaN/Inf.

Expand Down
Loading