Fix float64 precision loss in sparse Pearson residual kernels (#658)

Arshammik · Intron7 · web-flow · commit 42123b3721c5 · 2026-05-15T10:32:27.000Z
* Fix float64 precision loss in sparse Pearson residual kernels

The CSR and CSC Pearson residual kernels in _cuda/pr/kernels_pr.cuh
divided by `sqrtf`, the single-precision square-root intrinsic. Because
both kernels are templated on the element type `T`, a `T=double`
instantiation silently narrowed the variance term
`mu + mu * mu * inv_theta` to float32, evaluated the square root at
single precision, and promoted the result back to double. The float64
path of `pp.normalize_pearson_residuals` (and `pp.highly_variable_genes`
with `flavor='pearson_residuals'`) was therefore capped at ~7
significant digits regardless of the requested dtype. The dense kernel
`dense_norm_res_kernel` already used the overloaded `sqrt` and was
unaffected.

Replace `sqrtf` with the overloaded `sqrt` on both sparse paths. `sqrt`
dispatches to the single-precision root for `T=float` and the
double-precision root for `T=double`, so the float32 path is
byte-identical to before and only the float64 path changes.

Hardware verification (NVIDIA H100 80GB HBM3, CUDA 12.6, sm_90):
A standalone harness compiled the real `sparse_norm_res_csr_kernel`
verbatim and ran it on a 4000x4000 synthetic CSR count matrix against a
host float64 reference.

  T=double, before fix:   max relative error 8.83e-08  (~7.1 digits)
  T=double, after fix:    max relative error 3.97e-16  (~15.4 digits)
  T=float,  before/after: bit-identical (max abs diff 0.0)

The float64 path is now ~8 orders of magnitude more accurate; the
float32 path is provably unchanged.

Add `test_normalize_pearson_residuals_float64_precision` to
tests/test_normalization.py. It pins the float64 CSR/CSC output to an
analytic float64 reference at rtol/atol 1e-9 -- tight enough to fail on
a single-precision result and pass on a genuine float64 one -- across
theta in {100, inf}.

* add PR number

* switch to rsqrt

---------

Co-authored-by: Intron7 &lt;severin.dicks@icloud.com&gt;
Co-authored-by: Severin Dicks &lt;37635888+Intron7@users.noreply.github.com&gt;
diff --git a/docs/release-notes/0.15.1.md b/docs/release-notes/0.15.1.md
@@ -8,6 +8,7 @@
 ```{rubric} Bug fixes
 ```
 * Fixes `tl.rank_genes_groups` returning NaN/zero `logfoldchanges`/`pvals` with `groups=[subset]` and `reference='rest'` {pr}`651` {smaller}`S Dicks`
+* Fixes float64 precision loss in `pp.normalize_pearson_residuals` on CSR/CSC input {pr}`658` {smaller}`A Mikaeili & S Dicks`
 
 ```{rubric} Misc
 ```
diff --git a/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh b/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh
@@ -24,7 +24,7 @@ __global__ void sparse_norm_res_csc_kernel(
             ++sparse_idx;
         }
         residuals[res_index] -= mu;
-        residuals[res_index] /= sqrtf(mu + mu * mu * inv_theta);
+        residuals[res_index] *= rsqrt(mu + mu * mu * inv_theta);
         // clamp to [-clip, clip]
         if (residuals[res_index] < -clip) residuals[res_index] = -clip;
         if (residuals[res_index] > clip) residuals[res_index] = clip;
@@ -53,7 +53,7 @@ __global__ void sparse_norm_res_csr_kernel(
             ++sparse_idx;
         }
         residuals[res_index] -= mu;
-        residuals[res_index] /= sqrtf(mu + mu * mu * inv_theta);
+        residuals[res_index] *= rsqrt(mu + mu * mu * inv_theta);
 
         if (residuals[res_index] < -clip) residuals[res_index] = -clip;
         if (residuals[res_index] > clip) residuals[res_index] = clip;
diff --git a/tests/test_normalization.py b/tests/test_normalization.py
@@ -90,6 +90,46 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
         assert np.min(output_X) >= -clip
 
 
+@pytest.mark.parametrize(
+    "sparsity_func", [csr_matrix, csc_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize("theta", [100.0, np.inf])
+def test_normalize_pearson_residuals_float64_precision(sparsity_func, theta):
+    """Regression test: float64 precision of the sparse Pearson-residual kernels.
+
+    ``sparse_norm_res_csr_kernel`` / ``sparse_norm_res_csc_kernel`` (in
+    ``_cuda/pr/kernels_pr.cuh``) previously divided by the single-precision
+    intrinsic ``sqrtf``. Because the kernels are templated on the element
+    type, a ``float64`` instantiation silently narrowed the variance term
+    to ``float32``, capping accuracy at ~7 significant digits regardless of
+    the requested dtype. The ``rtol``/``atol`` of 1e-9 below is tight enough
+    to fail on a single-precision result and pass on a genuine float64 one.
+    """
+    rng = np.random.default_rng(0)
+    counts = rng.poisson(0.3, size=(300, 200)).astype(np.float64)
+    # ensure every gene and cell has a nonzero total so mu > 0 everywhere
+    counts[0, :] += 1
+    counts[:, 0] += 1
+    X = cp.asarray(counts)
+
+    # analytic float64 reference residuals (no clipping)
+    ns = cp.sum(X, axis=1)
+    ps = cp.sum(X, axis=0) / cp.sum(X)
+    mu = cp.outer(ns, ps)
+    if np.isinf(theta):
+        reference = (X - mu) / cp.sqrt(mu)
+    else:
+        reference = (X - mu) / cp.sqrt(mu + mu**2 / theta)
+
+    cudata = AnnData(X=sparsity_func(X, dtype=np.float64))
+    output = rsc.pp.normalize_pearson_residuals(
+        cudata, theta=theta, clip=np.inf, inplace=False
+    )
+
+    # the buggy `sqrtf` path is only ~1e-7 accurate; 1e-9 cleanly separates it
+    cp.testing.assert_allclose(output, reference, rtol=1e-9, atol=1e-9)
+
+
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("sparse", [True, False])
 @pytest.mark.parametrize("base", [None, 2, 10])