move decoupler

Intron7 · Intron7 · commit 20cf11ec05f3 · 2025-09-16T12:31:22.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -56,6 +56,7 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_aggr_cuda         src/rapids_singlecell/_cuda/aggr/aggr.cu)
   add_nb_cuda_module(_spca_cuda         src/rapids_singlecell/_cuda/spca/spca.cu)
   add_nb_cuda_module(_ligrec_cuda       src/rapids_singlecell/_cuda/ligrec/ligrec.cu)
+  add_nb_cuda_module(_pv_cuda           src/rapids_singlecell/_cuda/pv/pv.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
diff --git a/src/rapids_singlecell/_cuda/pv/kernels_pv.cuh b/src/rapids_singlecell/_cuda/pv/kernels_pv.cuh
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+__global__ void rev_cummin64_kernel(const double* __restrict__ x, double* __restrict__ y,
+                                    int n_rows, int m) {
+  int r = blockDim.x * blockIdx.x + threadIdx.x;
+  if (r >= n_rows) return;
+
+  const double* xr = x + (size_t)r * m;
+  double* yr = y + (size_t)r * m;
+
+  double cur = xr[m - 1];
+  yr[m - 1] = cur;
+
+  for (int j = m - 2; j >= 0; --j) {
+    double v = xr[j];
+    cur = (v < cur) ? v : cur;
+    yr[j] = cur;
+  }
+}
diff --git a/src/rapids_singlecell/_cuda/pv/pv.cu b/src/rapids_singlecell/_cuda/pv/pv.cu
@@ -0,0 +1,20 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_pv.cuh"
+
+namespace nb = nanobind;
+
+static inline void launch_rev_cummin64(std::uintptr_t x, std::uintptr_t y, int n_rows, int m) {
+  dim3 block(256);
+  dim3 grid((unsigned)((n_rows + block.x - 1) / block.x));
+  rev_cummin64_kernel<<<grid, block>>>(reinterpret_cast<const double*>(x),
+                                       reinterpret_cast<double*>(y), n_rows, m);
+}
+
+NB_MODULE(_pv_cuda, m) {
+  m.def("rev_cummin64", [](std::uintptr_t x, std::uintptr_t y, int n_rows, int m) {
+    launch_rev_cummin64(x, y, n_rows, m);
+  });
+}
diff --git a/src/rapids_singlecell/decoupler_gpu/_helper/_pv.py b/src/rapids_singlecell/decoupler_gpu/_helper/_pv.py
@@ -1,37 +1,19 @@
 from __future__ import annotations
 
+try:
+    from rapids_singlecell._cuda import _pv_cuda as _pv
+except ImportError:
+    _pv = None
 import cupy as cp
 import numba as nb
 import numpy as np
 
-# Reverse cumulative min along the last axis, per row (float64)
-_rev_cummin64 = cp.RawKernel(
-    r"""
-extern "C" __global__
-void rev_cummin64(const double* __restrict__ x,
-                  double* __restrict__ y,
-                  const int n_rows,
-                  const int m)
-{
-    int r = blockDim.x * blockIdx.x + threadIdx.x;
-    if (r >= n_rows) return;
-
-    const double* xr = x + (size_t)r * m;
-    double* yr       = y + (size_t)r * m;
-
-    double cur = xr[m - 1];
-    yr[m - 1] = cur;
-
-    // right -> left
-    for (int j = m - 2; j >= 0; --j) {
-        double v = xr[j];
-        cur = (v < cur) ? v : cur;
-        yr[j] = cur;
-    }
-}
-""",
-    "rev_cummin64",
-)
+
+def _rev_cummin64(x, n_rows, m):
+    y = cp.empty_like(x)
+
+    _pv.rev_cummin64(x.data.ptr, y.data.ptr, int(n_rows), int(m))
+    return y
 
 
 def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
@@ -78,7 +60,6 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
 
     out = cp.empty_like(ps, dtype=cp.float64)
 
-    threads = 256  # for the rev_cummin kernel
     for s in range(0, n_rows, B):
         e = min(n_rows, s + B)
         R = e - s
@@ -97,9 +78,7 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
         ps_bh = ps_sorted * scale  # (R, m) float64
 
         # 4) reverse cumulative min via custom kernel
-        ps_mon = cp.empty_like(ps_bh)
-        blocks = (R + threads - 1) // threads
-        _rev_cummin64((blocks,), (threads,), (ps_bh, ps_mon, R, m))
+        ps_mon = _rev_cummin64(ps_bh, R, m)
 
         # 5) build inverse permutation without argsort (scatter)
         inv_order = cp.empty_like(order, dtype=cp.int32)  # (R, m) int32
diff --git a/src/rapids_singlecell/tools/_kernels/_nan_mean_kernels.py b/src/rapids_singlecell/tools/_kernels/_nan_mean_kernels.py