scverse
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 12 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/rapids_singlecell/_cuda/wilcoxon/kernels_wilcoxon_ovo.cuh‎
Lines changed: 2 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/wilcoxon/kernels_wilcoxon_ovo.cuh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu‎
Lines changed: 26 additions & 22 deletions b/‎src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu‎
Lines changed: 26 additions & 22 deletions
diff --git a/‎src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_fast_common.cuh‎
Lines changed: 41 additions & 3 deletions b/‎src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_fast_common.cuh‎
Lines changed: 41 additions & 3 deletions
@@ -147,6 +147,8 @@ jobs:
             LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
             PATH=/usr/local/cuda/bin:$PATH
           CIBW_BEFORE_BUILD: >
+            rm -f build/.librmm_dir &&
+            mkdir -p build &&
             python -m pip install -U pip
             scikit-build-core cmake ninja nanobind
             librmm-cu${{ matrix.cuda_major }} &&
@@ -157,8 +159,8 @@ jobs:
             ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
             ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
             ldconfig &&
-            python -c "import librmm; print(librmm.__path__[0])" > /tmp/.librmm_dir &&
-            echo "[rsc-build] marker=$(cat /tmp/.librmm_dir)"
+            python -c "import librmm; print(librmm.__path__[0])" > build/.librmm_dir &&
+            echo "[rsc-build] marker=$(cat build/.librmm_dir)"
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
           CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
 
@@ -51,4 +51,4 @@ CLAUDE.md
 
 # tmp_scripts
 tmp_scripts/
-benchmarks/
+/benchmarks/
@@ -50,9 +50,19 @@ if (RSC_BUILD_EXTENSIONS)
   if (RSC_PYTHON_RMM_DIR AND EXISTS "${RSC_PYTHON_RMM_DIR}/rmm-config.cmake")
     list(APPEND RSC_RMM_HINTS "${RSC_PYTHON_RMM_DIR}")
   endif()
-  if(EXISTS "/tmp/.librmm_dir")
-    file(READ "/tmp/.librmm_dir" _rsc_librmm_marker)
+  # Wheel builds install librmm/rapids_logger into the isolated build env and
+  # write build/.librmm_dir from CIBW_BEFORE_BUILD.  publish.yml also symlinks
+  # those shared libraries into /usr/local/lib so auditwheel can see and exclude
+  # them instead of bundling RAPIDS runtime libraries into the wheel.
+  if(DEFINED ENV{RSC_LIBRMM_DIR} AND EXISTS "$ENV{RSC_LIBRMM_DIR}/lib64/cmake/rmm/rmm-config.cmake")
+    set(_rsc_librmm_marker "$ENV{RSC_LIBRMM_DIR}")
+  elseif(EXISTS "${CMAKE_SOURCE_DIR}/build/.librmm_dir")
+    file(READ "${CMAKE_SOURCE_DIR}/build/.librmm_dir" _rsc_librmm_marker)
     string(STRIP "${_rsc_librmm_marker}" _rsc_librmm_marker)
+  else()
+    set(_rsc_librmm_marker "")
+  endif()
+  if(NOT "${_rsc_librmm_marker}" STREQUAL "" AND EXISTS "${_rsc_librmm_marker}/lib64/cmake/rmm/rmm-config.cmake")
     file(GLOB _rsc_marker_rmm_dirs "${_rsc_librmm_marker}/lib64/cmake/rmm")
     file(GLOB _rsc_marker_rapids_prefixes
       "${_rsc_librmm_marker}/lib64"
 
@@ -4,7 +4,9 @@ requires = [
     "nanobind>=2.0.0",
     "setuptools-scm>=8",
     # librmm headers/CMake config are needed at build time for Wilcoxon.
-    # CUDA wheel builds rewrite this to the matching cu12/cu13 package.
+    # Generic isolated source builds default to CUDA 12. CUDA wheel builds
+    # rewrite this to the matching cu12/cu13 package; CUDA 13 source builds
+    # should build in an existing RAPIDS env with --no-build-isolation.
     "librmm-cu12>=25.10",
 ]
 build-backend = "scikit_build_core.build"
 
@@ -2,6 +2,8 @@
 
 #include <cuda_runtime.h>
 
+#include "wilcoxon_fast_common.cuh"
+
 // ============================================================================
 // Warp reduction helper (sum doubles across block via warp_buf)
 // ============================================================================
 
@@ -41,17 +41,14 @@ static void launch_ovr_rank_dense_streaming(
     }
 
     size_t sub_items = (size_t)n_rows * sub_batch_cols;
-    if (sub_items > (size_t)std::numeric_limits<int>::max()) {
-        throw std::runtime_error(
-            "Dense OVR sub-batch exceeds CUB int item limit");
-    }
+    int sub_items_i32 = checked_cub_items(sub_items, "Dense OVR sub-batch");
 
     size_t cub_temp_bytes = 0;
     {
         auto* fk = reinterpret_cast<float*>(1);
         auto* iv = reinterpret_cast<int*>(1);
         cub::DeviceSegmentedRadixSort::SortPairs(
-            nullptr, cub_temp_bytes, fk, fk, iv, iv, (int)sub_items,
+            nullptr, cub_temp_bytes, fk, fk, iv, iv, sub_items_i32,
             sub_batch_cols, iv, iv + 1, BEGIN_BIT, END_BIT);
     }
 
@@ -97,7 +94,8 @@ static void launch_ovr_rank_dense_streaming(
     int batch_idx = 0;
     while (col < n_cols) {
         int sb_cols = std::min(sub_batch_cols, n_cols - col);
-        int sb_items = n_rows * sb_cols;
+        int sb_items = checked_int_product((size_t)n_rows, (size_t)sb_cols,
+                                           "Dense OVR active sub-batch");
         int s = batch_idx % n_streams;
         cudaStream_t stream = streams[s];
         auto& buf = bufs[s];
@@ -184,32 +182,30 @@ static void launch_ovo_rank_dense_tiered_impl(
         n_streams = (n_cols + sub_batch_cols - 1) / sub_batch_cols;
 
     size_t sub_ref_items = (size_t)n_ref * sub_batch_cols;
-    if (sub_ref_items > (size_t)std::numeric_limits<int>::max()) {
-        throw std::runtime_error(
-            "Dense OVO reference sub-batch exceeds CUB int item limit");
-    }
+    int sub_ref_items_i32 =
+        checked_cub_items(sub_ref_items, "Dense OVO reference sub-batch");
 
     size_t sub_grp_items = (size_t)n_all_grp * sub_batch_cols;
-    if (sub_grp_items > (size_t)std::numeric_limits<int>::max()) {
-        throw std::runtime_error(
-            "Dense OVO sub-batch exceeds CUB int item limit");
-    }
+    int sub_grp_items_i32 =
+        checked_cub_items(sub_grp_items, "Dense OVO group sub-batch");
 
     size_t grp_cub_temp_bytes = 0;
     if (needs_tier3) {
-        int max_grp_seg = n_sort_groups * sub_batch_cols;
+        int max_grp_seg =
+            checked_int_product((size_t)n_sort_groups, (size_t)sub_batch_cols,
+                                "Dense OVO group segment count");
         auto* fk = reinterpret_cast<float*>(1);
         auto* doff = reinterpret_cast<int*>(1);
         cub::DeviceSegmentedRadixSort::SortKeys(
-            nullptr, grp_cub_temp_bytes, fk, fk, (int)sub_grp_items,
-            max_grp_seg, doff, doff + 1, BEGIN_BIT, END_BIT);
+            nullptr, grp_cub_temp_bytes, fk, fk, sub_grp_items_i32, max_grp_seg,
+            doff, doff + 1, BEGIN_BIT, END_BIT);
     }
     size_t ref_cub_temp_bytes = 0;
     if (!ref_is_sorted) {
         auto* fk = reinterpret_cast<float*>(1);
         auto* doff = reinterpret_cast<int*>(1);
         cub::DeviceSegmentedRadixSort::SortKeys(
-            nullptr, ref_cub_temp_bytes, fk, fk, (int)sub_ref_items,
+            nullptr, ref_cub_temp_bytes, fk, fk, sub_ref_items_i32,
             sub_batch_cols, doff, doff + 1, BEGIN_BIT, END_BIT);
     }
 
@@ -270,7 +266,9 @@ static void launch_ovo_rank_dense_tiered_impl(
             pool.alloc<double>((size_t)n_groups * sub_batch_cols);
         if (needs_tier3) {
             bufs[s].grp_sorted = pool.alloc<float>(sub_grp_items);
-            int max_seg = n_sort_groups * sub_batch_cols;
+            int max_seg = checked_int_product((size_t)n_sort_groups,
+                                              (size_t)sub_batch_cols,
+                                              "Dense OVO group segment buffer");
             bufs[s].grp_seg_offsets = pool.alloc<int>(max_seg);
             bufs[s].grp_seg_ends = pool.alloc<int>(max_seg);
         } else {
@@ -287,8 +285,12 @@ static void launch_ovo_rank_dense_tiered_impl(
     int batch_idx = 0;
     while (col < n_cols) {
         int sb_cols = std::min(sub_batch_cols, n_cols - col);
-        int sb_ref_items_actual = n_ref * sb_cols;
-        int sb_grp_items_actual = n_all_grp * sb_cols;
+        int sb_ref_items_actual =
+            checked_int_product((size_t)n_ref, (size_t)sb_cols,
+                                "Dense OVO active reference sub-batch");
+        int sb_grp_items_actual =
+            checked_int_product((size_t)n_all_grp, (size_t)sb_cols,
+                                "Dense OVO active group sub-batch");
         int s = batch_idx % n_streams;
         cudaStream_t stream = streams[s];
         auto& buf = bufs[s];
@@ -343,7 +345,9 @@ static void launch_ovo_rank_dense_tiered_impl(
                 compute_tie_corr, padded_grp_size, upper_skip_le);
             CUDA_CHECK_LAST_ERROR(ovo_fused_sort_rank_kernel);
         } else if (needs_tier3) {
-            int sb_grp_seg = n_sort_groups * sb_cols;
+            int sb_grp_seg =
+                checked_int_product((size_t)n_sort_groups, (size_t)sb_cols,
+                                    "Dense OVO active group segment count");
             int blk = (sb_grp_seg + UTIL_BLOCK_SIZE - 1) / UTIL_BLOCK_SIZE;
             build_tier3_seg_begin_end_offsets_kernel<<<blk, UTIL_BLOCK_SIZE, 0,
                                                        stream>>>(
 
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -48,6 +49,39 @@ constexpr int TIER1_GROUP_THRESHOLD = 2500;
 // 512 MB per stream dense slab + same for sorted copy ≈ 1 GB / stream.
 constexpr size_t GROUP_DENSE_BUDGET_ITEMS = 128 * 1024 * 1024;
 
+static inline size_t wilcoxon_max_smem_per_block() {
+    int device = 0;
+    int max_smem = 0;
+    cudaGetDevice(&device);
+    cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock,
+                           device);
+    return (size_t)max_smem;
+}
+
+static inline int checked_cub_items(size_t count, const char* context) {
+    if (count > (size_t)std::numeric_limits<int>::max()) {
+        throw std::runtime_error(std::string(context) +
+                                 " exceeds CUB int item limit");
+    }
+    return (int)count;
+}
+
+static inline int checked_int_span(size_t count, const char* context) {
+    if (count > (size_t)std::numeric_limits<int>::max()) {
+        throw std::runtime_error(std::string(context) +
+                                 " exceeds int32 offset limit");
+    }
+    return (int)count;
+}
+
+static inline int checked_int_product(size_t a, size_t b, const char* context) {
+    if (a != 0 && b > (size_t)std::numeric_limits<int>::max() / a) {
+        throw std::runtime_error(std::string(context) +
+                                 " exceeds int32 item limit");
+    }
+    return (int)(a * b);
+}
+
 // ---------------------------------------------------------------------------
 // RAII guard for cudaHostRegister.  Unregisters on scope exit even when an
 // exception unwinds — prevents leaked host pinning on stream-sync failures.
@@ -60,9 +94,9 @@ struct HostRegisterGuard {
         if (p && bytes > 0) {
             cudaError_t err = cudaHostRegister(p, bytes, flags);
             if (err != cudaSuccess) {
-                // Already-registered memory is fine; anything else means the
-                // subsequent kernels would read garbage from an unmapped
-                // pointer, so surface the error immediately.
+                // Already-registered memory belongs to another owner; use it
+                // without unregistering here. Other failures mean mapped reads
+                // would be unsafe, so surface them immediately.
                 if (err == cudaErrorHostMemoryAlreadyRegistered) {
                     cudaGetLastError();  // clear sticky error flag
                 } else {
@@ -116,6 +150,10 @@ struct RmmScratchPool {
     template <typename T>
     T* alloc(size_t count) {
         if (count == 0) count = 1;
+        if (count > std::numeric_limits<size_t>::max() / sizeof(T)) {
+            throw std::runtime_error(
+                "Wilcoxon scratch allocation size overflow");
+        }
         size_t bytes = count * sizeof(T);
         void* ptr = wilcoxon_rmm_allocate(bytes);
         bufs.push_back({ptr, bytes});