scverse
diff --git a/‎.coderabbit.yaml‎
Lines changed: 4 additions & 0 deletions b/‎.coderabbit.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cuda_agents.md‎
Lines changed: 41 additions & 3 deletions b/‎cuda_agents.md‎
Lines changed: 41 additions & 3 deletions
diff --git a/‎python_agents.md‎
Lines changed: 10 additions & 1 deletion b/‎python_agents.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/rapids_singlecell/_cuda/aggr/aggr.cu‎
Lines changed: 15 additions & 6 deletions b/‎src/rapids_singlecell/_cuda/aggr/aggr.cu‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎src/rapids_singlecell/_cuda/aucell/aucell.cu‎
Lines changed: 4 additions & 2 deletions b/‎src/rapids_singlecell/_cuda/aucell/aucell.cu‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/rapids_singlecell/_cuda/autocorr/autocorr.cu‎
Lines changed: 19 additions & 10 deletions b/‎src/rapids_singlecell/_cuda/autocorr/autocorr.cu‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎src/rapids_singlecell/_cuda/bbknn/bbknn.cu‎
Lines changed: 9 additions & 5 deletions b/‎src/rapids_singlecell/_cuda/bbknn/bbknn.cu‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/rapids_singlecell/_cuda/cooc/cooc.cu‎
Lines changed: 4 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/cooc/cooc.cu‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/edistance/edistance.cu‎
Lines changed: 2 additions & 1 deletion b/‎src/rapids_singlecell/_cuda/edistance/edistance.cu‎
Lines changed: 2 additions & 1 deletion
@@ -30,12 +30,16 @@ reviews:
         - Memory access patterns and coalescing
         - Correct use of atomicAdd and synchronization
         - Template parameter correctness (float vs double)
+        - MANDATORY: Every kernel launch (<<<grid, block, shared, stream>>>) MUST be followed by cudaGetLastError() to catch launch failures. Flag any kernel launch missing this check.
+        - MANDATORY: No magic numbers. All block sizes, tile sizes, grid calculations, and thresholds must use named constants (constexpr int BLOCK_SIZE = 256). Flag any raw numeric literal in dim3, grid, or shared memory calculations.
     - path: "src/rapids_singlecell/**/_kernels/**"
       instructions: |
         These are CuPy RawKernel definitions. Review for:
         - Correct CUDA kernel launch configurations
         - Shared memory bounds
         - Type safety (float32 vs float64 mismatches)
+        - No magic numbers in kernel launch configurations or kernel code. Block sizes, tile sizes, and thresholds must use named constants.
+        - After RawKernel calls, check for cp.cuda.runtime.getLastError() to catch silent launch failures.
     - path: "tests/**"
       instructions: |
         Do not suggest changing test tolerances without strong justification.
 
@@ -19,7 +19,7 @@
 ### GPU/CUDA Errors
 - Race conditions in GPU kernels (shared memory, atomics)
 - Invalid memory access (out-of-bounds, host/device confusion)
-- Missing CUDA error checking after kernel launches
+- **Missing `cudaGetLastError()` after kernel launches**: Every kernel launch (`<<<grid, block, shared, stream>>>`) MUST be followed by `cudaGetLastError()` to detect launch failures (invalid config, shared memory overflow, etc.). Without this, errors are silently deferred and may corrupt later operations or produce garbage results.
 - Kernel launch with zero blocks/threads or invalid grid/block dimensions
 - **Template type mismatches**: kernel templated on `float` but receiving `double` data from Python
 - **Shared memory overflow**: exceeding device shared memory limit (varies by GPU, e.g. T4 = 64KB)
@@ -73,7 +73,7 @@
 ### Kernel Configuration
 - Hard-coded shared memory sizes that may exceed device limits
 - Fixed tile sizes that don't adapt to device capabilities
-- **Magic numbers** in grid/block calculations without descriptive constants
+- **Magic numbers**: all numeric literals for block sizes, tile dimensions, shared memory sizes, and heuristic thresholds MUST use named constants. `dim3 block(256)` is not acceptable — use `constexpr int BLOCK_SIZE = 256; dim3 block(BLOCK_SIZE);`
 
 ### Test Quality
 - Missing validation of numerical correctness against CPU reference
@@ -141,6 +141,43 @@ int max_shared = device.attributes["MaxSharedMemoryPerBlock"];
 int tile = select_tile(max_shared, dtype_size);
 ```
 
+**CRITICAL** (missing cudaGetLastError):
+```text
+CRITICAL: Missing cudaGetLastError() after kernel launch
+
+Issue: Kernel launched without error checking — launch failures are silently deferred
+Why: Invalid grid/block config, shared memory overflow, or other launch errors go undetected
+Impact: Garbage results that look like algorithm bugs, not CUDA errors
+
+Bad:
+my_kernel<<<grid, block, shared_mem, stream>>>(...);
+
+Good:
+my_kernel<<<grid, block, shared_mem, stream>>>(...);
+cudaError_t err = cudaGetLastError();
+if (err != cudaSuccess) {
+    throw std::runtime_error(std::string("Kernel launch failed: ") + cudaGetErrorString(err));
+}
+```
+
+**HIGH** (magic numbers):
+```text
+HIGH: Magic numbers in kernel configuration
+
+Issue: `dim3 block(64)` and `dim3 grid((n + 63) / 64)` use raw numeric literals
+Why: Obscures intent, error-prone when changing, harder to review
+Impact: Maintainability and correctness risk
+
+Bad:
+dim3 block(64);
+dim3 grid((n + 63) / 64);
+
+Good:
+constexpr int BLOCK_SIZE = 64;
+dim3 block(BLOCK_SIZE);
+dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+```
+
 **CRITICAL** (missing syncthreads):
 ```text
 CRITICAL: Missing __syncthreads() between shared memory write and read
@@ -266,8 +303,9 @@ module_name/
 ### When Reviewing Nanobind Bindings (.cu files)
 - [ ] Is the template type `T` dispatched correctly based on array dtype?
 - [ ] Are array dimensions validated before kernel launch?
-- [ ] Is error checking done after CUDA calls?
+- [ ] Is `cudaGetLastError()` called after every kernel launch to catch launch failures?
 - [ ] Are DLPack/array interface conversions correct?
+- [ ] Are all numeric literals for block sizes, tile sizes, and thresholds defined as named constants?
 
 ### When Reviewing CuPy RawKernels (_kernels/*.py)
 - [ ] Is the kernel string syntactically correct CUDA C?
 
@@ -93,12 +93,21 @@ At millions of cells, numerical edge cases that "never happen" on small data bec
 - Unsafe deserialization of data files
 - Missing bounds checking allowing resource exhaustion
 
+### Magic Numbers
+- Hard-coded numeric literals (128, 256, 512, 1024, etc.) in kernel configurations, thresholds, or tile sizes without named constants
+- Use descriptive constants: `BLOCK_SIZE = 256`, `SHARED_MEM_THRESHOLD = 48 * 1024`
+- Tile sizes, block dimensions, and heuristic thresholds must all be named
+
+### Missing Kernel Error Checking
+- After calling nanobind CUDA kernel wrappers from Python, the next CuPy operation may silently consume a pending CUDA error
+- After RawKernel launches, call `cp.cuda.runtime.getLastError()` to surface launch failures immediately (e.g., shared memory overflow, invalid grid dimensions)
+- This is especially important in development and testing — a kernel that silently fails produces garbage results that look like algorithm bugs
+
 ## MEDIUM Issues (Comment Selectively)
 
 - Edge cases not handled (empty AnnData, single observation)
 - Deprecated API usage
 - Minor inefficiencies in non-critical code paths
-- Magic numbers without descriptive constant names
 
 ## Review Protocol
 
 
@@ -5,16 +5,20 @@
 
 using namespace nb::literals;
 
+constexpr int BLOCK_SIZE_SPARSE = 64;
+constexpr int BLOCK_SIZE_DENSE = 256;
+
 template <typename T>
 static inline void launch_csr_aggr(const int* indptr, const int* index,
                                    const T* data, double* out, const int* cats,
                                    const bool* mask, size_t n_cells,
                                    size_t n_genes, size_t n_groups,
                                    cudaStream_t stream) {
     dim3 grid((unsigned)n_cells);
-    dim3 block(64);
+    dim3 block(BLOCK_SIZE_SPARSE);
     csr_aggr_kernel<T><<<grid, block, 0, stream>>>(
         indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
+    CUDA_CHECK_LAST_ERROR(csr_aggr_kernel);
 }
 
 template <typename T>
@@ -24,31 +28,34 @@ static inline void launch_csc_aggr(const int* indptr, const int* index,
                                    size_t n_genes, size_t n_groups,
                                    cudaStream_t stream) {
     dim3 grid((unsigned)n_genes);
-    dim3 block(64);
+    dim3 block(BLOCK_SIZE_SPARSE);
     csc_aggr_kernel<T><<<grid, block, 0, stream>>>(
         indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
+    CUDA_CHECK_LAST_ERROR(csc_aggr_kernel);
 }
 
 template <typename T>
 static inline void launch_dense_aggr_C(const T* data, double* out,
                                        const int* cats, const bool* mask,
                                        size_t n_cells, size_t n_genes,
                                        size_t n_groups, cudaStream_t stream) {
-    dim3 block(256);
+    dim3 block(BLOCK_SIZE_DENSE);
     dim3 grid((unsigned)((n_cells * n_genes + block.x - 1) / block.x));
     dense_aggr_kernel_C<T><<<grid, block, 0, stream>>>(
         data, out, cats, mask, n_cells, n_genes, n_groups);
+    CUDA_CHECK_LAST_ERROR(dense_aggr_kernel_C);
 }
 
 template <typename T>
 static inline void launch_dense_aggr_F(const T* data, double* out,
                                        const int* cats, const bool* mask,
                                        size_t n_cells, size_t n_genes,
                                        size_t n_groups, cudaStream_t stream) {
-    dim3 block(256);
+    dim3 block(BLOCK_SIZE_DENSE);
     dim3 grid((unsigned)((n_cells * n_genes + block.x - 1) / block.x));
     dense_aggr_kernel_F<T><<<grid, block, 0, stream>>>(
         data, out, cats, mask, n_cells, n_genes, n_groups);
+    CUDA_CHECK_LAST_ERROR(dense_aggr_kernel_F);
 }
 
 template <typename T>
@@ -58,19 +65,21 @@ static inline void launch_csr_to_coo(const int* indptr, const int* index,
                                      const bool* mask, int n_cells,
                                      cudaStream_t stream) {
     dim3 grid((unsigned)n_cells);
-    dim3 block(64);
+    dim3 block(BLOCK_SIZE_SPARSE);
     csr_to_coo_kernel<T><<<grid, block, 0, stream>>>(
         indptr, index, data, row, col, ndata, cats, mask, n_cells);
+    CUDA_CHECK_LAST_ERROR(csr_to_coo_kernel);
 }
 
 static inline void launch_sparse_var(const int* indptr, const int* index,
                                      double* data, const double* mean_data,
                                      double* n_cells, int dof, int n_groups,
                                      cudaStream_t stream) {
     dim3 grid((unsigned)n_groups);
-    dim3 block(64);
+    dim3 block(BLOCK_SIZE_SPARSE);
     sparse_var_kernel<<<grid, block, 0, stream>>>(
         indptr, index, data, mean_data, n_cells, dof, n_groups);
+    CUDA_CHECK_LAST_ERROR(sparse_var_kernel);
 }
 
 template <typename T, typename Device>
 
@@ -36,10 +36,12 @@ static inline void launch_auc(const int* ranks, int R, int C, const int* cnct,
                               const int* starts, const int* lens, int n_sets,
                               int n_up, const float* max_aucs, float* es,
                               cudaStream_t stream) {
-    dim3 block(32);
-    dim3 grid((unsigned)n_sets, (unsigned)((R + block.x - 1) / block.x));
+    constexpr int BLOCK_SIZE = 32;
+    dim3 block(BLOCK_SIZE);
+    dim3 grid((unsigned)n_sets, (unsigned)((R + BLOCK_SIZE - 1) / BLOCK_SIZE));
     auc_kernel<<<grid, block, 0, stream>>>(ranks, R, C, cnct, starts, lens,
                                            n_sets, n_up, max_aucs, es);
+    CUDA_CHECK_LAST_ERROR(auc_kernel);
 }
 
 template <typename Device>
 
@@ -5,18 +5,23 @@
 
 using namespace nb::literals;
 
+constexpr int DENSE_BLOCK_DIM = 8;
+constexpr int SPARSE_BLOCK_SIZE = 1024;
+constexpr int ELEMENTWISE_BLOCK_SIZE = 32;
+
 template <typename T>
 static inline void launch_morans_dense(const T* data_centered,
                                        const int* adj_row_ptr,
                                        const int* adj_col_ind,
                                        const T* adj_data, T* num, int n_samples,
                                        int n_features, cudaStream_t stream) {
-    dim3 block(8, 8);
-    dim3 grid((n_features + block.x - 1) / block.x,
-              (n_samples + block.y - 1) / block.y);
+    dim3 block(DENSE_BLOCK_DIM, DENSE_BLOCK_DIM);
+    dim3 grid((n_features + DENSE_BLOCK_DIM - 1) / DENSE_BLOCK_DIM,
+              (n_samples + DENSE_BLOCK_DIM - 1) / DENSE_BLOCK_DIM);
     morans_I_num_dense_kernel<<<grid, block, 0, stream>>>(
         data_centered, adj_row_ptr, adj_col_ind, adj_data, num, n_samples,
         n_features);
+    CUDA_CHECK_LAST_ERROR(morans_I_num_dense_kernel);
 }
 
 template <typename T>
@@ -25,46 +30,50 @@ static inline void launch_morans_sparse(
     const int* data_row_ptr, const int* data_col_ind, const T* data_values,
     int n_samples, int n_features, const T* mean_array, T* num,
     cudaStream_t stream) {
-    dim3 block(1024);
+    dim3 block(SPARSE_BLOCK_SIZE);
     dim3 grid(n_samples);
     morans_I_num_sparse_kernel<<<grid, block, 0, stream>>>(
         adj_row_ptr, adj_col_ind, adj_data, data_row_ptr, data_col_ind,
         data_values, n_samples, n_features, mean_array, num);
+    CUDA_CHECK_LAST_ERROR(morans_I_num_sparse_kernel);
 }
 
 template <typename T>
 static inline void launch_gearys_dense(const T* data, const int* adj_row_ptr,
                                        const int* adj_col_ind,
                                        const T* adj_data, T* num, int n_samples,
                                        int n_features, cudaStream_t stream) {
-    dim3 block(8, 8);
-    dim3 grid((n_features + block.x - 1) / block.x,
-              (n_samples + block.y - 1) / block.y);
+    dim3 block(DENSE_BLOCK_DIM, DENSE_BLOCK_DIM);
+    dim3 grid((n_features + DENSE_BLOCK_DIM - 1) / DENSE_BLOCK_DIM,
+              (n_samples + DENSE_BLOCK_DIM - 1) / DENSE_BLOCK_DIM);
     gearys_C_num_dense_kernel<<<grid, block, 0, stream>>>(
         data, adj_row_ptr, adj_col_ind, adj_data, num, n_samples, n_features);
+    CUDA_CHECK_LAST_ERROR(gearys_C_num_dense_kernel);
 }
 
 template <typename T>
 static inline void launch_gearys_sparse(
     const int* adj_row_ptr, const int* adj_col_ind, const T* adj_data,
     const int* data_row_ptr, const int* data_col_ind, const T* data_values,
     int n_samples, int n_features, T* num, cudaStream_t stream) {
-    dim3 block(1024);
+    dim3 block(SPARSE_BLOCK_SIZE);
     dim3 grid(n_samples);
     gearys_C_num_sparse_kernel<<<grid, block, 0, stream>>>(
         adj_row_ptr, adj_col_ind, adj_data, data_row_ptr, data_col_ind,
         data_values, n_samples, n_features, num);
+    CUDA_CHECK_LAST_ERROR(gearys_C_num_sparse_kernel);
 }
 
 template <typename T>
 static inline void launch_pre_den_sparse(const int* data_col_ind,
                                          const T* data_values, int nnz,
                                          const T* mean_array, T* den,
                                          int* counter, cudaStream_t stream) {
-    dim3 block(32);
-    dim3 grid((nnz + block.x - 1) / block.x);
+    dim3 block(ELEMENTWISE_BLOCK_SIZE);
+    dim3 grid((nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE);
     pre_den_sparse_kernel<<<grid, block, 0, stream>>>(
         data_col_ind, data_values, nnz, mean_array, den, counter);
+    CUDA_CHECK_LAST_ERROR(pre_den_sparse_kernel);
 }
 
 template <typename Device>
 
@@ -5,25 +5,29 @@
 
 using namespace nb::literals;
 
+constexpr int BLOCK_SIZE = 64;
+
 static inline void launch_find_top_k_per_row(const float* data,
                                              const int* indptr, int n_rows,
                                              int trim, float* vals,
                                              cudaStream_t stream) {
-    dim3 block(64);
-    dim3 grid((n_rows + 64 - 1) / 64);
-    size_t shared_mem_size =
-        static_cast<size_t>(64) * static_cast<size_t>(trim) * sizeof(float);
+    dim3 block(BLOCK_SIZE);
+    dim3 grid((n_rows + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    size_t shared_mem_size = static_cast<size_t>(BLOCK_SIZE) *
+                             static_cast<size_t>(trim) * sizeof(float);
     find_top_k_per_row_kernel<<<grid, block, shared_mem_size, stream>>>(
         data, indptr, n_rows, trim, vals);
+    CUDA_CHECK_LAST_ERROR(find_top_k_per_row_kernel);
 }
 
 static inline void launch_cut_smaller(int* indptr, int* index, float* data,
                                       float* vals, int n_rows,
                                       cudaStream_t stream) {
     dim3 grid(n_rows);
-    dim3 block(64);
+    dim3 block(BLOCK_SIZE);
     cut_smaller_kernel<<<grid, block, 0, stream>>>(indptr, index, data, vals,
                                                    n_rows);
+    CUDA_CHECK_LAST_ERROR(cut_smaller_kernel);
 }
 
 template <typename Device>
 
@@ -122,6 +122,7 @@ static void launch_csr_catpairs_kernel(
         <<<grid, block, shared_mem, stream>>>(
             spatial, thresholds, cat_offsets, cell_indices, pair_left,
             pair_right, counts, k, l_val, blocks_per_pair, l_pad);
+    CUDA_CHECK_LAST_ERROR(occur_count_kernel_csr_catpairs_tiled);
 }
 
 // Dispatch to correct template specialization based on cell_tile
@@ -192,6 +193,7 @@ static inline void launch_count_pairwise(const float* spatial,
     dim3 block(32);
     occur_count_kernel_pairwise<<<grid, block, 0, stream>>>(
         spatial, thresholds, labels, result, n, k, l_val);
+    CUDA_CHECK_LAST_ERROR(occur_count_kernel_pairwise);
 }
 
 // Shared memory reduction launch
@@ -213,6 +215,7 @@ static inline bool launch_reduce_shared(const int* result, float* out, int k,
         static_cast<size_t>(k) * static_cast<size_t>(k + 1) * sizeof(float);
     occur_reduction_kernel_shared<<<grid, block, smem, stream>>>(result, out, k,
                                                                  l_val, format);
+    CUDA_CHECK_LAST_ERROR(occur_reduction_kernel_shared);
     return true;
 }
 
@@ -225,6 +228,7 @@ static inline void launch_reduce_global(const int* result, float* inter_out,
     size_t smem = static_cast<size_t>(k) * sizeof(float);
     occur_reduction_kernel_global<<<grid, block, smem, stream>>>(
         result, inter_out, out, k, l_val, format);
+    CUDA_CHECK_LAST_ERROR(occur_reduction_kernel_global);
 }
 
 template <typename Device>
 
@@ -28,7 +28,7 @@ static int choose_feat_tile_64(int n_features) {
 static int choose_feat_tile(int n_features, size_t max_shared_bytes,
                             int cell_tile, int dtype_size) {
     // Shared memory: cell_tile * feat_tile * dtype_size + warp_sums overhead
-    size_t warp_sums_overhead = 32 * dtype_size;
+    size_t warp_sums_overhead = WARP_SIZE * dtype_size;
     size_t available_shared = max_shared_bytes - warp_sums_overhead;
 
     int best_tile = 32;  // default minimum
@@ -109,6 +109,7 @@ static void launch_edistance_kernel(const T* embedding, const int* cat_offsets,
         <<<grid, block, shared_mem, stream>>>(
             embedding, cat_offsets, cell_indices, pair_left, pair_right,
             pairwise_sums, n_features, blocks_per_pair);
+    CUDA_CHECK_LAST_ERROR(edistance_kernel);
 }
 
 // Dispatch to correct tile size specialization for float32