Fixed batching and added rapids-singlecell benchmarks

remydubois · remydubois · commit 500cd6c4e9ac · 2026-03-19T17:07:43.000Z
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
 # illico
 `illico` is a python library performing fast and lightweight wilcoxon rank-sum tests (same as `scanpy.tl.rank_genes_groups(…, method="wilcoxon")`), useful for single-cell RNASeq data analyses and processing.
-Approximate speed benchmarks (done on a 8-CPUs machine) ran on k562-essential can be found below.
+Approximate speed benchmarks (done on a 8-CPUs, 1 GPU machine) ran on k562-essential (~300k cells, 8k genes, 2k perturbations) can be found below.
 
-|               Test               | Format | illico | scanpy | pdex |
-|----------------------------------|--------|--------|--------|------|
-| OVO (reference="non-targeting")  | Dense  |  ~20s  | ~1h    | ~20min  |
-| OVO (reference="non-targeting")  | Sparse |  ~15s  | ~1h30  | ~8min  |
-| OVR (reference=None)             | Dense  |  ~10s  | >10h   |  >10h   |
-| OVR (reference=None)             | Sparse |  ~10s  | >10h   |  >10h   |
+|               Test               | Format | illico | scanpy | pdex | rapids-singlecell (GPU) |
+|----------------------------------|--------|--------|--------|------|------------------ |
+| OVO (reference="non-targeting")  | Dense  |  ~20s  | ~1h    | ~20min  | ~25min |
+| OVO (reference="non-targeting")  | Sparse |  ~15s  | ~1h30min  | ~8min  | ~1h10min |
+| OVR (reference=None)             | Dense  |  ~10s  | >10h   |  >10h   | ~1min |
+| OVR (reference=None)             | Sparse |  ~10s  | >10h   |  >10h   | ~1min |
 
 ## Installation
 illico is compatible with python 3.11 and onward:
@@ -30,7 +30,7 @@ de_genes = asymptotic_wilcoxon(
        group_keys="perturbation",
        reference=["non-targeting"|None], # <- `None` computes cluster-wise DE genes. Any other `str` will be interpreted as label of the control cells.
        is_log1p=[False|True], # <-- Specify if your data underwent log1p or not
-       return_as_scanpy=[False|True], # <-- Whether to return a dict compatible with Scanpy's `rank_genes_groups` function, or a pd.DataFrame
+       return_as_scanpy=[False|True], # <-- Whether to return a dict compatible with Scanpy's `rank_genes_groups` function, or a pd.DataFrame holding all p-values, statistics, and fold-change
        )
 # Eventually, if return_as_scanpy=True:
 adata.uns["rank_genes_groups"] = de_genes
diff --git a/changelog.md b/changelog.md
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+Version 0.4.0
+------------
+- Added option to return scanpy-friendly output with `return_as_scanpy` arg. `asymptotic_wilcoxon` returns either:
+    - A `pandas.DataFrame` with columns `feature`, `p_value`, `fold_change`, and `statistic` (default), if `return_as_scanpy=False`
+    - A dictionary containing the same keys as `scanpy.tl.rank_genes_groups`, if `return_as_scanpy=True`. Similarly as scanpy, genes are ordered by decreasing z-score.
+- Improved the batching mechanism, fixed the 'auto' mode that was excluding the very last gene in previous versions.
+
 Version 0.3.0
 ------------
 - Rust backend is available for all tests. Compare Rust vs Numba with `poetry run pytest-benchmark compare 0003 0005`:
diff --git a/illico/asymptotic_wilcoxon.py b/illico/asymptotic_wilcoxon.py
@@ -1,4 +1,3 @@
-import math
 from typing import Literal
 
 import anndata as ad
@@ -12,6 +11,7 @@
 
 from illico.utils.compile import _precompile
 from illico.utils.groups import GroupContainer, encode_and_count_groups
+from illico.utils.math import compute_batch_bounds
 from illico.utils.memory import log_memory_usage
 from illico.utils.ranking import check_indices_sorted_per_parcel
 from illico.utils.registry import (
@@ -254,24 +254,9 @@ def asymptotic_wilcoxon(
     rows = pd.Series(unique_raw_groups, name="pert", dtype=str)
     results = np.empty((len(rows), len(cols), 4), dtype=np.float64)
 
-    # Adapt batch size to leverage multithreading regarding the number of genes, if requested
-    if n_genes < 256:
-        batch_size = n_genes  # No batching for small number of genes
-        n_threads = 1  # No multithreading for small number of genes
-        iterator = [[0, n_genes]]
-    elif isinstance(batch_size, int):
-        batch_size = min(batch_size, math.ceil(n_genes / n_threads))
-        bounds = np.append(np.arange(0, n_genes, batch_size), n_genes)
-        iterator = list(zip(bounds[:-1], bounds[1:]))
-    elif batch_size == "auto":
-        n_dispatches = max(int(n_genes / 256 / n_threads), 1)  # Aim for approximately 256 genes per chunk
-        splits = np.array_split(np.arange(n_genes + 1), indices_or_sections=n_threads * n_dispatches)
-        iterator = [[split[0], split[-1] + 1] for split in splits]
-        iterator[-1][-1] = n_genes  # Ensure the last upper bound is exactly n_genes
-        batch_size = int(np.ceil(n_genes / (n_dispatches * n_threads)))
-    else:
-        raise ValueError(f"Invalid batch_size value: {batch_size}. Must be 'auto' or an integer.")
-    logger.trace(f"Using batch size of {batch_size} for {n_threads} threads and {n_genes} genes.")
+    # Compute the batch bounds for each thread
+    iterator, batch_size = compute_batch_bounds(n_genes, batch_size, n_threads)
+    logger.trace(f"Processing {n_genes} genes through {len(iterator)} batches with {n_threads} threads.")
 
     # Compute estimated mem footprint
     _ = log_memory_usage(data_handler, group_container, batch_size, n_threads)
diff --git a/illico/utils/math.py b/illico/utils/math.py
@@ -2,7 +2,7 @@
 
 import math
 import warnings
-from typing import Literal
+from typing import List, Literal, Tuple
 
 import numpy as np
 from numba import njit
@@ -281,3 +281,49 @@ def chunk_and_fortranize(X: np.ndarray, chunk_lb: int, chunk_ub: int, indices: n
             for j in range(0, chunk_ub - chunk_lb):
                 chunk[i, j] = X[i, chunk_lb + j]
     return chunk
+
+
+def compute_batch_bounds(n_genes: int, batch_size: Literal["auto"] | int, n_threads: int) -> List[Tuple[int, int]]:
+    """Computes ideal batch bounds for processing genes in batches.
+    This function ensures no worker is starving. This could happen if we have 8 workers but 9 batches to allocate.
+    In this case, because each batch takes the same time to be processed, all but one workers will be idle waiting for one worker to process the last batch.
+
+    Args:
+        n_genes (int): Total number of genes
+        batch_size (Literal["auto"] | int): Batch size, or "auto" to compute ideal batch size.
+        n_threads (int): Number of threads to use.
+    Returns:
+        List[Tuple[int, int]]: List of (lower_bound, upper_bound) for each batch. Upper bound is excluding, following slicing conventions.
+    """
+    # No batching nor multithreading for small inputs
+    if n_genes < n_threads or n_genes < 256:
+        batch_size = n_genes
+        # n_threads = 1
+        batch_size = n_genes
+        bounds_iterator = [[0, n_genes]]
+    elif isinstance(batch_size, int):
+        # batch_size = min(batch_size, math.ceil(n_genes / n_threads))
+        bounds = list(range(0, n_genes + 1, batch_size))
+        if bounds[-1] != n_genes:
+            bounds.append(n_genes)
+        bounds_iterator = list(zip(bounds[:-1], bounds[1:]))
+    elif batch_size == "auto":
+        target_batch_size = 256
+        min_batches = (n_genes + target_batch_size - 1) // target_batch_size
+        num_batches = ((min_batches + n_threads - 1) // n_threads) * n_threads
+        base_size = n_genes // num_batches
+        remainder = n_genes % num_batches
+        bounds_iterator = []
+        start = 0
+        for i in range(num_batches):
+            end = start + base_size + (1 if i < remainder else 0)
+            bounds_iterator.append((start, end))
+            start = end
+        # Append the last gene as the right bound is excluding
+        if bounds_iterator[-1][1] != n_genes:
+            bounds_iterator[-1][1] = n_genes
+        batch_size = base_size
+    else:
+        raise ValueError(f"Invalid batch_size value: {batch_size}. Must be 'auto' or an integer.")
+
+    return bounds_iterator, batch_size
diff --git a/tests/test_asymptotic_wilcoxon.py b/tests/test_asymptotic_wilcoxon.py
@@ -494,6 +494,7 @@ def test_asymptotic_wilcoxon_auto_batchsize(eager_rand_adata):
     bigger_eager_rand_adata = ad.concat(
         [eager_rand_adata] * int(math.ceil(target_n_cols / eager_rand_adata.n_vars)), axis=1
     )
+    bigger_eager_rand_adata.var_names_make_unique()
     bigger_eager_rand_adata.obs = eager_rand_adata.obs.copy()
     asy_results = asymptotic_wilcoxon(
         adata=bigger_eager_rand_adata,

Original file line number	Diff line number	Diff line change
`@@ -494,6 +494,7 @@ def test_asymptotic_wilcoxon_auto_batchsize(eager_rand_adata):`
`494`	`494`	`bigger_eager_rand_adata = ad.concat(`
`495`	`495`	`[eager_rand_adata] * int(math.ceil(target_n_cols / eager_rand_adata.n_vars)), axis=1`
`496`	`496`	`)`
	`497`	`+ bigger_eager_rand_adata.var_names_make_unique()`
`497`	`498`	`bigger_eager_rand_adata.obs = eager_rand_adata.obs.copy()`
`498`	`499`	`asy_results = asymptotic_wilcoxon(`
`499`	`500`	`adata=bigger_eager_rand_adata,`