remydubois
diff --git a/‎illico/asymptotic_wilcoxon.py‎
Lines changed: 20 additions & 6 deletions b/‎illico/asymptotic_wilcoxon.py‎
Lines changed: 20 additions & 6 deletions
diff --git a/‎illico/ovo/dense_ovo.py‎
Lines changed: 18 additions & 10 deletions b/‎illico/ovo/dense_ovo.py‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎illico/ovo/sparse_ovo.py‎
Lines changed: 36 additions & 26 deletions b/‎illico/ovo/sparse_ovo.py‎
Lines changed: 36 additions & 26 deletions
diff --git a/‎illico/ovr/dense_ovr.py‎
Lines changed: 37 additions & 11 deletions b/‎illico/ovr/dense_ovr.py‎
Lines changed: 37 additions & 11 deletions
@@ -218,6 +218,8 @@ def asymptotic_wilcoxon(
     tie_correct: bool = True,
     exp_post_agg: bool = False,
     layer: str | None = None,
+    groups: list[str] | None = None,
+    exclude_from_ovr: list[str] | None = None,
     precompile: bool = True,
     use_rust: bool = True,
     return_as_scanpy: bool = False,
@@ -264,6 +266,15 @@ def asymptotic_wilcoxon(
         Note that `scanpy.rank_genes_groups` assumes the data to be log1p, and exponentiates post aggregation by default.
     layer : str or None, default=None
         Layer in `adata.layers` to use for the data. If `None`, uses `adata.X`.
+    groups : list of str or None, default=None
+        Subset of groups to test. If `None`, tests all groups. This arguments serves the same purpose as scanpy's `groups` argument in `rank_genes_groups`.
+        It is used to filter which groups to compare against the reference in the OVO scenario, or which groups to compare against the rest in the OVR scenario.
+        Note that in the OVR scenario, each comparison still happens against the entirety of the other groups, not just the ones listed in this argument.
+        Note that in the OVO scenario, the reference group is automatically added.
+    exclude_from_ovr : list of str or None, default=None
+        Subset of groups to exclude from the rest group in the OVR scenario. This argument is ignored in the OVO scenario.
+        This can be useful if, for instance, one of the groups is corrupted and contains meaningless data, and we don't want it to be part of the comparisons in the OVR scenario.
+        TODO: add warning about what values are okay or not okay taking interaction with `groups` into account.
     precompile : bool, default=True
         Whether to precompile necessary functions for performance. It is recommended to set this to `True`.
     use_rust : bool, default=True
@@ -376,21 +387,24 @@ def asymptotic_wilcoxon(
 
     # Process the groups information
     unique_raw_groups, group_container = encode_and_count_groups(
-        groups=adata.obs[group_keys].values, ref_group=reference
+        groups=adata.obs[group_keys].values,
+        ref_group=reference,
+        group_subset=groups,
+        exclude=exclude_from_ovr,
     )
     logger.info(
-        f"Found {group_container.counts.size} unique groups (min size: {group_container.counts.min()} cells; "
+        f"Found {group_container.counts.size} unique groups ({group_container.selected_group_ids.size} valid ones) (min size: {group_container.counts.min()} cells; "
         f"max size: {group_container.counts.max()} cells), with reference group: {reference}"
     )
     _, n_genes_total = X.shape
 
     # Allocate the results dataframes
     cols = pd.Series(adata.var_names, name="feature", dtype=str)
-    rows = pd.Series(unique_raw_groups, name="pert", dtype=str)
+    rows = pd.Series(unique_raw_groups[group_container.selected_group_ids], name="pert", dtype=str)
     results = np.empty((len(rows), len(cols), 4), dtype=np.float64)
 
     # Go through all the possible combinations
-    n_tests = n_genes_total * group_container.counts.size
+    n_tests = n_genes_total * group_container.selected_group_ids.size
     logger.trace(f"Performing a total of {n_tests:,d} tests.")
     with Parallel(n_threads, prefer="threads", return_as="generator_unordered") as pool:
         with tqdm(total=n_tests, smoothing=0.0, unit="it", unit_scale=True, unit_divisor=1000) as pbar:
@@ -427,7 +441,7 @@ def asymptotic_wilcoxon(
 
                 # Process chunks of columns one by one
                 for lb, ub in pool(all_purpose_operator(data_handler, lb, ub, group_container, is_log1p, use_continuity, alternative, tie_correct, exp_post_agg, use_rust, results) for lb, ub in iterator):  # fmt: skip
-                    pbar.update(group_container.counts.size * (ub - lb))
+                    pbar.update(group_container.selected_group_ids.size * (ub - lb))
 
     if not return_as_scanpy:
         if n_genes is not None:
@@ -444,7 +458,7 @@ def asymptotic_wilcoxon(
         # Return a dict formatted for Scanpy's rank_genes_groups results
         results = format_illico_results_for_scanpy(
             adata=adata,
-            unique_groups=unique_raw_groups,
+            unique_groups=unique_raw_groups[group_container.selected_group_ids],
             reference=reference,
             group_keys=group_keys,
             layer=layer,
 
@@ -4,7 +4,12 @@
 from numba import njit
 
 from illico.utils.groups import GroupContainer
-from illico.utils.math import chunk_and_fortranize, compute_pval, dense_fold_change
+from illico.utils.math import (
+    chunk_and_fortranize,
+    compute_pval,
+    dense_fold_change,
+    fancy_indexing_axis0,
+)
 from illico.utils.ranking import (
     _sort_along_axis_inplace,
     rank_sum_and_ties_from_sorted,
@@ -118,29 +123,32 @@ def dense_ovo_mwu_kernel_over_contiguous_col_chunk(
     ref_chunk = chunk_and_fortranize(X, chunk_lb, chunk_ub, ref_indices)
     _sort_along_axis_inplace(ref_chunk, axis=0)
 
-    pvalues = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
-    zscores = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
-    statistics = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
-    for group_id in range(n_groups):
+    n_selected_groups = grpc.selected_group_ids.size
+    pvalues = np.empty((n_selected_groups, chunk_ub - chunk_lb), dtype=np.float64)
+    zscores = np.empty((n_selected_groups, chunk_ub - chunk_lb), dtype=np.float64)
+    statistics = np.empty((n_selected_groups, chunk_ub - chunk_lb), dtype=np.float64)
+    for k, group_id in enumerate(grpc.selected_group_ids):
         if group_id == grpc.encoded_ref_group:
-            pvalues[group_id, :] = 1.0
-            zscores[group_id, :] = 0.0
-            statistics[group_id, :] = -1.0
+            pvalues[k, :] = 1.0
+            zscores[k, :] = 0.0
+            statistics[k, :] = -1.0
             continue
         tgt_indices = grpc.indices[grpc.indptr[group_id] : grpc.indptr[group_id + 1]]
         # tgt_chunk = np.asfortranarray(chunk[tgt_indices, :])
         tgt_chunk = chunk_and_fortranize(X, chunk_lb, chunk_ub, tgt_indices)
         _sort_along_axis_inplace(tgt_chunk, axis=0)
 
-        pvalues[group_id], statistics[group_id], zscores[group_id] = dense_ovo_mwu_kernel(
+        pvalues[k], statistics[k], zscores[k] = dense_ovo_mwu_kernel(
             sorted_ref_data=ref_chunk,
             sorted_tgt_data=tgt_chunk,
             use_continuity=use_continuity,
             tie_correct=tie_correct,
             alternative=alternative,
         )
 
-    # Compute fold change
+    # Compute fold change on all groups, but return it only for the selected groups
     fc = dense_fold_change(chunk, grpc, is_log1p=is_log1p, exp_post_agg=exp_post_agg)
+    if n_selected_groups < n_groups:
+        fc = fancy_indexing_axis0(fc, grpc.selected_group_ids)
 
     return pvalues, statistics, zscores, fc
@@ -4,7 +4,12 @@
 from numba import njit
 
 from illico.utils.groups import GroupContainer
-from illico.utils.math import compute_pval, diff, fold_change_from_summed_expr
+from illico.utils.math import (
+    compute_pval,
+    diff,
+    fancy_indexing_axis0,
+    fold_change_from_summed_expr,
+)
 from illico.utils.ranking import (
     _sort_csc_columns_inplace,
     rank_sum_and_ties_from_sorted,
@@ -171,15 +176,16 @@ def csc_ovo_mwu_kernel_over_contiguous_col_chunk(
     agg_counts = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
 
     # Now go through all the groups one by one
-    pvalues = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    zscores = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    statistics = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    for group_id in range(group_indptr.size - 1):
+    n_selected_groups = grpc.selected_group_ids.size
+    pvalues = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    zscores = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    statistics = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    for k, group_id in enumerate(grpc.selected_group_ids):
         if group_id == ref_group_id:
-            pvalues[group_id, :] = 1.0
-            zscores[group_id, :] = 0.0
-            statistics[group_id, :] = -1.0
-            agg_counts[ref_group_id, :] = csc_sum_axis0(csc_X_ref, expm1=is_log1p & (not exp_post_agg))
+            pvalues[k, :] = 1.0
+            zscores[k, :] = 0.0
+            statistics[k, :] = -1.0
+            agg_counts[k, :] = csc_sum_axis0(csc_X_ref, expm1=is_log1p & (not exp_post_agg))
             continue
 
         # Chunk
@@ -197,11 +203,13 @@ def csc_ovo_mwu_kernel_over_contiguous_col_chunk(
             tie_correct=tie_correct,
             alternative=alternative,
         )
-        pvalues[group_id, :] = pvalue
-        statistics[group_id, :] = statistic
-        zscores[group_id, :] = zscore
+        pvalues[k, :] = pvalue
+        statistics[k, :] = statistic
+        zscores[k, :] = zscore
 
     fold_change = fold_change_from_summed_expr(agg_counts, grpc, exp_post_agg=exp_post_agg & is_log1p)
+    if n_selected_groups < n_groups:
+        fold_change = fancy_indexing_axis0(fold_change, grpc.selected_group_ids)
 
     return pvalues, statistics, zscores, fold_change
 
@@ -264,19 +272,18 @@ def csr_ovo_mwu_kernel_over_contiguous_col_chunk(
     # Sort
     _sort_csc_columns_inplace(csc_matrix=csc_X_ref)
 
-    # Initalize aggregated matrix to compute fold change later on
-    agg_counts = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
-
     # Now go through all the groups one by one
-    pvalues = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    zscores = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    statistics = np.empty((n_groups, csc_X_ref.shape[1]), dtype=np.float64)
-    for group_id in range(group_indptr.size - 1):
+    agg_counts = np.empty((n_groups, chunk_ub - chunk_lb), dtype=np.float64)
+    n_selected_groups = grpc.selected_group_ids.size
+    pvalues = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    zscores = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    statistics = np.empty((n_selected_groups, csc_X_ref.shape[1]), dtype=np.float64)
+    for k, group_id in enumerate(grpc.selected_group_ids):
         if group_id == ref_group_id:
-            pvalues[group_id, :] = 1.0
-            zscores[group_id, :] = 0.0
-            statistics[group_id, :] = -1.0
-            agg_counts[ref_group_id, :] = csc_sum_axis0(csc_X_ref, expm1=is_log1p & (not exp_post_agg))
+            pvalues[k, :] = 1.0
+            zscores[k, :] = 0.0
+            statistics[k, :] = -1.0
+            agg_counts[k, :] = csc_sum_axis0(csc_X_ref, expm1=is_log1p & (not exp_post_agg))
             continue
 
         # Chunk
@@ -294,10 +301,13 @@ def csr_ovo_mwu_kernel_over_contiguous_col_chunk(
             tie_correct=tie_correct,
             alternative=alternative,
         )
-        pvalues[group_id, :] = pvalue
-        statistics[group_id, :] = statistic
-        zscores[group_id, :] = zscore
+        pvalues[k, :] = pvalue
+        statistics[k, :] = statistic
+        zscores[k, :] = zscore
 
+    # Compute fold change for all groups, but return only the groups of interest
     fold_change = fold_change_from_summed_expr(agg_counts, grpc, exp_post_agg=exp_post_agg & is_log1p)
+    if n_selected_groups < n_groups:
+        fold_change = fancy_indexing_axis0(fold_change, grpc.selected_group_ids)
 
     return pvalues, statistics, zscores, fold_change
@@ -6,7 +6,13 @@
 from numba import njit
 
 from illico.utils.groups import GroupContainer
-from illico.utils.math import chunk_and_fortranize, compute_pval, dense_fold_change
+from illico.utils.math import (
+    _add_at_vec,
+    chunk_and_fortranize,
+    compute_pval,
+    fancy_indexing_axis0,
+    fold_change_from_summed_expr,
+)
 from illico.utils.ranking import _accumulate_group_ranksums_from_argsort
 from illico.utils.registry import KernelDataFormat, Test, nb_dispatcher_registry
 
@@ -46,14 +52,17 @@ def dense_ovr_mwu_kernel_over_contiguous_col_chunk(
 
     """
     # Convert to F-order for faster column access and sorting later
-    chunk = chunk_and_fortranize(X, chunk_lb, chunk_ub, None)
+    chunk = chunk_and_fortranize(X, chunk_lb, chunk_ub, grpc.ovr_inclusion_indices)
 
     # Get ranks and tie sums
     tie_sum = np.empty(chunk.shape[1], dtype=np.float64)
     ranksums = np.zeros(shape=(grpc.counts.size, chunk.shape[1]), dtype=np.float64)
+    included_groups_indicator = grpc.encoded_groups[grpc.ovr_inclusion_indices]
     for j in range(chunk.shape[1]):
         idxs = np.argsort(chunk[:, j])
-        col_tie_sum, _ = _accumulate_group_ranksums_from_argsort(chunk[:, j], idxs, grpc.encoded_groups, ranksums[:, j])
+        col_tie_sum, _ = _accumulate_group_ranksums_from_argsort(
+            chunk[:, j], idxs, included_groups_indicator, ranksums[:, j]
+        )
         tie_sum[j] = col_tie_sum
 
     # Compute U stats
@@ -63,22 +72,39 @@ def dense_ovr_mwu_kernel_over_contiguous_col_chunk(
     statistics = ranksums - n_tgt * (n_tgt + 1) / 2
     mu = n_ref * n_tgt / 2.0
     # Compute pvals
-    pvals = np.empty(shape=(grpc.counts.size, chunk.shape[1]), dtype=np.float64)
-    zscores = np.empty(shape=(grpc.counts.size, chunk.shape[1]), dtype=np.float64)
+    n_selected_groups = grpc.selected_group_ids.size
+    pvals = np.empty(shape=(n_selected_groups, chunk.shape[1]), dtype=np.float64)
+    zscores = np.empty(shape=(n_selected_groups, chunk.shape[1]), dtype=np.float64)
     for j in range(chunk.shape[1]):
-        for k in range(grpc.counts.size):
+        for k, grp_id in enumerate(grpc.selected_group_ids):
             pvals[k, j], zscores[k, j] = compute_pval(
-                n_ref=n_ref[k, 0],
-                n_tgt=n_tgt[k, 0],
+                n_ref=n_ref[grp_id, 0],
+                n_tgt=n_tgt[grp_id, 0],
                 n=n,
                 tie_sum=tie_sum[j] if tie_correct else 0.0,
-                U=statistics[k, j],
-                mu=mu[k, 0],
+                U=statistics[grp_id, j],
+                mu=mu[grp_id, 0],
                 contin_corr=0.5 if use_continuity else 0.0,
                 alternative=alternative,
             )
 
     # Get fold change
-    fold_change = dense_fold_change(chunk, grpc=grpc, is_log1p=is_log1p, exp_post_agg=exp_post_agg)
+    # Note: it would be a bit cumbersome to have dense_fold_change handle itself all the shennanigans
+    # groups and subsetting. I find clearer to have it here.
+    # TODO: actually idk, bc I ended up doing it in the sparse path.
+    group_agg_counts = np.zeros(shape=(grpc.counts.size, X.shape[1]), dtype=np.float64)
+    # Sum expressions per group
+    if is_log1p and not exp_post_agg:
+        _add_at_vec(group_agg_counts, grpc.encoded_groups[grpc.ovr_inclusion_indices], np.expm1(chunk))
+    else:
+        _add_at_vec(group_agg_counts, grpc.encoded_groups[grpc.ovr_inclusion_indices], chunk)
+    fold_change = fold_change_from_summed_expr(
+        group_agg_counts, grpc, exp_post_agg=exp_post_agg & is_log1p, sum_over_selected_groups_only=True
+    )
+
+    # Now filter on the groups to return, if needed
+    if n_selected_groups < grpc.counts.size:
+        fold_change = fancy_indexing_axis0(fold_change, grpc.selected_group_ids)
+        statistics = fancy_indexing_axis0(statistics, grpc.selected_group_ids)
 
     return pvals, statistics, zscores, fold_change