Faster gene set removal

TylerSagendorf · TylerSagendorf · commit 8641dcc302c1 · 2026-02-28T01:13:54.000-06:00
Reduced runtime and memory usage when removing gene sets with fewer than min_size or more than max_size genes prior to calculating enrichment scores. Also updated internal documentation.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: fast.ssgsea
 Type: Package
 Title: High-Performance Gene Set Enrichment Analysis (HP-GSEA)
-Version: 0.1.0.9028
-Date: 2026-02-26
+Version: 0.1.0.9029
+Date: 2026-02-28
 Authors@R: 
       person(given = "Tyler", family = "Sagendorf", 
              email = "tyler.sagendorf@pnnl.gov", 
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,7 +5,6 @@ export(read_gmt)
 exportPattern("^[[:alpha:]]+")
 import(dqrng)
 importFrom(Rcpp,evalCpp)
-importFrom(collapse,"%!iin%")
 importFrom(collapse,alloc)
 importFrom(collapse,allv)
 importFrom(collapse,any_duplicated)
diff --git a/R/utils.R b/R/utils.R
@@ -116,14 +116,17 @@
 }
 
 
-#' @title Convert gene sets to a list of indices for .calcES
+#' @title Convert gene sets to a list of indices for `.calc_ES`
 #'
 #' @inheritParams fast_ssgsea
 #'
 #' @returns A named list.
 #'
 #' @author Tyler Sagendorf
 #'
+#' @importFrom collapse allv anyv fmatch fsubset funique groupid vec vlengths
+#'   vtypes whichNA whichv
+#'
 #' @noRd
 .gene_sets_to_indices <- function(stats,
                                   gene_sets,
@@ -145,8 +148,7 @@
   }
 
   # Pre-filter to remove gene sets that are too small. We can not remove gene
-  # sets that are too large without first restricting the genes to the
-  # background.
+  # sets that are too large without first restricting genes to names(stats).
   set_sizes <- vlengths(gene_sets)
 
   keep_sets <- whichv(set_sizes >= min_size, TRUE)
@@ -161,10 +163,8 @@
   genes <- vec(gene_sets)
   unique_genes <- funique(genes)
 
-  # Determine if any elements have an expected direction of change
-  directional_sets <- 0L != length(
-    grep(";[ud]{1}", unique_genes, perl = TRUE)
-  )
+  # Determine if any genes have an expected direction of change
+  directional_sets <- anyv(grepl(";[ud]{1}", unique_genes, perl = TRUE), TRUE)
 
   if (directional_sets) {
     # Determine which genes are expected to be "down" and remove the direction
@@ -182,7 +182,7 @@
   }
 
   # names(stats) is first because it was sorted lexicographically to deal with
-  # ties in the gene-level values
+  # ties in stats
   unique_genes <- intersect(names(stats), unique_genes)
 
   if (length(unique_genes) == 0L) {
@@ -200,8 +200,8 @@
 
   unique_sets <- names(gene_sets)
 
+  # Remove genes not in names(stats)
   if (anyNA(gene_indices)) {
-    # Remove genes not in names(stats)
     keep_genes <- whichNA(gene_indices, invert = TRUE)
 
     gene_indices <- fsubset(gene_indices, keep_genes)
@@ -251,7 +251,7 @@
 }
 
 
-#' @title Fast, specialized rep.int
+#' @title Fast, specialized `rep.int`
 #'
 #' @description Equivalent to `rep.int(seq_along(times), times)`, but several
 #'   times faster when the output is large.
@@ -287,17 +287,50 @@
 }
 
 
+#' @title Remove gene sets that are too small or too large
+#'
+#' @param gene_indices integer vector; indices of the genes in each set,
+#'   arranged in contiguous blocks by gene set.
+#' @param extreme_set_indices integer vector; indices of gene sets that are too
+#'   small or too large to test. Indices can range from 1 to `length(m)`.
+#' @param m integer vector; the number of genes in each set, where
+#'   `length(gene_indices) == sum(m)`.
+#'
+#' @returns The vector `gene_indices` with blocks of genes from the extreme gene
+#'   sets removed.
+#'
+#' @details This function is not called when all or no gene sets are extreme. If
+#'   all gene sets are extreme, an error will be thrown by `.calc_ES`.
+#'
+#'   The runtime of this function decreases as the number of extreme sets
+#'   increases.
+#'
+#' @author Tyler Sagendorf
+#'
+#' @noRd
+.C_remove_extreme_gene_sets <- function(gene_indices,
+                                        extreme_set_indices,
+                                        m) {
+  .Call(
+    "_C_remove_extreme_gene_sets",
+    gene_indices,
+    extreme_set_indices,
+    m
+  )
+}
+
+
 #' @title Calculate Enrichment Scores
 #'
 #' @param y_prime numeric vector of absolute gene-level values raised to the
 #'   power of `alpha` for genes that are members of at least one gene set.
 #' @param r_prime numeric vector of the ranks of the gene-level values for genes
 #'   that are members of at least one gene set.
 #' @param sum_ranks numeric; the sum of all ranks.
-#' @param i integer vector; indices of the genes in all sets. Used to index
-#'   vectors `y_prime` and `r_prime`.
+#' @param gene_indices integer vector; indices of the genes in all sets. Used to
+#'   index vectors `y_prime` and `r_prime`.
 #' @param m integer vector; the number of genes in each set. Used to select
-#'   elements of `i`.
+#'   elements of `gene_indices`.
 #' @param w integer vector; the number of genes that are not in each set.
 #' @inheritParams fast_ssgsea
 #'
@@ -339,8 +372,7 @@
 #'
 #' @author Tyler Sagendorf
 #'
-#' @importFrom collapse %!iin% allv anyv fmatch fmax fsubset funique groupid vec
-#'   vlengths vtypes whichNA whichv
+#' @importFrom collapse fmatch fmax fsubset
 #' @importFrom data.table frank
 #'
 #' @noRd
@@ -350,7 +382,7 @@
                      gene_sets,
                      min_size = 2L,
                      max_size = Inf) {
-  max_size <- max(min_size, min(max_size, n_genes - 1L))
+  max_size <- min(max_size, n_genes - 1L)
 
   storage.mode(min_size) <- storage.mode(max_size) <- "integer"
 
@@ -391,20 +423,22 @@
   } else if (length(extreme_set_indices)) {
     unique_sets <- fsubset(unique_sets, -extreme_set_indices)
 
-    m <- fsubset(m, -extreme_set_indices)
-
-    gene_indices <- fsubset(
-      .x = gene_indices,
-      subset = set_indices %!iin% extreme_set_indices
+    gene_indices <- .C_remove_extreme_gene_sets(
+      gene_indices = gene_indices,
+      extreme_set_indices = extreme_set_indices,
+      m = m
     )
 
-    if (directional_sets) {
-      m_d <- fsubset(m_d, -extreme_set_indices)
+    m <- fsubset(m, -extreme_set_indices)
 
-      gene_indices_down <- fsubset(
-        .x = gene_indices_down,
-        subset = set_indices_down %!iin% extreme_set_indices
+    if (directional_sets) {
+      gene_indices_down <- .C_remove_extreme_gene_sets(
+        gene_indices = gene_indices_down,
+        extreme_set_indices = extreme_set_indices,
+        m = m_d
       )
+
+      m_d <- fsubset(m_d, -extreme_set_indices)
     }
   }
 
@@ -439,7 +473,8 @@
 
   if (directional_sets) {
     # Calculate enrichment scores separately for the up-regulated and
-    # down-regulated genes
+    # down-regulated genes. Elements of m and m_d that are < min_size will be
+    # replaced with 0.
     ES_u <- .C_calc_ES(
       y_prime = y_prime,
       r_prime = r_prime,
diff --git a/src/C_functions.c b/src/C_functions.c
@@ -3,6 +3,57 @@
 #include <Rinternals.h>
 
 
+SEXP _C_remove_extreme_gene_sets(const SEXP gene_indices,
+                                 const SEXP extreme_set_indices,
+                                 const SEXP m) {
+  const int n_genes = Rf_length(gene_indices);
+  const int n_sets = Rf_length(m);
+  const int n_extreme_sets = Rf_length(extreme_set_indices);
+
+  const int *restrict pgene_indices = INTEGER(gene_indices);
+  const int *restrict pextreme_set_indices = INTEGER(extreme_set_indices);
+  const int *restrict pm = INTEGER(m);
+
+  int length_out = 0;
+
+  for (int j = 0; j < n_extreme_sets; ++j) {
+    length_out += pm[pextreme_set_indices[j] - 1];
+  }
+
+  length_out = n_genes - length_out;
+
+  SEXP out = PROTECT(Rf_allocVector(INTSXP, length_out));
+  int *restrict pout = INTEGER(out);
+
+  int start = 0;
+  int end = 0;
+  int j = 0;
+  int shift_backward = 0;
+
+  for (int i = 0; i < n_sets && j < n_extreme_sets; ++i) {
+    start = end;
+    end = start + pm[i];
+
+    if (i != (pextreme_set_indices[j] - 1)) {
+      for (int k = start; k < end; ++k) {
+        pout[k - shift_backward] = pgene_indices[k];
+      }
+    } else {
+      shift_backward += pm[i];
+      ++j;
+    }
+  }
+
+  for (int k = end; k < n_genes; ++k) {
+    pout[k - shift_backward] = pgene_indices[k];
+  }
+
+  UNPROTECT(1);
+
+  return out;
+}
+
+
 // Calculate enrichment scores for all gene sets
 SEXP _C_calc_ES(const SEXP y_prime,
                 const SEXP r_prime,
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -62,15 +62,17 @@ END_RCPP
 RcppExport SEXP _C_calc_ES(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 RcppExport SEXP _C_group_sizes(SEXP, SEXP);
 RcppExport SEXP _C_pair_szudzik(SEXP, SEXP);
+RcppExport SEXP _C_remove_extreme_gene_sets(SEXP, SEXP, SEXP);
 RcppExport SEXP _C_rep_int(SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
     {"_fast_ssgsea_calc_ES_perm", (DL_FUNC) &_fast_ssgsea_calc_ES_perm, 13},
     {"_fast_ssgsea_calc_ES_perm_dir", (DL_FUNC) &_fast_ssgsea_calc_ES_perm_dir, 17},
-    {"_C_calc_ES",      (DL_FUNC) &_C_calc_ES,      7},
-    {"_C_group_sizes",  (DL_FUNC) &_C_group_sizes,  2},
-    {"_C_pair_szudzik", (DL_FUNC) &_C_pair_szudzik, 2},
-    {"_C_rep_int",      (DL_FUNC) &_C_rep_int,      2},
+    {"_C_calc_ES",                  (DL_FUNC) &_C_calc_ES,                  7},
+    {"_C_group_sizes",              (DL_FUNC) &_C_group_sizes,              2},
+    {"_C_pair_szudzik",             (DL_FUNC) &_C_pair_szudzik,             2},
+    {"_C_remove_extreme_gene_sets", (DL_FUNC) &_C_remove_extreme_gene_sets, 3},
+    {"_C_rep_int",                  (DL_FUNC) &_C_rep_int,                  2},
     {NULL, NULL, 0}
 };
 
diff --git a/tests/testthat/test-dots_C_remove_extreme_sets.R b/tests/testthat/test-dots_C_remove_extreme_sets.R
@@ -0,0 +1,29 @@
+test_that(".C_remove_extreme_sets is correct", {
+  set_sizes <- sample.int(50L, size = 1000L, replace = TRUE)
+
+  sum_sizes <- sum(set_sizes)
+
+  gene_indices <- seq_len(sum_sizes)
+  set_indices <- .C_rep_int(set_sizes, length = sum_sizes)
+
+  extreme_set_indices <- which(
+    set_sizes < 5L | set_sizes > 40L
+  )
+
+  res <- .C_remove_extreme_gene_sets(
+    gene_indices = gene_indices,
+    extreme_set_indices = extreme_set_indices,
+    m = set_sizes
+  )
+
+  expected <- fsubset(
+    .x = gene_indices,
+    # collapse::`%!iin%`
+    subset = whichNA(fmatch(set_indices, extreme_set_indices))
+  )
+
+  expect_identical(
+    res,
+    expected
+  )
+})