Skip to content

Commit 8641dcc

Browse files
Faster gene set removal
Reduced runtime and memory usage when removing gene sets with fewer than min_size or more than max_size genes prior to calculating enrichment scores. Also updated internal documentation.
1 parent 935b92d commit 8641dcc

File tree

6 files changed

+150
-34
lines changed

6 files changed

+150
-34
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: fast.ssgsea
22
Type: Package
33
Title: High-Performance Gene Set Enrichment Analysis (HP-GSEA)
4-
Version: 0.1.0.9028
5-
Date: 2026-02-26
4+
Version: 0.1.0.9029
5+
Date: 2026-02-28
66
Authors@R:
77
person(given = "Tyler", family = "Sagendorf",
88
email = "tyler.sagendorf@pnnl.gov",

NAMESPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ export(read_gmt)
55
exportPattern("^[[:alpha:]]+")
66
import(dqrng)
77
importFrom(Rcpp,evalCpp)
8-
importFrom(collapse,"%!iin%")
98
importFrom(collapse,alloc)
109
importFrom(collapse,allv)
1110
importFrom(collapse,any_duplicated)

R/utils.R

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,17 @@
116116
}
117117

118118

119-
#' @title Convert gene sets to a list of indices for .calcES
119+
#' @title Convert gene sets to a list of indices for `.calc_ES`
120120
#'
121121
#' @inheritParams fast_ssgsea
122122
#'
123123
#' @returns A named list.
124124
#'
125125
#' @author Tyler Sagendorf
126126
#'
127+
#' @importFrom collapse allv anyv fmatch fsubset funique groupid vec vlengths
128+
#' vtypes whichNA whichv
129+
#'
127130
#' @noRd
128131
.gene_sets_to_indices <- function(stats,
129132
gene_sets,
@@ -145,8 +148,7 @@
145148
}
146149

147150
# Pre-filter to remove gene sets that are too small. We can not remove gene
148-
# sets that are too large without first restricting the genes to the
149-
# background.
151+
# sets that are too large without first restricting genes to names(stats).
150152
set_sizes <- vlengths(gene_sets)
151153

152154
keep_sets <- whichv(set_sizes >= min_size, TRUE)
@@ -161,10 +163,8 @@
161163
genes <- vec(gene_sets)
162164
unique_genes <- funique(genes)
163165

164-
# Determine if any elements have an expected direction of change
165-
directional_sets <- 0L != length(
166-
grep(";[ud]{1}", unique_genes, perl = TRUE)
167-
)
166+
# Determine if any genes have an expected direction of change
167+
directional_sets <- anyv(grepl(";[ud]{1}", unique_genes, perl = TRUE), TRUE)
168168

169169
if (directional_sets) {
170170
# Determine which genes are expected to be "down" and remove the direction
@@ -182,7 +182,7 @@
182182
}
183183

184184
# names(stats) is first because it was sorted lexicographically to deal with
185-
# ties in the gene-level values
185+
# ties in stats
186186
unique_genes <- intersect(names(stats), unique_genes)
187187

188188
if (length(unique_genes) == 0L) {
@@ -200,8 +200,8 @@
200200

201201
unique_sets <- names(gene_sets)
202202

203+
# Remove genes not in names(stats)
203204
if (anyNA(gene_indices)) {
204-
# Remove genes not in names(stats)
205205
keep_genes <- whichNA(gene_indices, invert = TRUE)
206206

207207
gene_indices <- fsubset(gene_indices, keep_genes)
@@ -251,7 +251,7 @@
251251
}
252252

253253

254-
#' @title Fast, specialized rep.int
254+
#' @title Fast, specialized `rep.int`
255255
#'
256256
#' @description Equivalent to `rep.int(seq_along(times), times)`, but several
257257
#' times faster when the output is large.
@@ -287,17 +287,50 @@
287287
}
288288

289289

290+
#' @title Remove gene sets that are too small or too large
291+
#'
292+
#' @param gene_indices integer vector; indices of the genes in each set,
293+
#' arranged in contiguous blocks by gene set.
294+
#' @param extreme_set_indices integer vector; indices of gene sets that are too
295+
#' small or too large to test. Indices can range from 1 to `length(m)`.
296+
#' @param m integer vector; the number of genes in each set, where
297+
#' `length(gene_indices) == sum(m)`.
298+
#'
299+
#' @returns The vector `gene_indices` with blocks of genes from the extreme gene
300+
#' sets removed.
301+
#'
302+
#' @details This function is not called when all or no gene sets are extreme. If
303+
#' all gene sets are extreme, an error will be thrown by `.calc_ES`.
304+
#'
305+
#' The runtime of this function decreases as the number of extreme sets
306+
#' increases.
307+
#'
308+
#' @author Tyler Sagendorf
309+
#'
310+
#' @noRd
311+
.C_remove_extreme_gene_sets <- function(gene_indices,
312+
extreme_set_indices,
313+
m) {
314+
.Call(
315+
"_C_remove_extreme_gene_sets",
316+
gene_indices,
317+
extreme_set_indices,
318+
m
319+
)
320+
}
321+
322+
290323
#' @title Calculate Enrichment Scores
291324
#'
292325
#' @param y_prime numeric vector of absolute gene-level values raised to the
293326
#' power of `alpha` for genes that are members of at least one gene set.
294327
#' @param r_prime numeric vector of the ranks of the gene-level values for genes
295328
#' that are members of at least one gene set.
296329
#' @param sum_ranks numeric; the sum of all ranks.
297-
#' @param i integer vector; indices of the genes in all sets. Used to index
298-
#' vectors `y_prime` and `r_prime`.
330+
#' @param gene_indices integer vector; indices of the genes in all sets. Used to
331+
#' index vectors `y_prime` and `r_prime`.
299332
#' @param m integer vector; the number of genes in each set. Used to select
300-
#' elements of `i`.
333+
#' elements of `gene_indices`.
301334
#' @param w integer vector; the number of genes that are not in each set.
302335
#' @inheritParams fast_ssgsea
303336
#'
@@ -339,8 +372,7 @@
339372
#'
340373
#' @author Tyler Sagendorf
341374
#'
342-
#' @importFrom collapse %!iin% allv anyv fmatch fmax fsubset funique groupid vec
343-
#' vlengths vtypes whichNA whichv
375+
#' @importFrom collapse fmatch fmax fsubset
344376
#' @importFrom data.table frank
345377
#'
346378
#' @noRd
@@ -350,7 +382,7 @@
350382
gene_sets,
351383
min_size = 2L,
352384
max_size = Inf) {
353-
max_size <- max(min_size, min(max_size, n_genes - 1L))
385+
max_size <- min(max_size, n_genes - 1L)
354386

355387
storage.mode(min_size) <- storage.mode(max_size) <- "integer"
356388

@@ -391,20 +423,22 @@
391423
} else if (length(extreme_set_indices)) {
392424
unique_sets <- fsubset(unique_sets, -extreme_set_indices)
393425

394-
m <- fsubset(m, -extreme_set_indices)
395-
396-
gene_indices <- fsubset(
397-
.x = gene_indices,
398-
subset = set_indices %!iin% extreme_set_indices
426+
gene_indices <- .C_remove_extreme_gene_sets(
427+
gene_indices = gene_indices,
428+
extreme_set_indices = extreme_set_indices,
429+
m = m
399430
)
400431

401-
if (directional_sets) {
402-
m_d <- fsubset(m_d, -extreme_set_indices)
432+
m <- fsubset(m, -extreme_set_indices)
403433

404-
gene_indices_down <- fsubset(
405-
.x = gene_indices_down,
406-
subset = set_indices_down %!iin% extreme_set_indices
434+
if (directional_sets) {
435+
gene_indices_down <- .C_remove_extreme_gene_sets(
436+
gene_indices = gene_indices_down,
437+
extreme_set_indices = extreme_set_indices,
438+
m = m_d
407439
)
440+
441+
m_d <- fsubset(m_d, -extreme_set_indices)
408442
}
409443
}
410444

@@ -439,7 +473,8 @@
439473

440474
if (directional_sets) {
441475
# Calculate enrichment scores separately for the up-regulated and
442-
# down-regulated genes
476+
# down-regulated genes. Elements of m and m_d that are < min_size will be
477+
# replaced with 0.
443478
ES_u <- .C_calc_ES(
444479
y_prime = y_prime,
445480
r_prime = r_prime,

src/C_functions.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,57 @@
33
#include <Rinternals.h>
44

55

6+
SEXP _C_remove_extreme_gene_sets(const SEXP gene_indices,
7+
const SEXP extreme_set_indices,
8+
const SEXP m) {
9+
const int n_genes = Rf_length(gene_indices);
10+
const int n_sets = Rf_length(m);
11+
const int n_extreme_sets = Rf_length(extreme_set_indices);
12+
13+
const int *restrict pgene_indices = INTEGER(gene_indices);
14+
const int *restrict pextreme_set_indices = INTEGER(extreme_set_indices);
15+
const int *restrict pm = INTEGER(m);
16+
17+
int length_out = 0;
18+
19+
for (int j = 0; j < n_extreme_sets; ++j) {
20+
length_out += pm[pextreme_set_indices[j] - 1];
21+
}
22+
23+
length_out = n_genes - length_out;
24+
25+
SEXP out = PROTECT(Rf_allocVector(INTSXP, length_out));
26+
int *restrict pout = INTEGER(out);
27+
28+
int start = 0;
29+
int end = 0;
30+
int j = 0;
31+
int shift_backward = 0;
32+
33+
for (int i = 0; i < n_sets && j < n_extreme_sets; ++i) {
34+
start = end;
35+
end = start + pm[i];
36+
37+
if (i != (pextreme_set_indices[j] - 1)) {
38+
for (int k = start; k < end; ++k) {
39+
pout[k - shift_backward] = pgene_indices[k];
40+
}
41+
} else {
42+
shift_backward += pm[i];
43+
++j;
44+
}
45+
}
46+
47+
for (int k = end; k < n_genes; ++k) {
48+
pout[k - shift_backward] = pgene_indices[k];
49+
}
50+
51+
UNPROTECT(1);
52+
53+
return out;
54+
}
55+
56+
657
// Calculate enrichment scores for all gene sets
758
SEXP _C_calc_ES(const SEXP y_prime,
859
const SEXP r_prime,

src/RcppExports.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,17 @@ END_RCPP
6262
RcppExport SEXP _C_calc_ES(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
6363
RcppExport SEXP _C_group_sizes(SEXP, SEXP);
6464
RcppExport SEXP _C_pair_szudzik(SEXP, SEXP);
65+
RcppExport SEXP _C_remove_extreme_gene_sets(SEXP, SEXP, SEXP);
6566
RcppExport SEXP _C_rep_int(SEXP, SEXP);
6667

6768
static const R_CallMethodDef CallEntries[] = {
6869
{"_fast_ssgsea_calc_ES_perm", (DL_FUNC) &_fast_ssgsea_calc_ES_perm, 13},
6970
{"_fast_ssgsea_calc_ES_perm_dir", (DL_FUNC) &_fast_ssgsea_calc_ES_perm_dir, 17},
70-
{"_C_calc_ES", (DL_FUNC) &_C_calc_ES, 7},
71-
{"_C_group_sizes", (DL_FUNC) &_C_group_sizes, 2},
72-
{"_C_pair_szudzik", (DL_FUNC) &_C_pair_szudzik, 2},
73-
{"_C_rep_int", (DL_FUNC) &_C_rep_int, 2},
71+
{"_C_calc_ES", (DL_FUNC) &_C_calc_ES, 7},
72+
{"_C_group_sizes", (DL_FUNC) &_C_group_sizes, 2},
73+
{"_C_pair_szudzik", (DL_FUNC) &_C_pair_szudzik, 2},
74+
{"_C_remove_extreme_gene_sets", (DL_FUNC) &_C_remove_extreme_gene_sets, 3},
75+
{"_C_rep_int", (DL_FUNC) &_C_rep_int, 2},
7476
{NULL, NULL, 0}
7577
};
7678

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
test_that(".C_remove_extreme_sets is correct", {
2+
set_sizes <- sample.int(50L, size = 1000L, replace = TRUE)
3+
4+
sum_sizes <- sum(set_sizes)
5+
6+
gene_indices <- seq_len(sum_sizes)
7+
set_indices <- .C_rep_int(set_sizes, length = sum_sizes)
8+
9+
extreme_set_indices <- which(
10+
set_sizes < 5L | set_sizes > 40L
11+
)
12+
13+
res <- .C_remove_extreme_gene_sets(
14+
gene_indices = gene_indices,
15+
extreme_set_indices = extreme_set_indices,
16+
m = set_sizes
17+
)
18+
19+
expected <- fsubset(
20+
.x = gene_indices,
21+
# collapse::`%!iin%`
22+
subset = whichNA(fmatch(set_indices, extreme_set_indices))
23+
)
24+
25+
expect_identical(
26+
res,
27+
expected
28+
)
29+
})

0 commit comments

Comments
 (0)