Skip to content

Commit eaf7104

Browse files
Improve incidence matrix creation
Improve speed and reduce memory usage when converting a named list of gene sets to a sparse incidence matrix.
1 parent b63c207 commit eaf7104

File tree

12 files changed

+215
-30
lines changed

12 files changed

+215
-30
lines changed

NAMESPACE

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@ export(read_gmt)
55
exportPattern("^[[:alpha:]]+")
66
import(RcppArmadillo)
77
import(dqrng)
8-
importFrom(Matrix,sparseMatrix)
98
importFrom(Rcpp,evalCpp)
10-
importFrom(collapse,alloc)
119
importFrom(collapse,allv)
1210
importFrom(collapse,anyv)
1311
importFrom(collapse,fsubset)

R/RcppExports.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,10 @@ NULL
184184
#' @noRd
185185
NULL
186186

187+
.Cpp_unsafe_sparseMatrix <- function(i, j, dims, dimnames) {
188+
.Call(`_fast_ssgsea_unsafe_sparseMatrix`, i, j, dims, dimnames)
189+
}
190+
187191
.Cpp_matmult_sparse <- function(X, Y) {
188192
.Call(`_fast_ssgsea_matmult_sparse`, X, Y)
189193
}

R/utils.R

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,36 @@
132132
}
133133

134134

135+
#' @title Fast, specialized rep.int
136+
#'
137+
#' @param times integer vector of group sizes.
138+
#'
139+
#' @returns Integer vector. Equivalent to the result of
140+
#' `rep.int(seq_along(times), times)`, but several times faster.
141+
#'
142+
#' @author Tyler Sagendorf
143+
#'
144+
#' @noRd
145+
.C_rep_int <- function(sizes) {
146+
.Call("_C_rep_int", sizes)
147+
}
148+
149+
150+
#' @title Get indices of non-duplicate integer pairs
151+
#'
152+
#' @param x,y integer vectors. `y` is assumed to be sorted, and `x` is sorted
153+
#' within groups of `y`. That is, any duplicate pairs are contiguous.
154+
#'
155+
#' @returns Integer vector of the indices of the non-duplicate (x, y) pairs.
156+
#'
157+
#' @author Tyler Sagendorf
158+
#'
159+
#' @noRd
160+
.C_pairs_not_duplicated <- function(x, y) {
161+
.Call("_C_pairs_not_duplicated", x, y)
162+
}
163+
164+
135165
#' @title Create Sparse Incidence Matrices
136166
#'
137167
#' @description Create a list of sparse incidence matrices, where the unique
@@ -155,10 +185,9 @@
155185
#'
156186
#' @author Tyler Sagendorf
157187
#'
158-
#' @importFrom collapse alloc allv anyv fsubset funique groupid vec vlengths
159-
#' vtypes whichNA whichv
188+
#' @importFrom collapse allv anyv fsubset funique groupid vec vlengths vtypes
189+
#' whichNA whichv
160190
#' @importFrom data.table chmatch
161-
#' @importFrom Matrix sparseMatrix
162191
#'
163192
#' @noRd
164193
.sparseIncidence <- function(gene_sets,
@@ -242,11 +271,10 @@
242271
# Row indices for sparse matrix
243272
i <- chmatch(elements, unique_elements, nomatch = NA_integer_)
244273

245-
# Unique set names
246274
unique_sets <- names(gene_sets)
247275

248276
# Column indices for sparse matrix
249-
j <- rep.int(seq_along(unique_sets), set_sizes)
277+
j <- .C_rep_int(set_sizes)
250278

251279
if (anyNA(i)) {
252280
# Remove elements not in the background
@@ -255,61 +283,75 @@
255283
i <- fsubset(i, keep)
256284
j <- fsubset(j, keep)
257285

286+
if (any_dir) {
287+
direction_down <- fsubset(direction_down, keep)
288+
}
289+
258290
unique_sets <- fsubset(unique_sets, funique(j))
259291

260-
j <- groupid(j) # this works because each set is a contiguous batch
292+
j <- groupid(j) # this works because each set is a contiguous group
293+
class(j) <- "integer"
261294
}
262295

296+
# Must sort to remove duplicates easily and to use the unsafe sparseMatrix
297+
# function. This is the slowest part of this function.
298+
o <- order(j, i, method = "radix")
299+
i <- fsubset(i, o)
300+
263301
dim_names <- list(unique_elements, unique_sets)
264302
dims <- vlengths(dim_names)
265303

266304
# Split elements by direction of change to create two incidence matrices
267305
if (any_dir) {
306+
direction_down <- fsubset(direction_down, o)
268307
idx_down <- whichv(direction_down, TRUE)
269308
direction_down <- NULL # signal that this is no longer needed
270309

271310
i_down <- fsubset(i, idx_down)
272311
j_down <- fsubset(j, idx_down)
273312

313+
# Indices of unique (i_down, j_down) pairs
314+
unique_pairs_idx <- .C_pairs_not_duplicated(i_down, j_down)
315+
316+
if (length(unique_pairs_idx) != length(i_down)) {
317+
i_down <- fsubset(i_down, unique_pairs_idx)
318+
j_down <- fsubset(j_down, unique_pairs_idx)
319+
}
320+
274321
i <- fsubset(i, -idx_down)
275322
j <- fsubset(j, -idx_down)
276323
}
277324

325+
# Indices of unique (j, i) pairs
326+
unique_pairs_idx <- .C_pairs_not_duplicated(i, j)
327+
328+
if (length(unique_pairs_idx) != length(i)) {
329+
i <- fsubset(i, unique_pairs_idx)
330+
j <- fsubset(j, unique_pairs_idx)
331+
}
332+
278333
# Incidence matrix where a 1 indicates that the element is in the set. If
279334
# gene_sets is a directional database, then A will only contain elements that
280335
# are expected to be "up".
281-
A <- sparseMatrix(
336+
A <- .Cpp_unsafe_sparseMatrix(
282337
i = i,
283338
j = j,
284-
x = alloc(1, length(i)),
285339
dims = dims,
286-
dimnames = dim_names,
287-
check = FALSE,
288-
use.last.ij = FALSE
340+
dimnames = dim_names
289341
)
290342

291-
# In the unlikely event where an element appears multiple times in the same
292-
# set, some values of A will be > 1. Replace all values with 1. Could also use
293-
# the use.last.ij parameter in sparseMatrix(), but this is faster.
294-
attr(A, which = "x") <- alloc(1, length(attr(A, which = "x")))
295-
296343
A_d <- NULL # default
297344

298345
if (any_dir) {
299346
# Incidence matrix where a 1 indicates that an element is expected to be
300347
# down in the set
301-
A_d <- sparseMatrix(
348+
A_d <- .Cpp_unsafe_sparseMatrix(
302349
i = i_down,
303350
j = j_down,
304-
x = alloc(1, length(i_down)),
305351
dims = dims,
306-
dimnames = dim_names,
307-
check = FALSE,
308-
use.last.ij = FALSE
352+
dimnames = dim_names
309353
)
310354

311-
attr(A_d, which = "x") <- alloc(1, length(attr(A_d, which = "x")))
312-
313355
# The Hadamard product A * A_d should be a matrix of zeros, since genes can
314356
# not be "up" and "down" in the same set.
315357
if (length(attr(A * A_d, which = "x"))) {

man/figures/README-figure-1.png

-654 Bytes
Loading

man/figures/README-figure-2.png

-1.03 KB
Loading
-3 Bytes
Binary file not shown.

simulation/figures/figure-1.pdf

-8 Bytes
Binary file not shown.

simulation/figures/figure-2.pdf

-14 Bytes
Binary file not shown.

simulation/scripts/manuscript_figures.Rmd

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ plot_time <- function(x, y_breaks, y_max = NULL, expand_upper = 0.05) {
8282
) +
8383
scale_color_manual(
8484
name = "Number of gene sets: ",
85-
values = palette.colors(n = 3L),
85+
values = structure(palette.colors(n = 3L), names = NULL),
8686
breaks = c("1,000", "10,000", "50,000")
8787
) +
8888
labs(
@@ -142,17 +142,17 @@ time_res <- prepare_data("../data/fast-ssGSEA_timing_results.rds")
142142
143143
# 10,000 permutations
144144
pa <- filter(time_res, nperm == levels(nperm)[1L]) %>%
145-
plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.02) +
145+
plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.05) +
146146
labs(x = "")
147147
148148
# 100,000 permutations
149149
pb <- filter(time_res, nperm == levels(nperm)[2L]) %>%
150-
plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.02) +
150+
plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.05) +
151151
labs(y = "")
152152
153153
# 1,000,000 permutations
154154
pc <- filter(time_res, nperm == levels(nperm)[3L]) %>%
155-
plot_time(y_breaks = seq(0, 20, 4), y_max = 20, expand_upper = 5e-3) +
155+
plot_time(y_breaks = seq(0, 20, 4), y_max = 20, expand_upper = 0.05) +
156156
labs(x = "", y = "")
157157
158158
# Combine plots
@@ -187,7 +187,7 @@ time_res <- prepare_data("../data/FGSEA_timing_results_nproc_1.rds")
187187
188188
# 10,000 permutations
189189
pa <- filter(time_res, nperm == levels(nperm)[1L]) %>%
190-
plot_time(y_breaks = seq(0, 40, 10), y_max = 40, expand_upper = 5e-3) +
190+
plot_time(y_breaks = seq(0, 40, 10), y_max = 40, expand_upper = 0.05) +
191191
labs(x = "")
192192
193193
# 100,000 permutations

src/C_functions.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,89 @@
33
#include <Rinternals.h>
44

55

6+
SEXP _C_rep_int(SEXP times) {
7+
const int *restrict ptimes = INTEGER(times);
8+
9+
const int n_times = Rf_length(times);
10+
11+
int n = 0;
12+
13+
for (int i = 0; i < n_times; ++i) {
14+
n += ptimes[i];
15+
}
16+
17+
SEXP out = PROTECT(Rf_allocVector(INTSXP, n));
18+
int *restrict pout = INTEGER(out);
19+
20+
int val = 0;
21+
int start = 0;
22+
int end = 0;
23+
24+
for (int i = 0; i < n_times; ++i) {
25+
++val;
26+
start = end;
27+
end = start + ptimes[i];
28+
29+
for (int j = start; j < end; ++j) {
30+
pout[j] = val;
31+
}
32+
}
33+
34+
UNPROTECT(1);
35+
36+
return out;
37+
}
38+
39+
40+
SEXP _C_pairs_not_duplicated(SEXP x, SEXP y) {
41+
const int N = Rf_length(x);
42+
43+
if (N == 0) {
44+
return R_NilValue;
45+
}
46+
47+
if (N == 1) {
48+
return Rf_ScalarInteger(1);
49+
}
50+
51+
const int *restrict px = INTEGER(x);
52+
const int *restrict py = INTEGER(y);
53+
54+
SEXP temp = PROTECT(Rf_allocVector(INTSXP, N));
55+
int *restrict ptemp = INTEGER(temp);
56+
57+
ptemp[0] = 1; // the first pair is not a duplicate, by definition
58+
59+
int n_unique = 1;
60+
61+
// Due to how x and y are sorted, duplicate pairs will be contiguous
62+
for (int i = 1; i < N; ++i) {
63+
if (!((px[i] == px[i - 1]) & (py[i] == py[i - 1]))) { // not duplicated
64+
ptemp[n_unique] = i + 1;
65+
++n_unique;
66+
}
67+
}
68+
69+
if (n_unique == N) {
70+
UNPROTECT(1);
71+
72+
return temp;
73+
}
74+
75+
// Subset to the first n_unique elements (indices of non-duplicate pairs)
76+
SEXP out = PROTECT(Rf_allocVector(INTSXP, n_unique));
77+
int *restrict pout = INTEGER(out);
78+
79+
for (int i = 0; i < n_unique; ++i) {
80+
pout[i] = ptemp[i];
81+
}
82+
83+
UNPROTECT(2);
84+
85+
return out;
86+
}
87+
88+
689
/*
790
* See Rcpp_functions.cpp for documentation.
891
*/

0 commit comments

Comments
 (0)