|
116 | 116 | } |
117 | 117 |
|
118 | 118 |
|
119 | | -#' @title Convert gene sets to a list of indices for .calcES |
| 119 | +#' @title Convert gene sets to a list of indices for `.calc_ES` |
120 | 120 | #' |
121 | 121 | #' @inheritParams fast_ssgsea |
122 | 122 | #' |
123 | 123 | #' @returns A named list. |
124 | 124 | #' |
125 | 125 | #' @author Tyler Sagendorf |
126 | 126 | #' |
| 127 | +#' @importFrom collapse allv anyv fmatch fsubset funique groupid vec vlengths |
| 128 | +#' vtypes whichNA whichv |
| 129 | +#' |
127 | 130 | #' @noRd |
128 | 131 | .gene_sets_to_indices <- function(stats, |
129 | 132 | gene_sets, |
|
145 | 148 | } |
146 | 149 |
|
147 | 150 | # Pre-filter to remove gene sets that are too small. We can not remove gene |
148 | | - # sets that are too large without first restricting the genes to the |
149 | | - # background. |
| 151 | + # sets that are too large without first restricting genes to names(stats). |
150 | 152 | set_sizes <- vlengths(gene_sets) |
151 | 153 |
|
152 | 154 | keep_sets <- whichv(set_sizes >= min_size, TRUE) |
|
161 | 163 | genes <- vec(gene_sets) |
162 | 164 | unique_genes <- funique(genes) |
163 | 165 |
|
164 | | - # Determine if any elements have an expected direction of change |
165 | | - directional_sets <- 0L != length( |
166 | | - grep(";[ud]{1}", unique_genes, perl = TRUE) |
167 | | - ) |
| 166 | + # Determine if any genes have an expected direction of change |
| 167 | + directional_sets <- anyv(grepl(";[ud]{1}", unique_genes, perl = TRUE), TRUE) |
168 | 168 |
|
169 | 169 | if (directional_sets) { |
170 | 170 | # Determine which genes are expected to be "down" and remove the direction |
|
182 | 182 | } |
183 | 183 |
|
184 | 184 | # names(stats) is first because it was sorted lexicographically to deal with |
185 | | - # ties in the gene-level values |
| 185 | + # ties in stats |
186 | 186 | unique_genes <- intersect(names(stats), unique_genes) |
187 | 187 |
|
188 | 188 | if (length(unique_genes) == 0L) { |
|
200 | 200 |
|
201 | 201 | unique_sets <- names(gene_sets) |
202 | 202 |
|
| 203 | + # Remove genes not in names(stats) |
203 | 204 | if (anyNA(gene_indices)) { |
204 | | - # Remove genes not in names(stats) |
205 | 205 | keep_genes <- whichNA(gene_indices, invert = TRUE) |
206 | 206 |
|
207 | 207 | gene_indices <- fsubset(gene_indices, keep_genes) |
|
251 | 251 | } |
252 | 252 |
|
253 | 253 |
|
254 | | -#' @title Fast, specialized rep.int |
| 254 | +#' @title Fast, specialized `rep.int` |
255 | 255 | #' |
256 | 256 | #' @description Equivalent to `rep.int(seq_along(times), times)`, but several |
257 | 257 | #' times faster when the output is large. |
|
287 | 287 | } |
288 | 288 |
|
289 | 289 |
|
| 290 | +#' @title Remove gene sets that are too small or too large |
| 291 | +#' |
| 292 | +#' @param gene_indices integer vector; indices of the genes in each set, |
| 293 | +#' arranged in contiguous blocks by gene set. |
| 294 | +#' @param extreme_set_indices integer vector; indices of gene sets that are too |
| 295 | +#' small or too large to test. Indices can range from 1 to `length(m)`. |
| 296 | +#' @param m integer vector; the number of genes in each set, where |
| 297 | +#' `length(gene_indices) == sum(m)`. |
| 298 | +#' |
| 299 | +#' @returns The vector `gene_indices` with blocks of genes from the extreme gene |
| 300 | +#' sets removed. |
| 301 | +#' |
| 302 | +#' @details This function is not called when all or no gene sets are extreme. If |
| 303 | +#' all gene sets are extreme, an error will be thrown by `.calc_ES`. |
| 304 | +#' |
| 305 | +#' The runtime of this function decreases as the number of extreme sets |
| 306 | +#' increases. |
| 307 | +#' |
| 308 | +#' @author Tyler Sagendorf |
| 309 | +#' |
| 310 | +#' @noRd |
| 311 | +.C_remove_extreme_gene_sets <- function(gene_indices, |
| 312 | + extreme_set_indices, |
| 313 | + m) { |
| 314 | + .Call( |
| 315 | + "_C_remove_extreme_gene_sets", |
| 316 | + gene_indices, |
| 317 | + extreme_set_indices, |
| 318 | + m |
| 319 | + ) |
| 320 | +} |
| 321 | + |
| 322 | + |
290 | 323 | #' @title Calculate Enrichment Scores |
291 | 324 | #' |
292 | 325 | #' @param y_prime numeric vector of absolute gene-level values raised to the |
293 | 326 | #' power of `alpha` for genes that are members of at least one gene set. |
294 | 327 | #' @param r_prime numeric vector of the ranks of the gene-level values for genes |
295 | 328 | #' that are members of at least one gene set. |
296 | 329 | #' @param sum_ranks numeric; the sum of all ranks. |
297 | | -#' @param i integer vector; indices of the genes in all sets. Used to index |
298 | | -#' vectors `y_prime` and `r_prime`. |
| 330 | +#' @param gene_indices integer vector; indices of the genes in all sets. Used to |
| 331 | +#' index vectors `y_prime` and `r_prime`. |
299 | 332 | #' @param m integer vector; the number of genes in each set. Used to select |
300 | | -#' elements of `i`. |
| 333 | +#' elements of `gene_indices`. |
301 | 334 | #' @param w integer vector; the number of genes that are not in each set. |
302 | 335 | #' @inheritParams fast_ssgsea |
303 | 336 | #' |
|
339 | 372 | #' |
340 | 373 | #' @author Tyler Sagendorf |
341 | 374 | #' |
342 | | -#' @importFrom collapse %!iin% allv anyv fmatch fmax fsubset funique groupid vec |
343 | | -#' vlengths vtypes whichNA whichv |
| 375 | +#' @importFrom collapse fmatch fmax fsubset |
344 | 376 | #' @importFrom data.table frank |
345 | 377 | #' |
346 | 378 | #' @noRd |
|
350 | 382 | gene_sets, |
351 | 383 | min_size = 2L, |
352 | 384 | max_size = Inf) { |
353 | | - max_size <- max(min_size, min(max_size, n_genes - 1L)) |
| 385 | + max_size <- min(max_size, n_genes - 1L) |
354 | 386 |
|
355 | 387 | storage.mode(min_size) <- storage.mode(max_size) <- "integer" |
356 | 388 |
|
|
391 | 423 | } else if (length(extreme_set_indices)) { |
392 | 424 | unique_sets <- fsubset(unique_sets, -extreme_set_indices) |
393 | 425 |
|
394 | | - m <- fsubset(m, -extreme_set_indices) |
395 | | - |
396 | | - gene_indices <- fsubset( |
397 | | - .x = gene_indices, |
398 | | - subset = set_indices %!iin% extreme_set_indices |
| 426 | + gene_indices <- .C_remove_extreme_gene_sets( |
| 427 | + gene_indices = gene_indices, |
| 428 | + extreme_set_indices = extreme_set_indices, |
| 429 | + m = m |
399 | 430 | ) |
400 | 431 |
|
401 | | - if (directional_sets) { |
402 | | - m_d <- fsubset(m_d, -extreme_set_indices) |
| 432 | + m <- fsubset(m, -extreme_set_indices) |
403 | 433 |
|
404 | | - gene_indices_down <- fsubset( |
405 | | - .x = gene_indices_down, |
406 | | - subset = set_indices_down %!iin% extreme_set_indices |
| 434 | + if (directional_sets) { |
| 435 | + gene_indices_down <- .C_remove_extreme_gene_sets( |
| 436 | + gene_indices = gene_indices_down, |
| 437 | + extreme_set_indices = extreme_set_indices, |
| 438 | + m = m_d |
407 | 439 | ) |
| 440 | + |
| 441 | + m_d <- fsubset(m_d, -extreme_set_indices) |
408 | 442 | } |
409 | 443 | } |
410 | 444 |
|
|
439 | 473 |
|
440 | 474 | if (directional_sets) { |
441 | 475 | # Calculate enrichment scores separately for the up-regulated and |
442 | | - # down-regulated genes |
| 476 | + # down-regulated genes. Elements of m and m_d that are < min_size will be |
| 477 | + # replaced with 0. |
443 | 478 | ES_u <- .C_calc_ES( |
444 | 479 | y_prime = y_prime, |
445 | 480 | r_prime = r_prime, |
|
0 commit comments