pnnl
diff --git a/‎NAMESPACE‎
Lines changed: 0 additions & 2 deletions b/‎NAMESPACE‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎R/RcppExports.R‎
Lines changed: 4 additions & 0 deletions b/‎R/RcppExports.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/utils.R‎
Lines changed: 65 additions & 23 deletions b/‎R/utils.R‎
Lines changed: 65 additions & 23 deletions
diff --git a/‎man/figures/README-figure-1.png‎
-654 Bytes b/‎man/figures/README-figure-1.png‎
-654 Bytes
diff --git a/‎man/figures/README-figure-2.png‎
-1.03 KB b/‎man/figures/README-figure-2.png‎
-1.03 KB
diff --git a/‎simulation/data/fast-ssGSEA_timing_results.rds‎
-3 Bytes b/‎simulation/data/fast-ssGSEA_timing_results.rds‎
-3 Bytes
diff --git a/‎simulation/figures/figure-1.pdf‎
-8 Bytes b/‎simulation/figures/figure-1.pdf‎
-8 Bytes
diff --git a/‎simulation/figures/figure-2.pdf‎
-14 Bytes b/‎simulation/figures/figure-2.pdf‎
-14 Bytes
diff --git a/‎simulation/scripts/manuscript_figures.Rmd‎
Lines changed: 5 additions & 5 deletions b/‎simulation/scripts/manuscript_figures.Rmd‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/C_functions.c‎
Lines changed: 83 additions & 0 deletions b/‎src/C_functions.c‎
Lines changed: 83 additions & 0 deletions
@@ -5,9 +5,7 @@ export(read_gmt)
 exportPattern("^[[:alpha:]]+")
 import(RcppArmadillo)
 import(dqrng)
-importFrom(Matrix,sparseMatrix)
 importFrom(Rcpp,evalCpp)
-importFrom(collapse,alloc)
 importFrom(collapse,allv)
 importFrom(collapse,anyv)
 importFrom(collapse,fsubset)
 
@@ -184,6 +184,10 @@ NULL
 #' @noRd
 NULL
 
+.Cpp_unsafe_sparseMatrix <- function(i, j, dims, dimnames) {
+    .Call(`_fast_ssgsea_unsafe_sparseMatrix`, i, j, dims, dimnames)
+}
+
 .Cpp_matmult_sparse <- function(X, Y) {
     .Call(`_fast_ssgsea_matmult_sparse`, X, Y)
 }
 
@@ -132,6 +132,36 @@
 }
 
 
+#' @title Fast, specialized rep.int
+#'
+#' @param times integer vector of group sizes.
+#'
+#' @returns Integer vector. Equivalent to the result of
+#'   `rep.int(seq_along(times), times)`, but several times faster.
+#'
+#' @author Tyler Sagendorf
+#'
+#' @noRd
+.C_rep_int <- function(sizes) {
+  .Call("_C_rep_int", sizes)
+}
+
+
+#' @title Get indices of non-duplicate integer pairs
+#'
+#' @param x,y integer vectors. `y` is assumed to be sorted, and `x` is sorted
+#'   within groups of `y`. That is, any duplicate pairs are contiguous.
+#'
+#' @returns Integer vector of the indices of the non-duplicate (x, y) pairs.
+#'
+#' @author Tyler Sagendorf
+#'
+#' @noRd
+.C_pairs_not_duplicated <- function(x, y) {
+  .Call("_C_pairs_not_duplicated", x, y)
+}
+
+
 #' @title Create Sparse Incidence Matrices
 #'
 #' @description Create a list of sparse incidence matrices, where the unique
@@ -155,10 +185,9 @@
 #'
 #' @author Tyler Sagendorf
 #'
-#' @importFrom collapse alloc allv anyv fsubset funique groupid vec vlengths
-#'   vtypes whichNA whichv
+#' @importFrom collapse allv anyv fsubset funique groupid vec vlengths vtypes
+#'   whichNA whichv
 #' @importFrom data.table chmatch
-#' @importFrom Matrix sparseMatrix
 #'
 #' @noRd
 .sparseIncidence <- function(gene_sets,
@@ -242,11 +271,10 @@
   # Row indices for sparse matrix
   i <- chmatch(elements, unique_elements, nomatch = NA_integer_)
 
-  # Unique set names
   unique_sets <- names(gene_sets)
 
   # Column indices for sparse matrix
-  j <- rep.int(seq_along(unique_sets), set_sizes)
+  j <- .C_rep_int(set_sizes)
 
   if (anyNA(i)) {
     # Remove elements not in the background
@@ -255,61 +283,75 @@
     i <- fsubset(i, keep)
     j <- fsubset(j, keep)
 
+    if (any_dir) {
+      direction_down <- fsubset(direction_down, keep)
+    }
+
     unique_sets <- fsubset(unique_sets, funique(j))
 
-    j <- groupid(j) # this works because each set is a contiguous batch
+    j <- groupid(j) # this works because each set is a contiguous group
+    class(j) <- "integer"
   }
 
+  # Must sort to remove duplicates easily and to use the unsafe sparseMatrix
+  # function. This is the slowest part of this function.
+  o <- order(j, i, method = "radix")
+  i <- fsubset(i, o)
+
   dim_names <- list(unique_elements, unique_sets)
   dims <- vlengths(dim_names)
 
   # Split elements by direction of change to create two incidence matrices
   if (any_dir) {
+    direction_down <- fsubset(direction_down, o)
     idx_down <- whichv(direction_down, TRUE)
     direction_down <- NULL # signal that this is no longer needed
 
     i_down <- fsubset(i, idx_down)
     j_down <- fsubset(j, idx_down)
 
+    # Indices of unique (i_down, j_down) pairs
+    unique_pairs_idx <- .C_pairs_not_duplicated(i_down, j_down)
+
+    if (length(unique_pairs_idx) != length(i_down)) {
+      i_down <- fsubset(i_down, unique_pairs_idx)
+      j_down <- fsubset(j_down, unique_pairs_idx)
+    }
+
     i <- fsubset(i, -idx_down)
     j <- fsubset(j, -idx_down)
   }
 
+  # Indices of unique (j, i) pairs
+  unique_pairs_idx <- .C_pairs_not_duplicated(i, j)
+
+  if (length(unique_pairs_idx) != length(i)) {
+    i <- fsubset(i, unique_pairs_idx)
+    j <- fsubset(j, unique_pairs_idx)
+  }
+
   # Incidence matrix where a 1 indicates that the element is in the set. If
   # gene_sets is a directional database, then A will only contain elements that
   # are expected to be "up".
-  A <- sparseMatrix(
+  A <- .Cpp_unsafe_sparseMatrix(
     i = i,
     j = j,
-    x = alloc(1, length(i)),
     dims = dims,
-    dimnames = dim_names,
-    check = FALSE,
-    use.last.ij = FALSE
+    dimnames = dim_names
   )
 
-  # In the unlikely event where an element appears multiple times in the same
-  # set, some values of A will be > 1. Replace all values with 1. Could also use
-  # the use.last.ij parameter in sparseMatrix(), but this is faster.
-  attr(A, which = "x") <- alloc(1, length(attr(A, which = "x")))
-
   A_d <- NULL # default
 
   if (any_dir) {
     # Incidence matrix where a 1 indicates that an element is expected to be
     # down in the set
-    A_d <- sparseMatrix(
+    A_d <- .Cpp_unsafe_sparseMatrix(
       i = i_down,
       j = j_down,
-      x = alloc(1, length(i_down)),
       dims = dims,
-      dimnames = dim_names,
-      check = FALSE,
-      use.last.ij = FALSE
+      dimnames = dim_names
     )
 
-    attr(A_d, which = "x") <- alloc(1, length(attr(A_d, which = "x")))
-
     # The Hadamard product A * A_d should be a matrix of zeros, since genes can
     # not be "up" and "down" in the same set.
     if (length(attr(A * A_d, which = "x"))) {
 
@@ -82,7 +82,7 @@ plot_time <- function(x, y_breaks, y_max = NULL, expand_upper = 0.05) {
     ) +
     scale_color_manual(
       name = "Number of gene sets: ",
-      values = palette.colors(n = 3L),
+      values = structure(palette.colors(n = 3L), names = NULL),
       breaks = c("1,000", "10,000", "50,000")
     ) +
     labs(
@@ -142,17 +142,17 @@ time_res <- prepare_data("../data/fast-ssGSEA_timing_results.rds")
 
 # 10,000 permutations
 pa <- filter(time_res, nperm == levels(nperm)[1L]) %>%
-  plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.02) +
+  plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.05) +
   labs(x = "")
 
 # 100,000 permutations
 pb <- filter(time_res, nperm == levels(nperm)[2L]) %>%
-  plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.02) +
+  plot_time(y_breaks = seq(0, 4, 1), y_max = 4, expand_upper = 0.05) +
   labs(y = "")
 
 # 1,000,000 permutations
 pc <- filter(time_res, nperm == levels(nperm)[3L]) %>%
-  plot_time(y_breaks = seq(0, 20, 4), y_max = 20, expand_upper = 5e-3) +
+  plot_time(y_breaks = seq(0, 20, 4), y_max = 20, expand_upper = 0.05) +
   labs(x = "", y = "")
 
 # Combine plots
@@ -187,7 +187,7 @@ time_res <- prepare_data("../data/FGSEA_timing_results_nproc_1.rds")
 
 # 10,000 permutations
 pa <- filter(time_res, nperm == levels(nperm)[1L]) %>%
-  plot_time(y_breaks = seq(0, 40, 10), y_max = 40, expand_upper = 5e-3) +
+  plot_time(y_breaks = seq(0, 40, 10), y_max = 40, expand_upper = 0.05) +
   labs(x = "")
 
 # 100,000 permutations
 
@@ -3,6 +3,89 @@
 #include <Rinternals.h>
 
 
+SEXP _C_rep_int(SEXP times) {
+  const int *restrict ptimes = INTEGER(times);
+
+  const int n_times = Rf_length(times);
+
+  int n = 0;
+
+  for (int i = 0; i < n_times; ++i) {
+    n += ptimes[i];
+  }
+
+  SEXP out = PROTECT(Rf_allocVector(INTSXP, n));
+  int *restrict pout = INTEGER(out);
+
+  int val = 0;
+  int start = 0;
+  int end = 0;
+
+  for (int i = 0; i < n_times; ++i) {
+    ++val;
+    start = end;
+    end = start + ptimes[i];
+
+    for (int j = start; j < end; ++j) {
+      pout[j] = val;
+    }
+  }
+
+  UNPROTECT(1);
+
+  return out;
+}
+
+
+SEXP _C_pairs_not_duplicated(SEXP x, SEXP y) {
+  const int N = Rf_length(x);
+
+  if (N == 0) {
+    return R_NilValue;
+  }
+
+  if (N == 1) {
+    return Rf_ScalarInteger(1);
+  }
+
+  const int *restrict px = INTEGER(x);
+  const int *restrict py = INTEGER(y);
+
+  SEXP temp = PROTECT(Rf_allocVector(INTSXP, N));
+  int *restrict ptemp = INTEGER(temp);
+
+  ptemp[0] = 1; // the first pair is not a duplicate, by definition
+
+  int n_unique = 1;
+
+  // Due to how x and y are sorted, duplicate pairs will be contiguous
+  for (int i = 1; i < N; ++i) {
+    if (!((px[i] == px[i - 1]) & (py[i] == py[i - 1]))) { // not duplicated
+      ptemp[n_unique] = i + 1;
+      ++n_unique;
+    }
+  }
+
+  if (n_unique == N) {
+    UNPROTECT(1);
+
+    return temp;
+  }
+
+  // Subset to the first n_unique elements (indices of non-duplicate pairs)
+  SEXP out = PROTECT(Rf_allocVector(INTSXP, n_unique));
+  int *restrict pout = INTEGER(out);
+
+  for (int i = 0; i < n_unique; ++i) {
+    pout[i] = ptemp[i];
+  }
+
+  UNPROTECT(2);
+
+  return out;
+}
+
+
 /*
  * See Rcpp_functions.cpp for documentation.
  */