Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
Package: fgsea
Title: Fast Gene Set Enrichment Analysis
Version: 1.37.0
Version: 1.37.1
Authors@R: c(person("Gennady", "Korotkevich", role = "aut"),
person("Vladimir", "Sukhov", role = "aut"),
person("Nikita", "Golikov", role = "aut"),
person("Nikolay", "Budin", role = "ctb"),
person("Nikita", "Gusak", role = "ctb"),
person("Zieman", "Mark", role = "ctb"),
Expand Down Expand Up @@ -42,7 +43,7 @@ Suggests:
License: MIT + file LICENCE
LazyData: true
LinkingTo: Rcpp, BH
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Encoding: UTF-8
VignetteBuilder: knitr
URL: https://github.com/ctlab/fgsea/
Expand Down
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
CHANGES IN VERSION 1.37.1
* Hash-based resolution of tied gene sets in the multilevel algorithm (by Nikita Golikov), properly fixes #151
* P-avlues for ES close to 1 are more accurate now
* Internally, the gene weights are now converted to integers (with scaling), which might introduce minor result discrepancies in rare cases

CHANGES IN VERSION 1.35.7
* fix handling of zero stat values with gseaParam=0 in plotEnrichment

Expand Down
30 changes: 28 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@ calcGseaStatCumulativeBatch <- function(stats, gseaParam, pathwayScores, pathway
.Call('_fgsea_calcGseaStatCumulativeBatch', PACKAGE = 'fgsea', stats, gseaParam, pathwayScores, pathwaysSizes, iterations, seed, scoreType)
}

#' Calculates GSEA statistic values for all the prefixes of a gene set
#'
#' Takes \emph{O(k^\{3/2\})} time, where \emph{k} is a size of `selectedSize`.
#' @param stats Named numeric vector with gene-level statistics
#' sorted in decreasing order (order is not checked)
#' @param selectedStats indexes of selected genes in a `stats` array
#' @param gseaParam GSEA weight parameter (0 is unweighted, suggested value is 1)
#' @return Numeric vector of GSEA statistics for all prefixes of selectedStats.
#' @export
#' @examples
#' data(exampleRanks)
#' data(examplePathways)
#' ranks <- sort(exampleRanks, decreasing=TRUE)
#' es <- calcGseaStatCumulative(ranks, na.omit(match(examplePathways[[1]], names(ranks))), 1)
calcGseaStatCumulative <- function(stats, selectedStats, gseaParam, scoreType = "std") {
.Call('_fgsea_calcGseaStatCumulative', PACKAGE = 'fgsea', stats, selectedStats, gseaParam, scoreType)
}
Expand All @@ -21,8 +35,20 @@ calcGseaStatBatchCpp <- function(stats, selectedGenes, geneRanks) {
.Call('_fgsea_calcGseaStatBatchCpp', PACKAGE = 'fgsea', stats, selectedGenes, geneRanks)
}

fgseaMultilevelCpp <- function(enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign) {
.Call('_fgsea_fgseaMultilevelCpp', PACKAGE = 'fgsea', enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign)
#' Calculates low GSEA p-values for a given gene set size using the multilevel split Monte Carlo approach.
#'
#' @param enrichmentScores A vector of enrichment scores, for which p-values should be calculated
#' @param ranks An integer vector with the gene-level statistics
#' @param pathwaySize A scalar with the size of the gene set
#' @param seed Random seed
#' @param eps P-values below eps aren't calculated
#' @param sign Controls whether ES^+ or ES score is used
#' @param moveScale Controls the number of MCMC iterations on each level
#' @param logStatus Controls whether debug output should be shown
#' @return table with p-values and estimation errors
#' @keyword internal
fgseaMultilevelCpp <- function(enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign, moveScale = 1.0, logStatus = FALSE) {
.Call('_fgsea_fgseaMultilevelCpp', PACKAGE = 'fgsea', enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign, moveScale, logStatus)
}

gesecaCpp <- function(E, inpScores, genesetSize, sampleSize, seed, eps) {
Expand Down
19 changes: 17 additions & 2 deletions R/fgsea.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ fgsea <- function(pathways, stats, minSize = 1, maxSize = length(stats)-1, gseaP
res
}


preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam, scoreType){
prepareStats <- function(stats, scoreType="std", gseaParam=1) {
# Error if stats are non-finite
if (any(!is.finite(stats))){
stop("Not all stats values are finite numbers")
Expand All @@ -61,6 +60,22 @@ preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam
stats <- sort(stats, decreasing=TRUE)
stats <- abs(stats) ^ gseaParam

# scaling weights to integer values for more stable calculations
# sum(stats) < 2**30 avoids integer overflows
scaleCoeff <- 2**30/sum(stats)
if (scaleCoeff >= 1) {
# scaling by an integer coefficient avoids information loss for integer input stats
scaleCoeff <- floor(scaleCoeff)
}

# TODO: special treatment for small non-zero values?
stats <- round(stats * scaleCoeff)
storage.mode(stats) <- "integer"
return(stats)
}

preparePathwaysAndStats <- function(pathways, stats, minSize, maxSize, gseaParam, scoreType){
stats <- prepareStats(stats, scoreType, gseaParam)
res <- preparePathways(pathways, universe=names(stats), minSize, maxSize)

res$stats <- stats
Expand Down
5 changes: 4 additions & 1 deletion man/plotEnrichmentData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,21 @@ BEGIN_RCPP
END_RCPP
}
// fgseaMultilevelCpp
DataFrame fgseaMultilevelCpp(const NumericVector& enrichmentScores, const NumericVector& ranks, int pathwaySize, int sampleSize, int seed, double eps, bool sign);
RcppExport SEXP _fgsea_fgseaMultilevelCpp(SEXP enrichmentScoresSEXP, SEXP ranksSEXP, SEXP pathwaySizeSEXP, SEXP sampleSizeSEXP, SEXP seedSEXP, SEXP epsSEXP, SEXP signSEXP) {
DataFrame fgseaMultilevelCpp(const NumericVector& enrichmentScores, const SEXP& ranks, int pathwaySize, int sampleSize, int seed, double eps, bool sign, double moveScale, bool logStatus);
RcppExport SEXP _fgsea_fgseaMultilevelCpp(SEXP enrichmentScoresSEXP, SEXP ranksSEXP, SEXP pathwaySizeSEXP, SEXP sampleSizeSEXP, SEXP seedSEXP, SEXP epsSEXP, SEXP signSEXP, SEXP moveScaleSEXP, SEXP logStatusSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const NumericVector& >::type enrichmentScores(enrichmentScoresSEXP);
Rcpp::traits::input_parameter< const NumericVector& >::type ranks(ranksSEXP);
Rcpp::traits::input_parameter< const SEXP& >::type ranks(ranksSEXP);
Rcpp::traits::input_parameter< int >::type pathwaySize(pathwaySizeSEXP);
Rcpp::traits::input_parameter< int >::type sampleSize(sampleSizeSEXP);
Rcpp::traits::input_parameter< int >::type seed(seedSEXP);
Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
Rcpp::traits::input_parameter< bool >::type sign(signSEXP);
rcpp_result_gen = Rcpp::wrap(fgseaMultilevelCpp(enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign));
Rcpp::traits::input_parameter< double >::type moveScale(moveScaleSEXP);
Rcpp::traits::input_parameter< bool >::type logStatus(logStatusSEXP);
rcpp_result_gen = Rcpp::wrap(fgseaMultilevelCpp(enrichmentScores, ranks, pathwaySize, sampleSize, seed, eps, sign, moveScale, logStatus));
return rcpp_result_gen;
END_RCPP
}
Expand All @@ -92,7 +94,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_fgsea_calcGseaStatCumulativeBatch", (DL_FUNC) &_fgsea_calcGseaStatCumulativeBatch, 7},
{"_fgsea_calcGseaStatCumulative", (DL_FUNC) &_fgsea_calcGseaStatCumulative, 4},
{"_fgsea_calcGseaStatBatchCpp", (DL_FUNC) &_fgsea_calcGseaStatBatchCpp, 3},
{"_fgsea_fgseaMultilevelCpp", (DL_FUNC) &_fgsea_fgseaMultilevelCpp, 7},
{"_fgsea_fgseaMultilevelCpp", (DL_FUNC) &_fgsea_fgseaMultilevelCpp, 9},
{"_fgsea_gesecaCpp", (DL_FUNC) &_fgsea_gesecaCpp, 6},
{NULL, NULL, 0}
};
Expand Down
38 changes: 17 additions & 21 deletions src/esCalculation.cpp
Original file line number Diff line number Diff line change
@@ -1,59 +1,55 @@
#include "esCalculation.h"
#include <algorithm>


double calcES(const vector<double> &ranks, const vector<int> &p, double NS) {
score_t calcES(const vector<int64_t> &ranks, const vector<int> &p, int64_t NS) {
// p must be sorted
int n = (int) ranks.size();
int k = (int) p.size();
double res = 0.0;
double cur = 0.0;
double q1 = 1.0 / (n - k);
double q2 = 1.0 / NS;
score_t res{NS, 0, n - k, 0};
score_t cur{NS, 0, n - k, 0};
int last = -1;
for (int pos : p) {
cur -= q1 * (pos - last - 1);
if (abs(cur) > abs(res)) {
cur.coef_const += pos - last - 1;
if (res.abs() < cur.abs()) {
res = cur;
}
cur += q2 * ranks[pos];
if (abs(cur) > abs(res)) {
cur.coef_NS += ranks[pos];
if (res.abs() < cur.abs()) {
res = cur;
}
last = pos;
}
return res;
}

double calcES(const vector<double> &ranks, const vector<int> &p) {
score_t calcES(const vector<int64_t> &ranks, const vector<int> &p) {
// p must be sorted
double NS = 0.0;
int64_t NS = 0;
for (int pos : p) {
NS += ranks[pos];
}
return calcES(ranks, p, NS);
}

double calcPositiveES(const vector<double> &ranks, const vector<int> &p, double NS) {
score_t calcPositiveES(const vector<int64_t> &ranks, const vector<int> &p, int64_t NS) {
// p must be sorted
int n = (int) ranks.size();
int k = (int) p.size();
double res = 0.0;
double cur = 0.0;
double q1 = 1.0 / (n - k);
double q2 = 1.0 / NS;
score_t res{NS, 0, n - k, 0};
score_t cur{NS, 0, n - k, 0};
int last = -1;
for (int pos : p) {
cur += q2 * ranks[pos] - q1 * (pos - last - 1);
cur.coef_NS += ranks[pos];
cur.coef_const += pos - last - 1;
res = max(res, cur);
last = pos;
}
return res;
}

double calcPositiveES(const vector<double> &ranks, const vector<int> &p) {
score_t calcPositiveES(const vector<int64_t> &ranks, const vector<int> &p) {
// p must be sorted
double NS = 0.0;
int64_t NS = 0;
for (int pos : p) {
NS += ranks[pos];
}
Expand All @@ -76,4 +72,4 @@ bool compareStat(const vector<double> &ranks, const vector<int> &p, double NS, d
last = pos;
}
return false;
}
}
87 changes: 83 additions & 4 deletions src/esCalculation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,95 @@

#include <vector>
#include <cmath>
#include <cstdint>

using namespace std;

double calcES(const vector<double> &ranks, const vector<int> &p, double NS);
#include <boost/config.hpp>

double calcES(const vector<double> &ranks, const vector<int> &p);
#if defined(BOOST_HAS_INT128)

double calcPositiveES(const vector<double> &ranks, const vector<int> &p, double NS);
using int128 = __int128;
using uint128 = unsigned __int128;

double calcPositiveES(const vector<double> &ranks, const vector<int> &p);
#else

#include <boost/multiprecision/cpp_int.hpp>
using boost::multiprecision::int128_t;
using boost::multiprecision::uint128_t;

using int128 = int128_t;
using uint128 = uint128_t;

#endif

struct score_t {
// score = coef_ns / NS - coef_const / diff
int64_t NS;
int64_t coef_NS;
int64_t diff; // n - k
int64_t coef_const;

double getDouble() const {
return 1.0 * coef_NS / NS - 1.0 * coef_const / diff;
}

int64_t getNumerator() const {
return coef_NS * diff - coef_const * NS;
}

int128 Compare(score_t const& other) const {
int64_t P1 = coef_NS * diff + NS * (other.coef_const - coef_const);
int64_t Q1 = NS * diff;
int64_t P2 = other.coef_NS;
int64_t Q2 = other.NS;
return int128(P1) * Q2 - int128(P2) * Q1;
}

bool operator<(score_t const& other) const {
return Compare(other) < 0;
}

bool operator<=(score_t const& other) const {
return Compare(other) <= 0;
}

bool operator>(score_t const& other) const {
return Compare(other) > 0;
}

bool operator>=(score_t const& other) const {
return Compare(other) >= 0;
}

bool operator==(score_t const& other) const {
return Compare(other) == 0;
}

bool operator!=(score_t const& other) const {
return Compare(other) != 0;
}

static int64_t getMaxNS() {
return int64_t(1 << 30);
}

score_t operator-() const {
return score_t{NS, -coef_NS, diff, -coef_const};
}

score_t abs() const {
return std::max(*this, -(*this));
}
};

score_t calcES(const vector<int64_t> &ranks, const vector<int> &p, int64_t NS);

score_t calcES(const vector<int64_t> &ranks, const vector<int> &p);

score_t calcPositiveES(const vector<int64_t> &ranks, const vector<int> &p, int64_t NS);

score_t calcPositiveES(const vector<int64_t> &ranks, const vector<int> &p);

bool compareStat(const vector<double> &ranks, const vector<int> &p, double NS, double bound);

Expand Down
3 changes: 3 additions & 0 deletions src/fastGSEA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using namespace std;

const double eps = 1e-13;

// not actually a segment tree, but a square root heuristic
template <class T>
class SegmentTree {
private:
Expand Down Expand Up @@ -43,6 +44,7 @@ class SegmentTree {
b = vector<T>(k2, 0);
}

// works in O(sqrt(n))
void inc(int p, T delta) { // increase value at position p
int blockEnd = p - (p & blockMask) + blockMask + 1;
for (; p < blockEnd; ++p) {
Expand All @@ -54,6 +56,7 @@ class SegmentTree {
}
}

// works in O(1)
T queryR(int r) { // sum on interval [0, r)
if (r == 0) {
return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/fastGSEA.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#pragma once
#include <Rcpp.h>
using namespace Rcpp;

Expand Down Expand Up @@ -28,7 +29,6 @@ List calcGseaStatCumulativeBatch(
//' data(examplePathways)
//' ranks <- sort(exampleRanks, decreasing=TRUE)
//' es <- calcGseaStatCumulative(ranks, na.omit(match(examplePathways[[1]], names(ranks))), 1)
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
NumericVector calcGseaStatCumulative(
NumericVector const& stats,
Expand Down
Loading
Loading