-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathstatistic-zscore.R
152 lines (139 loc) · 4.9 KB
/
statistic-zscore.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#' z-score
#'
#' @description
#' Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI.
#'
#' @details
#' The z-score calculates the mean of the molecular features of the known targets
#' for each regulator and adjusts it for the number of identified targets for the
#' regulator, the standard deviation of all molecular features (RoKAI), as well as
#' the mean of all moleculare features (KSEA).
#'
#' @inheritParams .decoupler_mat_format
#' @inheritParams .decoupler_network_format
#' @param sparse Deprecated parameter.
#' @param center Logical value indicating if `mat` must be centered by
#' [base::rowMeans()].
#' @param na.rm Should missing values (including NaN) be omitted from the
#' calculations of [base::rowMeans()]?
#' @param minsize Integer indicating the minimum number of targets per source.
#' @param flavor Whether the calculation should be based on RoKAI (default) or
#' KSEA.
#'
#' @return A long format tibble of the enrichment scores for each source
#' across the samples. Resulting tibble contains the following columns:
#' 1. `statistic`: Indicates which method is associated with which score.
#' 2. `source`: Source nodes of `network`.
#' 3. `condition`: Condition representing each column of `mat`.
#' 4. `score`: Regulatory activity (enrichment score).
#' @family decoupleR statistics
#' @export
#'
#' @importFrom stats coef lm summary.lm
#' @importFrom magrittr %<>% %>%
#' @importFrom dplyr ungroup
#' @examples
#' inputs_dir <- system.file("testdata", "inputs", package = "decoupleR")
#'
#' mat <- readRDS(file.path(inputs_dir, "mat.rds"))
#' net <- readRDS(file.path(inputs_dir, "net.rds"))
#'
#' run_zscore(mat, net, minsize=0)
run_zscore <- function(mat,
network,
.source = source,
.target = target,
.mor = mor,
.likelihood = likelihood,
sparse = FALSE,
center = FALSE,
na.rm = FALSE,
minsize = 5L,
flavor = "RoKAI"
) {
# NSE vs. R CMD check workaround
condition <- likelihood <- mor <- p_value <- score <-
source <- statistic <- target <- NULL
# Check for NAs/Infs in mat
mat %<>% check_nas_infs
network %>%
# Convert to standard tibble: source-target-mor.
rename_net(
{{ .source }},
{{ .target }},
{{ .mor }},
{{ .likelihood }}
) %>%
filt_minsize(rownames(mat), ., minsize) %>%
# Preprocessing -------------------------------------------------------
.fit_preprocessing(mat, center, na.rm, sparse) %>%
# Model evaluation ----------------------------------------------------
{.zscore_analysis(.$mat, .$mor_mat, flavor)} %>%
ungroup()
}
#' Wrapper to execute run_zscore() logic on preprocessed data
#'
#' Calculate a z-score from the molecular features of its targets and
#' normalise it by the background molecular features.
#'
#' @inheritParams run_zscore
#' @param mor_mat
#'
#' @inherit run_zscore return
#' @keywords intern
#' @importFrom stats pnorm
#' @importFrom dplyr filter rename mutate group_by summarise arrange
#' @importFrom tibble rownames_to_column tibble
#' @importFrom tidyr pivot_longer drop_na
#' @importFrom purrr map_dfr
#' @importFrom magrittr %<>% %>%
#' @noRd
.zscore_analysis <- function(mat, mor_mat, flavor) {
net <- mor_mat %>%
as.data.frame() %>%
rownames_to_column("target") %>%
pivot_longer(cols = -target, values_to = "mor", names_to = "source") %>%
filter(mor != 0)
scores <- purrr::map_dfr(seq_len(ncol(mat)), function(exp){
# Convert column of mat to data frame and drop NA rows
mat_c <- mat[, exp, drop = FALSE] %>%
data.frame() %>%
drop_na() %>%
rownames_to_column("target")
# Perform inner join and filter NA rows
KSdata <- full_join(net, mat_c, by = "target") %>%
drop_na() %>%
rename("stat" = colnames(.)[4])
# Calculate value based on mor and stat
KSdata <- KSdata %>%
mutate(value = mor * stat)
# Aggregate mean values for each source
Mean.FC <- KSdata %>%
group_by(source) %>%
summarise(mS = mean(value), m = n()) %>%
arrange(source)
# Calculate Enrichment and z-score
mean_value <- mean(mat_c[, 2], na.rm = TRUE)
if (flavor == "RoKAI") {
mean_value <- 0
}
Mean.FC <- Mean.FC %>%
mutate(
Enrichment = mS / abs(mean(mat_c[,2], na.rm = TRUE)),
z.score = ((mS - mean_value) * sqrt(m)) / sd(mat_c[, 2], na.rm = TRUE),
p.value = pnorm(-abs(z.score))
)
# Prepare and return result as tibble
tibble(
statistic = "z_score",
source = Mean.FC$source,
condition = colnames(mat_c)[2],
score = Mean.FC$z.score,
p_value = Mean.FC$p.value
)
})
# Arrange scores according to the order of sources in mor_mat
scores <- scores %>%
arrange(match(source, colnames(mor_mat)))
return(scores)
}