-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathOverRepresentationAnalysis.R
308 lines (270 loc) · 15.5 KB
/
OverRepresentationAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#'## ---------------------------
##
## Script name: Over representation Analysis (ORA)
##
## Purpose of script: Run ORA on MetaProViz metabolite clusters from MCA or diffeential results from DMA
##
## Author: Christina Schmidt
##
## Date Created: 2023-07-03
##
## Copyright (c)
## Email:
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------
#################################
### ### ### ClusterORA ### ### ###
#################################
#' ClusterORA
#'
#' Uses enricher to run ORA on each of the metabolite cluster from any of the MCA functions using a pathway list
#'
#' @param InputData DF with metabolite names/metabolite IDs as row names. Metabolite names/IDs need to match the identifier type (e.g. HMDB IDs) in the PathwayFile.
#' @param SettingsInfo \emph{Optional: } Pass ColumnName of the column including the cluster names that ORA should be performed on (=ClusterColumn). BackgroundColumn passes the column name needed if RemoveBackground=TRUE. Also pass ColumnName for PathwayFile including term and feature names. (ClusterColumn= ColumnName InputData, BackgroundColumn = ColumnName InputData, PathwayTerm= ColumnName PathwayFile, PathwayFeature= ColumnName PathwayFile) \strong{c(FeatureName="Metabolite", ClusterColumn="RG2_Significant", BackgroundColumn="BG_Method", PathwayTerm= "term", PathwayFeature= "Metabolite")}
#' @param RemoveBackground \emph{Optional: } If TRUE, column BackgroundColumn name needs to be in SettingsInfo, which includes TRUE/FALSE for each metabolite to fall into background based on the chosen Background method for e.g. MCA_2Cond are removed from the universe. \strong{default: TRUE}
#' @param PathwayFile DF that must include column "term" with the pathway name, column "Feature" with the Metabolite name or ID and column "Description" with pathway description.
#' @param PathwayName \emph{Optional: } Name of the pathway list used \strong{default: ""}
#' @param minGSSize \emph{Optional: } minimum group size in ORA \strong{default: 10}
#' @param maxGSSize \emph{Optional: } maximum group size in ORA \strong{default: 1000}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt" \strong{default: "csv"}
#' @param FolderPath \emph{Optional:} Path to the folder the results should be saved at. \strong{default: NULL}
#'
#' @return Saves results as individual .csv files.
#'
#' @importFrom logger log_info
#' @importFrom dplyr rename select
#' @export
ClusterORA <- function(InputData,
SettingsInfo=c(ClusterColumn="RG2_Significant", BackgroundColumn="BG_Method", PathwayTerm= "term", PathwayFeature= "Metabolite"),
RemoveBackground=TRUE,
PathwayFile,
PathwayName="",
minGSSize=10,
maxGSSize=1000 ,
SaveAs_Table= "csv",
FolderPath = NULL){
## ------------ Create log file ----------- ##
MetaProViz_Init()
## ------------ Check Input files ----------- ##
Pathways <- CheckInput_ORA(InputData=InputData,
SettingsInfo=SettingsInfo,
RemoveBackground=RemoveBackground,
PathwayFile=PathwayFile,
PathwayName=PathwayName,
minGSSize=minGSSize,
maxGSSize=maxGSSize,
SaveAs_Table=SaveAs_Table,
pCutoff=NULL,
PercentageCutoff=NULL)
## ------------ Create Results output folder ----------- ##
if(is.null(SaveAs_Table)==FALSE){
Folder <- SavePath(FolderName= "ClusterORA",
FolderPath=FolderPath)
}
############################################################################################################
## ------------ Prepare Data ----------- ##
# open the data
if(RemoveBackground==TRUE){
df <- subset(InputData, !InputData[[SettingsInfo[["BackgroundColumn"]]]] == "FALSE")%>%
rownames_to_column("Metabolite")
} else{
df <- InputData%>%
rownames_to_column("Metabolite")
}
#Select universe
allMetabolites <- as.character(df$Metabolite)
#Select clusters
grps_labels <- unlist(unique(df[[SettingsInfo[["ClusterColumn"]]]]))
#Load Pathways
Pathway <- Pathways
Term2gene <- Pathway[,c("term", "gene")]# term and MetaboliteID (MetaboliteID= gene as syntax required for enricher)
term2name <- Pathway[,c("term", "Description")]# term and description
#Add the number of genes present in each pathway
Pathway$Count <- 1
Pathway_Mean <- aggregate(Pathway$Count, by=list(term=Pathway$term), FUN=sum)
names(Pathway_Mean)[names(Pathway_Mean) == "x"] <- "Metabolites_in_Pathway"
Pathway <- merge(x= Pathway, y=Pathway_Mean,by="term", all.x=TRUE)
Pathway$Count <- NULL
## ------------ Run ----------- ##
df_list = list()# Make an empty list to store the created DFs
clusterGo_list = list()
#Run ORA
for(g in grps_labels){
grpMetabolites <- subset(df, df[[SettingsInfo[["ClusterColumn"]]]] == g)
log_info("Number of metabolites in cluster `", g, "`: ", nrow(grpMetabolites), sep="")
clusterGo <- clusterProfiler::enricher(gene=as.character(grpMetabolites$Metabolite),
pvalueCutoff = 1,
pAdjustMethod = "BH",
universe = allMetabolites,
minGSSize=minGSSize,
maxGSSize=maxGSSize,
qvalueCutoff = 1,
TERM2GENE=Term2gene ,
TERM2NAME = term2name)
clusterGoSummary <- data.frame(clusterGo)
clusterGo_list[[g]]<- clusterGo
if(!(dim(clusterGoSummary)[1] == 0)){
#Add pathway information (% of genes in pathway detected)
clusterGoSummary <- merge(x= clusterGoSummary%>% select(-Description), y=Pathway%>% select(term, Metabolites_in_Pathway), by.x="ID",by.y="term", all=TRUE)
clusterGoSummary$Count[is.na(clusterGoSummary$Count)] <- 0
clusterGoSummary$Percentage_of_Pathway_detected <-round(((clusterGoSummary$Count/clusterGoSummary$Metabolites_in_Pathway)*100),digits=2)
clusterGoSummary <- clusterGoSummary[!duplicated(clusterGoSummary$ID),]
clusterGoSummary <- clusterGoSummary[order(clusterGoSummary$p.adjust),]
clusterGoSummary <- clusterGoSummary%>%
dplyr::rename("Metabolites_in_pathway"="geneID")
g_save <- gsub("/", "-", g)
df_list[[g_save]] <- clusterGoSummary
}else{
log_info("None of the Input_data Metabolites of the cluster ", g ," were present in any terms of the PathwayFile. Hence the ClusterGoSummary ouput will be empty for this cluster. Please check that the metabolite IDs match the pathway IDs.")
}
}
#Save files
suppressMessages(suppressWarnings(
SaveRes(InputList_DF=df_list,
InputList_Plot= NULL,
SaveAs_Table=SaveAs_Table,
SaveAs_Plot=NULL,
FolderPath= Folder,
FileName= paste("ClusterGoSummary",PathwayName, sep="_"),
CoRe=FALSE,
PrintPlot=FALSE)))
#return <- clusterGoSummary
ORA_Output <- list("DF"= df_list, "ClusterGo"=clusterGo_list)
invisible(return(ORA_Output))
}
###################################
### ### ### StandardORA ### ### ###
###################################
#' Uses enricher to run ORA on the differential metabolites (DM) using a pathway list
#'
#' @param InputData DF with metabolite names/metabolite IDs as row names. Metabolite names/IDs need to match the identifier type (e.g. HMDB IDs) in the PathwayFile.
#' @param SettingsInfo \emph{Optional: } Pass ColumnName of the column including parameters to use for pCutoff and PercentageCutoff. Also pass ColumnName for PathwayFile including term and feature names. (pvalColumn = ColumnName InputData, PercentageColumn= ColumnName InputData, PathwayTerm= ColumnName PathwayFile, PathwayFeature= ColumnName PathwayFile) \strong{c(pvalColumn="p.adj", PercentageColumn="t.val", PathwayTerm= "term", PathwayFeature= "Metabolite")}
#' @param pCutoff \emph{Optional: } p-adjusted value cutoff from ORA results. Must be a numeric value. \strong{default: 0.05}
#' @param PercentageCutoff \emph{Optional: } Percentage cutoff of metabolites that should be considered for ORA. Selects Top/Bottom % of selected PercentageColumn, usually t.val or Log2FC \strong{default: 10}
#' @param PathwayFile DF that must include column "term" with the pathway name, column "Metabolite" with the Metabolite name or ID and column "Description" with pathway description that will be depicted on the plots.
#' @param PathwayName \emph{Optional: } Name of the PathwayFile used \strong{default: ""}
#' @param minGSSize \emph{Optional: } minimum group size in ORA \strong{default: 10}
#' @param maxGSSize \emph{Optional: } maximum group size in ORA \strong{default: 1000}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt" \strong{default: "csv"}
#' @param FolderPath \emph{Optional:} Path to the folder the results should be saved at. \strong{default: NULL}
#'
#' @return Saves results as individual .csv files.
#'
#' @export
#'
StandardORA <- function(InputData,
SettingsInfo=c(pvalColumn="p.adj", PercentageColumn="t.val", PathwayTerm= "term", PathwayFeature= "Metabolite"),
pCutoff=0.05,
PercentageCutoff=10,
PathwayFile,
PathwayName="",
minGSSize=10,
maxGSSize=1000 ,
SaveAs_Table="csv",
FolderPath = NULL
){
## ------------ Create log file ----------- ##
MetaProViz_Init()
## ------------ Check Input files ----------- ##
Pathways <- CheckInput_ORA(InputData=InputData,
SettingsInfo=SettingsInfo,
RemoveBackground=FALSE,
PathwayFile=PathwayFile,
PathwayName=PathwayName,
minGSSize=minGSSize,
maxGSSize=maxGSSize,
SaveAs_Table=SaveAs_Table,
pCutoff=pCutoff,
PercentageCutoff=PercentageCutoff)
## ------------ Create Results output folder ----------- ##
if(is.null(SaveAs_Table)==FALSE){
Folder <- SavePath(FolderName= "ORA",
FolderPath=FolderPath)
}
############################################################################################################
## ------------ Load the data and check ----------- ##
InputData<- InputData %>%
rownames_to_column("Metabolite")
#Select universe
allMetabolites <- as.character(InputData$Metabolite)
#select top changed metabolites (Up and down together)
#check if the metabolites are significantly changed.
value <- PercentageCutoff/100
allMetabolites_DF <- InputData[order(InputData[[SettingsInfo[["PercentageColumn"]]]]),]# rank by t.val
selectMetabolites_DF <- allMetabolites_DF[c(1:(ceiling(value * nrow(allMetabolites_DF))),(nrow(allMetabolites_DF)-(ceiling(value * nrow(allMetabolites_DF)))):(nrow(allMetabolites_DF))),]
selectMetabolites_DF$`Top/Bottom`<- "TRUE"
selectMetabolites_DF <-merge(allMetabolites_DF,selectMetabolites_DF[,c("Metabolite", "Top/Bottom")], by="Metabolite", all.x=TRUE)
InputSelection <- selectMetabolites_DF%>%
mutate(`Top/Bottom_Percentage` = case_when(`Top/Bottom`==TRUE ~ 'TRUE',
TRUE ~ 'FALSE'))%>%
mutate(Significant = case_when(get(SettingsInfo[["pvalColumn"]]) <= pCutoff ~ 'TRUE',
TRUE ~ 'FALSE'))%>%
mutate(Cluster_ChangedMetabolites = case_when(Significant==TRUE & `Top/Bottom_Percentage`==TRUE ~ 'TRUE',
TRUE ~ 'FALSE'))
InputSelection$`Top/Bottom` <- NULL #remove column as its not needed for output
selectMetabolites <- InputSelection%>%
subset(Cluster_ChangedMetabolites==TRUE)
selectMetabolites <-as.character(selectMetabolites$Metabolite)
#Load Pathways
Pathway <- Pathways
Term2gene <- Pathway[,c("term", "gene")]# term and MetaboliteID (MetaboliteID= gene as syntax required for enricher)
term2name <- Pathway[,c("term", "Description")]# term and description
#Add the number of genes present in each pathway
Pathway$Count <- 1
Pathway_Mean <- aggregate(Pathway$Count, by=list(term=Pathway$term), FUN=sum)
names(Pathway_Mean)[names(Pathway_Mean) == "x"] <- "Metabolites_in_Pathway"
Pathway <- merge(x= Pathway, y=Pathway_Mean,by="term", all.x=TRUE)
Pathway$Count <- NULL
## ------------ Run ----------- ##
#Run ORA
clusterGo <- clusterProfiler::enricher(gene=selectMetabolites,
pvalueCutoff = 1,
pAdjustMethod = "BH",
universe = allMetabolites,
minGSSize=minGSSize,
maxGSSize=maxGSSize,
qvalueCutoff = 1,
TERM2GENE=Term2gene ,
TERM2NAME = term2name)
clusterGoSummary <- data.frame(clusterGo)
#Make DF:
if(!(dim(clusterGoSummary)[1] == 0)){
#Add pathway information % of genes in pathway detected)
clusterGoSummary <- merge(x= clusterGoSummary%>% select(-Description), y=Pathway%>% select(term, Metabolites_in_Pathway),by.x="ID",by.y="term", all=TRUE)
clusterGoSummary$Count[is.na(clusterGoSummary$Count)] <- 0
clusterGoSummary$Percentage_of_Pathway_detected <-round(((clusterGoSummary$Count/clusterGoSummary$Metabolites_in_Pathway)*100),digits=2)
clusterGoSummary <- clusterGoSummary[!duplicated(clusterGoSummary$ID),]
clusterGoSummary <- clusterGoSummary[order(clusterGoSummary$p.adjust),]
clusterGoSummary <- clusterGoSummary%>%
dplyr::rename("Metabolites_in_pathway"="geneID")
}else{
stop("None of the Input_data Metabolites were present in any terms of the PathwayFile. Hence the ClusterGoSummary ouput will be empty. Please check that the metabolite IDs match the pathway IDs.")
}
#Return and save list of DFs
ORA_output_list <- list("InputSelection" = InputSelection , "ClusterGoSummary" = clusterGoSummary)
#save:
suppressMessages(suppressWarnings(
SaveRes(InputList_DF=ORA_output_list,
InputList_Plot= NULL,
SaveAs_Table=SaveAs_Table,
SaveAs_Plot=NULL,
FolderPath= Folder,
FileName= paste(PathwayName),
CoRe=FALSE,
PrintPlot=FALSE)))
#Return
ORA_output_list <- c( ORA_output_list, list("ClusterGo"=clusterGo))
invisible(return(ORA_output_list))
}
#################################
### ### ### Helper Fishers exact test ### ### ###
#################################
#'
#'
# perform test on n groups of features. DF with column feature and column group
# output list of DFs named after groups