-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdifferentialExpressionAnalysis.R
283 lines (253 loc) · 10.4 KB
/
differentialExpressionAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
## @input groups-samples file, feature-length file, relative abundance file, comparisons
## @param abundFile string abundance file
## @param featureLengthFile string feature length file
## @param groupSampFile string groups-samples file
## @param compar string comparisons to be made
## @output
## ---------------------------------------------------------------
## ABUNDANCE TAB
## ---------------------------------------------------------------
## FeatureID S1 S2 S3
## CDS1 2.3 3.9 3.8
## ...
## CDSN 3.8 2.1 3.0
## groupSampFile is a matrix (rows = groups, columns = samples)
## ---------------------------------------------------------------
## GROUP TAB
## ---------------------------------------------------------------
## GroupID S1 S2 S3 S4
## Group1 1 0 0 0
## Group2 0 1 1 0
## Group3 0 0 0 1
## Comparisons should be of the form:
## G1-G3;G2-G6-G3 (comparisons of samples of G1 vs samples of G3 ; and samples of G2 vs G6 vs G3) => ";-," cannot be used in group names
## RMK: Multiple groups comparisons is not yet implemented. If more than 2 groups are given, all combinations of 2 groups are taken as comparisons
## @usage R CMD BATCH '--args groupSampFile abundFile featureLengthFile compar methodName' differentialExpressionAnalysis.R
## ------ ---------------- FUNCTIONS ------------------------------------------------
## @brief pairwiseComparisonsDifferentialExpression
## dataObject === list data object with the following structure:
## dataObject -- "groupLabels" -- string list of group labels
## -- "groupSamples" -- group1 -- sample list (ex:c(sample1,sample5))
## -- ...
## -- groupN -- sample list (ex:c(sample2,sample8))
## -- "data" -- data.frame (rows = features, column = samples)
pairCompDiffExp <- function(
dataObject,
methodName, # "DESeq","KW"
withplot = 0 # generate plots
){
sampleNames <- colnames(dataObject$"data")
conds <- c()
resAll<- c()
## Get allcomb samples from groups
for (nam in sampleNames){
print(nam)
for (group in dataObject$"groupLabels"){
if (nam %in% dataObject$"groupSamples"[[group]]){
conds <- c(conds,group)
break
}
}
}
if (length(conds)!=length(sampleNames)){
stop("Some of the sample names in the group table are not found in the abundance table")
}
## going through all pairwise combinations
mystring<-dataObject$"groupLabels"
allpaircomb<-unlist(sapply(mystring[-length(mystring)],function(x) paste(x,mystring[(grep(x,mystring)+1):length(mystring)],sep=",")))
## check if biological replicates exist
for (group in dataObject$"groupLabels"){
if (length(dataObject$"groupSamples"[[group]]) == 1){
flagBlind <- 1
break
}else{
flagBlind <- 0
}
}
for (pair in allpaircomb){
if (methodName=="DESeq"){
g <- strsplit(pair,",")[[1]]
cds <- newCountDataSet(dataObject$"data",conds)
cds <- estimateSizeFactors(cds)
## cds <- estimateVarianceFunctions(cds)
if (flagBlind == 1){
#cds <- estimateVarianceFunctions(cds,method="blind")#,locfit_extra_args=list(maxk=200))
cds <- estimateDispersions(cds,method="blind",sharingMode="fit-only",fitType="local")
#cds <- estimateDispersions(cds,method="pooled")
}else{
#cds <- estimateVarianceFunctions(cds)
cds <- estimateDispersions(cds)
}
res <- nbinomTest(cds,g[1],g[2])
resAll<- c(resAll,res)
## Diagnostic plot to check the fit of the variance functions
if (withplot){
pdf(paste("checkVarFitPlot",".pdf",sep="",collapse=""))
for (group in dataObject$"groupLabels"){
diagForGroup<-varianceFitDiagnostics(cds,group)
smoothScatter(log10(diagForGroup$baseMean),log10(diagForGroup$baseVar))
lines(log10(fittedBaseVar)~log10(baseMean),diagForGroup[order(diagForGroup$baseMean),],col="red")
title(paste("Fit of variance - group ",group,sep="",collapse=""))
dev.off()
}
}
}else if (methodName == "KW"){
print(conds)
print(colnames(dataObject$"data"))
print(dataObject$"data" %in% conds)
myd <- dataObject$"data"#[,which(colnames(dataObject$"data") %in% conds)]
## colnames(myd)[which(colnames(myd)=="Deinococcus-Thermus_uncertain")]<-"Deinococcus_Thermus_uncertain"
## colnames(myd)[which(colnames(myd)=="Deinococcus-Thermus_uncertain")]<-"Deinococcus_Thermus_uncertain"
myd <- rbind(myd,conds)
rownames(myd)[nrow(myd)]<-"group"
myd<-t(myd)
## Check whether the first character is an integer and change the column name if this is the case
nbrList <- as.character(0:9)
if (substring(colnames(myd)[1],1,1) %in% nbrList){
colnames(myd)[1:(ncol(myd)-1)]<-unlist(lapply(colnames(myd)[1:(ncol(myd)-1)],function(x) paste("C",x,sep="",collapse="")))
}
resAll<-c()
resAll$names<-c()
resAll$pvalue<-c()
names<-c()
pvalue<-c()
for (i in 1:ncol(myd)){
print(i)
res<-kruskal.test(eval(parse(text=colnames(myd)[i])) ~ group,data=myd)
#res<-kruskal.test(colnames(myd)[i] ~ group,data=myd)
res$names<-colnames(myd)[i]
res$pvalue<-res$p.value
#resAll <- c(resAll,res)
names <- c(names,res$names)
pvalue <- c(pvalue,res$pvalue)
## resAll$names <- c(resAll$names,res$names)
## resAll$pvalue <- c(resAll$pvalue,res$pvalue)
}
print(names)
print(pvalue)
resAll$names <- names
resAll$pvalue <- pvalue
}
}
ret <- list(resAll=resAll, allpaircomb=allpaircomb)
}
## ----------------------------------------------------------------------------------
# @brief Generate table and figure for the DESeq method
DESeqGenFigAndTab <- function(
com,#comparison list
resAll,#result object
padj, #p-value
allpaircomb, # all pair combinations
dirout # output directory
){
bitmap(file.path(dirout,paste("meanVSvarPlot-",com[i],".pdf",sep="",collapse="")),type = "png256")
print(length(resAll))
print(class(resAll))
for (ir in 1:length(resAll)){
plot(
resAll[[ir]]$baseMean,
resAll[[ir]]$log2FoldChange,
log="x",pch=20,cex=0.1,
col = ifelse( resAll[[ir]]$padj < padj, "red", "black" ) ##
)
title(paste("Pair ",allpaircomb[ir],sep="",collapse=""))
dev.off()
}
## writing results
for (ir in 1:length(resAll)){
#write.table(resAll[[ir]],paste("diffExp-",com[i],"-",allpaircomb[ir],".csv",sep="",collapse=""))
write.table(resAll[[ir]],file.path(dirout,paste("diffExp-",com[i],".csv",sep="",collapse="")),row.names=F)
dat <- resAll[[ir]]
newTab <- dat[which(dat[,"padj"]<padj),c("id","padj")]
write.table(newTab,file.path(dirout,paste("summ",com[i],".csv",sep="",collapse="")),row.names=F)
}
}
## ----------------------------------------------------------------------------------
# @brief Generate table and figure for the DESeq method
KWGenFigAndTab <- function(
com,#comparison list
resAll,#result object
padj, #p-value
allpaircomb, # all pair combinations
outdir # output directory
){
for (ir in 1:length(resAll)){
## toW<-c()
## for (i in nrow(resAll[[ir]])){
## toW<-rbind(toW,cbind(resAll[[ir]]$names,resAll[[ir]]$pvalue))
## print(toW)
## }
write.table(resAll[[ir]],file.path(outdir,paste("diffExp-",com[ir],".csv",sep="",collapse="")),row.names=F)
}
}
## ----------------------------- MAIN -----------------------------------------------
## loading arguments
args <- commandArgs(TRUE)
abundFile <- args[1]
featureLengthFile <- args[2]
groupSampFile <- args[3]
compar <- args[4]
outdir <- args[5] # output directory
methodName <- args[6]
## Loading libraries
library(DESeq)
## Method names
metNames = c("DESeq","KW")
## making group comparisons
com <- strsplit(compar,";") # comparisons (ex: c("G1-G3","G2-G6-G3))
nbcom <-length(com) # nb of comparisons
## loading groups-sample table + loading relative abundance table
## nbrTot <- system(paste("head -n 2 ",featureLengthFile,sep="",collapse=""),intern=TRUE)
## nbrTot <- strsplit(nbrTot[2],"\t")[[1]] #list of total numbers of reads
abtab <- read.csv(abundFile,header=T,sep="\t",check.names=F)
rownames(abtab) <- abtab[,1] # 1st column contains the rownames
abtab <- abtab[,-1]
grtab <- read.csv(groupSampFile,header=T,sep="\t",check.names=F)
rownames(grtab) <- grtab[,1] # 1st column contains the rownames
grtab <- grtab[,-1]
## Statistics
allcomb<-list()
for (i in 1:length(com)){ # going through comparisons
allcomb[[i]] <- list()
grps <-strsplit(com[[i]],"-")[[1]] # sample groups (ex: c("G1","G3") )
allcomb[[i]]$"groupLabels" <- grps
allcomb[[i]]$"groupSamples" <- list()
sampList<-c()
for (grp in grps){ # going through groups
grpline <- grtab[which(rownames(grtab)==grp),] #
samps <- colnames(grtab)[which(grpline==1)] #sample list
allcomb[[i]]$"groupSamples"[[grp]] <- samps
sampList<-c(sampList,samps)
}
allcomb[[i]]$"data" <- abtab[,which(colnames(abtab) %in% sampList)]
res <- pairCompDiffExp(allcomb[[i]],methodName,withplot=0)
resAll <- res$resAll
if (length(allcomb)==1){
tmp<-resAll
rm(resAll)
resAll<-list()
resAll[[1]] <- tmp
}
allpaircomb <- res$allpaircomb
## Plotting files
padj = 0.1 ## p-value after correcting for multiple testing using Benjamini-Hochberg procedure which controls false discovery rate (FDR)
if (methodName == "DESeq"){
DESeqGenFigAndTab(
com,#comparison list
resAll,#result object
padj, #p-value
allpaircomb, # all pair combinations
outdir # output directory
)
}else if (methodName == "KW"){
KWGenFigAndTab(
com,#comparison list
resAll,#result object
padj, #p-value
allpaircomb, # all pair combinations
outdir # output directory
)
}else{
stop(paste("Method name must be in: ",metNames))
}
}