-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGLM_DE.R
130 lines (105 loc) · 4.37 KB
/
GLM_DE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Zinka 2020_06_16
# Differential expression analysis using EdgeR.
#
# Inputs:
# - Table with TPM values for all genes in all samples (all_abundnace_TPM.csv)
#
# Outputs:
# - Plots: Mean square deviation plot, dispertion plots, differential expression plots.
# - Data table with logFC and p-values for differential expression of trated vs. control T. pseudonana cultures at each time point.
library(edgeR)
library(dplyr)
library(readr)
library(ggplot2)
library(tidyr)
library(stringr)
setwd('~/Desktop/Thaps_Catl')
################################################################
# Importing files:
################################################################
#Reading in count file containing all experimental groups with only raw TPM counts, no annotations!
rawdata<- read.csv(file = '~/Desktop/Thaps_Catl/all_abundance_TPM.csv',
check.names=FALSE,
stringsAsFactors = FALSE,
header = TRUE)
head(rawdata)
################################################################
# Pre-processing data:
################################################################
#Format
counts <- select(rawdata, -"Gene ID") %>%
as.matrix()
genes <- select(rawdata, "Gene ID")
#Create metadata table for sample characteristics (time, group, replicate)
samples <- as.array(colnames(counts)) %>%
strsplit2( "[_]")
colnames(samples) <- c("time", "treatment", "rep")
#Put data into DGEList format:
y <- DGEList(counts=counts, samples = samples, genes=genes)
#TMM normalization:
y <- calcNormFactors(y) #Estimates normalization factors (used in Anders 2013)
# Mean Square deviation (MSD) plot:
pdf(file = "~/Desktop/Thaps_Catl/MSD.pdf", width = 10, height = 8)
plotMDS(y)
dev.off()
################################################################
# GLM model for differential expression analysis
################################################################
#Making the design matrix:
samples <- data.frame(samples)
Group <- factor(paste(samples$time, samples$treatment, sep="."))
samples <- cbind(samples,Group=Group)
design <- model.matrix(~0+Group)
colnames(design) <- levels(Group)
#Estimating dispersion:
disp = estimateDisp(y, design)
pdf(file = "~/Desktop/Thaps_Catl/Estimated_dispersion.pdf", width = 10, height = 8)
plotBCV(disp)
dev.off()
#Fitting GLMs to my data with the design matrix superimposed:
fit <- glmQLFit(disp, design)
pdf(file ="~/Desktop/Thaps_Catl/Fitted_dispersion.pdf", width = 10, height = 8)
plotQLDisp(fit)
dev.off()
################################################################
# Setting contrasts
################################################################
#Contrasts for pairwise comparisons between treated and control transcripts:
#Un-commemnt the one to be analyzed.
# Compare t24.filtrate to t24.cont: (-1*t24.cont, +1*t24.filtrate)
contrast = c(0,0,-1,1,0,0)
name = 't24.filtrate_vs_t24.cont'
# # Compare t72.filtrate to t72.cont: (-1*t72.cont, +1*t72.filtrate)
# contrast = c(0,0,0,0,-1,1)
# name = 't72.filtrate_vs_t72.cont'
# # Compare t120.filtrate to t120.cont: (-1*t120.cont, +1*t120.filtrate)
# contrast = c(-1,1,0,0,0,0)
# name = 't120.filtrate_vs_t120.cont'
################################################################
# Differential expression analsyis
################################################################
#DE with Quasi-Likelihood F-tests:
de = glmQLFTest(fit, contrast = contrast)
topTags(de)
#Calculate the total number of differentially expressed genes at 5% FDR
DEnumbers <- summary(decideTests(de))
DEnumbers = data.frame(DEnumbers)
pdf(file = "~/Desktop/Thaps_Catl/Num_DE_genes.pdf", width = 10, height = 8)
ylim <- c(0,1.1*max(DEnumbers$Freq))
xx <- barplot(DEnumbers$Freq,
main = DEnumbers$Var2[1],
names.arg = DEnumbers$Var1,
ylab = "Number of DE genes",
ylim = ylim)
DEnumbers$Freq <- as.numeric(DEnumbers$Freq)
text(x = xx, y = DEnumbers$Freq, label = DEnumbers$Freq, pos = 3)
dev.off()
#Plot log-fold change against log-counts per million, with DE genes highlighed:
pdf(file = "~/Desktop/Thaps_Catl/plotMD.pdf", width = 10, height = 8)
plotMD(de)
abline(h=c(-1,1), col="blue") #adding a blue line that indicates a 2-fold change.
dev.off()
#Export csv table of DE genes for the selected contrast:
top_genes = topTags(de, n=nrow(y), sort.by = "logFC")
file = paste("~/Desktop/Thaps_Catl/", name, ".csv", sep = "")
write.csv(top_genes$table, file=file)