-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstep3-DEG.R
112 lines (109 loc) · 5.01 KB
/
step3-DEG.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
####################################
# 5.对表达矩阵进行差异分析
####################################
#5.1 只需要提供表达矩阵efilt、分组信息group_list,就能使用limma进行分析
suppressMessages(library(limma))
#limma需要三个矩阵:表达矩阵(efilt)、分组矩阵(design)、比较矩阵(contrast)
#先做一个分组矩阵~design,说明MAO是哪几个样本,MNO又是哪几个,其中1代表“是”
design <- model.matrix(~0+factor(group_list))
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(efilt)
design
#再做一个比较矩阵【一般是case比control】
contrast<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast
#5.2 准备就绪,就可以开始差异分析
DEG <- function(efilt,design,contrast){
##step1
fit <- lmFit(efilt,design)
##step2
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
##step3
mtx = topTable(fit2, coef=1, n=Inf)
deg_mtx = na.omit(mtx)
return(deg_mtx)
}
DEG_mtx <- DEG(efilt,design,contrast) #得到全部的差异基因矩阵
save(DEG_mtx, efilt, file = "GSE62832.DEG.Rdata")
#5.3 对一个小表达矩阵,如30、50个基因,可以用热图
top30_gene=head(rownames(DEG_mtx),30)
top30_matrix=efilt[top30_gene,] #得到top30的表达量矩阵
top30_matrix=t(scale(t(top30_matrix)))
#这里做个top30 gene heatmap
if(T){
library(ComplexHeatmap)
#install.packages("dendextend")
library(dendextend)
library(dendsort)
e_mean = tail(sort(apply(efilt,1,mean)),30)
#top30_cluster_cols = hclust(dist(t(top30_matrix)))
sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...))) #聚类的过滤函数
top30_cluster_cols = sort_hclust(hclust(dist(t(top30_matrix))))
top30_cluster_rows <- sort_hclust(hclust(dist(top30_matrix)))
base_mean = e_mean
ef_lable = HeatmapAnnotation(text = anno_text(colnames(efilt), rot = 45,offset = unit(23, "mm"), gp = gpar(fontsize = 8.5)))
p <- Heatmap(top30_matrix, name = "expresssion",
column_title = "Samples",
column_title_gp = gpar(fontsize = 14, fontface = "bold"),
row_title = "Genes",
row_title_gp = gpar(fontsize = 14, fontface = "bold"),
row_names_gp = gpar(fontsize = 8.5),
row_names_side = "left",
cluster_rows = color_branches(top30_cluster_rows, k = 4),
cluster_columns = color_branches(top30_cluster_cols, k = 3),
show_column_names = F,
bottom_annotation = ef_lable, bottom_annotation_height = unit(0.5, "cm"))+
Heatmap(base_mean, name = "base_mean", show_row_names = FALSE, width = unit(5, "mm"))
print(p)
}
#5.4 对一个大的表达矩阵,如全部的差异基因,可以用火山图
#火山图实际上就是根据两列进行作图:logFC、pvalue
plot(DEG_mtx$logFC, -log10(DEG_mtx$P.Value)) #最简单的图(能说明原理)
#一个好看点的图
DEG=DEG_mtx
if(T){logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
DEG$result = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3), #round保留小数位数
'\nThe number of up gene is ',nrow(DEG[DEG$result =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$result =='DOWN',])
)
library(ggplot2)
p <- ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=result)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## 这里要注意和之前设置的result三个因子相对应,DOWN就设为blue,NOT就设为black
print(p)
}
#使用DESeq2+EnhancedVolcano【待完善!】
if(F){
library("DESeq2")
dds <- DESeqDataSet(airway, design = ~cell + dex)
dds <- DESeq(dds, betaPrior = FALSE)
res2 <- results(dds, contrast = c("cell", "N061011", "N61311"))
res2 <- lfcShrink(dds, contrast = c("cell", "N061011", "N61311"), res = res2)
install.packages("devtools")
library(devtools)
devtools::install_github("kevinblighe/EnhancedVolcano")
library(EnhancedVolcano)
EnhancedVolcano(DEG,
lab = rownames(DEG),
x = "log2FoldChange", y = "padj",
selectLab = top30_gene,
xlab = bquote(~Log[2]~ "fold change"),
ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = 0.0001,
FCcutoff = 2.0,
xlim = c(-6,6),
transcriptPointSize = 1.8,
transcriptLabSize = 5.0,
colAlpha = 1,
legend=c("NS","Log2 FC","Adjusted p-value",
"Adjusted p-value & Log2 FC"),
legendPosition = "bottom",
legendLabSize = 10,
legendIconSize = 3.0)
}