-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNGS_DEPipeline.Rmd
225 lines (195 loc) · 8.16 KB
/
NGS_DEPipeline.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
---
title: "Next-Generation Sequencing Single Cell Differential Expression Analysis Pipeline"
author: "Anni Liu"
date: "June 19, 2022"
output:
rmdformats::downcute:
downcute_theme: "chaos"
fig_height: 4.5
fig_width: 4.5
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
options(scipen = 999, digits = 4),
cache = TRUE,
error = FALSE,
message = FALSE,
warning = FALSE,
tidy.opts = list(width.cutoff = 60),
tidy = TRUE,
fig.width = 12,
fig.height = 8
)
```
# Key points for the differential expression analysis pipeline
- Sequence reads quality control
+ rule: **trim_galore_trim**
- Quantify expression
+ rule: **kallisto_quant**
- Differential gene expression analysis with R
+ **DEReport.Rmd**
* Principal component analysis for different cell types
+ key functions: `rlog()` and `plotPCA()`
* Volcano plot
+ key functions: `ggplot()`, `geom_point()`, `do.call()`, and `grid.arrange()`
* Pairwise comparison for differential gene expression between cell types
+ key functions: `DESeq()` and `results()`
# Build the snakemake rule of taking FASTQ files from the seqencing facility, assessing the quality of reads, and quantifying abundances of transcripts from RNA-Seq data
```{python eval=FALSE}
# configfile: "config.yaml"
from collections import Counter
import re
SAMPLE_NAME = sorted(set(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").fname)) # Automatically detect the sorted unique sample names
SAMPLE_NAME_ALL = sorted(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").fname) # Automatically detect the sorted sample names
REPLICATE = sorted(set(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").replicate)) # Considering the user may have different numbers of technical replicates for each sample
CONDITION = [re.split(r"['_'\d]", element)[0] for element in SAMPLE_NAME] # Automatically detect the sorted conditions
CONDITION_ALL = [re.split(r"['_'\d]", element)[0] for element in SAMPLE_NAME_ALL] # Automatically detect the sorted conditions
BIOLOGICAL_REPLICATE = list(Counter(CONDITION).values()) # Automatically detect the number of conditions (i.e., the number of Bmem)
CONVERTED_LIST = [str(i) for i in BIOLOGICAL_REPLICATE] # Convert each integer element in a list into a string
rule all:
input:
expand("kallisto_results/{target}/abundance.tsv",
target = SAMPLE_NAME),
expand("kallisto_results/{target}/run_info.json",
target = SAMPLE_NAME)
output: "DEReport.html"
params:
sample_list = '"' + '","'.join(SAMPLE_NAME) + '"', # Construct a long string containing each individual sample name separated by comma, with the initial and ending double quotes
condition_list = '"' + '","'.join(CONDITION) + '"', # Construct a long string containing each individual condition name corresponding to each individual sample name separated by comma, with the initial and ending double quotes
condition_times = ",".join(CONVERTED_LIST) # Construct a long string containing the frequency of each condition separated by comma
shell:
"""
Rscript -e 'library(rmarkdown); library(rmdformats); rmarkdown::render(input = "DEReport.Rmd", params = list("SAMPLE_NAME" = c({params.sample_list}), "CONDITION" = c({params.condition_list}), "BIOLOGICAL_REPLICATE" = c({params.condition_times})), output_format = "all", output_file = "{output}")'
"""
rule trim_galore_trim:
input:
"data/samples/{sample}_R1_001_copy{replicate}.fastq.gz"
output:
"trimmed_reads/{sample}/{sample}_R1_001_copy{replicate}_trimmed.fq.gz"
threads: 4
shell:
"trim_galore {input} "
"--cores {threads} "
"--quality 20 "
"--output_dir trimmed_reads/{wildcards.sample} "
"--no_report_file"
rule kallisto_quant:
input:
expand("trimmed_reads/{{sample}}/{{sample}}_R1_001_copy{replicate}_trimmed.fq.gz",
replicate = REPLICATE)
output:
"kallisto_results/{sample}/abundance.tsv",
"kallisto_results/{sample}/run_info.json"
params:
index = "mus_musculus/transcriptome.idx"
threads: 4
shell:
"kallisto quant "
"--index {params.index} "
"--plaintext "
"--output-dir kallisto_results/{wildcards.sample} "
"--single -l 200 -s 20 "
"{input}"
```
# Flow from expression quantification to differential expression analysis in R
```{r eval=FALSE}
# The following contents are extracted from DEReport.Rmd
---
title: "Differential Expression Analysis Report"
author: "Anni Liu"
output:
rmdformats::downcute:
downcute_theme: "chaos"
date: "June 6 - June 12, 2022"
params:
SAMPLE_NAME: [""]
CONDITION: [""]
BIOLOGICAL_REPLICATE: [""]
---
# Load libraries
library(tidyverse)
library(tximport)
# Set the file path
dir <- c("/Users/anniliu/Desktop/DE_pipeline")
# Propagate SAMPLE_NAME from params -to-> {params.sample_list} in `rule all` -to-> YAML in head of rmarkdown -to-> here: getElement(params, "SAMPLE_NAME")
sample_name <- getElement(params, "SAMPLE_NAME")
files <- file.path(dir, "kallisto_results", sample_name, "abundance.tsv")
names(files) <- sample_name # Assign individual names to individual tsv file path
# Import tx2 gene file for mice
library(readr)
tx2gene_mice <- read.table("/Users/anniliu/Desktop/DE_pipeline/mus_musculus/transcripts_to_genes.txt",
header = FALSE) %>%
select(-V3) # Select away gene names
head(tx2gene_mice) # Take a peek at the first 6 records of the file
# Generate the count matrix for each sample
txi.kallisto.tsv <- tximport(files,
type = "kallisto",
tx2gene = tx2gene_mice,
ignoreAfterBar = TRUE) # Split the rownames at the first bar “|”, and only includes the first “ENST” identifier.
# Look at the structure of count data
cts <- txi.kallisto.tsv$counts
str(cts)
# Look at the number of sequence fragments that are assigned to each gene
head(cts) %>% # Take a peek at the first 6 records of gene expression in raw counts
knitr::kable()
# Create a data frame of sample information
condition_name <- getElement(params, "CONDITION")
condition_times <- getElement(params, "BIOLOGICAL_REPLICATE")
# print(condition_name)
# print(condition_times)
coldata <- data.frame(condition = condition_name, type = rep("single-read"))
rownames(coldata) <- sample_name
coldata %>%
knitr::kable()
# Factorize two variables
coldata$condition <- factor(coldata$condition)
coldata$type <- factor(coldata$type)
# Construct DESeqDataSetFromMatrix
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = round(cts),
colData = coldata,
design = ~ condition)
dds
# Pre-filtering least variable genes
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
head(dds)
# Perform differential expression analysis
dds <- DESeq(dds)
# Extract the pairwise combination of 2 conditions
pair_condition <- combn(unique(condition_name), 2, simplify = FALSE)
pair_number <- length(pair_condition)
res <- list(length = pair_number)
# Return a result table with base mean, log2 fold changes, p values and adjusted p values for each pairwise comparison of 2 conditions
for (i in 1:pair_number){
contrast <- c("condition", pair_condition[[i]])
# print(contrast)
res[i] <- list(results(dds, contrast = contrast))
}
print(res)
# Batch volcano plot
plot_array <- list()
for (i in 1:pair_number) {
res_new <- res[[i]] %>%
data.frame() %>%
rownames_to_column(var = "ensgene") %>%
mutate(threshold = padj < 0.5)
plot_array[[i]] <- ggplot(res_new) +
geom_point(mapping = aes(x = log2FoldChange,
y = -log10(padj),
color = threshold)) +
labs(
title = paste0(pair_condition[[i]][1], " vs ", pair_condition[[i]][2]),
x = "log2 fold change",
y = "-log10 adjusted p-value"
) +
xlim(-10, 10) +
scale_color_manual(values = c("black", "#CFB53B")) +
theme_bw() +
guides(color = "none")
}
library(gridExtra)
do.call("grid.arrange", c(plot_array, ncol = 3))
# Perform principal component analysis
plotPCA(rlog(dds), intgroup = "condition")
```