-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlignFASTQ.Rmd
265 lines (228 loc) · 9.92 KB
/
AlignFASTQ.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
---
title: "ATAC-Seq Data Analysis with Human Data - Align the FASTQ files"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
code_folding: show
---
```{r, shorcut, include=FALSE}
## RStudio keyboard shortcut
# Cursor at the beginning of a command line: Ctrl+A
# Cursor at the end of a command line: Ctrl+E
# Clear all the code from your console: Ctrl+L
# Create a pipe operator %>%: Ctrl+Shift+M (Windows) or Cmd+Shift+M (Mac)
# Create an assignment operator <-: Alt+- (Windows) or Option+- (Mac)
# Knit a document (knitr): Ctrl+Shift+K (Windows) or Cmd+Shift+K (Mac)
# Comment or uncomment current selection: Ctrl+Shift+C (Windows) or Cmd+Shift+C (Mac)
```
# Run the FastQC to check the read quality in FASTQ files
```{bash}
zcat *.fastq.gz | fastqc stdin --outdir=../report/
# zcat ATACSample_r1.fastq.gz ATACSample_r2.fastq.gz | fastqc stdin --outdir=../report/
```
# Align the FASTQ files
## Create a reference genome
```{r}
library(BSgenome.Hsapiens.UCSC.hg19)
mainChromosomes <- paste0("chr", c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
function(x) BSgenome.Hsapiens.UCSC.hg19[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
Biostrings::writeXStringSet(x = mainChrSeqSet,
filepath = "BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa")
```
## Update the reference genome based on hg38
```{r}
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890951-Human-genome-reference-builds-GRCh38-or-hg38-b37-hg19
library(BSgenome.Hsapiens.UCSC.hg38)
main_chromosomes <- paste0("chr", c(1:22, "X", "Y", "M"))
main_chr_seq <- lapply(main_chromosomes,
function(x) BSgenome.Hsapiens.UCSC.hg38[[x]])
names(main_chr_seq) <- main_chromosomes
main_chr_seq_set <- DNAStringSet(main_chr_seq)
Biostrings::writeXStringSet(x = main_chr_seq_set,
filepath = "BSgenome.Hsapiens.UCSC.hg38.mainchr.fa")
```
## Update the reference genome based on hs1
```{r}
library(BSgenome.Hsapiens.UCSC.hs1)
main_chromosomes <- paste0("chr", c(1:22, "X", "Y", "M"))
main_chr_seq <- lapply(main_chromosomes,
function(x) BSgenome.Hsapiens.UCSC.hs1[[x]])
names(main_chr_seq) <- main_chromosomes
main_chr_seq_set <- DNAStringSet(main_chr_seq)
Biostrings::writeXStringSet(x = main_chr_seq_set,
filepath = "BSgenome.Hsapiens.UCSC.hs1.mainchr.fa")
```
## Build the index
```{r}
library(Rsubread)
buildindex(basename = "BSgenome.Hsapiens.UCSC.hg19.mainChrs",
reference = "BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa",
indexSplit = TRUE,
memory = 1000)
# indexSplit: logical indicating whether the index can be split into multiple blocks. The block size is determined by the value of memory. FALSE by default (ie. a single-block index is generated).
# memory: a numeric value specifying the amount of memory (in megabytes) used for storing the index during read mapping. 8000 MB by default. Note that this option is ignored when indexSplit is FALSE.
```
## Align **paired-end** sequence reads from Greenleaf
```{r}
# Take a peek at 2 fastq files
library(ShortRead)
# The following 2 fastq files data downloaded from https://www.ebi.ac.uk/ena/browser/view/SRR891269
read1 <- readFastq(dirPath = "./data/ATACSample_r1.fastq.gz")
read2 <- readFastq(dirPath = "./data/ATACSample_r2.fastq.gz")
id(read1)[1]
# BStringSet object of length 1:
# width seq
# [1] 59 HISEQ:236:HA03EADXX:1:1101:1147:2237 1:Y:0:TAAGGCGACTCTCTAT
id(read2)[1]
# BStringSet object of length 1:
# width seq
# [1] 59 HISEQ:236:HA03EADXX:1:1101:1147:2237 2:Y:0:TAAGGCGACTCTCTAT
# These 2 IDs indicate 2 reads are paired, stemming from the same fragment
library(Rsubread)
align(index = "BSgenome.Hsapiens.UCSC.hg19.mainChrs",
readfile1 = "./data/ATACSample_r1.fastq.gz",
readfile2 = "./data/ATACSample_r2.fastq.gz",
output_file = "ATAC_50K_2.bam",
nthreads = 2, type = 1,
unique = TRUE, maxFragLength = 2000)
# type: a character string or an integer giving the type of sequencing data. Possible values include rna (or 0; RNA-seq data) and dna (or *1; genomic DNA-seq data such as WGS, WES, *ChIP-seq data etc.). Character strings are case insensitive.
# unique: logical indicating if only uniquely mapped reads should be reported. A uniquely mapped read has one single mapping location that has less mis-matched bases than any other candidate locations. By default, multi-mapping reads will be reported in addition to uniquely mapped reads. Number of alignments reported for each multi-mapping read is determined by the nBestLocations parameter.
# maxFragLength: numeric value giving the maximum fragment length. 600 by default.
```
## Perform the alignment relative to hg38 using the new ATACseq data from female lung
```{bash}
# https://www.ebi.ac.uk/ena/browser/view/SAMN08743398?show=reads
wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR687/008/SRR6870408/SRR6870408_2.fastq.gz
wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR687/008/SRR6870408/SRR6870408_1.fastq.gz
# -nc: if a file with the same name already exists, wget will not overwrite it but will skip the download.
```
```{r}
r1 <- readFastq(dirPath = "./data/SRR6870408_1.fastq.gz")
r2 <- readFastq(dirPath = "./data/SRR6870408_2.fastq.gz")
id(r1)[1]
# BStringSet object of length 1:
# width seq
# [1] 16 SRR6870408.1 1/1
id(r2)[1]
# BStringSet object of length 1:
# width seq
# [1] 16 SRR6870408.1 1/2
align(index = "BSgenome.Hsapiens.UCSC.hg38.mainchr",
readfile1 = "./data/SRR6870408_1.fastq.gz",
readfile2 = "./data/SRR6870408_2.fastq.gz",
output_file = "ATAC_female_lung.bam",
nthreads = 4, type = 1,
unique = TRUE, maxFragLength = 600)
```
## Alternative alignment approach: `Rbowtie2`
### Build the Rbowtie2 index relative to hg19
```{r}
library(Rbowtie2)
bowtie2_build(references = "BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa",
bt2Index = "BSgenome.Hsapiens.UCSC.hg19.mainChrs_bowtie2")
```
### Build the Rbowtie2 index relative to hg38
```{r}
bowtie2_build(references = "BSgenome.Hsapiens.UCSC.hg38.mainchr.fa",
bt2Index = "BSgenome.Hsapiens.UCSC.hg38.mainchr_bowtie2")
```
### Decompress the FASTQ files
```{bash}
gunzip ./ATAC_Data/ATAC_FQs/SRR891269_1.fastq.gz
gunzip ./ATAC_Data/ATAC_FQs/SRR891269_2.fastq.gz
# Keep the compressed file:
# gunzip -c ./ATAC_Data/ATAC_FQs/SRR891269_1.fastq.gz > ./ATAC_Data/ATAC_FQs/SRR891269_1.fastq
```
```{bash}
gunzip ./data/SRR6870408_1.fastq.gz
gunzip ./data/SRR6870408_2.fastq.gz
```
### Align the paired reads relative to hg19
```{r}
bowtie2(bt2Index = "BSgenome.Hsapiens.UCSC.hg19.mainChrs_bowtie2",
samOutput = "ATAC_50K_2_bowtie2.sam",
seq1 = "./ATAC_Data/ATAC_FQs/SRR891269_1.fastq",
seq2 = "./ATAC_Data/ATAC_FQs/SRR891269_2.fastq")
# Convert the output SAM file to a more useable BAM file
library(Rsamtools)
asBam("ATAC_50K_2_bowtie2.sam")
# https://rdrr.io/bioc/Rsamtools/src/R/asBam.R
# asBam, asSam return the file name of the destination file.
# To save the disk space, we are removing the uncompressed FASTQ and SAM files, while retaining the BAM file
unlink(c("./ATAC_Data/ATAC_FQs/SRR891269_1.fastq",
"./ATAC_Data/ATAC_FQs/SRR891269_2.fastq",
"ATAC_50K_2_bowtie2.sam"))
```
### Align the paired reads relative to hg38
```{r}
bowtie2(bt2Index = "BSgenome.Hsapiens.UCSC.hg38.mainchr_bowtie2",
samOutput = "ATAC_female_lung_bowtie2.sam",
seq1 = "SRR6870408_1.fastq",
seq2 = "SRR6870408_2.fastq")
# arguments 'show.output.on.console', 'minimized' and 'invisible' are for Windows only
# 43819122 reads; of these:
# 43819122 (100.00%) were paired; of these:
# 4382152 (10.00%) aligned concordantly 0 times
# 26983067 (61.58%) aligned concordantly exactly 1 time
# 12453903 (28.42%) aligned concordantly >1 times
# ----
# 4382152 pairs aligned concordantly 0 times; of these:
# 1142730 (26.08%) aligned discordantly 1 time
# ----
# 3239422 pairs aligned 0 times concordantly or discordantly; of these:
# 6478844 mates make up the pairs; of these:
# 4752093 (73.35%) aligned 0 times
# 742250 (11.46%) aligned exactly 1 time
# 984501 (15.20%) aligned >1 times
# 94.58% overall alignment rate
asBam("ATAC_female_lung_bowtie2.sam")
# [bam_sort_core] merging from 42 files and 1 in-memory blocks...
# [1] "ATAC_female_lung_bowtie2.bam"
unlink(c("SRR6870408_1.fastq",
"SRR6870408_2.fastq",
"ATAC_female_lung_bowtie2.sam"))
```
## Sort and index the aligned reads
```{r}
library(Rsamtools)
out_bam <- 'ATAC_female_lung_bowtie2.bam'
sorted_bam <- file.path(dirname(out_bam),
paste0("Sorted_", basename(out_bam))) # Construct the path to a file from components
sortBam(out_bam, gsub("\\.bam", "", basename(sorted_bam)))
# [1] "Sorted_ATAC_female_lung_bowtie2.bam"
indexBam(sorted_bam)
# ./Sorted_ATAC_female_lung_bowtie2.bam
# "./Sorted_ATAC_female_lung_bowtie2.bam.bai"
# basename("../ATACseq/res.bam")
# [1] "res.bam"
```
## [QC] Check the distribution of mapped reads on every chromosome
```{r}
library(Rsamtools)
mapped_reads <- idxstatsBam(sorted_bam)
# idxstatsBam visit the index in index(file), and quickly returns the number of mapped and unmapped reads on each seqname.
saveRDS(mapped_reads, file = "ATAC_female_lung_idxstats.RDS")
mapped_reads <- readRDS("ATAC_female_lung_idxstats.RDS")
mapped_reads |> View()
```
```{r use.data.in.package}
file_path <- system.file("extdata", "ex1.bam", package = "Rsamtools", mustWork = T)
indexBam(file_path)
idxstatsBam(file_path)
# seqnames: chromosome names
# mapped: how many reads are mapped on every chromosome
```
```{r visual}
library(ggplot2)
ggplot(data = mapped_reads,
mapping = aes(x = seqnames, y = mapped, fill = seqnames))+
geom_bar(stat = "identity") +
geom_text(mapping = aes(label = mapped), vjust = 0, color = "black", size = 3.5)+
coord_flip() +
labs(fill = 'Chromosome')
# The barplot shows lots of mapped mitochondrial reads
```