Glaucoma-variant-calling/Marwa Abd elazim at master · SamahElmahdi/Glaucoma-variant-calling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
###Downloading glaucoma sample file
##SRA Toolkit installation
wget "http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz"
#unbacking Toolkit
tar -xzf sratoolkit.current-centos_linux64.tar.gz
#Configuring the Toolkit
cd bin
./vdb-config -i
##Make directory and start download
$mkdir Project
$cd Project
$ fastq-dump -X 15000000 -Z SRR5858204 | fastq-dump -I --split-files SRR5858204
Command 'fastq-dump' not found
$ sudo apt install sra-toolkit
$ fastq-dump -X 15000000 -Z SRR5858204 | fastq-dump -I --split-files SRR5858204
#using another command to download and split reads in the same command
$fastq-dump -I --split-files -X 8000000 SRR5858204
#FASTQC for each read end
for f in ~/Project/*.fastq;do fastqc -t 1 -f fastq -noextract $f;done
#Download the whole annotated genome
wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz
#Download the reference chromosome 20
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
#Extract Chromosome 20 from the whole annotated genome
gunzip -k Homo_sapiens.GRCh38.99.chr.gtf.gz
grep "^#" Homo_sapiens.GRCh38.99.chr.gtf > Homo_sapiens.chr20.gtf
grep "^20" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.chr20.gtf
###Alignment by HISAT2
##Install  HISAT2
conda install -c bioconda -y hisat2
##Indexing
cd Sample_data
gunzip -k Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
mkdir -p ~/Project/hisat_align/hisatIndex
cd  ~/Project/hisat_align/hisatIndex
https://linuxize.com/post/how-to-create-symbolic-links-in-linux-using-the-ln-command/
ln -s ~/workdir/sample_data/Homo_sapiens.GRCh38.dna.chromosome.20.fa
hisat2_extract_splice_sites.py  ~/Project/Sample_data/Homo_sapiens.chr20.gtf > splicesites.tsv
hisat2_extract_exons.py  ~/Project/Sample_data/Homo_sapiens.chr20.gtf > exons.tsv
hisat2-build -p 1 --ss splicesites.tsv --exon exons.tsv Homo_sapiens.GRCh38.dna.chromosome.20.fa Homo_sapiens.GRCh38.dna.chromosome.20
####Complete Alignment step with chromosome 20 only as areference
R1="$HOME/GlaucomaProject/sample_data/SRR5858204_1.fastq"(ngs1)
R2="$HOME/GlaucomaProject/sample_data/SRR5858204_2.fastq"(ngs1)
hisat2 -p 1 -x hisatIndex/Homo_sapiens.GRCh38.dna.chromosome.20 -1 $R1 -2 $R2 -S SRR5858204.sam
8000000 reads; of these:
  8000000 (100.00%) were paired; of these:
    7830528 (97.88%) aligned concordantly 0 times
    163664 (2.05%) aligned concordantly exactly 1 time
    5808 (0.07%) aligned concordantly >1 times
    ----
    7830528 pairs aligned concordantly 0 times; of these:
      2917 (0.04%) aligned discordantly 1 time
    ----
    7827611 pairs aligned 0 times concordantly or discordantly; of these:
      15655222 mates make up the pairs; of these:
        15575523 (99.49%) aligned 0 times
        70117 (0.45%) aligned exactly 1 time
        9582 (0.06%) aligned >1 times
2.65% overall alignment rate
#Alignment with read group information
hisat2 -p 1 --rg ID:SRR5858204_F6_II.1 --rg SM:SRR5858204_F6_II.2 --rg PL:ILLUMINA --rg LB:SRR5858204_F6_II.1 --rg PU:SRR5858204_F6_II.1 -x hisatIndex/Homo_sapiens.GRCh38.dna.chromosome.20 -1 $R1 -2 $R2 -S SRR5858204.sam
#generate & sort BAM file
samtools view -hbo SRR5858204.bam SRR5858204.sam
samtools sort SRR5858204.bam -o SRR5858204.sorted.bam
#Explore file size (to make sure it isn`t corrupted)
ls -trl SRR5858204.sorted.bam
#mapping QC
 samtools depth $SRR5858204.bam | awk '{{sum+=$3}} END {{print "Average = ",sum/NR, "No of covered Nuc = ", NR}}' > $SRR5858204.cov
  samtools flagstat SRR5858204.bam > SRR5858204.stat
  16067103 + 0 in total (QC-passed reads + QC-failed reads)
67103 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
491580 + 0 mapped (3.06% : N/A)
16000000 + 0 paired in sequencing
8000000 + 0 read1
8000000 + 0 read2
338944 + 0 properly paired (2.12% : N/A)
351192 + 0 with itself and mate mapped
73285 + 0 singletons (0.46% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
SRR5858204.stat (END)
#Mark duplicates
# Install Picard tools
conda install -c bioconda picard
picard_path=$CONDA_PREFIX/share/picard-* ## 2.21.7-0
java  -Xmx2g -jar $picard_path/picard.jar MarkDuplicates INPUT=SRR5858204.sorted.bam OUTPUT=SRR5858204.dedup.bam METRICS_FILE=SRR5858204.metrics.txt
##Install GATK
conda install -c bioconda gatk4
##indexing step of both sample and reference
# sample
java -Xmx2g -jar $picard_path/picard.jar BuildBamIndex VALIDATION_STRINGENCY=LENIENT INPUT=SRR5858204.dedup.bam
#Reference
ln -s ~/GlaucomaProject/sample_data/Homo_sapiens.GRCh38.dna.chromosome.20.fa
java -Xmx2g -jar $picard_path/picard.jar CreateSequenceDictionary R=Homo_sapiens.GRCh38.dna.chromosome.20.fa O=Homo_sapiens.GRCh38.dna.chromosome.20.dict
samtools faidx Homo_sapiens.GRCh38.dna.chromosome.20.fa
less -S Homo_sapiens.GRCh38.dna.chromosome.20.fa.fai
20
ls -trl Homo_sapiens.GRCh38.dna.chromosome.20.dict
156
##Download known varinats
# Download known polymorphic sites
wget ftp://ftp.ensembl.org/pub/release-99/variation/vcf/homo_sapiens/homo_sapiens-chr20.vcf.gz
##Recalibrate Bases BQSR
#1st step
gunzip -k homo_sapiens-chr20.vcf.gz
gatk IndexFeatureFile -I homo_sapiens-chr20.vcf
 gatk --java-options "-Xmx2G" BaseRecalibrator \
-R Homo_sapiens.GRCh38.dna.chromosome.20.fa -I SRR5858204.dedup.bam --known-sites homo_sapiens-chr20.vcf \
-O SRR5858204.report
#2nd step
gatk --java-options "-Xmx2G" ApplyBQSR \
-R Homo_sapiens.GRCh38.dna.chromosome.20.fa -I SRR5858204.dedup.bam -bqsr SRR5858204.report \
-O SRR5858204.bqsr.bam --add-output-sam-program-record --emit-original-quals
#Joint variant calling using HaplotypeCaller
gatk --java-options "-Xmx2G" HaplotypeCaller \
  -R Homo_sapiens.GRCh38.dna.chromosome.20.fa -I SRR5858204.dedup.bam \
  --emit-ref-confidence GVCF \
  --pcr-indel-model NONE \
  -O SRR5858204.gvcf


**To overcome this law alignment rate ,the whole genome will be used as areference
#####Downloading whole genome reference
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
##Downloading annotated whole genome
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gtf.gz
##Indexing
mkdir -p ~/Project/hisat2_align/hisatIndex2
cd  ~/Project/hisat_align2/hisatIndex2
ln -s ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.dna.primary_assembly.fa .
hisat2_extract_splice_sites.py ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.99.chr.gtf > splicesites.tsv
hisat2_extract_exons.py ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.99.chr.gtf > exons.tsv
hisat2-build -p 1 --ss splicesites.tsv --exon exons.tsv Homo_sapiens.GRCh38.dna.primary_assembly.fa \
Homo_sapiens.GRCh38.dna.primary_assembly


##Due to space issuses we will choose a fewer number of chromosomes and merge them toghther as areference to align in
#Download annotated whole genome
wget wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz
#Download fewer number of chromosomes (3-6-11-20)
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.11.fa.gz
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
#Concatinate chromosomes files of (3-6-11-20) in one file (merged.fa)
gunzip *.fa.gz -k
cat *.fa > merged.fa
#Selecting annotated chromosomes (3-6-11-20) from the whole annotated genome
gunzip -k  Homo_sapiens.GRCh38.99.chr.gtf.gz
grep "^#" Homo_sapiens.GRCh38.99.chr.gtf > Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
grep "^3" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
grep "^6" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
grep "^11" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
grep "^20" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
$ cd sample_data
Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf     Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
Homo_sapiens.GRCh38.99.chr.gtf               Homo_sapiens.GRCh38.dna.chromosome.3.fa
Homo_sapiens.GRCh38.99.chr.gtf.gz            Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz
Homo_sapiens.GRCh38.dna.chromosome.11.fa     Homo_sapiens.GRCh38.dna.chromosome.6.fa
Homo_sapiens.GRCh38.dna.chromosome.11.fa.gz  Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz
Homo_sapiens.GRCh38.dna.chromosome.20.fa     merged.fa
#To view merged file size
ls -trl merged.fa
-rw-rw-r-- 1 ngs ngs 578109761 Feb 26 04:47 merged.fa
#To view annotated chromosomes gtf file
ls -trl Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf size
-rw-rw-r-- 1 ngs ngs 239070933 Feb 26 05:31 Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf
##Indexing
conda install -c bioconda -y hisat2
mkdir -p ~/GlaucomaProject/hisat_align/hisatIndex
cd ~/GlaucomaProject/hisat_align/hisatIndex
ln -s ~/GlaucomaProject/sample_data/chr3_6_11_20.fa
hisat2_extract_splice_sites.py ~/GlaucomaProject/sample_data/Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf > splicesites.tsv
hisat2_extract_exons.py ~/GlaucomaProject/sample_data/Homo_sapiens.GHCh38.99.chr_3_6_11_20.gtf > exons.tsv
hisat2-build -p 1 --ss splicesites.tsv --exon exons.tsv chr3_6_11_20.fa chr3_6_11_20
# Memory dump while indexing (Ran out of memory; automatically trying more memory-economical parameters).!!