Glaucoma-variant-calling/Samah at master · SamahElmahdi/Glaucoma-variant-calling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
## create the conda env if you have not already did
#conda create -y --name ngs1 python=3.6
## Activating the enviornment
conda activate ngs1
## Creating a directory for the tutorial
mkdir glaucomaproject && cd glaucomaproject
## Downloading and installing the SRA Toolkit
wget "http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz"
tar -xzf sratoolkit.current-centos_linux64.tar.gz
sudo apt install sra-toolkit
## Creating a directory for the sampledata
mkdir sample_data && cd sample_data
## subsetting of data and splitting of reads using SRA Toolkit
fastq-dump -I --split-files -X 8000000 SRR5858162
## Installing fastqc & multiqc
#conda install -c bioconda fastqc
## Creating a directory for the FASTQC
mkdir -p ~/glaucomaproject/FASTQC_tut && cd ~/glaucomaproject/FASTQC_tut
## Run the FASTQC for each read
for f in ~/glaucomaproject/sample_data/*.fastq;do fastqc -t 1 -f fastq -noextract $f;done
cd glaucomaproject/sample_data
## Downloading the reference chromosome chr20
wget ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
gunzip -k Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz
## Downloading whole genome annotation file
wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz
gunzip -k Homo_sapiens.GRCh38.99.chr.gtf.gz
## Select only chr20 annotaion
grep "^#" Homo_sapiens.GRCh38.99.chr.gtf > Homo_sapiens.GRCh38.99.chr_chr20.gtf
grep "^20" Homo_sapiens.GRCh38.99.chr.gtf >> Homo_sapiens.GRCh38.99.chr_chr20.gtf
## Installing Hisat aligner
#conda install -c bioconda -y hisat2
## Indexing
mkdir -p ~/glaucomaproject/hisat_align/hisatIndex && cd ~/glaucomaproject/hisat_align/hisatIndex
ln -s ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.dna.chromosome.20.fa .
hisat2_extract_splice_sites.py ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.99.chr_chr20.gtf > splicesites.tsv
hisat2_extract_exons.py ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.99.chr_chr20.gtf > exons.tsv
hisat2-build -p 1 --ss splicesites.tsv --exon exons.tsv Homo_sapiens.GRCh38.dna.chromosome.20.fa Homo_sapiens.GRCh38.dna.chromosome.20
## Creating a directory for the GATK
mkdir -p ~/glaucomaproject/GATK_tutorial && cd ~/glaucomaproject/GATK_tutorial
## Add Read group information and align all reads
R1="$HOME/glaucomaproject/sample_data/SRR5858162_1.fastq"
R2="$HOME/glaucomaproject/sample_data/SRR5858162_2.fastq"
myhisatindex="$HOME/glaucomaproject/hisat_align/hisatIndex/Homo_sapiens.GRCh38.dna.chromosome.20"
hisat2 -p 1 --rg ID:SRR5858162_F8_III.3 --rg SM:SRR5858162_F8_III.3 --rg PL:ILLUMINA --rg LB:SRR5858162_F8_III.3 \
--rg PU:SRR5858162_F8_III.3 -x $myhisatindex -1 $R1 -2 $R2 -S SRR5858162.sam
###Check the Alignment summary!
8000000 reads; of these:
  8000000 (100.00%) were paired; of these:
    7821340 (97.77%) aligned concordantly 0 times
    172034 (2.15%) aligned concordantly exactly 1 time
    6626 (0.08%) aligned concordantly >1 times
    ----
    7821340 pairs aligned concordantly 0 times; of these:
      2960 (0.04%) aligned discordantly 1 time
    ----
    7818380 pairs aligned 0 times concordantly or discordantly; of these:
      15636760 mates make up the pairs; of these:
        15549528 (99.44%) aligned 0 times
        76157 (0.49%) aligned exactly 1 time
        11075 (0.07%) aligned >1 times
2.82% overall alignment rate
## installing Samtools
#conda install -y samtools
## generating & sorting BAM file
samtools view -hbo SRR5858162.bam SRR5858162.sam
samtools sort SRR5858162.bam -o SRR5858162.sorted.bam
## Installing Picard tools
conda install -c bioconda picard
picard_path=$CONDA_PREFIX/share/picard-* ## 2.21.7-0
## mapping QC
samtools depth SRR5858162.sorted.bam| awk '{{sum+=$3}} END {{print "Average = ",sum/NR, "No of covered Nuc = ", NR}}' > SRR5858162.cov
samtools flagstat SRR5858162.sorted.bam > SRR5858162.stat
## Marking duplicate
java  -Xmx2g -jar $picard_path/picard.jar MarkDuplicates INPUT=SRR5858162.sorted.bam OUTPUT=SRR5858162.dedup.bam \
METRICS_FILE=SRR5858162.metrics.txt
## GATK installing
conda install -c bioconda gatk4
## GATK indexing
## for sample
java -Xmx2g -jar $picard_path/picard.jar BuildBamIndex VALIDATION_STRINGENCY=LENIENT INPUT=SRR5858162.dedup.bam
## for reference
ln -s ~/glaucomaproject/sample_data/Homo_sapiens.GRCh38.dna.chromosome.20.fa .
java -Xmx2g -jar $picard_path/picard.jar CreateSequenceDictionary R=Homo_sapiens.GRCh38.dna.chromosome.20.fa \
O=Homo_sapiens.GRCh38.dna.chromosome.20.dict
samtools faidx Homo_sapiens.GRCh38.dna.chromosome.20.fa
## Downloading known varinats for chr20
ftp://ftp.ensembl.org/pub/release-99/variation/vcf/homo_sapiens/homo_sapiens-chr20.vcf.gz
gunzip -k homo_sapiens-chr20.vcf.gz
gatk IndexFeatureFile -I homo_sapiens-chr20.vcf
## Recalibrate Bases BQSR
gatk --java-options "-Xmx2G" BaseRecalibrator \
-R Homo_sapiens.GRCh38.dna.chromosome.20.fa -I SRR5858162.dedup.bam --known-sites homo_sapiens-chr20.vcf \
-O SRR5858162.report
gatk --java-options "-Xmx2G" ApplyBQSR \
-R Homo_sapiens.GRCh38.dna.chromosome.20.fa -I SRR5858162.dedup.bam -bqsr SRR5858162.report \
-O SRR5858162.bqsr.bam --add-output-sam-program-record --emit-original-quals
## Joint variant calling using HaplotypeCaller
## Call germline SNPs and indels via local re-assembly of haplotypes
## assess genotype likelihood per-sample
gatk --java-options "-Xmx2G" HaplotypeCaller  -R Homo_sapiens.GRCh38.dna.chromosome.20.fa \
-I SRR5858162.bqsr.bam --emit-ref-confidence GVCF  --pcr-indel-model NONE  -O SRR5858162.gvcf
## 2 samples (SRR5858204.gvcf & RR5858157.gvcf) and 2 controls (SRR5858160.gvcf & SRR5858161.gvcf) which done by my colleagues \
was uploaded to the machine for the compination with the fifth one ( sample (SRR5858162.gvcf)).
## combine samples
gatk --java-options "-Xmx2G" CombineGVCFs -R Homo_sapiens.GRCh38.dna.chromosome.20.fa -V SRR5858162.gvcf \
-V SRR5858204.gvcf -V SRR5858157.gvcf -V SRR5858160.gvcf -V SRR5858161.gvcf -O raw_variants.gvcf
## Joint Genotyping with annotated output
gatk --java-options "-Xmx60G" GenotypeGVCFs -R Homo_sapiens.GRCh38.dna.chromosome.20.fa \
-V raw_variants.gvcf --max-alternate-alleles 2 --dbsnp homo_sapiens-chr20.vcf -O raw_variants_ann.vcf
## check how many variant got annotated
grep -v "^#" raw_variants_ann.vcf | awk '{print $3}' | grep "^rs" | wc -l
###10900 known variants
## check no of all vaiants
grep -v "^#" raw_variants_ann.vcf | awk '{print $3}' | wc -l
###35472 all variants
## VCF statitics
## Index the VCF file
# conda install -c bioconda tabix
bgzip -c raw_variants_ann.vcf > raw_variants_ann.vcf.gz
tabix -p vcf raw_variants_ann.vcf.gz
## Calc some stats about your vcf
# conda install -c bioconda rtg-tools
rtg vcfstats raw_variants_ann.vcf.gz > stats.txt
## Split SNPs and indels
gatk --java-options "-Xmx2G" SelectVariants -R Homo_sapiens.GRCh38.dna.chromosome.20.fa \
-V raw_variants_ann.vcf --select-type-to-include SNP -O raw_variants_ann_SNP.vcf
gatk --java-options "-Xmx2G" SelectVariants -R Homo_sapiens.GRCh38.dna.chromosome.20.fa \
-V raw_variants_ann.vcf --select-type-to-include INDEL -O raw_variants_ann_INDEL.vcf
## Assess the different filters in both known and novel
for var in "SNP" "INDEL";do
 input="raw_variants_ann_"$var".vcf"
 for filter in "QD" "MQ" "MQRankSum" "FS" "SOR" "ReadPosRankSum" "AN" "DP";do
  filterValues=$var.$filter
  awk -v k="$filter=" '!/#/{n=split($8,a,";"); for(i=1;i<=n;i++) if(a[i]~"^"k) {sub(k,$3" ",a[i]); print a[i]}}' $input > $filterValues
  grep -v "^\." $filterValues > known.$var.$filter
  grep "^\." $filterValues > novel.$var.$filter
done; done
## creating directory for filters
mkdir filters && cd filters
mv ../{*.SNP.*,SNP.*,*.INDEL.*,INDEL.*} .
# downloading densityCurves.R script and installing plotting packages
wget https://raw.githubusercontent.com/dib-lab/dogSeq/master/scripts/densityCurves.R
Rscript -e "install.packages('ggplot2', contriburl=contrib.url('http://cran.r-project.org/'))"
# plotting Figures
for f in SNP.* INDEL.*;do
  Rscript densityCurves.R "$f"
done