Tips from here: https://medium.com/computational-biology/how-to-download-all-bacterial-assemblies-from-ncbi-35f4bc5435f9.
mkdir -p /lizardfs/guarracino/seqwish-paper/bacteria/assemblies
cd /lizardfs/guarracino/seqwish-paper/bacteria/assembliesDownload assembly information:
wget ftp://ftp.ncbi.nih.gov/genomes/genbank/bacteria/assembly_summary.txt && pigz assembly_summary.txt
# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for column descriptionsGet IDs, species, and links for Reference/representative genome, latest and Complete Genome assemblies coming from Full data:
zcat assembly_summary.txt.gz | awk -F '\t' '{if($11=="latest" && $12=="Complete Genome" && $14=="Full" && $20 != "na") print $1"\t"$8"\t"$20 }' | pigz -c > assembly.bacteria.txt.gzTop species with more assemblies:
zcat assembly.bacteria.txt.gz | cut -f 2 | cut -f 1,2 -d ' ' | sort | uniq -c | sort -k 1nr | head -n 15
2621 Escherichia coli + 1 synthetic Escherichia coli C321.deltaA
1596 Salmonella enterica
1179 Klebsiella pneumoniae
1067 Staphylococcus aureus
754 Bordetella pertussis
408 Enterococcus faecalis
375 Pseudomonas aeruginosa
312 Acinetobacter baumannii
307 Mycobacterium tuberculosis
282 Campylobacter jejuni
273 Listeria monocytogenes
253 Streptococcus pyogenes
250 Helicobacter pylori
240 Chlamydia trachomatis
239 Enterococcus faeciumPrepare the URLs for a few species:
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
mkdir -p $gspecies
zgrep "$genus_species" assembly.bacteria.txt.gz | cut -f 3 | while read f; do
# Slow: name=$(basename $f);
# Slow: name=$(echo $f | rev | cut -f 1 -d '/' | rev)
# Slow: name=$(echo $f | grep -o '[^/]*$')
name=$(echo $f | awk -F '/' '{print $NF}')
echo $f/${name}_genomic.fna.gz >> $gspecies/$gspecies.urls;
done
doneDownload the assemblies:
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
sbatch -p lowmem --wrap "cd /lizardfs/guarracino/seqwish-paper/bacteria/assemblies/'$gspecies'; cat *.urls | parallel -j 16 'wget -q {} && echo got {}'"
done
#sbatch -p headnode -w octopus01 --wrap 'zcat assembly.bacteria.txt.gz | cut -f 3 | while read f; do echo $f; name=$(basename $f); wget -c $f/${name}_genomic.fna.gz; done'
# Check integrity
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
ls $gspecies/*gz | while read f; do gzip -t $f; done
doneTrim headers and add prefixes:
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
cd $gspecies
ls *.fna.gz | while read f; do
prefix=$(echo $f | cut -f 1,2 -d '_');
echo $prefix
# `cut -f 1` to trim the headers
~/tools/fastix/target/release/fastix-331c1159ea16625ee79d1a82522e800c99206834 -p "${prefix}#" <(zcat $f | cut -f 1) | bgzip -@ 48 -c > $prefix.fa.gz;
samtools faidx $prefix.fa.gz
done
cd ..
donePut all together:
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
cd $gspecies
num_assemblies=$(ls *.fa.gz | wc -l)
zcat *.fa.gz | cut -f 1 | bgzip -@ 48 -c > $gspecies${num_assemblies}.fasta.gz; samtools faidx $gspecies${num_assemblies}.fasta.gz
cd ..
doneNumber of total contigs (1st column) for each species:
wc */*fasta.gz.fai
8660 43300 440033 ecoli/ecoli2622.fasta.gz.fai
292 1460 14683 hpylori/hpylori250.fasta.gz.fai
5094 25470 256697 kpneumoniae/kpneumoniae1179.fasta.gz.fai
3361 16805 170949 senterica/senterica1596.fasta.gz.fai
17407 87035 882362 totalDistances:
# guix install mash
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
cd $gspecies
ls *.fa.gz | while read f; do mash sketch $f; done
mash triangle *.fa.gz >$gspecies.mash_triangle.txt
cd ..
donesbatch -p headnode -c 1 --wrap 'bash /lizardfs/guarracino/seqwish-paper/bacteria/assemblies/mash_triangle.sh'
Top distances:
for genus_species in "Escherichia coli" "Salmonella enterica" "Klebsiella pneumoniae" "Helicobacter pylori"; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
sed 1,1d $gspecies/$gspecies.mash_triangle.txt | tr '\t' '\n' | grep GCA -v | grep e -v | sort -g -k 1nr | uniq | head -n 5
done
Escherichia coli
1
0.295981
0.263022
0.243761
0.23011
Salmonella enterica
1
0.295981
0.263022
0.243761
0.23011
Klebsiella pneumoniae
0.0653942
0.0645437
0.0642645
0.0639874
0.0637124
Helicobacter pylori
0.0794837
0.0786628
0.0782587
0.0759181
0.0755413Mapping:
for genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
mkdir -p /lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 5k 10k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
APPROX_PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.approx.paf
sbatch -p workers -c 48 --job-name $gspecies --wrap 'hostname; cd /scratch; \time -v ~/tools/wfmash/build/bin/wfmash-948f1683d14927745aef781cdabeb66ac6c7880b '$ASSEMBLIES' '$ASSEMBLIES' -X -s '$s' -l '$l' -p '$p' -n '$NUM_HAPLOTYPES' -t 48 -m > '${APPROX_PAF}
done
done
doneSplit mappings:
for genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 5k 10k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
APPROX_PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.approx.paf
python3 ~/tools/wfmash/scripts/split_approx_mappings_in_chunks.py $APPROX_PAF 10
done
done
doneAlignment:
for genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 5k 10k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
seq 0 9 | while read i; do
APPROX_PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.approx.paf.chunk_$i.paf
UNFILTERED_PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.chunk_$i.paf.gz
sbatch -p workers -c 48 --job-name $gspecies --wrap 'hostname; cd /scratch; \time -v ~/tools/wfmash/build/bin/wfmash-948f1683d14927745aef781cdabeb66ac6c7880b '$ASSEMBLIES' '$ASSEMBLIES' -X -s '$s' -l '$l' -p '$p' -n '$NUM_HAPLOTYPES' -t 48 -i '$APPROX_PAF' | pigz -c > '$UNFILTERED_PAF
done
done
done
doneFiltering:
for genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 5k 10k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
p_threshold=$(echo "scale=2; $p/100.0" | bc)
seq 0 9 | while read i; do
echo $s $p $i
UNFILTERED_PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.chunk_$i.paf.gz
PAF=/lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.chunk_$i.filtered.paf.gz
zcat $UNFILTERED_PAF | awk -v p=$p_threshold '{split($13, gi, /:/); if(gi[3] >= p) {print $0}}' | pigz -c > $PAF
done
done
done
donefor genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
if [ $gspecies == "hpylori" ]; then
LOGSECS=1 # Graph induction is fast because the dataset is small
else
LOGSECS=10
fi
mkdir -p /lizardfs/guarracino/seqwish-paper/bacteria/graphs/$gspecies
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 10k 5k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
PAFS=$(ls /lizardfs/guarracino/seqwish-paper/bacteria/alignment/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.chunk_*.filtered.paf.gz | tr '\n' ',')
PAFS=${PAFS::-1}
for k in 0 11 29 49 79 127 179 229 311; do
for B in 10M 50M; do
GFA=/scratch/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.k$k.B$B.gfa
LOG=/scratch/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.k$k.B$B.size.log
sbatch -p 386mem -c 48 --job-name $gspecies --wrap 'bash /lizardfs/guarracino/seqwish-paper/scripts/seqwish_with_logging.sh '$ASSEMBLIES' '$PAFS' '$GFA' '$k' '$B' '$LOG' '$LOGSECS'; mv '$GFA' /lizardfs/guarracino/seqwish-paper/bacteria/graphs/'$gspecies'; mv '$LOG' /lizardfs/guarracino/seqwish-paper/logs/'
done
done
done
done
donefor genus_species in "Helicobacter pylori" ; do
echo $genus_species
genus_species_lower=$(echo $genus_species | tr '[:upper:]' '[:lower:]')
g=$(echo $genus_species_lower | cut -f 1 -d ' ')
g=${g:0:1} # fist letter
species=$(echo $genus_species_lower | cut -f 2 -d ' ')
gspecies=$(echo $g$species)
if [ $gspecies == "hpylori" ]; then
LOGSECS=1 # Graph induction is fast because the dataset is small
else
LOGSECS=10
fi
mkdir -p /lizardfs/guarracino/seqwish-paper/bacteria/graphs/$gspecies
ASSEMBLIES=/lizardfs/guarracino/seqwish-paper/bacteria/assemblies/$gspecies/*fasta.gz
NUM_HAPLOTYPES=$(cut -f 1 -d '#' $ASSEMBLIES.fai | sort | uniq | wc -l)
FILENAME=$(basename $ASSEMBLIES .fasta.gz)
for s in 5k 10k; do
for p in 98 95 90; do
s_no_k=${s::-1}
l_no_k=$(echo $s_no_k '*' 3 | bc)
l=${l_no_k}k
for k in 0 11 29 49 79 127 179 229 311; do
for B in 10M 50M; do
GFA=/lizardfs/guarracino/seqwish-paper/bacteria/graphs/$gspecies/$FILENAME.s$s.l$l.p$p.n${NUM_HAPLOTYPES}.k$k.B$B.gfa
sbatch -p workers -c 24 --job-name ${gspecies}_stats --wrap 'hostname; cd /scratch && ~/tools/odgi/bin/odgi-67a7e5bb2f328888e194845a362cef9c8ccc488f stats -i '$GFA' -S -b -L -W -t 24 -P > '$GFA'.og.stats.txt';
done
done
done
done
done
# Compress GFA files
ls /lizardfs/guarracino/seqwish-paper/bacteria/graphs/hpylori/*.gfa | while read f; do echo $f; pigz $f; done