@@ -2,11 +2,13 @@ version 1.0
22
33workflow Glimpse2LowPassImputation {
44 input {
5- String pipeline_version = "0.0.3"
5+ String pipeline_version = "0.0.4"
6+
7+ # List of files, one per line
68
79 Array [String ] contigs
810
9- # this is the path the a directory that contains sites vcf, sites tabke , and reference chunks file. should end with a "/
11+ # this is the path the a directory that contains sites vcf, sites table , and reference chunks file. should end with a "/"
1012 String reference_panel_prefix
1113
1214 File ? input_vcf
@@ -134,11 +136,25 @@ workflow Glimpse2LowPassImputation {
134136 docker = glimpse_docker
135137 }
136138 Array [File ] contig_coverage_metrics = select_all (GlimpsePhase .coverage_metrics )
139+
140+ call SelectVariantRecordsOnly {
141+ input :
142+ vcf = GlimpseLigate .imputed_vcf ,
143+ vcf_index = GlimpseLigate .imputed_vcf_index ,
144+ basename = output_basename + "." + contig + ".imputed.only_variants" ,
145+ }
146+
147+ call CreateHomRefSitesOnlyVcf {
148+ input :
149+ vcf = GlimpseLigate .imputed_vcf ,
150+ vcf_index = GlimpseLigate .imputed_vcf_index ,
151+ basename = output_basename + "." + contig + ".imputed.only_hom_ref.sites_only" ,
152+ }
137153 }
138154
139155 call GatherVcfsNoIndex {
140156 input :
141- input_vcfs = GlimpseLigate . imputed_vcf ,
157+ input_vcfs = SelectVariantRecordsOnly . output_vcf ,
142158 output_vcf_basename = output_basename + ".imputed" ,
143159 gatk_docker = gatk_docker
144160 }
@@ -150,6 +166,20 @@ workflow Glimpse2LowPassImputation {
150166 preemptible = 0
151167 }
152168
169+ call GatherVcfsNoIndex as GatherVcfsNoIndexHomRefOnly {
170+ input :
171+ input_vcfs = CreateHomRefSitesOnlyVcf .output_vcf ,
172+ output_vcf_basename = output_basename + ".imputed.hom_ref_sites_only" ,
173+ gatk_docker = gatk_docker
174+ }
175+
176+ call CreateVcfIndexAndMd5 as CreateVcfIndexAndMd5HomRefOnly {
177+ input :
178+ vcf_input = GatherVcfsNoIndexHomRefOnly .output_vcf ,
179+ gatk_docker = gatk_docker ,
180+ preemptible = 0
181+ }
182+
153183 Array [File ] genome_coverage_metrics = flatten (contig_coverage_metrics )
154184 if (length (genome_coverage_metrics ) > 0 ) {
155185 call CombineCoverageMetrics {
@@ -171,6 +201,10 @@ workflow Glimpse2LowPassImputation {
171201 File imputed_vcf_index = CreateVcfIndexAndMd5 .output_vcf_index
172202 File imputed_vcf_md5sum = CreateVcfIndexAndMd5 .output_vcf_md5sum
173203
204+ File imputed_hom_ref_sites_only_vcf = CreateVcfIndexAndMd5HomRefOnly .output_vcf
205+ File imputed_hom_ref_sites_only_vcf_inex = CreateVcfIndexAndMd5HomRefOnly .output_vcf_index
206+ File imputed_hom_ref_sites_only_vcf_md5 = CreateVcfIndexAndMd5HomRefOnly .output_vcf_md5sum
207+
174208 File qc_metrics = CollectQCMetrics .qc_metrics
175209 File ? coverage_metrics = CombineCoverageMetrics .coverage_metrics
176210 }
@@ -214,6 +248,7 @@ task SplitIntoBatches {
214248 disks : "local-disk 10 HDD"
215249 memory : "1 GiB"
216250 preemptible : 3
251+ noAddress : true
217252 }
218253
219254 output {
@@ -249,6 +284,7 @@ task ComputeShardsAndMemoryPerShard {
249284
250285 runtime {
251286 docker : "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
287+ noAddress : true
252288 }
253289
254290 output {
@@ -297,6 +333,7 @@ task BcftoolsMpileup {
297333 cpu : cpu
298334 preemptible : preemptible
299335 maxRetries : max_retries
336+ noAddress : true
300337 }
301338
302339 output {
@@ -332,6 +369,7 @@ task BcftoolsCall {
332369 cpu : cpu
333370 preemptible : preemptible
334371 maxRetries : max_retries
372+ noAddress : true
335373 }
336374
337375 output {
@@ -366,6 +404,7 @@ task BcftoolsNorm {
366404 cpu : cpu
367405 preemptible : preemptible
368406 maxRetries : max_retries
407+ noAddress : true
369408 }
370409
371410 output {
@@ -401,6 +440,7 @@ task BcftoolsMerge {
401440 cpu : cpu
402441 preemptible : preemptible
403442 maxRetries : max_retries
443+ noAddress : true
404444 }
405445
406446 output {
@@ -515,6 +555,7 @@ task GlimpsePhase {
515555 preemptible : preemptible
516556 maxRetries : max_retries
517557 checkpointFile : "checkpoint.bin"
558+ noAddress : true
518559 }
519560
520561 output {
@@ -551,6 +592,7 @@ task GlimpseLigate {
551592 bcftools view -h --no-version ligated.vcf.gz > old_header.vcf
552593 java -jar /picard.jar UpdateVcfSequenceDictionary -I old_header.vcf --SD ~{ref_dict } -O new_header.vcf
553594 bcftools reheader -h new_header.vcf -o ~{output_basename }.imputed.vcf.gz ligated.vcf.gz
595+ tabix ~{output_basename }.imputed.vcf.gz
554596 >>>
555597
556598 runtime {
@@ -560,10 +602,12 @@ task GlimpseLigate {
560602 cpu : cpu
561603 preemptible : preemptible
562604 maxRetries : max_retries
605+ noAddress : true
563606 }
564607
565608 output {
566609 File imputed_vcf = "~{output_basename }.imputed.vcf.gz"
610+ File imputed_vcf_index = "~{output_basename }.imputed.vcf.gz.tbi"
567611 }
568612}
569613
@@ -572,10 +616,10 @@ task CollectQCMetrics {
572616 File imputed_vcf
573617 String output_basename
574618
575- Int preemptible = 1
619+ Int preemptible = 0
576620 String docker = "hailgenetics/hail:0.2.126-py3.11"
577621 Int cpu = 4
578- Int mem_gb = 16
622+ Int mem_gb = 8
579623 }
580624
581625 parameter_meta {
@@ -584,7 +628,7 @@ task CollectQCMetrics {
584628 }
585629 }
586630
587- Int disk_size_gb = 100
631+ Int disk_size_gb = ceil ( 2 * size ( imputed_vcf , "GiB" ) + 50 )
588632
589633 command <<<
590634 set -euo pipefail
@@ -612,6 +656,7 @@ task CollectQCMetrics {
612656 memory : mem_gb + " GiB"
613657 cpu : cpu
614658 preemptible : preemptible
659+ noAddress : true
615660 }
616661
617662 output {
@@ -638,6 +683,7 @@ task CountSamples {
638683 disks : "local-disk ${disk_size_gb } HDD"
639684 memory : "${memory_mb } MiB"
640685 cpu : cpu
686+ noAddress : true
641687 }
642688 output {
643689 Int nSamples = read_int (stdout ())
@@ -680,7 +726,8 @@ task CombineCoverageMetrics
680726 >>>
681727
682728 runtime {
683- docker : "ubuntu:24.04"
729+ docker : "us.gcr.io/broad-dsde-methods/ubuntu:20.04"
730+ noAddress : true
684731 }
685732
686733 output {
@@ -734,15 +781,16 @@ task CreateVcfIndexAndMd5 {
734781 Int preemptible = 3
735782 }
736783
737- String vcf_basename = basename (vcf_input )
784+ String vcf_basename = basename (vcf_input , ".vcf.gz" )
738785
739786 command <<<
740787 set -e -o pipefail
741788
742- ln -sf ~{vcf_input } ~{vcf_basename }
789+ ln -sf ~{vcf_input } ~{vcf_basename }.vcf.gz
790+
791+ bcftools index -t ~{vcf_basename }.vcf.gz
743792
744- bcftools index -t ~{vcf_basename }
745- md5sum ~{vcf_basename } | awk '{ print $1 }' > ~{vcf_basename }.md5sum
793+ md5sum ~{vcf_basename }.vcf.gz | awk '{ print $1 }' > ~{vcf_basename }.md5sum
746794 >>>
747795 runtime {
748796 docker : gatk_docker
@@ -754,8 +802,86 @@ task CreateVcfIndexAndMd5 {
754802 noAddress : true
755803 }
756804 output {
757- File output_vcf = "~{vcf_basename }"
758- File output_vcf_index = "~{vcf_basename }.tbi"
805+ File output_vcf = "~{vcf_basename }.vcf.gz "
806+ File output_vcf_index = "~{vcf_basename }.vcf.gz. tbi"
759807 File output_vcf_md5sum = "~{vcf_basename }.md5sum"
760808 }
761809}
810+
811+ task SelectVariantRecordsOnly {
812+ input {
813+ File vcf
814+ File vcf_index
815+ String basename
816+
817+ Int disk_size_gb = ceil (2 *size (vcf , "GiB" )) + 10
818+ Int cpu = 1
819+ Int memory_mb = 3000
820+ String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
821+ }
822+ Int command_mem = memory_mb - 1500
823+ Int max_heap = memory_mb - 1000
824+
825+ command {
826+ set -e -o pipefail
827+
828+ # keep alt sites (i.e. remove hom ref sites)
829+ bcftools view -i 'GT[*]="alt"' -Oz -o ~{basename }.vcf.gz ~{vcf }
830+ }
831+
832+ runtime {
833+ docker : gatk_docker
834+ disks : "local-disk ${disk_size_gb } SSD"
835+ memory : "${memory_mb } MiB"
836+ cpu : cpu
837+ maxRetries : 1
838+ preemptible : 3
839+ noAddress : true
840+ }
841+
842+ output {
843+ File output_vcf = "~{basename }.vcf.gz"
844+ }
845+ }
846+
847+ task CreateHomRefSitesOnlyVcf {
848+ input {
849+ File vcf
850+ File vcf_index
851+ String basename
852+
853+ Int disk_size_gb = ceil (2 *size (vcf , "GiB" )) + 10
854+ Int cpu = 1
855+ Int memory_mb = 6000
856+ String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
857+ }
858+ Int command_mem = memory_mb - 1500
859+ Int max_heap = memory_mb - 1000
860+
861+ command {
862+ set -e -o pipefail
863+
864+ # create header with only first 8 columns and store that
865+ bcftools view -h ~{vcf } | grep "^##" > ~{basename }.vcf
866+ bcftools view -h ~{vcf } | grep -v "^##" | cut -f1 -8 >> ~{basename }.vcf
867+
868+ # append first 8 columns of hom ref sites to previously stored header
869+ bcftools query -e 'GT[*]="alt"' -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%INFO\n' ~{vcf } >> ~{basename }.vcf
870+
871+ bgzip ~{basename }.vcf
872+ }
873+
874+ runtime {
875+ docker : gatk_docker
876+ disks : "local-disk ${disk_size_gb } SSD"
877+ memory : "${memory_mb } MiB"
878+ cpu : cpu
879+ maxRetries : 1
880+ preemptible : 3
881+ noAddress : true
882+ }
883+
884+ output {
885+ File output_vcf = "~{basename }.vcf.gz"
886+ }
887+ }
0 commit comments