Skip to content

Commit 9c73323

Browse files
jsotobroadJose Sotoactions-user
authored
TSPS-846 move imputed hom ref sites to new sites only vcf output (#1806)
* low pass wgs imputation - split out hom ref sites into their own sites only vcf --------- Co-authored-by: Jose Soto <jsoto@broadinstitute.org> Co-authored-by: GitHub Action <action@github.com>
1 parent 6072ef1 commit 9c73323

File tree

3 files changed

+145
-14
lines changed

3 files changed

+145
-14
lines changed

pipeline_versions.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ BuildIndices 5.1.0 2026-02-13
44
CramToUnmappedBams 1.1.3 2024-08-02
55
ExomeGermlineSingleSample 3.2.7 2026-01-21
66
ExomeReprocessing 3.3.7 2026-01-21
7-
Glimpse2LowPassImputation 0.0.3 2026-03-25
7+
Glimpse2LowPassImputation 0.0.4 2026-04-01
88
IlluminaGenotypingArray 1.12.27 2026-01-21
99
Imputation 1.1.23 2025-10-03
1010
ImputationBeagle 3.0.1 2026-02-23

pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# 0.0.4
2+
2026-04-01 (Date of Last Commit)
3+
4+
* split out imputed hom ref sites to their own sites only vcf file output
5+
16
# 0.0.3
27
2026-03-25 (Date of Last Commit)
38

pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl

Lines changed: 139 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@ version 1.0
22

33
workflow Glimpse2LowPassImputation {
44
input {
5-
String pipeline_version = "0.0.3"
5+
String pipeline_version = "0.0.4"
6+
7+
# List of files, one per line
68
79
Array[String] contigs
810

9-
# this is the path the a directory that contains sites vcf, sites tabke, and reference chunks file. should end with a "/
11+
# this is the path the a directory that contains sites vcf, sites table, and reference chunks file. should end with a "/"
1012
String reference_panel_prefix
1113

1214
File? input_vcf
@@ -134,11 +136,25 @@ workflow Glimpse2LowPassImputation {
134136
docker = glimpse_docker
135137
}
136138
Array[File] contig_coverage_metrics = select_all(GlimpsePhase.coverage_metrics)
139+
140+
call SelectVariantRecordsOnly {
141+
input:
142+
vcf = GlimpseLigate.imputed_vcf,
143+
vcf_index = GlimpseLigate.imputed_vcf_index,
144+
basename = output_basename + "." + contig + ".imputed.only_variants",
145+
}
146+
147+
call CreateHomRefSitesOnlyVcf {
148+
input:
149+
vcf = GlimpseLigate.imputed_vcf,
150+
vcf_index = GlimpseLigate.imputed_vcf_index,
151+
basename = output_basename + "." + contig + ".imputed.only_hom_ref.sites_only",
152+
}
137153
}
138154
139155
call GatherVcfsNoIndex {
140156
input:
141-
input_vcfs = GlimpseLigate.imputed_vcf,
157+
input_vcfs = SelectVariantRecordsOnly.output_vcf,
142158
output_vcf_basename = output_basename + ".imputed",
143159
gatk_docker = gatk_docker
144160
}
@@ -150,6 +166,20 @@ workflow Glimpse2LowPassImputation {
150166
preemptible = 0
151167
}
152168
169+
call GatherVcfsNoIndex as GatherVcfsNoIndexHomRefOnly {
170+
input:
171+
input_vcfs = CreateHomRefSitesOnlyVcf.output_vcf,
172+
output_vcf_basename = output_basename + ".imputed.hom_ref_sites_only",
173+
gatk_docker = gatk_docker
174+
}
175+
176+
call CreateVcfIndexAndMd5 as CreateVcfIndexAndMd5HomRefOnly {
177+
input:
178+
vcf_input = GatherVcfsNoIndexHomRefOnly.output_vcf,
179+
gatk_docker = gatk_docker,
180+
preemptible = 0
181+
}
182+
153183
Array[File] genome_coverage_metrics = flatten(contig_coverage_metrics)
154184
if (length(genome_coverage_metrics) > 0) {
155185
call CombineCoverageMetrics {
@@ -171,6 +201,10 @@ workflow Glimpse2LowPassImputation {
171201
File imputed_vcf_index = CreateVcfIndexAndMd5.output_vcf_index
172202
File imputed_vcf_md5sum = CreateVcfIndexAndMd5.output_vcf_md5sum
173203

204+
File imputed_hom_ref_sites_only_vcf = CreateVcfIndexAndMd5HomRefOnly.output_vcf
205+
File imputed_hom_ref_sites_only_vcf_inex = CreateVcfIndexAndMd5HomRefOnly.output_vcf_index
206+
File imputed_hom_ref_sites_only_vcf_md5 = CreateVcfIndexAndMd5HomRefOnly.output_vcf_md5sum
207+
174208
File qc_metrics = CollectQCMetrics.qc_metrics
175209
File? coverage_metrics = CombineCoverageMetrics.coverage_metrics
176210
}
@@ -214,6 +248,7 @@ task SplitIntoBatches {
214248
disks: "local-disk 10 HDD"
215249
memory: "1 GiB"
216250
preemptible: 3
251+
noAddress: true
217252
}
218253

219254
output {
@@ -249,6 +284,7 @@ task ComputeShardsAndMemoryPerShard {
249284

250285
runtime {
251286
docker : "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
287+
noAddress: true
252288
}
253289

254290
output {
@@ -297,6 +333,7 @@ task BcftoolsMpileup {
297333
cpu: cpu
298334
preemptible: preemptible
299335
maxRetries: max_retries
336+
noAddress: true
300337
}
301338

302339
output {
@@ -332,6 +369,7 @@ task BcftoolsCall {
332369
cpu: cpu
333370
preemptible: preemptible
334371
maxRetries: max_retries
372+
noAddress: true
335373
}
336374

337375
output {
@@ -366,6 +404,7 @@ task BcftoolsNorm {
366404
cpu: cpu
367405
preemptible: preemptible
368406
maxRetries: max_retries
407+
noAddress: true
369408
}
370409

371410
output {
@@ -401,6 +440,7 @@ task BcftoolsMerge {
401440
cpu: cpu
402441
preemptible: preemptible
403442
maxRetries: max_retries
443+
noAddress: true
404444
}
405445

406446
output {
@@ -515,6 +555,7 @@ task GlimpsePhase {
515555
preemptible: preemptible
516556
maxRetries: max_retries
517557
checkpointFile: "checkpoint.bin"
558+
noAddress: true
518559
}
519560

520561
output {
@@ -551,6 +592,7 @@ task GlimpseLigate {
551592
bcftools view -h --no-version ligated.vcf.gz > old_header.vcf
552593
java -jar /picard.jar UpdateVcfSequenceDictionary -I old_header.vcf --SD ~{ref_dict} -O new_header.vcf
553594
bcftools reheader -h new_header.vcf -o ~{output_basename}.imputed.vcf.gz ligated.vcf.gz
595+
tabix ~{output_basename}.imputed.vcf.gz
554596
>>>
555597

556598
runtime {
@@ -560,10 +602,12 @@ task GlimpseLigate {
560602
cpu: cpu
561603
preemptible: preemptible
562604
maxRetries: max_retries
605+
noAddress: true
563606
}
564607

565608
output {
566609
File imputed_vcf = "~{output_basename}.imputed.vcf.gz"
610+
File imputed_vcf_index = "~{output_basename}.imputed.vcf.gz.tbi"
567611
}
568612
}
569613

@@ -572,10 +616,10 @@ task CollectQCMetrics {
572616
File imputed_vcf
573617
String output_basename
574618

575-
Int preemptible = 1
619+
Int preemptible = 0
576620
String docker = "hailgenetics/hail:0.2.126-py3.11"
577621
Int cpu = 4
578-
Int mem_gb = 16
622+
Int mem_gb = 8
579623
}
580624

581625
parameter_meta {
@@ -584,7 +628,7 @@ task CollectQCMetrics {
584628
}
585629
}
586630

587-
Int disk_size_gb = 100
631+
Int disk_size_gb = ceil(2*size(imputed_vcf, "GiB") + 50)
588632

589633
command <<<
590634
set -euo pipefail
@@ -612,6 +656,7 @@ task CollectQCMetrics {
612656
memory: mem_gb + " GiB"
613657
cpu: cpu
614658
preemptible: preemptible
659+
noAddress: true
615660
}
616661

617662
output {
@@ -638,6 +683,7 @@ task CountSamples {
638683
disks: "local-disk ${disk_size_gb} HDD"
639684
memory: "${memory_mb} MiB"
640685
cpu: cpu
686+
noAddress: true
641687
}
642688
output {
643689
Int nSamples = read_int(stdout())
@@ -680,7 +726,8 @@ task CombineCoverageMetrics
680726
>>>
681727

682728
runtime {
683-
docker: "ubuntu:24.04"
729+
docker: "us.gcr.io/broad-dsde-methods/ubuntu:20.04"
730+
noAddress: true
684731
}
685732

686733
output {
@@ -734,15 +781,16 @@ task CreateVcfIndexAndMd5 {
734781
Int preemptible = 3
735782
}
736783

737-
String vcf_basename = basename(vcf_input)
784+
String vcf_basename = basename(vcf_input, ".vcf.gz")
738785

739786
command <<<
740787
set -e -o pipefail
741788
742-
ln -sf ~{vcf_input} ~{vcf_basename}
789+
ln -sf ~{vcf_input} ~{vcf_basename}.vcf.gz
790+
791+
bcftools index -t ~{vcf_basename}.vcf.gz
743792
744-
bcftools index -t ~{vcf_basename}
745-
md5sum ~{vcf_basename} | awk '{ print $1 }' > ~{vcf_basename}.md5sum
793+
md5sum ~{vcf_basename}.vcf.gz | awk '{ print $1 }' > ~{vcf_basename}.md5sum
746794
>>>
747795
runtime {
748796
docker: gatk_docker
@@ -754,8 +802,86 @@ task CreateVcfIndexAndMd5 {
754802
noAddress: true
755803
}
756804
output {
757-
File output_vcf = "~{vcf_basename}"
758-
File output_vcf_index = "~{vcf_basename}.tbi"
805+
File output_vcf = "~{vcf_basename}.vcf.gz"
806+
File output_vcf_index = "~{vcf_basename}.vcf.gz.tbi"
759807
File output_vcf_md5sum = "~{vcf_basename}.md5sum"
760808
}
761809
}
810+
811+
task SelectVariantRecordsOnly {
812+
input {
813+
File vcf
814+
File vcf_index
815+
String basename
816+
817+
Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10
818+
Int cpu = 1
819+
Int memory_mb = 3000
820+
String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
821+
}
822+
Int command_mem = memory_mb - 1500
823+
Int max_heap = memory_mb - 1000
824+
825+
command {
826+
set -e -o pipefail
827+
828+
# keep alt sites (i.e. remove hom ref sites)
829+
bcftools view -i 'GT[*]="alt"' -Oz -o ~{basename}.vcf.gz ~{vcf}
830+
}
831+
832+
runtime {
833+
docker: gatk_docker
834+
disks: "local-disk ${disk_size_gb} SSD"
835+
memory: "${memory_mb} MiB"
836+
cpu: cpu
837+
maxRetries: 1
838+
preemptible: 3
839+
noAddress: true
840+
}
841+
842+
output {
843+
File output_vcf = "~{basename}.vcf.gz"
844+
}
845+
}
846+
847+
task CreateHomRefSitesOnlyVcf {
848+
input {
849+
File vcf
850+
File vcf_index
851+
String basename
852+
853+
Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10
854+
Int cpu = 1
855+
Int memory_mb = 6000
856+
String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
857+
}
858+
Int command_mem = memory_mb - 1500
859+
Int max_heap = memory_mb - 1000
860+
861+
command {
862+
set -e -o pipefail
863+
864+
# create header with only first 8 columns and store that
865+
bcftools view -h ~{vcf} | grep "^##" > ~{basename}.vcf
866+
bcftools view -h ~{vcf} | grep -v "^##" | cut -f1-8 >> ~{basename}.vcf
867+
868+
# append first 8 columns of hom ref sites to previously stored header
869+
bcftools query -e 'GT[*]="alt"' -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\t%INFO\n' ~{vcf} >> ~{basename}.vcf
870+
871+
bgzip ~{basename}.vcf
872+
}
873+
874+
runtime {
875+
docker: gatk_docker
876+
disks: "local-disk ${disk_size_gb} SSD"
877+
memory: "${memory_mb} MiB"
878+
cpu: cpu
879+
maxRetries: 1
880+
preemptible: 3
881+
noAddress: true
882+
}
883+
884+
output {
885+
File output_vcf = "~{basename}.vcf.gz"
886+
}
887+
}

0 commit comments

Comments
 (0)