Skip to content

Commit 5cff5ad

Browse files
authored
Allow missing vcf samples or gvcfs in Module00c (#207)
1 parent 6a2a7ba commit 5cff5ad

File tree

5 files changed

+105
-117
lines changed

5 files changed

+105
-117
lines changed

input_values/dockers.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"sv_base_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-base:lint-24b9cda",
1414
"sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:lint-24b9cda",
1515
"sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-base:lint-24b9cda",
16-
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:lint-24b9cda",
16+
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-missing-baf-f14df3b",
1717
"sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-qc:lint-24b9cda",
1818
"sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-rdtest:lint-24b9cda",
1919
"wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa",

src/sv-pipeline/02_evidence_assessment/02d_baftest/scripts/Filegenerate/generate_baf.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import argparse
44
import numpy as np
55
import pysam
6-
import sys
76

87

98
def filter_record(record, unfiltered=False):
@@ -17,7 +16,7 @@ def filter_record(record, unfiltered=False):
1716
1817
Parameters
1918
----------
20-
records : iterator of pysam.VariantRecords
19+
record : Object of pysam.VariantRecords
2120
2221
Returns
2322
------
@@ -41,7 +40,7 @@ def filter_record(record, unfiltered=False):
4140
return False
4241

4342

44-
def calc_BAF(record, samples=None):
43+
def calc_BAF(record, samples):
4544
"""
4645
4746
Parameters
@@ -71,10 +70,7 @@ def _calc_BAF(sample):
7170
else:
7271
return np.nan
7372

74-
if samples is None:
75-
samples = record.samples.keys()
76-
77-
bafs = np.array([_calc_BAF(sample) for sample in samples], dtype=np.float)
73+
bafs = np.array([_calc_BAF(sample) for sample in samples], dtype=float)
7874

7975
return bafs, samples
8076

@@ -116,14 +112,25 @@ def main():
116112
parser = argparse.ArgumentParser(
117113
description=__doc__,
118114
formatter_class=argparse.RawDescriptionHelpFormatter)
119-
parser.add_argument('--samples-list', default=None)
115+
parser.add_argument('vcf')
116+
parser.add_argument('--samples-list')
120117
parser.add_argument('--unfiltered', action='store_true')
118+
parser.add_argument('--ignore-missing-vcf-samples', action='store_true')
121119
args = parser.parse_args()
122-
vcf = pysam.VariantFile(sys.stdin)
120+
vcf = pysam.VariantFile(args.vcf)
121+
vcf_samples = vcf.header.samples
123122
if args.samples_list is not None:
124123
samples_list = read_samples_list(args.samples_list)
124+
samples_list_intersection = set(samples_list).intersection(set(vcf_samples))
125+
if args.ignore_missing_vcf_samples:
126+
samples_list = [s for s in samples_list if s in samples_list_intersection] # Preserves order
127+
elif len(samples_list) > len(samples_list_intersection):
128+
missing_samples = set(samples_list) - samples_list_intersection
129+
raise ValueError("VCF is missing samples in the samples list. Use --ignore-missing-vcf-samples to bypass "
130+
"this error. Samples: {}".format(", ".join(missing_samples)))
125131
else:
126-
samples_list = None
132+
samples_list = vcf_samples
133+
127134
# While loop to iterate over all records, then break if reach the end
128135
for record in vcf:
129136
if not filter_record(record, args.unfiltered):

wdl/BAFFromGVCFs.wdl

Lines changed: 44 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ import "Structs.wdl"
66

77
workflow BAFFromGVCFs {
88
input {
9-
Array[File] gvcfs
9+
Array[File?] gvcfs
1010
Array[String] samples
11+
Boolean ignore_missing_gvcfs
1112
File unpadded_intervals_file
1213
File dbsnp_vcf
1314
File? dbsnp_vcf_index
@@ -26,17 +27,24 @@ workflow BAFFromGVCFs {
2627
}
2728

2829
Int num_of_original_intervals = length(read_lines(unpadded_intervals_file))
29-
Int num_gvcfs = length(gvcfs)
3030

3131
# Make a 2.5:1 interval number to samples in callset ratio interval list
3232
Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5)
3333
Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1
3434

3535
File dbsnp_vcf_index_ = if defined(dbsnp_vcf_index) then select_first([dbsnp_vcf_index]) else dbsnp_vcf + ".idx"
3636

37-
scatter (gvcf in gvcfs) {
38-
File gvcf_indexes = gvcf + ".tbi"
37+
scatter (i in range(length(samples))) {
38+
if (defined(gvcfs[i])) {
39+
String defined_samples_optional_ = samples[i]
40+
File defined_gvcfs_optional_ = select_first([gvcfs[i]])
41+
File gvcf_indexes_optional_ = select_first([gvcfs[i]]) + ".tbi"
42+
}
3943
}
44+
Array[String] defined_samples_ = select_all(defined_samples_optional_)
45+
Array[File] defined_gvcfs_ = select_all(defined_gvcfs_optional_)
46+
Array[File] gvcf_indexes_ = select_all(gvcf_indexes_optional_)
47+
Int num_gvcfs = length(defined_gvcfs_)
4048

4149
call DynamicallyCombineIntervals {
4250
input:
@@ -47,14 +55,14 @@ workflow BAFFromGVCFs {
4755
4856
Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals)
4957
50-
Int disk_size_gb = 10 + ceil((size(gvcfs, "GB") + size(gvcf_indexes, "GB")) * 1.5)
58+
Int disk_size_gb = 10 + ceil((size(defined_gvcfs_, "GB") + size(gvcf_indexes_, "GB")) * 1.5)
5159
5260
scatter (idx in range(length(unpadded_intervals))) {
5361
call ImportGVCFs {
5462
input:
55-
sample_names = samples,
56-
input_gvcfs = gvcfs,
57-
input_gvcfs_indices = gvcf_indexes,
63+
sample_names = defined_samples_,
64+
input_gvcfs = defined_gvcfs_,
65+
input_gvcfs_indices = gvcf_indexes_,
5866
interval = unpadded_intervals[idx],
5967
workspace_dir_name = "genomicsdb",
6068
disk_size = disk_size_gb,
@@ -82,7 +90,8 @@ workflow BAFFromGVCFs {
8290
call GenerateBAF {
8391
input:
8492
vcf = GenotypeGVCFs.output_vcf,
85-
vcf_index = GenotypeGVCFs.output_vcf_index,
93+
samples = samples,
94+
ignore_missing_vcf_samples = ignore_missing_gvcfs,
8695
batch = batch,
8796
shard = "~{idx}",
8897
sv_pipeline_docker = sv_pipeline_docker,
@@ -92,8 +101,8 @@ workflow BAFFromGVCFs {
92101
93102
call MergeEvidenceFiles {
94103
input:
95-
files = GenerateBAF.BAF,
96-
indexes = GenerateBAF.BAF_idx,
104+
files = GenerateBAF.out,
105+
indexes = GenerateBAF.out_index,
97106
batch = batch,
98107
evidence = "BAF",
99108
inclusion_bed = inclusion_bed,
@@ -312,84 +321,50 @@ task ImportGVCFs {
312321
task GenerateBAF {
313322
input {
314323
File vcf
315-
File vcf_index
324+
File? vcf_header
325+
Array[String] samples
326+
Boolean ignore_missing_vcf_samples
316327
String batch
317328
String shard
318329
String sv_pipeline_docker
319330
RuntimeAttr? runtime_attr_override
320331
}
321332

322-
RuntimeAttr default_attr = object {
323-
cpu_cores: 1,
324-
mem_gb: 3.75,
325-
disk_gb: 10,
326-
boot_disk_gb: 10,
327-
preemptible_tries: 3,
328-
max_retries: 1
329-
}
330-
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
331-
332-
output {
333-
File BAF = "~{batch}.BAF.shard-~{shard}.txt.gz"
334-
File BAF_idx = "~{batch}.BAF.shard-~{shard}.txt.gz.tbi"
335-
}
336-
command <<<
337-
338-
set -euo pipefail
339-
bcftools view -M2 -v snps ~{vcf} \
340-
| python /opt/sv-pipeline/02_evidence_assessment/02d_baftest/scripts/Filegenerate/generate_baf.py --unfiltered \
341-
| bgzip -c \
342-
> ~{batch}.BAF.shard-~{shard}.txt.gz
343-
tabix -f -s1 -b 2 -e 2 ~{batch}.BAF.shard-~{shard}.txt.gz
344-
345-
>>>
346-
runtime {
347-
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
348-
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
349-
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
350-
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
351-
docker: sv_pipeline_docker
352-
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
353-
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
354-
}
355-
356-
}
357-
358-
task GatherBAF {
359-
input {
360-
String batch
361-
Array[File] BAF
362-
String sv_base_mini_docker
363-
RuntimeAttr? runtime_attr_override
364-
}
333+
String ignore_missing_vcf_samples_flag = if (ignore_missing_vcf_samples) then "--ignore-missing-vcf-samples" else ""
365334

366335
RuntimeAttr default_attr = object {
367-
cpu_cores: 1,
368-
mem_gb: 3.75,
369-
disk_gb: 100,
370-
boot_disk_gb: 10,
371-
preemptible_tries: 3,
372-
max_retries: 1
373-
}
336+
cpu_cores: 1,
337+
mem_gb: 3.75,
338+
disk_gb: 10,
339+
boot_disk_gb: 10,
340+
preemptible_tries: 3,
341+
max_retries: 1
342+
}
374343
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
375344

376345
output {
377-
File out = "~{batch}.BAF.txt.gz"
378-
File out_index = "~{batch}.BAF.txt.gz.tbi"
346+
File out = "~{batch}.~{shard}.txt.gz"
347+
File out_index = "~{batch}.~{shard}.txt.gz.tbi"
379348
}
380349
command <<<
381350
382351
set -euo pipefail
383-
cat ~{sep=" " BAF} | bgzip -c > ~{batch}.BAF.txt.gz
384-
tabix -f -s1 -b 2 -e 2 ~{batch}.BAF.txt.gz
385-
352+
python /opt/sv-pipeline/02_evidence_assessment/02d_baftest/scripts/Filegenerate/generate_baf.py \
353+
--unfiltered \
354+
--samples-list ~{write_lines(samples)} \
355+
~{ignore_missing_vcf_samples_flag} \
356+
~{if defined(vcf_header) then "<(cat ~{vcf_header} ~{vcf})" else vcf} \
357+
| bgzip \
358+
> ~{batch}.~{shard}.txt.gz
359+
360+
tabix -s1 -b2 -e2 ~{batch}.~{shard}.txt.gz
386361
>>>
387362
runtime {
388363
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
389364
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
390365
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
391366
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
392-
docker: sv_base_mini_docker
367+
docker: sv_pipeline_docker
393368
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
394369
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
395370
}
@@ -436,7 +411,7 @@ task MergeEvidenceFiles {
436411
437412
mkdir tmp
438413
sort -m -k1,1V -k2,2n -T tmp data/*.txt | bgzip -c > ~{batch}.~{evidence}.txt.gz
439-
tabix -f -s1 -b2 -e2 ~{batch}.~{evidence}.txt.gz
414+
tabix -s1 -b2 -e2 ~{batch}.~{evidence}.txt.gz
440415
441416
>>>
442417
runtime {

0 commit comments

Comments
 (0)