Skip to content

Commit ed9ba7a

Browse files
authored
Remove greps in clean_vcf_part1 which could match sample IDs to variant IDs (#157)
* make script use bcftools instead of greps to calculate sex-specific median values * udpate sv-pipeline docker
1 parent a90c76e commit ed9ba7a

File tree

9 files changed

+26
-15
lines changed

9 files changed

+26
-15
lines changed

input_values/dockers.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"sv_base_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:mw-gnomad-02-6a66c96",
1414
"sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:rlc_posthoc_filtering_cnv_mcnv_compatability_9a8561",
1515
"sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-base:rlc_posthoc_filtering_cnv_mcnv_compatability_9a8561",
16-
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:cw_cleanvcf1_ev_script_7cf6798",
16+
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/cwhelan/sv-pipeline:cw_clean_vcf_part1_rm_allosome_greps_8348649",
1717
"sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-qc:mw-xz-fixes-7cbffee",
1818
"sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-rdtest:mw-gnomad-02-6a66c96",
1919
"wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa",

src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BCFTOOLS=/usr/local/bin/bcftools
1919
vcf=$1
2020
backgroundlist=$2
2121
famfile=$3
22+
allosome_fai=$4
2223

2324
##get sampleids from VCF##
2425
zcat $vcf \
@@ -58,6 +59,7 @@ zcat convertsvtype.vcf.gz \
5859
|awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' OFS='\t' vargq.persample - \
5960
|bgzip \
6061
>cleaninfo.vcf.gz
62+
tabix -p vcf cleaninfo.vcf.gz
6163

6264

6365
##fix sex chr if necessary##
@@ -83,21 +85,20 @@ awk '{if ($5==2) print $2}' $famfile \
8385
if [ $(cat clean.bed.ids.txt|wc -l) -gt 0 ]
8486
then
8587

86-
zcat cleaninfo.vcf.gz \
87-
|awk '{if ($1!~"#" && ($1~"X" || $1~"Y")) $1=$3;print}' OFS="\t"\
88-
|vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
89-
|awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"\t"header[j] "\t" $j }' \
90-
|fgrep -wf clean.bed.ids.txt \
91-
|awk '{if ($3!=".") print}' \
92-
|gzip \
93-
>RD_CN.sexcheck.FORMAT.gz
94-
95-
zcat RD_CN.sexcheck.FORMAT.gz|fgrep -wf male.txt| Rscript -e 'd<-read.table("stdin")' \
88+
awk '{print $1"\t0\t"$2}' < ${allosome_fai} > allosomes.list
89+
${BCFTOOLS} query -R allosomes.list -S male.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \
90+
| awk '{if ($3!=".") print}' \
91+
| gzip > RD_CN.sexcheck.FORMAT.male.gz
92+
93+
${BCFTOOLS} query -R allosomes.list -S female.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \
94+
| awk '{if ($3!=".") print}' \
95+
| gzip > RD_CN.sexcheck.FORMAT.female.gz
96+
97+
zcat RD_CN.sexcheck.FORMAT.male.gz| Rscript -e 'd<-read.table("stdin")' \
9698
-e 'x<-tapply(d[,3],d[,1],median)' \
9799
-e 'write.table(x,"male.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
98100

99-
100-
zcat RD_CN.sexcheck.FORMAT.gz|fgrep -wf female.txt| Rscript -e 'd<-read.table("stdin")' \
101+
zcat RD_CN.sexcheck.FORMAT.female.gz| Rscript -e 'd<-read.table("stdin")' \
101102
-e 'x<-tapply(d[,3],d[,1],median)' \
102103
-e 'write.table(x,"female.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
103104
fi
@@ -242,7 +243,6 @@ fi
242243

243244

244245
tabix -p vcf combinedsex.vcf.gz
245-
tabix -p vcf cleaninfo.vcf.gz
246246

247247
zcat combinedsex.vcf.gz|awk '{if ($1!~"#") print $3}'>modified.ids.txt
248248

test_input_templates/module0506/Module0506.json.tmpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"Module0506.bin_exclude": {{ reference_resources.bin_exclude | tojson }},
33
"Module0506.contig_list": {{ reference_resources.primary_contigs_fai | tojson }},
4+
"Module0506.allosome_fai": {{ reference_resources.allosome_file | tojson }},
45
"Module0506.cytobands": {{ reference_resources.cytobands | tojson }},
56
"Module0506.mei_bed": {{ reference_resources.mei_bed | tojson }},
67
"Module0506.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }},

test_input_templates/module0506/Module0506Test.json.tmpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120

121121
"Module0506Test.Module0506.bin_exclude": {{ reference_resources.bin_exclude | tojson }},
122122
"Module0506Test.Module0506.contig_list": {{ reference_resources.primary_contigs_fai | tojson }},
123+
"Module0506Test.Module0506.allosome_fai": {{ reference_resources.allosome_file | tojson }},
123124
"Module0506Test.Module0506.cytobands": {{ reference_resources.cytobands | tojson }},
124125
"Module0506Test.Module0506.mei_bed": {{ reference_resources.mei_bed | tojson }},
125126
"Module0506Test.Module0506.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }},

wdl/CleanVcf.wdl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ workflow CleanVcf {
99
String contig
1010
File background_list
1111
File ped_file
12+
File allosome_fai
1213
String prefix
1314
Int max_shards_per_chrom_step1
1415
File bothsides_pass_list
@@ -60,6 +61,7 @@ workflow CleanVcf {
6061
ped_file=ped_file,
6162
sv_pipeline_docker=sv_pipeline_docker,
6263
bothsides_pass_list=bothsides_pass_list,
64+
allosome_fai=allosome_fai,
6365
runtime_attr_override=runtime_override_clean_vcf_1a
6466
}
6567
}
@@ -203,6 +205,7 @@ task CleanVcf1a {
203205
File ped_file
204206
String sv_pipeline_docker
205207
File bothsides_pass_list
208+
File allosome_fai
206209
RuntimeAttr? runtime_attr_override
207210
}
208211

@@ -235,7 +238,7 @@ task CleanVcf1a {
235238
command <<<
236239
set -eu -o pipefail
237240
238-
/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh ~{vcf} ~{background_list} ~{ped_file}
241+
/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh ~{vcf} ~{background_list} ~{ped_file} ~{allosome_fai}
239242
/opt/sv-pipeline/04_variant_resolution/scripts/add_bothsides_support_filter.py \
240243
--bgzip \
241244
--outfile int.w_bothsides.vcf.gz \

wdl/GATKSVPipelineBatch.wdl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ workflow GATKSVPipelineBatch {
251251
pesr_vcfs=[Module04.genotyped_pesr_vcf],
252252
depth_vcfs=Module04b.regenotyped_depth_vcfs,
253253
contig_list=primary_contigs_fai,
254+
allosome_fai=allosome_file,
254255
ref_dict=reference_dict,
255256
disc_files=[GATKSVPipelinePhase1.merged_PE],
256257
disc_files_index=[GATKSVPipelinePhase1.merged_PE_index],

wdl/GATKSVPipelineSingleSample.wdl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,7 @@ workflow GATKSVPipelineSingleSample {
871871
pesr_vcfs=[ConvertCNVsWithoutDepthSupportToBNDs.out_vcf],
872872
depth_vcfs=[Module04.genotyped_depth_vcf],
873873
contig_list=primary_contigs_fai,
874+
allosome_fai=allosome_file,
874875
ref_dict=reference_dict,
875876
876877
merge_complex_genotype_vcfs = true,

wdl/Module0506.wdl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ workflow Module0506 {
3131

3232
File bin_exclude
3333
File contig_list
34+
File allosome_fai
3435
Int max_shards_per_chrom
3536
Int min_variants_per_shard
3637
File cytobands
@@ -270,6 +271,7 @@ workflow Module0506 {
270271
complex_resolve_background_fail_lists=Module0506ComplexResolve.complex_resolve_background_fail_lists,
271272
merged_ped_file=ped_file,
272273
contig_list=contig_list,
274+
allosome_fai=allosome_fai,
273275
max_shards_per_chrom=max_shards_per_chrom,
274276
max_shards_per_chrom_clean_vcf_step1=max_shards_per_chrom_clean_vcf_step1,
275277
min_records_per_shard_clean_vcf_step1=min_records_per_shard_clean_vcf_step1,

wdl/Module0506Clean.wdl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ workflow Module0506Clean {
1313
File merged_ped_file
1414

1515
File contig_list
16+
File allosome_fai
1617
Int max_shards_per_chrom
1718
Int max_shards_per_chrom_clean_vcf_step1
1819
Int min_records_per_shard_clean_vcf_step1
@@ -57,6 +58,7 @@ workflow Module0506Clean {
5758
background_list=complex_resolve_background_fail_lists[i],
5859
ped_file=merged_ped_file,
5960
bothsides_pass_list=complex_resolve_bothside_pass_lists[i],
61+
allosome_fai=allosome_fai,
6062
prefix=cohort_name,
6163
max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1,
6264
min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1,

0 commit comments

Comments
 (0)