Skip to content

Commit b53986e

Browse files
authored
feat: add wgs normal sv artefact db (#1578)
#### Added - artefact database filter for WGS SVs
1 parent 92e1987 commit b53986e

18 files changed

+114
-95
lines changed

BALSAMIC/commands/config/case.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
OPTION_ANALYSIS_DIR,
1313
OPTION_ANALYSIS_WORKFLOW,
1414
OPTION_ARTEFACT_SNV_OBSERVATIONS,
15+
OPTION_ARTEFACT_SV_OBSERVATIONS,
1516
OPTION_BACKGROUND_VARIANTS,
1617
OPTION_BALSAMIC_CACHE,
1718
OPTION_CACHE_VERSION,
@@ -79,6 +80,7 @@
7980
@OPTION_ANALYSIS_DIR
8081
@OPTION_ANALYSIS_WORKFLOW
8182
@OPTION_ARTEFACT_SNV_OBSERVATIONS
83+
@OPTION_ARTEFACT_SV_OBSERVATIONS
8284
@OPTION_BACKGROUND_VARIANTS
8385
@OPTION_BALSAMIC_CACHE
8486
@OPTION_CACHE_VERSION
@@ -114,6 +116,7 @@ def case_config(
114116
analysis_dir: Path,
115117
analysis_workflow: AnalysisWorkflow,
116118
artefact_snv_observations: Path,
119+
artefact_sv_observations: Path,
117120
background_variants: Path,
118121
balsamic_cache: Path,
119122
cache_version: str,
@@ -184,6 +187,7 @@ def case_config(
184187
)
185188
variants_observations = {
186189
"artefact_snv_observations": artefact_snv_observations,
190+
"artefact_sv_observations": artefact_sv_observations,
187191
"clinical_snv_observations": clinical_snv_observations,
188192
"clinical_sv_observations": clinical_sv_observations,
189193
"cancer_germline_snv_observations": cancer_germline_snv_observations,
@@ -200,6 +204,7 @@ def case_config(
200204
if path is not None
201205
}
202206
)
207+
LOG.info(f"Collected references: {references}")
203208

204209
# Re-organises references into a subdict and adds metadata when available
205210
references = add_reference_metadata(

BALSAMIC/commands/options.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@
4848
required=False,
4949
help="VCF path of somatic SNVs called in high coverage normal samples (used in all workflows)",
5050
)
51+
OPTION_ARTEFACT_SV_OBSERVATIONS = click.option(
52+
"--artefact-sv-observations",
53+
type=click.Path(exists=True, resolve_path=True),
54+
required=False,
55+
help="VCF path of somatic SVs called in high coverage wgs normal samples used in WGS",
56+
)
5157
OPTION_BACKGROUND_VARIANTS = click.option(
5258
"-b",
5359
"--background-variants",

BALSAMIC/commands/run/analysis.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import logging
44
import os
55
import subprocess
6-
import shlex
76
import sys
87
from pathlib import Path
98
from typing import List

BALSAMIC/constants/analysis.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ class AnnotationCategory(StrEnum):
144144
"names": ["ArtefactFrq", "ArtefactObs", "ArtefactHom"],
145145
"category": AnnotationCategory.CLINICAL,
146146
},
147+
"artefact_sv_observations": {
148+
"fields": ["Frq", "Obs", "Hom"],
149+
"ops": ["self", "self", "self"],
150+
"names": ["ArtefactFrq", "ArtefactObs", "ArtefactHom"],
151+
"category": AnnotationCategory.CLINICAL,
152+
},
147153
"clinical_snv_observations": {
148154
"fields": ["Frq", "Obs", "Hom"],
149155
"ops": ["self", "self", "self"],

BALSAMIC/constants/rules.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
"snakemake_rules/align/bam_compress.rule",
3535
],
3636
"varcall": [
37+
"snakemake_rules/variant_calling/sv_quality_filter.rule",
3738
"snakemake_rules/variant_calling/snv_quality_filter.rule",
3839
"snakemake_rules/variant_calling/tnscope_post_process.rule",
3940
],

BALSAMIC/constants/variant_filters.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,11 @@ def get_tag_and_filtername(filters: List[VCFFilter], filter_name: str) -> List[s
472472
"filter_name": "Frq",
473473
"field": "INFO",
474474
},
475+
"loqusdb_artefact_sv_obs": {
476+
"tag_value": 4,
477+
"filter_name": "ArtefactObs",
478+
"field": "INFO",
479+
},
475480
"varcaller_name": "svdb",
476481
"filter_type": "general",
477482
"analysis_type": "tumor_only,tumor_normal",

BALSAMIC/constants/workflow_profile/config.yaml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -401,18 +401,14 @@ set-resources:
401401
bcftools_filter_merged_research_tumor_normal:
402402
runtime: 280
403403
mem_mb: 10000
404-
svdb_annotate_clinical_obs_somatic_clinical_sv:
405-
runtime: 600
406-
mem_mb: 15000
407-
svdb_annotate_somatic_obs_somatic_clinical_sv:
404+
svdb_annotate_clinical_sv:
408405
runtime: 600
409406
mem_mb: 15000
410407
svdb_annotate_somatic_research_sv:
411408
runtime: 720
412409
mem_mb: 25000
413410
vep_somatic_research_sv:
414411
runtime: 720
415-
mem_mb: 140000
416412
vcfanno_annotate_somaticSNV_clinical:
417413
runtime: 1080
418414
mem_mb: 10000
@@ -423,7 +419,7 @@ set-resources:
423419
runtime: 60
424420
mem_mb: 4000
425421
cadd_annotate_somaticINDEL_research:
426-
runtime: 1080
422+
runtime: 1580
427423
mem_mb: 20000
428424
bcftools_get_somaticINDEL_research:
429425
runtime: 60
@@ -626,8 +622,7 @@ set-threads:
626622
bcftools_filter_TNscope_umi_research_tumor_normal: 4
627623
bcftools_filter_merged_clinical_tumor_normal: 8
628624
bcftools_filter_merged_research_tumor_normal: 8
629-
svdb_annotate_clinical_obs_somatic_clinical_sv: 8
630-
svdb_annotate_somatic_obs_somatic_clinical_sv: 8
625+
svdb_annotate_clinical_sv: 8
631626
svdb_annotate_somatic_research_sv: 36
632627
vep_somatic_research_sv: 36
633628
vcfanno_annotate_somaticSNV_clinical: 12

BALSAMIC/models/params.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ class StructuralVariantFilters(BaseModel):
275275
Attributes:
276276
swegen_sv_freq: VCFAttributes (optional); maximum swegen sv allele frequency
277277
loqusdb_clinical_sv_freq: VCFAttributes (optional); maximum loqusdb clinical sv allele frequency
278+
loqusdb_artefact_sv_obs: VCFAttributes (optional); maximum loqusdb artefact sv allele obs
278279
low_pr_sr_count: VCFAttributes (optional); minumum Manta variant read support
279280
varcaller_name: str (required); variant caller name
280281
filter_type: str (required); filter name for variant caller
@@ -284,6 +285,7 @@ class StructuralVariantFilters(BaseModel):
284285

285286
swegen_sv_freq: Optional[VCFFilter] = None
286287
loqusdb_clinical_sv_freq: Optional[VCFFilter] = None
288+
loqusdb_artefact_sv_obs: Optional[VCFFilter] = None
287289
low_pr_sr_count: Optional[VCFFilter] = None
288290
varcaller_name: str
289291
filter_type: str

BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule

Lines changed: 26 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ rule vep_somatic_research_sv:
1111
vcf_research_vep = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vep.vcf.gz",
1212
benchmark:
1313
Path(benchmark_dir, "vep_somatic_research_sv." + config["analysis"]["case_id"] + ".svdb.tsv").as_posix()
14+
resources:
15+
mem_mb = lambda wc: (140000 if config_model.analysis.sequencing_type == SequencingType.WGS else 25000)
1416
singularity:
1517
Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix()
1618
params:
@@ -68,66 +70,52 @@ tabix -p vcf -f {output.vcf_research};
6870
"""
6971

7072

71-
rule svdb_annotate_clinical_obs_somatic_clinical_sv:
73+
rule svdb_annotate_clinical_sv:
7274
input:
73-
vcf_sv_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.filtered.pass.vcf.gz",
75+
vcf_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vcf.gz",
7476
output:
75-
vcf_sv_clinical_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf.gz"),
77+
vcf_sv_clinical = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.vcf.gz",
7678
benchmark:
77-
Path(benchmark_dir, 'svdb_annotate_clinical_obs_somatic_clinical_sv.' + config["analysis"]["case_id"] + ".tsv")
79+
Path(benchmark_dir, 'svdb_annotate_clinical_sv_.' + config["analysis"]["case_id"] + ".tsv")
7880
singularity:
7981
Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix()
8082
params:
8183
case_name = config["analysis"]["case_id"],
8284
clinical_sv_observations = clinical_sv,
83-
vcf_clinical_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf"),
85+
somatic_sv_observations = somatic_sv,
86+
artefact_sv_observations = artefact_sv_obs,
87+
vcf_clinical_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf.gz"),
88+
vcf_somatic_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.somatic_obs.vcf.gz"),
89+
vcf_intermediate = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.intermediate.vcf"),
8490
message:
8591
"Annotating structural and copy number variants with clinical observations using SVDB for {params.case_name}",
8692
shell:
8793
"""
8894
if [[ -f "{params.clinical_sv_observations}" ]]; then
8995
svdb --query --bnd_distance 10000 --overlap 0.80 \
9096
--in_occ Obs --out_occ clin_obs --in_frq Frq --out_frq Frq \
91-
--db {params.clinical_sv_observations} --query_vcf {input.vcf_sv_research} > {params.vcf_clinical_obs}
92-
bgzip -l 9 -c {params.vcf_clinical_obs} > {output.vcf_sv_clinical_obs};
97+
--db {params.clinical_sv_observations} --query_vcf {input.vcf_research} > {params.vcf_intermediate} ;
98+
bgzip -l 9 -c {params.vcf_intermediate} > {params.vcf_clinical_obs};
9399
else
94-
cp {input.vcf_sv_research} {output.vcf_sv_clinical_obs};
100+
cp {input.vcf_research} {params.vcf_clinical_obs};
95101
fi
96-
97-
tabix -p vcf -f {output.vcf_sv_clinical_obs};
98-
99-
rm {params.vcf_clinical_obs}
100-
"""
101-
102-
103-
rule svdb_annotate_somatic_obs_somatic_clinical_sv:
104-
input:
105-
vcf_sv_clinical_obs = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf.gz",
106-
output:
107-
vcf_sv_clinical = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.vcf.gz",
108-
benchmark:
109-
Path(benchmark_dir, 'svdb_annotate_somatic_obs_somatic_clinical_sv.' + config["analysis"]["case_id"] + ".tsv")
110-
singularity:
111-
Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix()
112-
params:
113-
case_name = config["analysis"]["case_id"],
114-
somatic_sv_observations = somatic_sv,
115-
vcf_somatic_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.somatic_obs.vcf"),
116-
message:
117-
"Annotating structural and copy number variants with clinical observations using SVDB for {params.case_name}",
118-
shell:
119-
"""
120102
if [[ -f "{params.somatic_sv_observations}" ]]; then
121103
svdb --query --bnd_distance 10000 --overlap 0.80 \
122104
--in_occ Obs --out_occ Cancer_Somatic_Obs --in_frq Frq --out_frq Cancer_Somatic_Frq \
123-
--db {params.somatic_sv_observations} --query_vcf {input.vcf_sv_clinical_obs} > {params.vcf_somatic_obs}
124-
bgzip -l 9 -c {params.vcf_somatic_obs} > {output.vcf_sv_clinical};
125-
rm {params.vcf_somatic_obs};
105+
--db {params.somatic_sv_observations} --query_vcf {params.vcf_clinical_obs} > {params.vcf_intermediate} ;
106+
bgzip -l 9 -c {params.vcf_intermediate} > {params.vcf_somatic_obs};
126107
else
127-
cp {input.vcf_sv_clinical_obs} {output.vcf_sv_clinical};
108+
cp {params.vcf_clinical_obs} {params.vcf_somatic_obs};
109+
fi
110+
if [[ -f "{params.artefact_sv_observations}" ]]; then
111+
svdb --query --bnd_distance 500 --overlap 0.80 \
112+
--in_occ Obs --out_occ ArtefactObs --in_frq Frq --out_frq ArtefactFrq \
113+
--db {params.artefact_sv_observations} --query_vcf {params.vcf_somatic_obs} > {params.vcf_intermediate} ;
114+
bgzip -l 9 -c {params.vcf_intermediate} > {output.vcf_sv_clinical};
115+
else
116+
cp {params.vcf_somatic_obs} {output.vcf_sv_clinical};
128117
fi
129118
130119
tabix -p vcf -f {output.vcf_sv_clinical};
120+
"""
131121

132-
rm {input.vcf_sv_clinical_obs};
133-
"""

BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ tabix -p vcf -f {output.vcf_pass_svdb};
2929
bcftools +counts {output.vcf_pass_svdb} > {output.bcftools_counts};
3030
"""
3131

32-
3332
rule bcftools_filter_sv_clinical:
3433
input:
3534
vcf_sv_clinical = vep_dir + "SV.somatic.{case_name}.svdb.clinical.vcf.gz",
@@ -43,19 +42,31 @@ rule bcftools_filter_sv_clinical:
4342
Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix()
4443
params:
4544
case_name = "{case_name}",
45+
artefact_sv_observations = artefact_sv_obs,
4646
swegen_freq = [SVDB_FILTERS.swegen_sv_freq.tag_value, SVDB_FILTERS.swegen_sv_freq.filter_name],
4747
loqusdb_clinical_freq = [SVDB_FILTERS.loqusdb_clinical_sv_freq.tag_value, SVDB_FILTERS.loqusdb_clinical_sv_freq.filter_name],
48+
loqusdb_artefact_sv_obs = [SVDB_FILTERS.loqusdb_artefact_sv_obs.tag_value ,SVDB_FILTERS.loqusdb_artefact_sv_obs.filter_name],
49+
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
4850
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"},
4951
message:
5052
"Filtering merged clinical structural and copy number variants using bcftools for {params.case_name}"
5153
shell:
5254
"""
5355
bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_sv_clinical} |\
5456
bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\
55-
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\
56-
bcftools view --threads {threads} -f .,PASS -O z -o {output.vcf_pass_svdb};
57+
bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' -O z -o {params.tmpdir}/SV.somatic.tmp.svdb.clinical.filtered.vcf.gz
58+
59+
if [[ -f "{params.artefact_sv_observations}" ]]; then
60+
bcftools filter --threads {threads} --include 'INFO/ArtefactObs <= {params.loqusdb_artefact_sv_obs[0]} || INFO/ArtefactObs == \".\"' --soft-filter '{params.loqusdb_artefact_sv_obs[1]}' --mode '+' {params.tmpdir}/SV.somatic.tmp.svdb.clinical.filtered.vcf.gz -O z -o {params.tmpdir}/SV.somatic.artefactfiltered.svdb.clinical.filtered.vcf.gz
61+
else
62+
mv {params.tmpdir}/SV.somatic.tmp.svdb.clinical.filtered.vcf.gz {params.tmpdir}/SV.somatic.artefactfiltered.svdb.clinical.filtered.vcf.gz ;
63+
fi
64+
65+
bcftools view --threads {threads} -f .,PASS -O z -o {output.vcf_pass_svdb} {params.tmpdir}/SV.somatic.artefactfiltered.svdb.clinical.filtered.vcf.gz ;
5766
5867
tabix -p vcf -f {output.vcf_pass_svdb};
5968
6069
bcftools +counts {output.vcf_pass_svdb} > {output.bcftools_counts};
61-
"""
70+
"""
71+
72+

0 commit comments

Comments
 (0)