Skip to content

Commit fc5e163

Browse files
authored
chore: add clnvid cache rule (#1576)
#### Added - rule and script for adding CLNVID to clinvar INFO field
1 parent 56e2f8c commit fc5e163

File tree

11 files changed

+150
-14
lines changed

11 files changed

+150
-14
lines changed

.github/workflows/pytest_and_coveralls.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,4 @@ jobs:
6161
file: ./coverage.xml
6262
flags: unittests
6363
fail_ci_if_error: true
64-
verbose: true
64+
verbose: true
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import pysam
2+
import click
3+
4+
5+
def add_clnvid_header(output_handle) -> None:
6+
"""
7+
Writes the INFO header line for the CLNVID field to the output VCF.
8+
"""
9+
# Use String to be safe; ID column can contain non-numeric identifiers.
10+
header_line = '##INFO=<ID=CLNVID,Number=1,Type=String,Description="ClinVar Variation ID taken from the VCF ID column">'
11+
output_handle.write(f"{header_line}\n".encode("utf-8"))
12+
13+
14+
def process_vcf(input_path: str, output_path: str) -> None:
15+
"""
16+
Processes a bgzipped VCF file using pysam, adds the CLNVID INFO field based on the ID column,
17+
and writes to a new bgzipped VCF file in a tabix-compatible format.
18+
"""
19+
saw_clnvid_header = False
20+
21+
with pysam.BGZFile(input_path, "r") as infile, pysam.BGZFile(
22+
output_path, "w"
23+
) as outfile:
24+
for raw_line in infile:
25+
# Pass through meta headers; track if CLNVID header already exists
26+
if raw_line.startswith(b"##"):
27+
if b"##INFO=<ID=CLNVID" in raw_line:
28+
saw_clnvid_header = True
29+
outfile.write(raw_line)
30+
continue
31+
32+
# Column header line: add CLNVID header if missing, then write
33+
if raw_line.startswith(b"#"):
34+
if not saw_clnvid_header:
35+
add_clnvid_header(outfile)
36+
saw_clnvid_header = True
37+
outfile.write(raw_line)
38+
continue
39+
40+
# Variant line
41+
line = raw_line.decode("utf-8").rstrip("\n")
42+
fields = line.split("\t")
43+
44+
# Ensure we have at least up to INFO column
45+
if len(fields) < 8:
46+
# Malformed line, write back unchanged
47+
outfile.write((line + "\n").encode("utf-8"))
48+
continue
49+
50+
vcf_id = fields[2]
51+
info = fields[7]
52+
53+
# Only add CLNVID when ID column is not '.'
54+
if vcf_id != ".":
55+
if info == "." or info == "":
56+
info = f"CLNVID={vcf_id}"
57+
elif "CLNVID=" not in info:
58+
info = f"{info};CLNVID={vcf_id}"
59+
60+
# Write updated INFO back to fields
61+
fields[7] = info
62+
modified_line = "\t".join(fields) + "\n"
63+
outfile.write(modified_line.encode("utf-8"))
64+
65+
66+
@click.command()
67+
@click.argument(
68+
"input_path", type=click.Path(exists=True, readable=True, dir_okay=False)
69+
)
70+
@click.argument("output_path", type=click.Path(writable=True, dir_okay=False))
71+
def main(input_path: str, output_path: str) -> None:
72+
"""
73+
Adds a CLNVID INFO field to each record in a bgzipped VCF file based on the ID column.
74+
75+
INPUT_PATH: Path to the input VCF file (.vcf.gz).
76+
OUTPUT_PATH: Path to the output VCF file (.vcf.gz).
77+
"""
78+
process_vcf(input_path, output_path)
79+
80+
81+
if __name__ == "__main__":
82+
main()

BALSAMIC/commands/init/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import subprocess
55
import sys
6+
import shlex
67
from datetime import datetime
78
from pathlib import Path
89
from typing import List, Optional, Union

BALSAMIC/commands/run/analysis.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import subprocess
6+
import shlex
67
import sys
78
from pathlib import Path
89
from typing import List

BALSAMIC/constants/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ class DockerContainers(StrEnum):
390390
"url": "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz",
391391
"file_type": FileType.VCF,
392392
"gzip": True,
393-
"file_name": "clinvar.vcf",
393+
"file_name": "clinvar_preprocess.vcf",
394394
"dir_name": "variants",
395395
},
396396
"somalier_sites": {

BALSAMIC/models/cache.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Balsamic reference cache models."""
2+
import re
23
import logging
34
from pathlib import Path
45
from typing import Dict, Optional, List, Union
@@ -148,6 +149,12 @@ def get_cadd_snv_file_paths(self) -> List[str]:
148149
"""Return CADD SNV reference output files."""
149150
return [self.cadd_snv.file_path, f"{self.cadd_snv.file_path}.{FileType.TBI}"]
150151

152+
def get_processed_clinvar_file_path(self) -> str:
153+
return re.sub(r"\.vcf$", "_processed.vcf.gz", self.clinvar.file_path)
154+
155+
def get_processed_clinvar_index_path(self) -> str:
156+
return re.sub(r"\.vcf$", "_processed.vcf.gz.tbi", self.clinvar.file_path)
157+
151158
def get_delly_file_paths(self) -> List[str]:
152159
"""Return Delly associated output files."""
153160
return [
@@ -400,12 +407,13 @@ def get_reference_output_paths(self) -> List[str]:
400407
self.references.ascat_gc_correction.file_path,
401408
self.references.cadd_snv.file_path,
402409
self.references.simple_repeat.file_path,
403-
f"{self.references.clinvar.file_path}.{FileType.GZ}",
404410
f"{self.references.cosmic.file_path}.{FileType.GZ}",
405411
f"{self.references.dbsnp.file_path}.{FileType.GZ}",
406412
self.references.rank_score.file_path,
407413
f"{self.references.somalier_sites.file_path}.{FileType.GZ}",
408414
self.references.wgs_calling_regions.file_path,
415+
self.references.get_processed_clinvar_file_path(),
416+
self.references.get_processed_clinvar_index_path(),
409417
*self.get_compressed_indexed_vcf_paths(),
410418
*self.references.get_1k_genome_file_paths(),
411419
*self.references.get_cadd_snv_file_paths(),
@@ -434,7 +442,7 @@ def get_analysis_references(
434442
ascat_gc_correction=self.references.ascat_gc_correction.file_path,
435443
cadd_snv=self.references.cadd_snv.file_path,
436444
simple_repeat=self.references.simple_repeat.file_path,
437-
clinvar=f"{self.references.clinvar.file_path}.{FileType.GZ}",
445+
clinvar=self.references.get_processed_clinvar_file_path(),
438446
cosmic=f"{self.references.cosmic.file_path}.{FileType.GZ}",
439447
dbsnp=f"{self.references.dbsnp.file_path}.{FileType.GZ}",
440448
delly_exclusion=self.references.delly_exclusion.file_path,

BALSAMIC/snakemake_rules/cache/reference_vcf.rule

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Rules to process VCF reference files."""
22

3+
from BALSAMIC.utils.rule import get_script_path
34

45
wildcard_constraints:
56
vcf="|".join(cache_config.get_reference_file_paths_by_file_type(file_type=FileType.VCF)),
@@ -45,3 +46,42 @@ rule index_vcfs:
4546
"""
4647
tabix -p vcf "{input.vcf_gz}" &> "{log}"
4748
"""
49+
50+
rule postprocess_clinvar:
51+
"""Add new info-field based on clinvar variant ID: CLNVID"""
52+
input:
53+
clinvar=f"{cache_config.references.clinvar.file_path}.{FileType.GZ}",
54+
output:
55+
clinvar=cache_config.references.get_processed_clinvar_file_path(),
56+
params:
57+
add_clnvid_script=get_script_path("add_clnvid_field.py"),
58+
message:
59+
"Adding INFO field CLNVID to ClinVar VCF: {input.clinvar}"
60+
benchmark:
61+
f"{cache_config.references.clinvar.file_path}.postprocess_clinvar.benchmark.{FileType.TSV}"
62+
log:
63+
f"{cache_config.references.clinvar.file_path}.postprocess_clinvar.{FileType.LOG}",
64+
shell:
65+
"""
66+
python {params.add_clnvid_script} {input.clinvar} {output.clinvar}
67+
"""
68+
69+
rule index_postprocessed_clinvar_vcf:
70+
"""Index postprocessed clinvar VCF"""
71+
input:
72+
clinvar=cache_config.references.get_processed_clinvar_file_path(),
73+
singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}",
74+
output:
75+
clinvar_tbi=cache_config.references.get_processed_clinvar_index_path(),
76+
singularity:
77+
f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}"
78+
message:
79+
"Index postprocessed ClinVar VCF: {input.clinvar}"
80+
benchmark:
81+
f"{cache_config.references.clinvar.file_path}.tabix_postprocess_clinvar.benchmark.{FileType.TSV}"
82+
log:
83+
f"{cache_config.references.clinvar.file_path}.tabix_postprocess_clinvar.{FileType.LOG}",
84+
shell:
85+
"""
86+
tabix -p vcf {input.clinvar}
87+
"""

BALSAMIC/workflows/reference.smk

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ from BALSAMIC.constants.rules import SNAKEMAKE_RULES
1212
from BALSAMIC.models.cache import CacheConfig, AnalysisReferences
1313
from BALSAMIC.utils.io import write_finish_file, write_json
1414
from BALSAMIC.utils.utils import get_relative_paths_dict
15-
15+
from BALSAMIC.utils.rule import (
16+
get_script_path,
17+
)
1618
LOG = logging.getLogger(__name__)
1719

1820
# Balsamic cache configuration model

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Changed:
2222
* use median_target_coverage as threshold for wgs normals coverage instead of median coverage https://github.com/Clinical-Genomics/BALSAMIC/pull/1587
2323

2424

25+
2526
Removed:
2627
^^^^^^^^
2728
* removed immediate submit functionality https://github.com/Clinical-Genomics/BALSAMIC/pull/1604
@@ -44,6 +45,7 @@ Added:
4445

4546

4647

48+
4749
Changed:
4850
^^^^^^^^
4951

tests/conftest.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2300,12 +2300,14 @@ def fixture_delly_exclusion_converted_file(session_tmp_path: Path) -> Path:
23002300
return reference_file
23012301

23022302

2303-
@pytest.fixture(scope="session", name="clinvar_file")
2303+
@pytest.fixture(scope="session", name="clinvar_processed")
23042304
def fixture_clinvar_file(session_tmp_path: Path) -> Path:
23052305
"""Return dummy clinvar file."""
2306-
clinvar_file: Path = Path(session_tmp_path, "variants", "clinvar.vcf.gz")
2307-
clinvar_file.touch()
2308-
return clinvar_file
2306+
clinvar_processed: Path = Path(
2307+
session_tmp_path, "variants", "clinvar_processed.vcf.gz"
2308+
)
2309+
clinvar_processed.touch()
2310+
return clinvar_processed
23092311

23102312

23112313
@pytest.fixture(scope="session", name="cosmic_file")
@@ -2379,7 +2381,7 @@ def fixture_analysis_references_hg_data(
23792381
cache_config: CacheConfig,
23802382
analysis_references_data: Dict[str, Path],
23812383
delly_exclusion_converted_file: Path,
2382-
clinvar_file: Path,
2384+
clinvar_processed: Path,
23832385
cosmic_file: Path,
23842386
dbsnp_file: Path,
23852387
hc_vcf_1kg_file: Path,
@@ -2397,7 +2399,7 @@ def fixture_analysis_references_hg_data(
23972399
),
23982400
"cadd_snv": Path(cache_config.references.cadd_snv.file_path),
23992401
"simple_repeat": Path(cache_config.references.simple_repeat.file_path),
2400-
"clinvar": clinvar_file,
2402+
"clinvar": clinvar_processed,
24012403
"cosmic": cosmic_file,
24022404
"dbsnp": dbsnp_file,
24032405
"delly_exclusion": Path(cache_config.references.delly_exclusion.file_path),

0 commit comments

Comments
 (0)