Skip to content

Commit 2d411a2

Browse files
authored
Merge pull request #1608 from Clinical-Genomics/release_v18.0.0
feat: release v18.0.0
2 parents e5742a1 + 2a6a1bc commit 2d411a2

File tree

182 files changed

+10493
-4360
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+10493
-4360
lines changed

.github/workflows/pytest_and_coveralls.yml

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ on:
55
- "CHANGELOG.rst"
66
- "docs/**"
77
push:
8-
branches:
9-
- master
8+
branches: [ master ]
109
paths-ignore:
1110
- "CHANGELOG.rst"
1211
- "docs/**"
@@ -16,49 +15,41 @@ jobs:
1615
name: run PyTest
1716
runs-on: ubuntu-22.04
1817
steps:
19-
# Checkout BALSAMIC
2018
- name: Git checkout
21-
id: git_checkout
2219
uses: actions/checkout@v3
23-
# Conda env create
20+
2421
- name: setup conda
25-
id: setup_conda
2622
uses: conda-incubator/setup-miniconda@v2
2723
with:
2824
activate-environment: balsamic
2925
environment-file: BALSAMIC/conda/balsamic.yaml
26+
# optional speed-ups:
27+
# auto-activate-base: false
28+
# use-mamba: true
29+
3030
- name: Install the HTML to PDF renderer
3131
run: sudo apt-get update && sudo apt-get install -y wkhtmltopdf
32-
# Install BALSAMIC
33-
- name: Install BALSAMIC
34-
id: install_balsamic
35-
shell: bash -l {0}
36-
run: |
37-
conda activate balsamic
38-
pip install --no-cache-dir .
39-
# Install pytest coveralls dependencies
40-
- name: Install PyTest and Coveralls
41-
id: install_pytest
32+
33+
- name: Install BALSAMIC + test extras
4234
shell: bash -l {0}
4335
run: |
4436
conda activate balsamic
45-
pip install --no-cache-dir -r requirements-dev.txt
46-
# Run PyTest
37+
pip install --no-cache-dir -e .[test]
38+
4739
- name: Run PyTest
48-
id: pytest
4940
shell: bash -l {0}
50-
run: |
51-
conda activate balsamic
52-
py.test --cov-report=xml --cov=BALSAMIC -rsxv tests/*
5341
env:
5442
SENTIEON_LICENSE: dummy_license
5543
SENTIEON_INSTALL_DIR: dummy_install_dir
56-
# Run Codecov
44+
run: |
45+
conda activate balsamic
46+
pytest --cov-report=xml --cov=BALSAMIC -rsxv tests/*
47+
5748
- name: Upload coverage to Codecov
5849
uses: codecov/codecov-action@v3
5950
with:
6051
token: ${{ secrets.CODECOV_TOKEN }}
6152
file: ./coverage.xml
6253
flags: unittests
6354
fail_ci_if_error: true
64-
verbose: true
55+
verbose: true
File renamed without changes.

BALSAMIC/assets/analysis_metadata/rescue_snvs.vcf

Lines changed: 5822 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import pysam
2+
import click
3+
4+
5+
def add_clnvid_header(output_handle) -> None:
6+
"""
7+
Writes the INFO header line for the CLNVID field to the output VCF.
8+
"""
9+
# Use String to be safe; ID column can contain non-numeric identifiers.
10+
header_line = '##INFO=<ID=CLNVID,Number=1,Type=String,Description="ClinVar Variation ID taken from the VCF ID column">'
11+
output_handle.write(f"{header_line}\n".encode("utf-8"))
12+
13+
14+
def process_vcf(input_path: str, output_path: str) -> None:
15+
"""
16+
Processes a bgzipped VCF file using pysam, adds the CLNVID INFO field based on the ID column,
17+
and writes to a new bgzipped VCF file in a tabix-compatible format.
18+
"""
19+
saw_clnvid_header = False
20+
21+
with pysam.BGZFile(input_path, "r") as infile, pysam.BGZFile(
22+
output_path, "w"
23+
) as outfile:
24+
for raw_line in infile:
25+
# Pass through meta headers; track if CLNVID header already exists
26+
if raw_line.startswith(b"##"):
27+
if b"##INFO=<ID=CLNVID" in raw_line:
28+
saw_clnvid_header = True
29+
outfile.write(raw_line)
30+
continue
31+
32+
# Column header line: add CLNVID header if missing, then write
33+
if raw_line.startswith(b"#"):
34+
if not saw_clnvid_header:
35+
add_clnvid_header(outfile)
36+
saw_clnvid_header = True
37+
outfile.write(raw_line)
38+
continue
39+
40+
# Variant line
41+
line = raw_line.decode("utf-8").rstrip("\n")
42+
fields = line.split("\t")
43+
44+
# Ensure we have at least up to INFO column
45+
if len(fields) < 8:
46+
# Malformed line, write back unchanged
47+
outfile.write((line + "\n").encode("utf-8"))
48+
continue
49+
50+
vcf_id = fields[2]
51+
info = fields[7]
52+
53+
# Only add CLNVID when ID column is not '.'
54+
if vcf_id != ".":
55+
if info == "." or info == "":
56+
info = f"CLNVID={vcf_id}"
57+
elif "CLNVID=" not in info:
58+
info = f"{info};CLNVID={vcf_id}"
59+
60+
# Write updated INFO back to fields
61+
fields[7] = info
62+
modified_line = "\t".join(fields) + "\n"
63+
outfile.write(modified_line.encode("utf-8"))
64+
65+
66+
@click.command()
67+
@click.argument(
68+
"input_path", type=click.Path(exists=True, readable=True, dir_okay=False)
69+
)
70+
@click.argument("output_path", type=click.Path(writable=True, dir_okay=False))
71+
def main(input_path: str, output_path: str) -> None:
72+
"""
73+
Adds a CLNVID INFO field to each record in a bgzipped VCF file based on the ID column.
74+
75+
INPUT_PATH: Path to the input VCF file (.vcf.gz).
76+
OUTPUT_PATH: Path to the output VCF file (.vcf.gz).
77+
"""
78+
process_vcf(input_path, output_path)
79+
80+
81+
if __name__ == "__main__":
82+
main()

BALSAMIC/assets/scripts/collect_qc_metrics.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
get_analysis_type,
1515
get_capture_kit,
1616
get_sample_type_from_sample_name,
17+
get_sample_name_from_sample_type,
1718
get_sequencing_type,
1819
)
1920

@@ -35,17 +36,17 @@ def collect_qc_metrics(
3536
config_path: Path,
3637
output_path: Path,
3738
multiqc_data_path: Path,
38-
sex_prediction_path: Path,
3939
counts_path: List[Path],
40+
sex_prediction_path: Path,
4041
):
4142
"""Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file
4243
4344
Args:
4445
config_path: Path; case config file path
4546
output_path: Path; destination path for the extracted YAML formatted metrics
4647
multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted
47-
sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
4848
counts_path: Path; list of variant caller specific files containing the number of variants
49+
sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
4950
"""
5051

5152
config = read_json(config_path)
@@ -76,7 +77,7 @@ def collect_qc_metrics(
7677
)
7778

7879

79-
def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
80+
def get_multiqc_data_source(multiqc_data: dict, sampleid: str, tool: str) -> str:
8081
"""Extracts the metrics data source associated with a specific sample and tool
8182
8283
Args:
@@ -104,26 +105,18 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
104105
subtool_name[1].lower() in source_tool.lower()
105106
and subtool_name[2].lower() in source_subtool.lower()
106107
):
107-
try:
108-
return os.path.basename(
109-
multiqc_data["report_data_sources"][source_tool][
110-
source_subtool
111-
][sample]
112-
)
113-
except KeyError:
114-
# Deletes pair orientation information from the sample name (insertSize metrics)
115-
sample = sample.rsplit("_", 1)[0]
116-
return os.path.basename(
117-
multiqc_data["report_data_sources"][source_tool][
118-
source_subtool
119-
][sample]
120-
)
108+
source_dict = multiqc_data["report_data_sources"][source_tool][
109+
source_subtool
110+
]
111+
metric_file = next(
112+
(v for k, v in source_dict.items() if sampleid in k), None
113+
)
114+
return os.path.basename(metric_file)
121115

122116

123117
def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
124118
"""Retrieves the sex check metrics and returns them as a Metric list."""
125119
metric = "compare_predicted_to_given_sex"
126-
case_id: str = config["analysis"]["case_id"]
127120
sex_prediction: dict = read_json(sex_prediction_path)
128121

129122
given_sex: str = config["analysis"]["gender"]
@@ -133,8 +126,9 @@ def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
133126
for sample_type in ["tumor", "normal"]:
134127
if sample_type in sex_prediction:
135128
predicted_sex = sex_prediction[sample_type]["predicted_sex"]
129+
sample_name = get_sample_name_from_sample_type(config, sample_type)
136130
sex_prediction_metrics = Metric(
137-
id=f"{case_id}_{sample_type}",
131+
id=sample_name,
138132
input=os.path.basename(sex_prediction_path),
139133
name=metric.upper(),
140134
step="sex_check",
@@ -224,7 +218,7 @@ def get_metric_condition(
224218
req_metrics = requested_metrics[metric]["condition"]
225219
if sequencing_type == "wgs" and (
226220
(metric == "PCT_60X" and sample_type == "normal")
227-
or (metric == "MEDIAN_COVERAGE" and sample_type == "tumor")
221+
or (metric == "MEDIAN_TARGET_COVERAGE" and sample_type == "tumor")
228222
):
229223
req_metrics = None
230224

@@ -264,7 +258,7 @@ def extract(data, output_metrics, multiqc_key=None, source=None):
264258
Metric(
265259
id=get_sample_id(multiqc_key),
266260
input=get_multiqc_data_source(
267-
multiqc_data, multiqc_key, source
261+
multiqc_data, get_sample_id(multiqc_key), source
268262
),
269263
name=k,
270264
step=source,

BALSAMIC/assets/scripts/immediate_submit.py

Lines changed: 0 additions & 78 deletions
This file was deleted.

0 commit comments

Comments
 (0)