Skip to content

Commit 56e2f8c

Browse files
authored
feat: use predicted sex in cnv analysis (#1581)
#### Added - gender unknown gender assignment - function for retrieving predicted case gender if gender is unknown - warning_qc metric thresholds - moved sex check to a warning metric - added predicted sex to inputs for various cnv rules #### Changed - refactored validate_metric - modified validate_metric to not fail when metric is a warning_qc metric - added balsamic.log as logfile to snakemake workflows - create new balsamic.log.[number] unique with each new start of balsamic - divided balsamic run and balsamic config into separate logs (necessary to keep logs clean for reruns) #### Removed - balsamic status check which was never used
1 parent 6d72b4c commit 56e2f8c

29 files changed

+284
-138
lines changed

BALSAMIC/assets/scripts/collect_qc_metrics.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
get_analysis_type,
1515
get_capture_kit,
1616
get_sample_type_from_sample_name,
17+
get_sample_name_from_sample_type,
1718
get_sequencing_type,
1819
)
1920

@@ -35,17 +36,17 @@ def collect_qc_metrics(
3536
config_path: Path,
3637
output_path: Path,
3738
multiqc_data_path: Path,
38-
sex_prediction_path: Path,
3939
counts_path: List[Path],
40+
sex_prediction_path: Path,
4041
):
4142
"""Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file
4243
4344
Args:
4445
config_path: Path; case config file path
4546
output_path: Path; destination path for the extracted YAML formatted metrics
4647
multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted
47-
sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
4848
counts_path: Path; list of variant caller specific files containing the number of variants
49+
sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
4950
"""
5051

5152
config = read_json(config_path)
@@ -123,7 +124,6 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
123124
def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
124125
"""Retrieves the sex check metrics and returns them as a Metric list."""
125126
metric = "compare_predicted_to_given_sex"
126-
case_id: str = config["analysis"]["case_id"]
127127
sex_prediction: dict = read_json(sex_prediction_path)
128128

129129
given_sex: str = config["analysis"]["gender"]
@@ -133,8 +133,9 @@ def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
133133
for sample_type in ["tumor", "normal"]:
134134
if sample_type in sex_prediction:
135135
predicted_sex = sex_prediction[sample_type]["predicted_sex"]
136+
sample_name = get_sample_name_from_sample_type(config, sample_type)
136137
sex_prediction_metrics = Metric(
137-
id=f"{case_id}_{sample_type}",
138+
id=sample_name,
138139
input=os.path.basename(sex_prediction_path),
139140
name=metric.upper(),
140141
step="sex_check",

BALSAMIC/commands/config/case.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ def case_config(
191191
if path is not None
192192
}
193193
)
194-
LOG.info(f"Collected references: {references}")
195194

196195
analysis_fastq_dir: str = get_analysis_fastq_files_directory(
197196
case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path

BALSAMIC/commands/options.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,8 @@
230230
OPTION_GENDER = click.option(
231231
"--gender",
232232
required=False,
233-
type=click.Choice([Gender.FEMALE, Gender.MALE]),
234-
default=Gender.FEMALE,
233+
type=click.Choice([Gender.FEMALE, Gender.MALE, Gender.UNKNOWN]),
234+
default=Gender.UNKNOWN,
235235
show_default=True,
236236
help="Case associated Gender",
237237
)

BALSAMIC/constants/analysis.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class Gender(StrEnum):
4545

4646
FEMALE: str = "female"
4747
MALE: str = "male"
48+
UNKNOWN: str = "unknown"
4849

4950

5051
class AnalysisType(StrEnum):

BALSAMIC/constants/metrics.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""QC metrics constants."""
22
import operator
3-
from typing import Dict, Callable
3+
from typing import Dict, Callable, List
44

55

66
VALID_OPS: Dict[str, Callable] = {
@@ -12,6 +12,7 @@
1212
"gt": operator.gt,
1313
}
1414

15+
METRIC_WARNINGS = {"COMPARE_PREDICTED_TO_GIVEN_SEX"}
1516

1617
METRICS: Dict[str, dict] = {
1718
"targeted": {

BALSAMIC/models/config.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Balsamic analysis config case models."""
22

33
import re
4+
import os
45
from glob import glob
56
from pathlib import Path
67
from typing import Annotated, Dict, List, Optional
@@ -23,6 +24,7 @@
2324
)
2425
from BALSAMIC.models.params import QCModel
2526
from BALSAMIC.models.validators import is_dir, is_file
27+
from BALSAMIC.utils.io import read_json
2628

2729

2830
class FastqInfoModel(BaseModel):
@@ -143,7 +145,7 @@ class AnalysisModel(BaseModel):
143145
144146
Raises:
145147
ValueError:
146-
When gender is set to any other than [female, male]
148+
When gender is set to any other than [female, male, unknown]
147149
When analysis_type is set to any value other than [single, paired, pon]
148150
When sequencing_type is set to any value other than [wgs, targeted]
149151
When analysis_workflow is set to any other than [balsamic, balsamic-qc, balsamic-umi]
@@ -473,3 +475,30 @@ def get_cnv_report_plots(self) -> List[str]:
473475
f"CNV.somatic.{self.analysis.case_id}.ascat.germline.png",
474476
f"CNV.somatic.{self.analysis.case_id}.ascat.sunrise.png",
475477
]
478+
479+
def get_gender(self, wildcards, input):
480+
"""Return the bioinformatically predicted sex of the case if the given sex is unknown."""
481+
482+
if self.analysis.gender != Gender.UNKNOWN:
483+
return self.analysis.gender # Default to using assigned gender
484+
485+
if not os.path.exists(input.sex_prediction_json):
486+
return Gender.FEMALE # Only necessary for snakemake dry-run
487+
488+
sex_prediction = read_json(input.sex_prediction_json)
489+
490+
gender = Gender.UNKNOWN
491+
if self.analysis.analysis_type == AnalysisType.PAIRED:
492+
# Prioritise normal gender if available
493+
gender = sex_prediction[SampleType.NORMAL]["predicted_sex"]
494+
495+
if gender == Gender.UNKNOWN:
496+
# Fall back to use tumor gender
497+
gender = sex_prediction[SampleType.TUMOR]["predicted_sex"]
498+
499+
if gender == Gender.UNKNOWN:
500+
# If gender is unknown, default to using female gender
501+
return Gender.FEMALE
502+
else:
503+
# Return predicted gender
504+
return gender

BALSAMIC/models/metrics.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
"""QC validation metrics model."""
22
import logging
3-
from typing import Optional, Any, List, Annotated
3+
from typing import Optional, Any, List, Annotated, Callable
44

55
from pydantic import BaseModel, AfterValidator
66

7-
from BALSAMIC.constants.metrics import VALID_OPS
7+
from BALSAMIC.constants.metrics import VALID_OPS, METRIC_WARNINGS
88

99
LOG = logging.getLogger(__name__)
1010

@@ -50,6 +50,10 @@ def validate_metric(metric: Metric):
5050
threshold: Optional[Any] = metric.condition.threshold
5151
value: Any = metric.value
5252

53+
# Ignore warning metrics from failing
54+
if metric.name in METRIC_WARNINGS:
55+
return metric
56+
5357
# Validate the norm operator
5458
if norm not in VALID_OPS:
5559
raise ValueError(f"Unsupported operation: {norm}")
@@ -67,7 +71,6 @@ def validate_metric(metric: Metric):
6771
f"are not compatible with operator {norm}. (ID: {metric.id})."
6872
)
6973

70-
LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.")
7174
return metric
7275

7376

BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ if config["analysis"]["sequencing_type"] != 'wgs':
66
rule vcf2cytosure_convert:
77
input:
88
cnvkit_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz",
9-
cnvkit_cnr = cnv_dir + "tumor.merged" + ".cnr"
9+
cnvkit_cnr = cnv_dir + "tumor.merged" + ".cnr",
10+
sex_prediction_json = qc_dir + "sex_prediction.json"
1011
output:
1112
cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf2cytosure.cgh"
1213
benchmark:
@@ -15,7 +16,7 @@ if config["analysis"]["sequencing_type"] != 'wgs':
1516
Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
1617
params:
1718
case_name = config["analysis"]["case_id"],
18-
gender = config["analysis"]["gender"],
19+
gender = config_model.get_gender,
1920
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
2021
message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
2122
shell:
@@ -28,6 +29,7 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
2829
input:
2930
delly_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.vcf.gz",
3031
tiddit_cov_tumor = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed",
32+
sex_prediction_json = qc_dir + "sex_prediction.json"
3133
output:
3234
cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".tumor.vcf2cytosure.cgh"
3335
benchmark:
@@ -36,7 +38,7 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
3638
Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
3739
params:
3840
case_name = config["analysis"]["case_id"],
39-
gender = config["analysis"]["gender"],
41+
gender = config_model.get_gender,
4042
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
4143
message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
4244
shell:
@@ -50,6 +52,7 @@ elif config["analysis"]["sequencing_type"] == "wgs" and config["analysis"]["anal
5052
ascat_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz",
5153
tiddit_cov_tumor = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed",
5254
tiddit_cov_normal = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".normal.tiddit_cov.bed",
55+
sex_prediction_json = qc_dir + "sex_prediction.json"
5356
output:
5457
ascat_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".filtered.ascat.vcf.gz",
5558
cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".tumor.vcf2cytosure.cgh",
@@ -60,7 +63,7 @@ elif config["analysis"]["sequencing_type"] == "wgs" and config["analysis"]["anal
6063
Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
6164
params:
6265
case_name = config["analysis"]["case_id"],
63-
gender= config["analysis"]["gender"],
66+
gender = config_model.get_gender,
6467
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
6568
message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
6669
shell:

BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,19 @@ if config["analysis"]["analysis_workflow"] != "balsamic-qc":
1919
rule collect_custom_qc_metrics:
2020
input:
2121
bcftools_counts = bcftools_counts_input,
22-
sex_prediction_json = qc_dir + "sex_prediction.json",
23-
json = qc_dir + "multiqc_data/multiqc_data.json"
22+
json = qc_dir + "multiqc_data/multiqc_data.json",
23+
sex_prediction_json= qc_dir + "sex_prediction.json"
2424
output:
2525
yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml"
2626
params:
27-
config_path = f"{analysis_dir_home}/{case_id}/{case_id}.json",
27+
config_path = f"{case_dir}/{case_id}.json",
2828
collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"),
2929
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"}
3030
message:
3131
"Extract the manually specified QC metric for validation and delivery"
3232
shell:
3333
"""
34-
python {params.collect_qc_metrics_script} --sex-prediction-path {input.sex_prediction_json} {params.config_path} {output.yaml} {input.json} {input.bcftools_counts}
34+
python {params.collect_qc_metrics_script} --sex-prediction-path {input.sex_prediction_json} {params.config_path} {output.yaml} {input.json} {input.bcftools_counts}
3535
"""
3636
else:
3737
rule collect_custom_qc_metrics:
@@ -41,7 +41,7 @@ else:
4141
output:
4242
yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml"
4343
params:
44-
config_path = f"{analysis_dir_home}/{case_id}/{case_id}.json",
44+
config_path = f"{case_dir}/{case_id}.json",
4545
collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"),
4646
housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"}
4747
message:

BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ rule cnvkit_call_CNV_research:
206206
cns_initial = cnv_dir + "tumor.initial.cns",
207207
cnr = cnv_dir + "tumor.merged.cnr",
208208
snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz",
209+
sex_prediction_json = qc_dir + "sex_prediction.json"
209210
output:
210211
cns = cnv_dir + "tumor.merged.cns",
211212
gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks",
@@ -223,7 +224,7 @@ rule cnvkit_call_CNV_research:
223224
cnv_dir = cnv_dir,
224225
cnsr = lambda wc: "tumor.merged.cn{s,r}",
225226
case_name = config["analysis"]["case_id"],
226-
gender = config["analysis"]["gender"],
227+
gender = config_model.get_gender,
227228
tumor_sample_id = "TUMOR",
228229
normal_sample_id = "NORMAL",
229230
message:

0 commit comments

Comments
 (0)