Clinical-Genomics
diff --git a/‎BALSAMIC/assets/scripts/collect_qc_metrics.py‎
Lines changed: 5 additions & 4 deletions b/‎BALSAMIC/assets/scripts/collect_qc_metrics.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎BALSAMIC/commands/config/case.py‎
Lines changed: 0 additions & 1 deletion b/‎BALSAMIC/commands/config/case.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎BALSAMIC/commands/options.py‎
Lines changed: 2 additions & 2 deletions b/‎BALSAMIC/commands/options.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎BALSAMIC/constants/analysis.py‎
Lines changed: 1 addition & 0 deletions b/‎BALSAMIC/constants/analysis.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎BALSAMIC/constants/metrics.py‎
Lines changed: 2 additions & 1 deletion b/‎BALSAMIC/constants/metrics.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎BALSAMIC/models/config.py‎
Lines changed: 30 additions & 1 deletion b/‎BALSAMIC/models/config.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎BALSAMIC/models/metrics.py‎
Lines changed: 6 additions & 3 deletions b/‎BALSAMIC/models/metrics.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule‎
Lines changed: 7 additions & 4 deletions b/‎BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule‎
Lines changed: 5 additions & 5 deletions b/‎BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule‎
Lines changed: 2 additions & 1 deletion b/‎BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule‎
Lines changed: 2 additions & 1 deletion
@@ -14,6 +14,7 @@
     get_analysis_type,
     get_capture_kit,
     get_sample_type_from_sample_name,
+    get_sample_name_from_sample_type,
     get_sequencing_type,
 )
 
@@ -35,17 +36,17 @@ def collect_qc_metrics(
     config_path: Path,
     output_path: Path,
     multiqc_data_path: Path,
-    sex_prediction_path: Path,
     counts_path: List[Path],
+    sex_prediction_path: Path,
 ):
     """Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file
 
     Args:
         config_path: Path; case config file path
         output_path: Path; destination path for the extracted YAML formatted metrics
         multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted
-        sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
         counts_path: Path; list of variant caller specific files containing the number of variants
+        sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
     """
 
     config = read_json(config_path)
@@ -123,7 +124,6 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
 def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
     """Retrieves the sex check metrics and returns them as a Metric list."""
     metric = "compare_predicted_to_given_sex"
-    case_id: str = config["analysis"]["case_id"]
     sex_prediction: dict = read_json(sex_prediction_path)
 
     given_sex: str = config["analysis"]["gender"]
@@ -133,8 +133,9 @@ def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
     for sample_type in ["tumor", "normal"]:
         if sample_type in sex_prediction:
             predicted_sex = sex_prediction[sample_type]["predicted_sex"]
+            sample_name = get_sample_name_from_sample_type(config, sample_type)
             sex_prediction_metrics = Metric(
-                id=f"{case_id}_{sample_type}",
+                id=sample_name,
                 input=os.path.basename(sex_prediction_path),
                 name=metric.upper(),
                 step="sex_check",
 
@@ -191,7 +191,6 @@ def case_config(
             if path is not None
         }
     )
-    LOG.info(f"Collected references: {references}")
 
     analysis_fastq_dir: str = get_analysis_fastq_files_directory(
         case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path
 
@@ -230,8 +230,8 @@
 OPTION_GENDER = click.option(
     "--gender",
     required=False,
-    type=click.Choice([Gender.FEMALE, Gender.MALE]),
-    default=Gender.FEMALE,
+    type=click.Choice([Gender.FEMALE, Gender.MALE, Gender.UNKNOWN]),
+    default=Gender.UNKNOWN,
     show_default=True,
     help="Case associated Gender",
 )
 
@@ -45,6 +45,7 @@ class Gender(StrEnum):
 
     FEMALE: str = "female"
     MALE: str = "male"
+    UNKNOWN: str = "unknown"
 
 
 class AnalysisType(StrEnum):
 
@@ -1,6 +1,6 @@
 """QC metrics constants."""
 import operator
-from typing import Dict, Callable
+from typing import Dict, Callable, List
 
 
 VALID_OPS: Dict[str, Callable] = {
@@ -12,6 +12,7 @@
     "gt": operator.gt,
 }
 
+METRIC_WARNINGS = {"COMPARE_PREDICTED_TO_GIVEN_SEX"}
 
 METRICS: Dict[str, dict] = {
     "targeted": {
 
@@ -1,6 +1,7 @@
 """Balsamic analysis config case models."""
 
 import re
+import os
 from glob import glob
 from pathlib import Path
 from typing import Annotated, Dict, List, Optional
@@ -23,6 +24,7 @@
 )
 from BALSAMIC.models.params import QCModel
 from BALSAMIC.models.validators import is_dir, is_file
+from BALSAMIC.utils.io import read_json
 
 
 class FastqInfoModel(BaseModel):
@@ -143,7 +145,7 @@ class AnalysisModel(BaseModel):
 
     Raises:
         ValueError:
-            When gender is set to any other than [female, male]
+            When gender is set to any other than [female, male, unknown]
             When analysis_type is set to any value other than [single, paired, pon]
             When sequencing_type is set to any value other than [wgs, targeted]
             When analysis_workflow is set to any other than [balsamic, balsamic-qc, balsamic-umi]
@@ -473,3 +475,30 @@ def get_cnv_report_plots(self) -> List[str]:
             f"CNV.somatic.{self.analysis.case_id}.ascat.germline.png",
             f"CNV.somatic.{self.analysis.case_id}.ascat.sunrise.png",
         ]
+
+    def get_gender(self, wildcards, input):
+        """Return the bioinformatically predicted sex of the case if the given sex is unknown."""
+
+        if self.analysis.gender != Gender.UNKNOWN:
+            return self.analysis.gender  # Default to using assigned gender
+
+        if not os.path.exists(input.sex_prediction_json):
+            return Gender.FEMALE  # Only necessary for snakemake dry-run
+
+        sex_prediction = read_json(input.sex_prediction_json)
+
+        gender = Gender.UNKNOWN
+        if self.analysis.analysis_type == AnalysisType.PAIRED:
+            # Prioritise normal gender if available
+            gender = sex_prediction[SampleType.NORMAL]["predicted_sex"]
+
+        if gender == Gender.UNKNOWN:
+            # Fall back to use tumor gender
+            gender = sex_prediction[SampleType.TUMOR]["predicted_sex"]
+
+        if gender == Gender.UNKNOWN:
+            # If gender is unknown, default to using female gender
+            return Gender.FEMALE
+        else:
+            # Return predicted gender
+            return gender
@@ -1,10 +1,10 @@
 """QC validation metrics model."""
 import logging
-from typing import Optional, Any, List, Annotated
+from typing import Optional, Any, List, Annotated, Callable
 
 from pydantic import BaseModel, AfterValidator
 
-from BALSAMIC.constants.metrics import VALID_OPS
+from BALSAMIC.constants.metrics import VALID_OPS, METRIC_WARNINGS
 
 LOG = logging.getLogger(__name__)
 
@@ -50,6 +50,10 @@ def validate_metric(metric: Metric):
         threshold: Optional[Any] = metric.condition.threshold
         value: Any = metric.value
 
+        # Ignore warning metrics from failing
+        if metric.name in METRIC_WARNINGS:
+            return metric
+
         # Validate the norm operator
         if norm not in VALID_OPS:
             raise ValueError(f"Unsupported operation: {norm}")
@@ -67,7 +71,6 @@ def validate_metric(metric: Metric):
                 f"are not compatible with operator {norm}. (ID: {metric.id})."
             )
 
-    LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.")
     return metric
 
 
 
@@ -6,7 +6,8 @@ if config["analysis"]["sequencing_type"] != 'wgs':
     rule vcf2cytosure_convert:
         input:
             cnvkit_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz",
-            cnvkit_cnr = cnv_dir + "tumor.merged" + ".cnr"
+            cnvkit_cnr = cnv_dir + "tumor.merged" + ".cnr",
+            sex_prediction_json = qc_dir + "sex_prediction.json"
         output:
             cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf2cytosure.cgh"
         benchmark:
@@ -15,7 +16,7 @@ if config["analysis"]["sequencing_type"] != 'wgs':
             Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
         params:
             case_name = config["analysis"]["case_id"],
-            gender = config["analysis"]["gender"],
+            gender = config_model.get_gender,
             housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
         message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
         shell:
@@ -28,6 +29,7 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
         input:
             delly_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.vcf.gz",
             tiddit_cov_tumor = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed",
+            sex_prediction_json = qc_dir + "sex_prediction.json"
         output:
             cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".tumor.vcf2cytosure.cgh"
         benchmark:
@@ -36,7 +38,7 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
             Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
         params:
             case_name = config["analysis"]["case_id"],
-            gender = config["analysis"]["gender"],
+            gender = config_model.get_gender,
             housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
         message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
         shell:
@@ -50,6 +52,7 @@ elif config["analysis"]["sequencing_type"] == "wgs" and config["analysis"]["anal
             ascat_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz",
             tiddit_cov_tumor = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed",
             tiddit_cov_normal = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".normal.tiddit_cov.bed",
+            sex_prediction_json = qc_dir + "sex_prediction.json"
         output:
             ascat_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".filtered.ascat.vcf.gz",
             cgh_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".tumor.vcf2cytosure.cgh",
@@ -60,7 +63,7 @@ elif config["analysis"]["sequencing_type"] == "wgs" and config["analysis"]["anal
             Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix()
         params:
             case_name = config["analysis"]["case_id"],
-            gender= config["analysis"]["gender"],
+            gender = config_model.get_gender,
             housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"},
         message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}"
         shell:
 
@@ -19,19 +19,19 @@ if config["analysis"]["analysis_workflow"] != "balsamic-qc":
     rule collect_custom_qc_metrics:
         input:
             bcftools_counts = bcftools_counts_input,
-            sex_prediction_json = qc_dir + "sex_prediction.json",
-            json = qc_dir + "multiqc_data/multiqc_data.json"
+            json = qc_dir + "multiqc_data/multiqc_data.json",
+            sex_prediction_json= qc_dir + "sex_prediction.json"
         output:
             yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml"
         params:
-            config_path = f"{analysis_dir_home}/{case_id}/{case_id}.json",
+            config_path = f"{case_dir}/{case_id}.json",
             collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"),
             housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"}
         message:
             "Extract the manually specified QC metric for validation and delivery"
         shell:
             """
-    python {params.collect_qc_metrics_script} --sex-prediction-path {input.sex_prediction_json} {params.config_path} {output.yaml} {input.json} {input.bcftools_counts}
+    python {params.collect_qc_metrics_script}  --sex-prediction-path {input.sex_prediction_json} {params.config_path} {output.yaml} {input.json} {input.bcftools_counts}
             """
 else:
     rule collect_custom_qc_metrics:
@@ -41,7 +41,7 @@ else:
         output:
             yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml"
         params:
-            config_path = f"{analysis_dir_home}/{case_id}/{case_id}.json",
+            config_path = f"{case_dir}/{case_id}.json",
             collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"),
             housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"}
         message:
 
@@ -206,6 +206,7 @@ rule cnvkit_call_CNV_research:
         cns_initial = cnv_dir + "tumor.initial.cns",
         cnr = cnv_dir + "tumor.merged.cnr",
         snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz",
+        sex_prediction_json = qc_dir + "sex_prediction.json"
     output:
         cns = cnv_dir + "tumor.merged.cns",
         gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks",
@@ -223,7 +224,7 @@ rule cnvkit_call_CNV_research:
         cnv_dir = cnv_dir,
         cnsr = lambda wc: "tumor.merged.cn{s,r}",
         case_name = config["analysis"]["case_id"],
-        gender = config["analysis"]["gender"],
+        gender = config_model.get_gender,
         tumor_sample_id = "TUMOR",
         normal_sample_id = "NORMAL",
     message:
Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,6 @@ def case_config(`
`191`	`191`	`if path is not None`
`192`	`192`	`}`
`193`	`193`	`)`
`194`		`- LOG.info(f"Collected references: {references}")`
`195`	`194`
`196`	`195`	`analysis_fastq_dir: str = get_analysis_fastq_files_directory(`
`197`	`196`	`case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path`