Clinical-Genomics
diff --git a/‎.github/workflows/pytest_and_coveralls.yml‎
Lines changed: 15 additions & 24 deletions b/‎.github/workflows/pytest_and_coveralls.yml‎
Lines changed: 15 additions & 24 deletions
diff --git a/‎BALSAMIC/snakemake_rules/misc/__init__.py‎ ‎…MIC/assets/analysis_metadata/__init__.py‎BALSAMIC/snakemake_rules/misc/__init__.py renamed to BALSAMIC/assets/analysis_metadata/__init__.py b/‎BALSAMIC/snakemake_rules/misc/__init__.py‎ ‎…MIC/assets/analysis_metadata/__init__.py‎BALSAMIC/snakemake_rules/misc/__init__.py renamed to BALSAMIC/assets/analysis_metadata/__init__.py
diff --git a/‎BALSAMIC/assets/analysis_metadata/rescue_snvs.vcf‎
Lines changed: 5822 additions & 0 deletions b/‎BALSAMIC/assets/analysis_metadata/rescue_snvs.vcf‎
Lines changed: 5822 additions & 0 deletions
diff --git a/‎BALSAMIC/assets/scripts/add_clnvid_field.py‎
Lines changed: 82 additions & 0 deletions b/‎BALSAMIC/assets/scripts/add_clnvid_field.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎BALSAMIC/assets/scripts/collect_qc_metrics.py‎
Lines changed: 15 additions & 21 deletions b/‎BALSAMIC/assets/scripts/collect_qc_metrics.py‎
Lines changed: 15 additions & 21 deletions
diff --git a/‎BALSAMIC/assets/scripts/immediate_submit.py‎
Lines changed: 0 additions & 78 deletions b/‎BALSAMIC/assets/scripts/immediate_submit.py‎
Lines changed: 0 additions & 78 deletions
@@ -5,8 +5,7 @@ on:
       - "CHANGELOG.rst"
       - "docs/**"
   push:
-    branches:
-      - master
+    branches: [ master ]
     paths-ignore:
       - "CHANGELOG.rst"
       - "docs/**"
@@ -16,49 +15,41 @@ jobs:
     name: run PyTest
     runs-on: ubuntu-22.04
     steps:
-      # Checkout BALSAMIC
       - name: Git checkout
-        id: git_checkout
         uses: actions/checkout@v3
-      # Conda env create
+
       - name: setup conda
-        id: setup_conda
         uses: conda-incubator/setup-miniconda@v2
         with:
           activate-environment: balsamic
           environment-file: BALSAMIC/conda/balsamic.yaml
+          # optional speed-ups:
+          # auto-activate-base: false
+          # use-mamba: true
+
       - name: Install the HTML to PDF renderer
         run: sudo apt-get update && sudo apt-get install -y wkhtmltopdf
-      # Install BALSAMIC
-      - name: Install BALSAMIC
-        id: install_balsamic
-        shell: bash -l {0}
-        run: |
-          conda activate balsamic
-          pip install --no-cache-dir .
-      # Install pytest coveralls dependencies
-      - name: Install PyTest and Coveralls
-        id: install_pytest
+
+      - name: Install BALSAMIC + test extras
         shell: bash -l {0}
         run: |
           conda activate balsamic
-          pip install --no-cache-dir -r requirements-dev.txt
-      # Run PyTest
+          pip install --no-cache-dir -e .[test]
+
       - name: Run PyTest
-        id: pytest
         shell: bash -l {0}
-        run: |
-          conda activate balsamic
-          py.test --cov-report=xml --cov=BALSAMIC -rsxv tests/*
         env:
           SENTIEON_LICENSE: dummy_license
           SENTIEON_INSTALL_DIR: dummy_install_dir
-      # Run Codecov
+        run: |
+          conda activate balsamic
+          pytest --cov-report=xml --cov=BALSAMIC -rsxv tests/*
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           file: ./coverage.xml
           flags: unittests
           fail_ci_if_error: true
-          verbose: true
+          verbose: true
@@ -0,0 +1,82 @@
+import pysam
+import click
+
+
+def add_clnvid_header(output_handle) -> None:
+    """
+    Writes the INFO header line for the CLNVID field to the output VCF.
+    """
+    # Use String to be safe; ID column can contain non-numeric identifiers.
+    header_line = '##INFO=<ID=CLNVID,Number=1,Type=String,Description="ClinVar Variation ID taken from the VCF ID column">'
+    output_handle.write(f"{header_line}\n".encode("utf-8"))
+
+
+def process_vcf(input_path: str, output_path: str) -> None:
+    """
+    Processes a bgzipped VCF file using pysam, adds the CLNVID INFO field based on the ID column,
+    and writes to a new bgzipped VCF file in a tabix-compatible format.
+    """
+    saw_clnvid_header = False
+
+    with pysam.BGZFile(input_path, "r") as infile, pysam.BGZFile(
+        output_path, "w"
+    ) as outfile:
+        for raw_line in infile:
+            # Pass through meta headers; track if CLNVID header already exists
+            if raw_line.startswith(b"##"):
+                if b"##INFO=<ID=CLNVID" in raw_line:
+                    saw_clnvid_header = True
+                outfile.write(raw_line)
+                continue
+
+            # Column header line: add CLNVID header if missing, then write
+            if raw_line.startswith(b"#"):
+                if not saw_clnvid_header:
+                    add_clnvid_header(outfile)
+                    saw_clnvid_header = True
+                outfile.write(raw_line)
+                continue
+
+            # Variant line
+            line = raw_line.decode("utf-8").rstrip("\n")
+            fields = line.split("\t")
+
+            # Ensure we have at least up to INFO column
+            if len(fields) < 8:
+                # Malformed line, write back unchanged
+                outfile.write((line + "\n").encode("utf-8"))
+                continue
+
+            vcf_id = fields[2]
+            info = fields[7]
+
+            # Only add CLNVID when ID column is not '.'
+            if vcf_id != ".":
+                if info == "." or info == "":
+                    info = f"CLNVID={vcf_id}"
+                elif "CLNVID=" not in info:
+                    info = f"{info};CLNVID={vcf_id}"
+
+            # Write updated INFO back to fields
+            fields[7] = info
+            modified_line = "\t".join(fields) + "\n"
+            outfile.write(modified_line.encode("utf-8"))
+
+
+@click.command()
+@click.argument(
+    "input_path", type=click.Path(exists=True, readable=True, dir_okay=False)
+)
+@click.argument("output_path", type=click.Path(writable=True, dir_okay=False))
+def main(input_path: str, output_path: str) -> None:
+    """
+    Adds a CLNVID INFO field to each record in a bgzipped VCF file based on the ID column.
+
+    INPUT_PATH: Path to the input VCF file (.vcf.gz).
+    OUTPUT_PATH: Path to the output VCF file (.vcf.gz).
+    """
+    process_vcf(input_path, output_path)
+
+
+if __name__ == "__main__":
+    main()
@@ -14,6 +14,7 @@
     get_analysis_type,
     get_capture_kit,
     get_sample_type_from_sample_name,
+    get_sample_name_from_sample_type,
     get_sequencing_type,
 )
 
@@ -35,17 +36,17 @@ def collect_qc_metrics(
     config_path: Path,
     output_path: Path,
     multiqc_data_path: Path,
-    sex_prediction_path: Path,
     counts_path: List[Path],
+    sex_prediction_path: Path,
 ):
     """Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file
 
     Args:
         config_path: Path; case config file path
         output_path: Path; destination path for the extracted YAML formatted metrics
         multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted
-        sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
         counts_path: Path; list of variant caller specific files containing the number of variants
+        sex_prediction_path: Path; sex prediction JSON path from which sex prediction info will be extracted
     """
 
     config = read_json(config_path)
@@ -76,7 +77,7 @@ def collect_qc_metrics(
         )
 
 
-def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
+def get_multiqc_data_source(multiqc_data: dict, sampleid: str, tool: str) -> str:
     """Extracts the metrics data source associated with a specific sample and tool
 
     Args:
@@ -104,26 +105,18 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str:
                 subtool_name[1].lower() in source_tool.lower()
                 and subtool_name[2].lower() in source_subtool.lower()
             ):
-                try:
-                    return os.path.basename(
-                        multiqc_data["report_data_sources"][source_tool][
-                            source_subtool
-                        ][sample]
-                    )
-                except KeyError:
-                    # Deletes pair orientation information from the sample name (insertSize metrics)
-                    sample = sample.rsplit("_", 1)[0]
-                    return os.path.basename(
-                        multiqc_data["report_data_sources"][source_tool][
-                            source_subtool
-                        ][sample]
-                    )
+                source_dict = multiqc_data["report_data_sources"][source_tool][
+                    source_subtool
+                ]
+                metric_file = next(
+                    (v for k, v in source_dict.items() if sampleid in k), None
+                )
+                return os.path.basename(metric_file)
 
 
 def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
     """Retrieves the sex check metrics and returns them as a Metric list."""
     metric = "compare_predicted_to_given_sex"
-    case_id: str = config["analysis"]["case_id"]
     sex_prediction: dict = read_json(sex_prediction_path)
 
     given_sex: str = config["analysis"]["gender"]
@@ -133,8 +126,9 @@ def get_sex_check_metrics(sex_prediction_path: str, config: dict) -> list:
     for sample_type in ["tumor", "normal"]:
         if sample_type in sex_prediction:
             predicted_sex = sex_prediction[sample_type]["predicted_sex"]
+            sample_name = get_sample_name_from_sample_type(config, sample_type)
             sex_prediction_metrics = Metric(
-                id=f"{case_id}_{sample_type}",
+                id=sample_name,
                 input=os.path.basename(sex_prediction_path),
                 name=metric.upper(),
                 step="sex_check",
@@ -224,7 +218,7 @@ def get_metric_condition(
     req_metrics = requested_metrics[metric]["condition"]
     if sequencing_type == "wgs" and (
         (metric == "PCT_60X" and sample_type == "normal")
-        or (metric == "MEDIAN_COVERAGE" and sample_type == "tumor")
+        or (metric == "MEDIAN_TARGET_COVERAGE" and sample_type == "tumor")
     ):
         req_metrics = None
 
@@ -264,7 +258,7 @@ def extract(data, output_metrics, multiqc_key=None, source=None):
                             Metric(
                                 id=get_sample_id(multiqc_key),
                                 input=get_multiqc_data_source(
-                                    multiqc_data, multiqc_key, source
+                                    multiqc_data, get_sample_id(multiqc_key), source
                                 ),
                                 name=k,
                                 step=source,