Clinical-Genomics
diff --git a/‎BALSAMIC/assets/scripts/scan_finished_jobid_status.py‎
Lines changed: 165 additions & 47 deletions b/‎BALSAMIC/assets/scripts/scan_finished_jobid_status.py‎
Lines changed: 165 additions & 47 deletions
diff --git a/‎BALSAMIC/commands/config/case.py‎
Lines changed: 6 additions & 0 deletions b/‎BALSAMIC/commands/config/case.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎BALSAMIC/commands/options.py‎
Lines changed: 0 additions & 2 deletions b/‎BALSAMIC/commands/options.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎BALSAMIC/constants/analysis.py‎
Lines changed: 1 addition & 1 deletion b/‎BALSAMIC/constants/analysis.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎BALSAMIC/constants/workflow_profile/config.yaml‎
Lines changed: 4 additions & 6 deletions b/‎BALSAMIC/constants/workflow_profile/config.yaml‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎BALSAMIC/models/sbatchsubmitter.py‎
Lines changed: 1 addition & 1 deletion b/‎BALSAMIC/models/sbatchsubmitter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎BALSAMIC/models/snakemake.py‎
Lines changed: 1 addition & 1 deletion b/‎BALSAMIC/models/snakemake.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule‎
Lines changed: 1 addition & 1 deletion b/‎BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule‎
Lines changed: 1 addition & 1 deletion
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import logging
-import re
 import subprocess
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
@@ -12,6 +11,9 @@
 
 LOG = logging.getLogger(__name__)
 
+FAILURE_LIKE_STATES = {"FAILED", "CANCELLED", "TIMEOUT", "OUT_OF_MEMORY"}
+SUCCESS_STATES = {"COMPLETED"}
+
 
 def find_job_logs(log_root: Path) -> Dict[str, Path]:
     """
@@ -30,35 +32,57 @@ def find_job_logs(log_root: Path) -> Dict[str, Path]:
 
 def get_job_state(jobid: str) -> Optional[str]:
     """
-    Return raw output of `scontrol show job JOBID`, or None if the query fails.
+    Look up the final job state via `sacct`.
+
+    We prefer the top-level job record (e.g. '10683002')
+    over step records (e.g. '10683002.batch', '10683002.0').
+
+    Returns a normalized state string like 'COMPLETED', 'FAILED',
+    'CANCELLED', etc., or None if not found.
     """
+    cmd = [
+        "sacct",
+        "-j",
+        jobid,
+        "--noheader",  # no column headers
+        "--parsable2",  # '|' separator, stable columns
+        "-o",
+        "JobID,State",
+    ]
+
     try:
-        LOG.debug(f"Running show job scontrol {jobid}")
+        LOG.debug("Running: %s", " ".join(cmd))
         result = subprocess.run(
-            ["scontrol", "show", "job", jobid],
+            cmd,
             capture_output=True,
             text=True,
             check=True,
         )
-        return result.stdout
     except FileNotFoundError:
-        LOG.error("scontrol executable not found: scontrol")
+        LOG.error("sacct executable not found: sacct")
         return None
     except subprocess.CalledProcessError as e:
-        LOG.warning(f"Could not check job {jobid} (may not exist). rc={e.returncode}")
-        LOG.debug(f"scontrol stderr for {jobid} {e.stderr}")
+        LOG.warning("Could not check job %s via sacct (rc=%s)", jobid, e.returncode)
+        LOG.debug("sacct stderr for %s: %s", jobid, e.stderr)
         return None
 
+    lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
+    if not lines:
+        LOG.debug("No sacct records returned for job %s", jobid)
+        return None
+
+    # Each line looks like: "10683002|FAILED" or "10683002.0|CANCELLED+"
+    records = [ln.split("|") for ln in lines]
+
+    # Prefer the exact jobid (no step suffix)
+    parent_record = next((r for r in records if r[0] == jobid), None)
+    chosen = parent_record or records[0]
+
+    raw_state = chosen[1]
+    # Normalize things like "CANCELLED+" or "FAILED node_fail"
+    state = raw_state.split()[0].rstrip("+")
+    LOG.debug("Job %s sacct raw state=%r -> normalized=%r", jobid, raw_state, state)
 
-def parse_state(scontrol_output: str) -> Optional[str]:
-    """
-    Extract JobState from scontrol text, e.g. 'JobState=FAILED'.
-    Returns the state string (e.g. 'FAILED') or None if not found.
-    """
-    m = re.search(r"JobState=(\S+)", scontrol_output)
-    state = m.group(1) if m else None
-    if state is None:
-        LOG.debug("JobState not found in scontrol output")
     return state
 
 
@@ -67,44 +91,153 @@ def write_results(
     failed: List[Tuple[str, Path]],
     cancelled: List[Tuple[str, Path]],
     unknown: List[str],
+    resolved_failures: List[Tuple[str, Path, str, str]],
 ) -> None:
     """
     Append job results to output_file.
     Each run is prefixed with a timestamp header.
+
+    resolved_failures items are (jobid, log_path, state, rule_key).
     """
     output_file.parent.mkdir(parents=True, exist_ok=True)
 
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
     with output_file.open("a") as out_f:
-        out_f.write(f"=== Job status check at {timestamp} ===\n")
+        out_f.write(f"=== Job status check at {timestamp} ===\n\n")
+
+        # If there are no *unresolved* failures/cancellations, consider run successful.
+        if not failed and not cancelled:
+            out_f.write("SUCCESSFUL\n\n")
 
         if failed:
-            out_f.write("Failed jobs:\n")
+            out_f.write("FAILED JOBS (no successful retry):\n")
             for jobid, log_path in failed:
                 out_f.write(f"{jobid}\t{log_path}\n")
             out_f.write("\n")
 
         if cancelled:
-            out_f.write("Cancelled jobs:\n")
+            out_f.write("CANCELLED JOBS (no successful retry):\n")
             for jobid, log_path in cancelled:
                 out_f.write(f"{jobid}\t{log_path}\n")
             out_f.write("\n")
 
+        if resolved_failures:
+            out_f.write("\n")
+            out_f.write("NOTE:\n")
+            out_f.write("Some jobs failed but succeeded on retry:\n")
+            for jobid, log_path, state, rule_key in resolved_failures:
+                out_f.write(f"{jobid}\t{log_path}\t{state}\n")
+            out_f.write("\n")
+
         if unknown:
             out_f.write("Unknown status jobs:\n")
             for jobid in unknown:
                 out_f.write(f"{jobid}\tNA\n")
             out_f.write("\n")
 
-        if not failed and not cancelled:
-            out_f.write("SUCCESSFUL\n\n")
-
     LOG.info(
-        f"Appended results to {output_file} (failed={len(failed)}, cancelled={len(cancelled)} unknown={len(unknown)})"
+        "Appended results to %s (failed=%d, cancelled=%d, resolved_failures=%d, unknown=%d)",
+        output_file,
+        len(failed),
+        len(cancelled),
+        len(resolved_failures),
+        len(unknown),
     )
 
 
+def derive_rule_key(log_root: Path, log_path: Path) -> str:
+    """
+    Derive the unique log-directory path for the rule + any wildcard as a key
+
+    Prefer path relative to log_root; fall back to absolute parent directory.
+    """
+    try:
+        return str(log_path.parent.relative_to(log_root))
+    except ValueError:
+        return str(log_path.parent)
+
+
+def group_jobs_by_rule(
+    log_dir: Path, job_logs: Dict[str, Path]
+) -> Tuple[Dict[str, List[Tuple[str, Path, Optional[str]]]], List[str]]:
+    """
+    Query sacct for each job and group them per rule directory.
+
+    Returns:
+        rule_to_jobs: {rule_key -> [(jobid, log_path, state), ...]}
+        unknown: list of jobids with missing sacct info
+    """
+    rule_to_jobs: Dict[str, List[Tuple[str, Path, Optional[str]]]] = {}
+    unknown: List[str] = []
+
+    for jobid in sorted(job_logs.keys(), key=int):
+        log_path = job_logs[jobid]
+
+        state = get_job_state(jobid)
+        if state is None:
+            LOG.warning(
+                "Missing sacct state for job %s -- setting status UNKNOWN", jobid
+            )
+            unknown.append(jobid)
+            continue
+        rule_key = derive_rule_key(log_dir, log_path)
+
+        LOG.debug("Job %s in rule dir %s has state %s", jobid, rule_key, state)
+
+        rule_to_jobs.setdefault(rule_key, []).append((jobid, log_path, state))
+
+    return rule_to_jobs, unknown
+
+
+def classify_jobs(
+    rule_to_jobs: Dict[str, List[Tuple[str, Path, Optional[str]]]]
+) -> Tuple[
+    List[Tuple[str, Path]],
+    List[Tuple[str, Path]],
+    List[Tuple[str, Path, str, str]],
+]:
+    """
+    Classify jobs into:
+      - failed (no successful retry for that rule) -> **only the latest attempt**
+      - cancelled (no successful retry for that rule) -> **only the latest attempt**
+      - resolved_failures (failed/cancelled but with a successful retry)
+    """
+    failed: List[Tuple[str, Path]] = []
+    cancelled: List[Tuple[str, Path]] = []
+    resolved_failures: List[Tuple[str, Path, str, str]] = []
+
+    for rule_key, jobs in rule_to_jobs.items():
+        # jobs for this rule are in ascending jobid order (because of sorted(...) in group_jobs_by_rule)
+        has_success = any(state in SUCCESS_STATES for _, _, state in jobs)
+
+        # All failure-like attempts for this rule
+        failure_like_jobs = [
+            (jobid, log_path, state)
+            for jobid, log_path, state in jobs
+            if state in FAILURE_LIKE_STATES
+        ]
+
+        if not failure_like_jobs:
+            # Nothing to do for this rule
+            continue
+
+        if has_success:
+            # Rule eventually succeeded; treat *all* failure-like attempts as resolved.
+            for jobid, log_path, state in failure_like_jobs:
+                resolved_failures.append((jobid, log_path, state, rule_key))
+        else:
+            # Rule never succeeded: only report the **latest** failure-like attempt.
+            latest_jobid, latest_log_path, latest_state = failure_like_jobs[-1]
+            if latest_state == "FAILED":
+                failed.append((latest_jobid, latest_log_path))
+            else:
+                # CANCELLED, TIMEOUT, OUT_OF_MEMORY, ...
+                cancelled.append((latest_jobid, latest_log_path))
+
+    return failed, cancelled, resolved_failures
+
+
 @click.command()
 @click.argument(
     "log_dir", type=click.Path(exists=True, file_okay=False, path_type=Path)
@@ -128,7 +261,11 @@ def write_results(
 def check_failed_jobs(log_dir: Path, output: Path, log_level: str) -> None:
     """
     Recursively scan LOG_DIR for SLURM *.log files (stdout+stderr combined),
-    extract job IDs from filenames, and check their states via `scontrol show job JOBID`.
+    extract job IDs from filenames, and check their states via `sacct`.
+
+    If multiple jobs share the same rule log directory and at least one of them
+    completes successfully, earlier failures in that directory are reported under
+    a separate heading as “Failed jobs with successful retry”.
     """
     logging.basicConfig(
         level=getattr(logging, log_level.upper(), logging.INFO),
@@ -138,33 +275,14 @@ def check_failed_jobs(log_dir: Path, output: Path, log_level: str) -> None:
     LOG.info("Scanning logs under: %s", log_dir)
     job_logs = find_job_logs(log_dir)
 
-    failed: List[Tuple[str, Path]] = []
-    cancelled: List[Tuple[str, Path]] = []
-    unknown: List[str] = []
-
     if not job_logs:
         LOG.warning("No job logs found (no files matching '*.log')")
         return
 
-    for jobid in sorted(job_logs.keys(), key=int):
-        out_text = get_job_state(jobid)
-        if not out_text:
-            # Can't classify without job info; skip but note it.
-            LOG.warning(
-                f"Missing scontrol output for job {jobid} -- setting status UNKNOWN"
-            )
-            unknown.append(jobid)
-            continue
-
-        state = parse_state(out_text)
-        if state == "FAILED":
-            failed.append((jobid, job_logs[jobid]))
-        elif state in ["CANCELLED", "TIMEOUT", "OUT_OF_MEMORY"]:
-            cancelled.append((jobid, job_logs[jobid]))
-        else:
-            LOG.debug(f"Job {jobid} state is {state}")
+    rule_to_jobs, unknown = group_jobs_by_rule(log_dir, job_logs)
+    failed, cancelled, resolved_failures = classify_jobs(rule_to_jobs)
 
-    write_results(output, failed, cancelled, unknown)
+    write_results(output, failed, cancelled, unknown, resolved_failures)
 
 
 if __name__ == "__main__":
 
@@ -151,6 +151,12 @@ def case_config(
 
     LOG.info(f"Starting configuring analysis case: {case_id}.")
 
+    if exome and not panel_bed:
+        raise click.BadParameter(
+            "If --exome is provided, --panel-bed must also be provided.",
+            param_hint=["--panel-bed"],
+        )
+
     LOG.info(f"Creating case analysis directory: {analysis_dir}/{case_id}.")
     Path(analysis_dir, case_id).mkdir(exist_ok=True)
 
 
@@ -23,7 +23,6 @@
 from BALSAMIC.constants.paths import WORKFLOW_PROFILE, CACHE_PROFILE
 from BALSAMIC.utils.cli import (
     validate_cache_version,
-    validate_exome_option,
     validate_umi_min_reads,
 )
 
@@ -222,7 +221,6 @@
     is_flag=True,
     default=False,
     help="Assign exome parameters to TGA workflow",
-    callback=validate_exome_option,
 )
 
 OPTION_FASTQ_PATH = click.option(
 
@@ -14,7 +14,7 @@ class LogFile:
 class SubmitSnakemake:
     """Constants for sbatch script running snakemake on cluster"""
 
-    MAX_RUN_HOURS: int = 120
+    MAX_RUN_HOURS: int = 168
 
 
 class SnakemakeDAG:
 
@@ -6,12 +6,10 @@ max-status-checks-per-second: 1
 max-jobs-per-second: 1
 restart-times: 2
 
-default-resources:
-    runtime: 120
-    mem_mb: 4000
-    slurm_partition: "core"
-
 set-resources:
+  all:
+    runtime: 60
+    mem_mb: 4000
   post_process_tnscope_info_fields_wgs:
     runtime: 60
     mem_mb: min(230000, 5000 * attempt)
@@ -471,7 +469,7 @@ set-resources:
     mem_mb: min(230000, 25000 * attempt)
     runtime: 1440
   samtools_fixmate:
-    mem_mb: min(230000, 50000 * attempt)
+    mem_mb: min(230000, 80000 * attempt)
     runtime: 560
   bam_compress_tumor:
     mem_mb: min(230000, 8000 * attempt)
 
@@ -65,7 +65,7 @@ def _build_sbatch_header(self) -> str:
           - optional partition if `self.headjob_partition` is set
         """
         lines = [
-            "#!/bin/bash -l",
+            "#!/bin/bash",
             f"#SBATCH --account={self.account}",
             f"#SBATCH --job-name=BALSAMIC_snakemake_submit.{self.case_id}.%j",
             f"#SBATCH --output={self.log_path}/BALSAMIC_snakemake_submit.{self.case_id}.%j.out",
 
@@ -137,7 +137,7 @@ def get_command(self) -> str:
 
     def get_slurm_job_arguments(self) -> str:
         return (
-            f'--default-resources slurm_extra="--qos={self.qos}" '
+            f'--default-resources slurm_extra="--qos={self.qos}" runtime=120 mem_mb=4000 '
             f"slurm_partition={self.workflow_partition} slurm_account={self.account}"
         )
 
 
@@ -12,7 +12,7 @@ rule vep_somatic_research_sv:
     benchmark:
         Path(benchmark_dir, "vep_somatic_research_sv." + config["analysis"]["case_id"] + ".svdb.tsv").as_posix()
     resources:
-        mem_mb = lambda wc: (140000 if config_model.analysis.sequencing_type == SequencingType.WGS else 35000)
+        mem_mb = lambda wc, attempt: min(230000, (140000 if config_model.analysis.sequencing_type == SequencingType.WGS else 35000) * attempt)
     singularity:
         Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix()
     params:
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,6 @@`
`23`	`23`	`from BALSAMIC.constants.paths import WORKFLOW_PROFILE, CACHE_PROFILE`
`24`	`24`	`from BALSAMIC.utils.cli import (`
`25`	`25`	`validate_cache_version,`
`26`		`- validate_exome_option,`
`27`	`26`	`validate_umi_min_reads,`
`28`	`27`	`)`
`29`	`28`
`@@ -222,7 +221,6 @@`
`222`	`221`	`is_flag=True,`
`223`	`222`	`default=False,`
`224`	`223`	`help="Assign exome parameters to TGA workflow",`
`225`		`- callback=validate_exome_option,`
`226`	`224`	`)`
`227`	`225`
`228`	`226`	`OPTION_FASTQ_PATH = click.option(`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def get_command(self) -> str:`
`137`	`137`
`138`	`138`	`def get_slurm_job_arguments(self) -> str:`
`139`	`139`	`return (`
`140`		`- f'--default-resources slurm_extra="--qos={self.qos}" '`
	`140`	`+ f'--default-resources slurm_extra="--qos={self.qos}" runtime=120 mem_mb=4000 '`
`141`	`141`	`f"slurm_partition={self.workflow_partition} slurm_account={self.account}"`
`142`	`142`	`)`
`143`	`143`