Fix BD Rhapsody FASTQ staging and metrics summary parsing

xies4 · xies4 · commit 2a89aa2c852a · 2026-03-27T17:19:21.000-04:00
diff --git a/rules/singlecell_bdrhapsody_import.smk b/rules/singlecell_bdrhapsody_import.smk
@@ -33,18 +33,9 @@ def get_sample_path_list(attr, sample):
 
 
 def get_required_reference_archive(wildcards):
-    archive = get_sample_config_value(
-        "reference_archive",
-        wildcards.sample,
-        getattr(reference, "reference_archive_bdrhapsody", ""),
-    )
-    if archive in (None, ""):
-        raise ValueError(
-            "reference_archive is required for the BD Rhapsody pipeline. "
-            "Set config.reference_archive or define reference.reference_archive_bdrhapsody."
-        )
-    return archive
-
+    if getattr(reference, "reference_archive_bdrhapsody", False) is False:
+        raise ValueError("reference_archive_bdrhapsody is a required config parameter for the BD Rhapsody pipeline.")
+    return getattr(reference, "reference_archive_bdrhapsody", False)
 
 def shell_join_args(arg_pairs):
     args = []
@@ -63,25 +54,6 @@ def build_write_inputs_args(wildcards):
     sample = wildcards.sample
     scalar_pairs = [
         ("--file-field", f"Reference_Archive={get_required_reference_archive(wildcards)}"),
-        ("--scalar", f"Run_Name={get_sample_config_value('run_name', sample, sample)}"),
-        ("--scalar", f"Sample_Tags_Version={get_sample_config_value('sample_tags_version', sample)}"),
-        ("--scalar", f"VDJ_Version={get_sample_config_value('vdj_version', sample)}"),
-        ("--scalar", f"Cell_Calling_Data={get_sample_config_value('cell_calling_data', sample)}"),
-        (
-            "--scalar",
-            "Cell_Calling_Bioproduct_Algorithm="
-            f"{get_sample_config_value('cell_calling_bioproduct_algorithm', sample)}",
-        ),
-        (
-            "--scalar",
-            "Cell_Calling_ATAC_Algorithm="
-            f"{get_sample_config_value('cell_calling_atac_algorithm', sample)}",
-        ),
-        ("--scalar", f"Exact_Cell_Count={get_sample_config_value('exact_cell_count', sample)}"),
-        ("--scalar", f"Expected_Cell_Count={get_sample_config_value('expected_cell_count', sample)}"),
-        ("--scalar", f"Maximum_Threads={get_sample_config_value('maximum_threads', sample)}"),
-        ("--scalar", f"Custom_STAR_Params={get_sample_config_value('custom_star_params', sample)}"),
-        ("--scalar", f"Custom_bwa_mem2_Params={get_sample_config_value('custom_bwa_mem2_params', sample)}"),
     ]
     args = shell_join_args(scalar_pairs)
 
@@ -129,13 +101,17 @@ rule write_bdrhapsody_inputs:
         r1=rules.make_fastq_concat.output.r1,
         r2=rules.make_fastq_concat.output.r2,
     output:
-        "{sample}/pipeline_inputs.yml"
+        "{sample}_pipeline_inputs.yml"
     params:
         extra_args=build_write_inputs_args,
         script=os.path.join(analysis, "workflow/scripts/bdrhapsody/write_inputs.py"),
     shell:
         """
-python {params.script}     --output {output}     --read {input.r1}     --read {input.r2}     {params.extra_args}
+python {params.script} \
+    --output {output}  \
+    --read {input.r1}  \
+    --read {input.r2}  \
+    {params.extra_args}
 """
 
 
@@ -145,34 +121,35 @@ rule count:
         r1=rules.make_fastq_concat.output.r1,
         r2=rules.make_fastq_concat.output.r2,
     output:
-        web_summary="{sample}/outs/web_summary.html",
-        metrics="{sample}/outs/metrics_summary.csv",
-        matrix="{sample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz",
-        features="{sample}/outs/filtered_feature_bc_matrix/features.tsv.gz",
-        barcodes="{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz",
+        web_summary="{sample}/{sample}_Pipeline_Report.html",
+        metrics="{sample}/{sample}_Metrics_Summary.csv",
     log:
         err="run_{sample}_bdrhapsody.err",
         log="run_{sample}_bdrhapsody.log",
     params:
-        rawdir=lambda wildcards: os.path.join(analysis, wildcards.sample, "bdrhapsody_raw"),
+        outdir=os.path.join(analysis, "{sample}"),
         script=os.path.join(analysis, "workflow/scripts/bdrhapsody/collect_outputs.py"),
         vendor=program.bdrhapsody,
     shell:
         r"""
 set -euo pipefail
-rm -rf {params.rawdir} {wildcards.sample}/outs
-mkdir -p {params.rawdir}
-
-{params.vendor} pipeline --no-parallel --outdir {params.rawdir} {input.yaml} > {log.log} 2> {log.err} || true
-
-python {params.script}     --sample {wildcards.sample}     --rawdir {params.rawdir}     --outs {wildcards.sample}/outs     --vendor-log {log.log}     --vendor-err {log.err}
+rm -rf {params.outdir}
+echo "Before module load R/4.5.2:" > {log.log}
+which R >> {log.log}
+module load R/4.5.2
+echo "After module load R/4.5.2:" >> {log.log}
+which R >> {log.log}
+{params.vendor} pipeline --no-parallel \
+        --outdir {params.outdir}\
+         {input.yaml} \
+         >> {log.log} 2> {log.err} || true
 """
 
 
 rule summaryFiles:
     input:
-        expand("{sample}/outs/metrics_summary.csv", sample=samples),
-        expand("{sample}/outs/web_summary.html", sample=samples),
+        expand(rules.count.output.metrics, sample=samples),
+        expand(rules.count.output.web_summary, sample=samples),
     output:
         "finalreport/metric_summary.xlsx",
         expand("finalreport/summaries/{sample}_web_summary.html", sample=samples),
diff --git a/rules/singlecell_import.smk b/rules/singlecell_import.smk
@@ -205,11 +205,11 @@ def filterFastq4nopipe(wildcards):
             break
 
     if detected_sample_folder is None:
-        sys.stderr.write(f"\nError: No FASTQ folder found for sample {detected_sample_folder}. Check the directory structure.\n\n")
+        sys.stderr.write(f"\nError: No FASTQ folder found for sample {wildcards.sample}. Check the directory structure.\n\n")
         sys.exit(1)
 
-    # Define new FASTQ output directory
-    path_fq_new = f"fastq/{detected_sample_folder}/"
+    # Stage symlinks under the workflow sample id and keep source-folder detection separate.
+    path_fq_new = f"fastq/{wildcards.sample}/"
 
     # Create directory if it doesn't exist
     os.makedirs(path_fq_new, exist_ok=True)
@@ -271,10 +271,11 @@ def prep_fastq_folder_ln(sample, get_dict_only=False):
             break
 
     if detected_sample_folder is None:
-        sys.stderr.write(f"\nError: No FASTQ folder found for sample {detected_sample_folder}. Check the directory structure.\n\n")
+        sys.stderr.write(f"\nError: No FASTQ folder found for sample {sample}. Check the directory structure.\n\n")
         sys.exit(1)
 
-    path_fq_new = os.path.join(analysis, f"fastq/{detected_sample_folder}/")
+    # Stage symlinks under the workflow sample id and keep source-folder detection separate.
+    path_fq_new = os.path.join(analysis, f"fastq/{sample}/")
     if not get_dict_only:
         if os.path.exists(path_fq_new):
             for file in os.listdir(path_fq_new):
diff --git a/scripts/bdrhapsody/generateSummaryFiles.py b/scripts/bdrhapsody/generateSummaryFiles.py
@@ -3,64 +3,157 @@
 import csv
 import glob
 import os
+import re
+from pathlib import Path
 from shutil import copyfile
 
 import xlsxwriter
 
 
 METRICS_PATH = "finalreport/"
 SUMMARY_PATH = "finalreport/summaries/"
+PIPELINE_VERSION_RE = re.compile(r"^(?P<name>.+?) Version (?P<version>[^ ]+)$")
+INT_RE = re.compile(r"^[+-]?\d+$")
+FLOAT_RE = re.compile(r"^[+-]?(?:\d+\.\d*|\d*\.\d+)$")
 
 
 def coerce_value(value):
     value = str(value).strip()
-    if value == "":
+    if value in ("", "-"):
         return value, None
     if value.endswith("%"):
-        try:
-            return float(value[:-1]) / 100.0, "percent"
-        except ValueError:
-            return value, None
-    try:
-        if "." in value:
-            return float(value), "float"
-        return int(value), "int"
-    except ValueError:
+        numeric = value[:-1].strip().replace(",", "")
+        if INT_RE.match(numeric) or FLOAT_RE.match(numeric):
+            return float(numeric) / 100.0, "percent"
         return value, None
 
-
-def main(output_name="metric_summary"):
-    os.makedirs(METRICS_PATH, exist_ok=True)
-    os.makedirs(SUMMARY_PATH, exist_ok=True)
-
-    files = sorted(glob.glob("./*/outs/metrics_summary.csv"))
-    workbook = xlsxwriter.Workbook(os.path.join(METRICS_PATH, f"{output_name}.xlsx"))
-    worksheet = workbook.add_worksheet("metrics_summary")
-
+    numeric = value.replace(",", "")
+    if INT_RE.match(numeric):
+        return int(numeric), "int"
+    if FLOAT_RE.match(numeric):
+        return float(numeric), "float"
+    return value, None
+
+
+def next_nonempty(lines, start_idx):
+    idx = start_idx
+    while idx < len(lines) and not lines[idx].strip():
+        idx += 1
+    return idx
+
+
+def parse_metadata_line(line):
+    content = line[2:].strip()
+    if not content or set(content) == {"#"}:
+        return []
+    if " - " in content:
+        section, remainder = content.split(" - ", 1)
+        entries = []
+        for item in remainder.split(" | "):
+            if ": " in item:
+                key, value = item.split(": ", 1)
+                entries.append((f"{section}.{key}", value))
+            else:
+                entries.append((section, item))
+        return entries
+
+    match = PIPELINE_VERSION_RE.match(content)
+    if match:
+        return [
+            ("Pipeline.Name", match.group("name")),
+            ("Pipeline.Version", match.group("version")),
+        ]
+
+    return [("Pipeline.Info", content)]
+
+
+def flatten_section(section_name, header, rows):
+    flattened = {}
+    if not rows:
+        return flattened
+
+    id_column = None
+    for candidate in ("Library", "Bioproduct_Type"):
+        if candidate in header:
+            id_column = candidate
+            break
+
+    multiple_rows = len(rows) > 1
+    for row in rows:
+        row_label = None
+        if multiple_rows and id_column:
+            row_label = row.get(id_column, "")
+        for column in header:
+            if column == id_column:
+                continue
+            key = f"{section_name}.{column}"
+            if row_label:
+                key = f"{section_name}[{row_label}].{column}"
+            flattened[key] = row.get(column, "")
+    return flattened
+
+
+def parse_metrics_summary(filename):
+    lines = Path(filename).read_text(encoding="utf-8", errors="replace").splitlines()
+    metadata = {}
+    metrics = {}
+    idx = 0
+
+    while idx < len(lines):
+        line = lines[idx].strip()
+        if not line:
+            idx += 1
+            continue
+        if line.startswith("##"):
+            for key, value in parse_metadata_line(line):
+                metadata[key] = value
+            idx += 1
+            continue
+        if line.startswith("#"):
+            section_name = line.strip("#").strip()
+            idx = next_nonempty(lines, idx + 1)
+            if idx >= len(lines):
+                break
+            header_line = lines[idx].strip()
+            if not header_line or header_line.startswith("#"):
+                continue
+            header = next(csv.reader([header_line]))
+            idx += 1
+            rows = []
+            while idx < len(lines):
+                candidate = lines[idx].strip()
+                if not candidate:
+                    idx += 1
+                    break
+                if candidate.startswith("#"):
+                    break
+                values = next(csv.reader([candidate]))
+                rows.append(dict(zip(header, values)))
+                idx += 1
+            metrics.update(flatten_section(section_name, header, rows))
+            continue
+        idx += 1
+
+    return metadata, metrics
+
+
+def write_sheet(workbook, name, headers, rows, coerce_numbers=True):
+    worksheet = workbook.add_worksheet(name)
     format_num = workbook.add_format({"num_format": "#,##0"})
     format_float = workbook.add_format({"num_format": "#,##0.00"})
     format_per = workbook.add_format({"num_format": "0.00%"})
     format_head = workbook.add_format({"bold": True, "italic": True, "text_wrap": True, "align": "center"})
 
-    headers = ["Sample"]
-    rows = []
-    for filename in files:
-        with open(filename, newline="", encoding="utf-8") as handle:
-            reader = csv.DictReader(handle)
-            data = next(reader)
-        sample = filename.split("/")[1]
-        for key in data:
-            if key not in headers:
-                headers.append(key)
-        rows.append((sample, data))
-
     for col, header in enumerate(headers):
         worksheet.write(0, col, header, format_head)
 
-    for row_idx, (sample, data) in enumerate(rows, start=1):
-        worksheet.write(row_idx, 0, sample)
-        for col_idx, header in enumerate(headers[1:], start=1):
-            value, kind = coerce_value(data.get(header, ""))
+    for row_idx, row in enumerate(rows, start=1):
+        for col_idx, header in enumerate(headers):
+            raw_value = row.get(header, "")
+            if not coerce_numbers:
+                worksheet.write(row_idx, col_idx, raw_value)
+                continue
+            value, kind = coerce_value(raw_value)
             if kind == "percent":
                 worksheet.write(row_idx, col_idx, value, format_per)
             elif kind == "float":
@@ -70,11 +163,45 @@ def main(output_name="metric_summary"):
             else:
                 worksheet.write(row_idx, col_idx, value)
 
-    worksheet.set_column(0, len(headers) - 1, 18)
+    worksheet.set_column(0, len(headers) - 1, 22)
+
+
+def main(output_name="metric_summary"):
+    os.makedirs(METRICS_PATH, exist_ok=True)
+    os.makedirs(SUMMARY_PATH, exist_ok=True)
+
+    files = sorted(glob.glob("./*/*_Metrics_Summary.csv"))
+    workbook = xlsxwriter.Workbook(os.path.join(METRICS_PATH, f"{output_name}.xlsx"))
+
+    summary_headers = ["Sample"]
+    metadata_headers = ["Sample"]
+    summary_rows = []
+    metadata_rows = []
+
+    for filename in files:
+        sample = Path(filename).parent.name
+        metadata, metrics = parse_metrics_summary(filename)
+
+        summary_row = {"Sample": sample, **metrics}
+        metadata_row = {"Sample": sample, **metadata}
+        summary_rows.append(summary_row)
+        metadata_rows.append(metadata_row)
+
+        for key in metrics:
+            if key not in summary_headers:
+                summary_headers.append(key)
+        for key in metadata:
+            if key not in metadata_headers:
+                metadata_headers.append(key)
+
+    write_sheet(workbook, "metrics_summary", summary_headers, summary_rows)
+    if len(metadata_headers) > 1:
+        write_sheet(workbook, "run_metadata", metadata_headers, metadata_rows, coerce_numbers=False)
+
     workbook.close()
 
-    for filename in sorted(glob.glob("./*/outs/web_summary.html")):
-        sample = filename.split("/")[1]
+    for filename in sorted(glob.glob("./*/*_Pipeline_Report.html")):
+        sample = Path(filename).parent.name
         copyfile(filename, os.path.join(SUMMARY_PATH, f"{sample}_web_summary.html"))