Skip to content

Commit 2a89aa2

Browse files
committed
Fix BD Rhapsody FASTQ staging and metrics summary parsing
1 parent b313f7a commit 2a89aa2

3 files changed

Lines changed: 195 additions & 90 deletions

File tree

rules/singlecell_bdrhapsody_import.smk

Lines changed: 24 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,9 @@ def get_sample_path_list(attr, sample):
3333

3434

3535
def get_required_reference_archive(wildcards):
36-
archive = get_sample_config_value(
37-
"reference_archive",
38-
wildcards.sample,
39-
getattr(reference, "reference_archive_bdrhapsody", ""),
40-
)
41-
if archive in (None, ""):
42-
raise ValueError(
43-
"reference_archive is required for the BD Rhapsody pipeline. "
44-
"Set config.reference_archive or define reference.reference_archive_bdrhapsody."
45-
)
46-
return archive
47-
36+
if getattr(reference, "reference_archive_bdrhapsody", False) is False:
37+
raise ValueError("reference_archive_bdrhapsody is a required config parameter for the BD Rhapsody pipeline.")
38+
return getattr(reference, "reference_archive_bdrhapsody", False)
4839

4940
def shell_join_args(arg_pairs):
5041
args = []
@@ -63,25 +54,6 @@ def build_write_inputs_args(wildcards):
6354
sample = wildcards.sample
6455
scalar_pairs = [
6556
("--file-field", f"Reference_Archive={get_required_reference_archive(wildcards)}"),
66-
("--scalar", f"Run_Name={get_sample_config_value('run_name', sample, sample)}"),
67-
("--scalar", f"Sample_Tags_Version={get_sample_config_value('sample_tags_version', sample)}"),
68-
("--scalar", f"VDJ_Version={get_sample_config_value('vdj_version', sample)}"),
69-
("--scalar", f"Cell_Calling_Data={get_sample_config_value('cell_calling_data', sample)}"),
70-
(
71-
"--scalar",
72-
"Cell_Calling_Bioproduct_Algorithm="
73-
f"{get_sample_config_value('cell_calling_bioproduct_algorithm', sample)}",
74-
),
75-
(
76-
"--scalar",
77-
"Cell_Calling_ATAC_Algorithm="
78-
f"{get_sample_config_value('cell_calling_atac_algorithm', sample)}",
79-
),
80-
("--scalar", f"Exact_Cell_Count={get_sample_config_value('exact_cell_count', sample)}"),
81-
("--scalar", f"Expected_Cell_Count={get_sample_config_value('expected_cell_count', sample)}"),
82-
("--scalar", f"Maximum_Threads={get_sample_config_value('maximum_threads', sample)}"),
83-
("--scalar", f"Custom_STAR_Params={get_sample_config_value('custom_star_params', sample)}"),
84-
("--scalar", f"Custom_bwa_mem2_Params={get_sample_config_value('custom_bwa_mem2_params', sample)}"),
8557
]
8658
args = shell_join_args(scalar_pairs)
8759

@@ -129,13 +101,17 @@ rule write_bdrhapsody_inputs:
129101
r1=rules.make_fastq_concat.output.r1,
130102
r2=rules.make_fastq_concat.output.r2,
131103
output:
132-
"{sample}/pipeline_inputs.yml"
104+
"{sample}_pipeline_inputs.yml"
133105
params:
134106
extra_args=build_write_inputs_args,
135107
script=os.path.join(analysis, "workflow/scripts/bdrhapsody/write_inputs.py"),
136108
shell:
137109
"""
138-
python {params.script} --output {output} --read {input.r1} --read {input.r2} {params.extra_args}
110+
python {params.script} \
111+
--output {output} \
112+
--read {input.r1} \
113+
--read {input.r2} \
114+
{params.extra_args}
139115
"""
140116

141117

@@ -145,34 +121,35 @@ rule count:
145121
r1=rules.make_fastq_concat.output.r1,
146122
r2=rules.make_fastq_concat.output.r2,
147123
output:
148-
web_summary="{sample}/outs/web_summary.html",
149-
metrics="{sample}/outs/metrics_summary.csv",
150-
matrix="{sample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz",
151-
features="{sample}/outs/filtered_feature_bc_matrix/features.tsv.gz",
152-
barcodes="{sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz",
124+
web_summary="{sample}/{sample}_Pipeline_Report.html",
125+
metrics="{sample}/{sample}_Metrics_Summary.csv",
153126
log:
154127
err="run_{sample}_bdrhapsody.err",
155128
log="run_{sample}_bdrhapsody.log",
156129
params:
157-
rawdir=lambda wildcards: os.path.join(analysis, wildcards.sample, "bdrhapsody_raw"),
130+
outdir=os.path.join(analysis, "{sample}"),
158131
script=os.path.join(analysis, "workflow/scripts/bdrhapsody/collect_outputs.py"),
159132
vendor=program.bdrhapsody,
160133
shell:
161134
r"""
162135
set -euo pipefail
163-
rm -rf {params.rawdir} {wildcards.sample}/outs
164-
mkdir -p {params.rawdir}
165-
166-
{params.vendor} pipeline --no-parallel --outdir {params.rawdir} {input.yaml} > {log.log} 2> {log.err} || true
167-
168-
python {params.script} --sample {wildcards.sample} --rawdir {params.rawdir} --outs {wildcards.sample}/outs --vendor-log {log.log} --vendor-err {log.err}
136+
rm -rf {params.outdir}
137+
echo "Before module load R/4.5.2:" > {log.log}
138+
which R >> {log.log}
139+
module load R/4.5.2
140+
echo "After module load R/4.5.2:" >> {log.log}
141+
which R >> {log.log}
142+
{params.vendor} pipeline --no-parallel \
143+
--outdir {params.outdir}\
144+
{input.yaml} \
145+
>> {log.log} 2> {log.err} || true
169146
"""
170147

171148

172149
rule summaryFiles:
173150
input:
174-
expand("{sample}/outs/metrics_summary.csv", sample=samples),
175-
expand("{sample}/outs/web_summary.html", sample=samples),
151+
expand(rules.count.output.metrics, sample=samples),
152+
expand(rules.count.output.web_summary, sample=samples),
176153
output:
177154
"finalreport/metric_summary.xlsx",
178155
expand("finalreport/summaries/{sample}_web_summary.html", sample=samples),

rules/singlecell_import.smk

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,11 @@ def filterFastq4nopipe(wildcards):
205205
break
206206

207207
if detected_sample_folder is None:
208-
sys.stderr.write(f"\nError: No FASTQ folder found for sample {detected_sample_folder}. Check the directory structure.\n\n")
208+
sys.stderr.write(f"\nError: No FASTQ folder found for sample {wildcards.sample}. Check the directory structure.\n\n")
209209
sys.exit(1)
210210

211-
# Define new FASTQ output directory
212-
path_fq_new = f"fastq/{detected_sample_folder}/"
211+
# Stage symlinks under the workflow sample id and keep source-folder detection separate.
212+
path_fq_new = f"fastq/{wildcards.sample}/"
213213

214214
# Create directory if it doesn't exist
215215
os.makedirs(path_fq_new, exist_ok=True)
@@ -271,10 +271,11 @@ def prep_fastq_folder_ln(sample, get_dict_only=False):
271271
break
272272

273273
if detected_sample_folder is None:
274-
sys.stderr.write(f"\nError: No FASTQ folder found for sample {detected_sample_folder}. Check the directory structure.\n\n")
274+
sys.stderr.write(f"\nError: No FASTQ folder found for sample {sample}. Check the directory structure.\n\n")
275275
sys.exit(1)
276276

277-
path_fq_new = os.path.join(analysis, f"fastq/{detected_sample_folder}/")
277+
# Stage symlinks under the workflow sample id and keep source-folder detection separate.
278+
path_fq_new = os.path.join(analysis, f"fastq/{sample}/")
278279
if not get_dict_only:
279280
if os.path.exists(path_fq_new):
280281
for file in os.listdir(path_fq_new):

scripts/bdrhapsody/generateSummaryFiles.py

Lines changed: 165 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,64 +3,157 @@
33
import csv
44
import glob
55
import os
6+
import re
7+
from pathlib import Path
68
from shutil import copyfile
79

810
import xlsxwriter
911

1012

1113
METRICS_PATH = "finalreport/"
1214
SUMMARY_PATH = "finalreport/summaries/"
15+
PIPELINE_VERSION_RE = re.compile(r"^(?P<name>.+?) Version (?P<version>[^ ]+)$")
16+
INT_RE = re.compile(r"^[+-]?\d+$")
17+
FLOAT_RE = re.compile(r"^[+-]?(?:\d+\.\d*|\d*\.\d+)$")
1318

1419

1520
def coerce_value(value):
1621
value = str(value).strip()
17-
if value == "":
22+
if value in ("", "-"):
1823
return value, None
1924
if value.endswith("%"):
20-
try:
21-
return float(value[:-1]) / 100.0, "percent"
22-
except ValueError:
23-
return value, None
24-
try:
25-
if "." in value:
26-
return float(value), "float"
27-
return int(value), "int"
28-
except ValueError:
25+
numeric = value[:-1].strip().replace(",", "")
26+
if INT_RE.match(numeric) or FLOAT_RE.match(numeric):
27+
return float(numeric) / 100.0, "percent"
2928
return value, None
3029

31-
32-
def main(output_name="metric_summary"):
33-
os.makedirs(METRICS_PATH, exist_ok=True)
34-
os.makedirs(SUMMARY_PATH, exist_ok=True)
35-
36-
files = sorted(glob.glob("./*/outs/metrics_summary.csv"))
37-
workbook = xlsxwriter.Workbook(os.path.join(METRICS_PATH, f"{output_name}.xlsx"))
38-
worksheet = workbook.add_worksheet("metrics_summary")
39-
30+
numeric = value.replace(",", "")
31+
if INT_RE.match(numeric):
32+
return int(numeric), "int"
33+
if FLOAT_RE.match(numeric):
34+
return float(numeric), "float"
35+
return value, None
36+
37+
38+
def next_nonempty(lines, start_idx):
39+
idx = start_idx
40+
while idx < len(lines) and not lines[idx].strip():
41+
idx += 1
42+
return idx
43+
44+
45+
def parse_metadata_line(line):
46+
content = line[2:].strip()
47+
if not content or set(content) == {"#"}:
48+
return []
49+
if " - " in content:
50+
section, remainder = content.split(" - ", 1)
51+
entries = []
52+
for item in remainder.split(" | "):
53+
if ": " in item:
54+
key, value = item.split(": ", 1)
55+
entries.append((f"{section}.{key}", value))
56+
else:
57+
entries.append((section, item))
58+
return entries
59+
60+
match = PIPELINE_VERSION_RE.match(content)
61+
if match:
62+
return [
63+
("Pipeline.Name", match.group("name")),
64+
("Pipeline.Version", match.group("version")),
65+
]
66+
67+
return [("Pipeline.Info", content)]
68+
69+
70+
def flatten_section(section_name, header, rows):
71+
flattened = {}
72+
if not rows:
73+
return flattened
74+
75+
id_column = None
76+
for candidate in ("Library", "Bioproduct_Type"):
77+
if candidate in header:
78+
id_column = candidate
79+
break
80+
81+
multiple_rows = len(rows) > 1
82+
for row in rows:
83+
row_label = None
84+
if multiple_rows and id_column:
85+
row_label = row.get(id_column, "")
86+
for column in header:
87+
if column == id_column:
88+
continue
89+
key = f"{section_name}.{column}"
90+
if row_label:
91+
key = f"{section_name}[{row_label}].{column}"
92+
flattened[key] = row.get(column, "")
93+
return flattened
94+
95+
96+
def parse_metrics_summary(filename):
97+
lines = Path(filename).read_text(encoding="utf-8", errors="replace").splitlines()
98+
metadata = {}
99+
metrics = {}
100+
idx = 0
101+
102+
while idx < len(lines):
103+
line = lines[idx].strip()
104+
if not line:
105+
idx += 1
106+
continue
107+
if line.startswith("##"):
108+
for key, value in parse_metadata_line(line):
109+
metadata[key] = value
110+
idx += 1
111+
continue
112+
if line.startswith("#"):
113+
section_name = line.strip("#").strip()
114+
idx = next_nonempty(lines, idx + 1)
115+
if idx >= len(lines):
116+
break
117+
header_line = lines[idx].strip()
118+
if not header_line or header_line.startswith("#"):
119+
continue
120+
header = next(csv.reader([header_line]))
121+
idx += 1
122+
rows = []
123+
while idx < len(lines):
124+
candidate = lines[idx].strip()
125+
if not candidate:
126+
idx += 1
127+
break
128+
if candidate.startswith("#"):
129+
break
130+
values = next(csv.reader([candidate]))
131+
rows.append(dict(zip(header, values)))
132+
idx += 1
133+
metrics.update(flatten_section(section_name, header, rows))
134+
continue
135+
idx += 1
136+
137+
return metadata, metrics
138+
139+
140+
def write_sheet(workbook, name, headers, rows, coerce_numbers=True):
141+
worksheet = workbook.add_worksheet(name)
40142
format_num = workbook.add_format({"num_format": "#,##0"})
41143
format_float = workbook.add_format({"num_format": "#,##0.00"})
42144
format_per = workbook.add_format({"num_format": "0.00%"})
43145
format_head = workbook.add_format({"bold": True, "italic": True, "text_wrap": True, "align": "center"})
44146

45-
headers = ["Sample"]
46-
rows = []
47-
for filename in files:
48-
with open(filename, newline="", encoding="utf-8") as handle:
49-
reader = csv.DictReader(handle)
50-
data = next(reader)
51-
sample = filename.split("/")[1]
52-
for key in data:
53-
if key not in headers:
54-
headers.append(key)
55-
rows.append((sample, data))
56-
57147
for col, header in enumerate(headers):
58148
worksheet.write(0, col, header, format_head)
59149

60-
for row_idx, (sample, data) in enumerate(rows, start=1):
61-
worksheet.write(row_idx, 0, sample)
62-
for col_idx, header in enumerate(headers[1:], start=1):
63-
value, kind = coerce_value(data.get(header, ""))
150+
for row_idx, row in enumerate(rows, start=1):
151+
for col_idx, header in enumerate(headers):
152+
raw_value = row.get(header, "")
153+
if not coerce_numbers:
154+
worksheet.write(row_idx, col_idx, raw_value)
155+
continue
156+
value, kind = coerce_value(raw_value)
64157
if kind == "percent":
65158
worksheet.write(row_idx, col_idx, value, format_per)
66159
elif kind == "float":
@@ -70,11 +163,45 @@ def main(output_name="metric_summary"):
70163
else:
71164
worksheet.write(row_idx, col_idx, value)
72165

73-
worksheet.set_column(0, len(headers) - 1, 18)
166+
worksheet.set_column(0, len(headers) - 1, 22)
167+
168+
169+
def main(output_name="metric_summary"):
170+
os.makedirs(METRICS_PATH, exist_ok=True)
171+
os.makedirs(SUMMARY_PATH, exist_ok=True)
172+
173+
files = sorted(glob.glob("./*/*_Metrics_Summary.csv"))
174+
workbook = xlsxwriter.Workbook(os.path.join(METRICS_PATH, f"{output_name}.xlsx"))
175+
176+
summary_headers = ["Sample"]
177+
metadata_headers = ["Sample"]
178+
summary_rows = []
179+
metadata_rows = []
180+
181+
for filename in files:
182+
sample = Path(filename).parent.name
183+
metadata, metrics = parse_metrics_summary(filename)
184+
185+
summary_row = {"Sample": sample, **metrics}
186+
metadata_row = {"Sample": sample, **metadata}
187+
summary_rows.append(summary_row)
188+
metadata_rows.append(metadata_row)
189+
190+
for key in metrics:
191+
if key not in summary_headers:
192+
summary_headers.append(key)
193+
for key in metadata:
194+
if key not in metadata_headers:
195+
metadata_headers.append(key)
196+
197+
write_sheet(workbook, "metrics_summary", summary_headers, summary_rows)
198+
if len(metadata_headers) > 1:
199+
write_sheet(workbook, "run_metadata", metadata_headers, metadata_rows, coerce_numbers=False)
200+
74201
workbook.close()
75202

76-
for filename in sorted(glob.glob("./*/outs/web_summary.html")):
77-
sample = filename.split("/")[1]
203+
for filename in sorted(glob.glob("./*/*_Pipeline_Report.html")):
204+
sample = Path(filename).parent.name
78205
copyfile(filename, os.path.join(SUMMARY_PATH, f"{sample}_web_summary.html"))
79206

80207

0 commit comments

Comments
 (0)