Skip to content

Commit 7d5ee69

Browse files
authored
Merge pull request #276 from ypriverol/dev
DIANN convert for SDRF
2 parents 8c65265 + 3000f18 commit 7d5ee69

File tree

7 files changed

+371
-52
lines changed

7 files changed

+371
-52
lines changed

src/sdrf_pipelines/converters/diann/diann.py

Lines changed: 151 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,23 @@
22
33
Converts SDRF files to DIA-NN configuration files:
44
- diann_config.cfg: DIA-NN command-line flags (enzyme, mods, channels, tolerances, scan ranges)
5-
- diann_filemap.tsv: Per-file metadata (tolerances, labels, mods, scan ranges)
5+
- diann_design.tsv: Per-file metadata (tolerances, labels, mods, scan ranges, experimental design)
66
"""
77

88
import logging
99
import re
1010

1111
import pandas as pd
1212

13-
from sdrf_pipelines.converters.base import BaseConverter
13+
from sdrf_pipelines.converters.base import BaseConverter, ConditionBuilder
1414
from sdrf_pipelines.converters.diann.constants import ENZYME_NAME_MAPPINGS, ENZYME_SPECIFICITY
1515
from sdrf_pipelines.converters.diann.modifications import DiannModificationConverter
1616
from sdrf_pipelines.converters.diann.plexdia import (
1717
build_channels_flag,
1818
build_fixed_mod_flag,
1919
detect_plexdia_type,
2020
)
21+
from sdrf_pipelines.converters.openms.experimental_design import FractionGroupTracker
2122
from sdrf_pipelines.converters.openms.utils import parse_tolerance
2223

2324
logger = logging.getLogger(__name__)
@@ -56,7 +57,7 @@ def diann_convert(self, sdrf_file: str) -> None:
5657
5758
Generates:
5859
- diann_config.cfg: DIA-NN CLI flags
59-
- diann_filemap.tsv: Per-file metadata
60+
- diann_design.tsv: Per-file metadata
6061
6162
Args:
6263
sdrf_file: Path to the SDRF file
@@ -98,11 +99,14 @@ def diann_convert(self, sdrf_file: str) -> None:
9899
# Compute global scan ranges across all runs
99100
scan_range_summary = self._compute_global_scan_ranges(file_data)
100101

102+
# Extract experimental design
103+
design_rows = self._extract_experimental_design(sdrf, file_data)
104+
101105
# Write config file
102106
self._write_config(enzyme, diann_fixed, diann_var, plex_info, tolerance_summary, scan_range_summary)
103107

104108
# Write filemap
105-
self._write_filemap(file_data, plex_info)
109+
self._write_filemap(file_data, plex_info, design_rows)
106110

107111
self.report_warnings()
108112

@@ -321,10 +325,13 @@ def _extract_modifications(self, row: pd.Series, mod_cols: list[str]) -> tuple[l
321325
mod_str = str(row.get(col, "")).strip()
322326
if not mod_str or mod_str.lower() in ("nan", "not available", ""):
323327
continue
324-
if "MT=fixed" in mod_str or "mt=fixed" in mod_str:
325-
fixed.append(mod_str)
326-
elif "MT=variable" in mod_str or "mt=variable" in mod_str:
327-
var.append(mod_str)
328+
# Normalize MT key-value to lowercase for consistent comparison
329+
normalized = re.sub(r"(?i)\bMT=\w+", lambda m: m.group().lower(), mod_str)
330+
mod_lower = normalized.lower()
331+
if "mt=fixed" in mod_lower:
332+
fixed.append(normalized)
333+
elif "mt=variable" in mod_lower:
334+
var.append(normalized)
328335
return fixed, var
329336

330337
def _extract_tolerance(self, row: pd.Series, column: str) -> tuple:
@@ -388,6 +395,11 @@ def _extract_scan_range(self, row: pd.Series, ms_level: str) -> tuple[float | No
388395

389396
# Resolve: range takes precedence over discrete
390397
if range_min is not None and range_max is not None:
398+
if range_min >= range_max:
399+
raise ValueError(
400+
f"Inverted {ms_level} scan range: min ({range_min}) >= max ({range_max}). "
401+
f"Check your SDRF annotation."
402+
)
391403
if discrete_min is not None or discrete_max is not None:
392404
self.add_warning(
393405
f"Both interval ('{range_col}') and discrete min/max columns found for {ms_level}. "
@@ -396,7 +408,115 @@ def _extract_scan_range(self, row: pd.Series, ms_level: str) -> tuple[float | No
396408
return range_min, range_max
397409

398410
# Fall back to discrete
399-
return discrete_min, discrete_max
411+
min_mz, max_mz = discrete_min, discrete_max
412+
if min_mz is not None and max_mz is not None and min_mz >= max_mz:
413+
raise ValueError(
414+
f"Inverted {ms_level} scan range: min ({min_mz}) >= max ({max_mz}). Check your SDRF annotation."
415+
)
416+
return min_mz, max_mz
417+
418+
@staticmethod
419+
def _extract_acquisition_method(row: pd.Series) -> str:
420+
col = "comment[proteomics data acquisition method]"
421+
if col in row.index:
422+
value = str(row[col]).strip()
423+
if value.lower() not in ("", "nan", "not available"):
424+
# Extract NT= value if present (e.g. "NT=Data-Independent Acquisition;AC=NCIT:C161786")
425+
if "NT=" in value:
426+
nt_match = re.search(r"NT=([^;]+)", value)
427+
if nt_match:
428+
value = nt_match.group(1).strip()
429+
return value
430+
return ""
431+
432+
@staticmethod
433+
def _extract_dissociation_method(row: pd.Series) -> str:
434+
col = "comment[dissociation method]"
435+
if col in row.index:
436+
value = str(row[col]).strip()
437+
if value.lower() not in ("", "nan", "not available"):
438+
# Extract NT= value if present (e.g. "NT=HCD;AC=PRIDE:0000590" -> "HCD")
439+
if "NT=" in value:
440+
nt_match = re.search(r"NT=([^;]+)", value)
441+
if nt_match:
442+
value = nt_match.group(1).strip()
443+
mapping = {
444+
"collision-induced dissociation": "CID",
445+
"beam-type collision-induced dissociation": "HCD",
446+
"higher energy beam-type collision-induced dissociation": "HCD",
447+
"electron transfer dissociation": "ETD",
448+
"electron capture dissociation": "ECD",
449+
}
450+
return mapping.get(value.lower(), value)
451+
return ""
452+
453+
def _extract_experimental_design(self, sdrf: pd.DataFrame, file_data: dict) -> list[dict]:
454+
"""Extract experimental design metadata from SDRF.
455+
456+
Returns a list of dicts, one per SDRF row. For plexDIA, each channel row
457+
produces its own entry with its own Condition/BioReplicate.
458+
"""
459+
factor_cols = [c for c in sdrf.columns if c.startswith("factor value[")]
460+
condition_builder = ConditionBuilder(factor_cols)
461+
fraction_tracker = FractionGroupTracker()
462+
463+
source_name_list: list[str] = []
464+
source_name2n_reps: dict[str, int] = {}
465+
for _, row in sdrf.iterrows():
466+
sn = str(row["source name"])
467+
tech_rep = str(row.get("comment[technical replicate]", "1"))
468+
if tech_rep.lower() in ("", "nan", "not available"):
469+
tech_rep = "1"
470+
if sn not in source_name_list:
471+
source_name_list.append(sn)
472+
source_name2n_reps[sn] = int(tech_rep)
473+
else:
474+
source_name2n_reps[sn] = max(source_name2n_reps[sn], int(tech_rep))
475+
476+
source_to_sample: dict[str, int] = {}
477+
source_to_biorep: dict[str, int] = {}
478+
for i, sn in enumerate(source_name_list, start=1):
479+
source_to_sample[sn] = i
480+
source_to_biorep[sn] = i
481+
482+
design_rows: list[dict] = []
483+
seen_files: set[str] = set()
484+
485+
for _, row in sdrf.iterrows():
486+
filename = str(row["comment[data file]"])
487+
sn = str(row["source name"])
488+
tech_rep = str(row.get("comment[technical replicate]", "1"))
489+
if tech_rep.lower() in ("", "nan", "not available"):
490+
tech_rep = "1"
491+
492+
if filename not in seen_files:
493+
seen_files.add(filename)
494+
fraction = self.get_fraction_identifier(row)
495+
source_idx = source_name_list.index(sn)
496+
offset = sum(source_name2n_reps[source_name_list[i]] for i in range(source_idx))
497+
raw_frac_group = offset + int(tech_rep)
498+
frac_group = fraction_tracker.get_fraction_group(filename, raw_frac_group)
499+
else:
500+
fraction = self.get_fraction_identifier(row)
501+
frac_group = fraction_tracker.fraction_groups[filename]
502+
503+
condition = condition_builder.add_from_row(row, fallback=sn)
504+
505+
design_rows.append(
506+
{
507+
"filename": filename,
508+
"label": self._extract_label(row),
509+
"sample": source_to_sample[sn],
510+
"fraction_group": frac_group,
511+
"fraction": int(fraction),
512+
"condition": condition,
513+
"bioreplicate": source_to_biorep[sn],
514+
"acquisition_method": self._extract_acquisition_method(row),
515+
"dissociation_method": self._extract_dissociation_method(row),
516+
}
517+
)
518+
519+
return design_rows
400520

401521
def _write_config(
402522
self,
@@ -478,34 +598,46 @@ def _write_config(
478598
if val is not None:
479599
parts.append(f"{flag} {val}")
480600

481-
with open("diann_config.cfg", "w") as f:
601+
with open("diann_config.cfg", "w", encoding="utf-8") as f:
482602
f.write(" ".join(parts))
483603

484-
def _write_filemap(self, file_data: dict, plex_info: dict | None) -> None:
485-
"""Write diann_filemap.tsv."""
604+
def _write_filemap(self, file_data: dict, plex_info: dict | None, design_rows: list[dict] | None = None) -> None:
605+
"""Write diann_design.tsv (unified design file)."""
486606
rows = []
487607
label_type = plex_info["type"] if plex_info else "label free"
488608

609+
design_lookup: dict[tuple[str, str], dict] = {}
610+
if design_rows:
611+
for d in design_rows:
612+
design_lookup[(d["filename"], d["label"])] = d
613+
489614
for filename, fd in file_data.items():
490615
if plex_info is not None:
491-
# For plexDIA: one row per channel per file
492616
for label in fd["labels"]:
493-
rows.append(self._filemap_row(filename, fd, label, label_type))
617+
design = design_lookup.get((filename, label))
618+
rows.append(self._filemap_row(filename, fd, label, label_type, design))
494619
else:
495-
# Label-free: one row per file
496620
label = fd["labels"][0] if fd["labels"] else "label free sample"
497-
rows.append(self._filemap_row(filename, fd, label, label_type))
621+
design = design_lookup.get((filename, label))
622+
rows.append(self._filemap_row(filename, fd, label, label_type, design))
498623

499624
df = pd.DataFrame(rows)
500-
df.to_csv("diann_filemap.tsv", sep="\t", index=False)
625+
df.to_csv("diann_design.tsv", sep="\t", index=False, encoding="utf-8")
501626

502-
def _filemap_row(self, filename: str, fd: dict, label: str, label_type: str) -> dict:
503-
"""Build a single filemap row."""
627+
def _filemap_row(self, filename: str, fd: dict, label: str, label_type: str, design: dict | None = None) -> dict:
628+
"""Build a single design file row."""
504629
return {
505630
"Filename": filename,
506631
"URI": fd.get("uri", ""),
632+
"Sample": design["sample"] if design else "",
633+
"FractionGroup": design["fraction_group"] if design else "",
634+
"Fraction": design["fraction"] if design else 1,
507635
"Label": label,
508636
"LabelType": label_type,
637+
"AcquisitionMethod": design["acquisition_method"] if design else "",
638+
"DissociationMethod": design["dissociation_method"] if design else "",
639+
"Condition": design["condition"] if design else "",
640+
"BioReplicate": design["bioreplicate"] if design else "",
509641
"Enzyme": fd["enzyme"],
510642
"FixedModifications": ";".join(fd["fixed_mods"]),
511643
"VariableModifications": ";".join(fd["var_mods"]),
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file] comment[ms1 scan range] comment[ms2 scan range] comment[proteomics data acquisition method] comment[dissociation method] comment[fraction identifier] comment[technical replicate] factor value[treatment]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1_gpf1.raw 400 m/z-600 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 1 1 control
3+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1_gpf2.raw 600 m/z-800 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 2 1 control
4+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1_gpf3.raw 800 m/z-1000 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 3 1 control
5+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample2_gpf1.raw 400 m/z-600 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 1 1 treated
6+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample2_gpf2.raw 600 m/z-800 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 2 1 treated
7+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample2_gpf3.raw 800 m/z-1000 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 3 1 treated
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file] comment[ms1 scan range] comment[ms2 scan range] comment[proteomics data acquisition method] comment[dissociation method] comment[fraction identifier] comment[technical replicate] factor value[treatment]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw 400 m/z-600 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 1 1 treatment_A
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw 600 m/z-800 m/z 100 m/z-1800 m/z Data-Independent Acquisition beam-type collision-induced dissociation 1 1 treatment_B
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file] comment[proteomics data acquisition method] comment[dissociation method] comment[fraction identifier] comment[technical replicate] factor value[treatment]
2+
Sample 1 Homo sapiens run 1 ch0 NT=MTRAQ0;AC=PRIDE:0000851 AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm run1.raw Data-Independent Acquisition beam-type collision-induced dissociation 1 1 control
3+
Sample 2 Homo sapiens run 1 ch4 NT=MTRAQ4;AC=PRIDE:0000852 AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm run1.raw Data-Independent Acquisition beam-type collision-induced dissociation 1 1 low_dose
4+
Sample 3 Homo sapiens run 1 ch8 NT=MTRAQ8;AC=PRIDE:0000853 AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm run1.raw Data-Independent Acquisition beam-type collision-induced dissociation 1 1 high_dose
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file] comment[ms1 scan range] comment[ms2 scan range]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw 1200 m/z-400 m/z 100 m/z-1800 m/z

0 commit comments

Comments
 (0)