22
33Converts SDRF files to DIA-NN configuration files:
44- diann_config.cfg: DIA-NN command-line flags (enzyme, mods, channels, tolerances, scan ranges)
5- - diann_filemap .tsv: Per-file metadata (tolerances, labels, mods, scan ranges)
5+ - diann_design .tsv: Per-file metadata (tolerances, labels, mods, scan ranges, experimental design )
66"""
77
88import logging
99import re
1010
1111import pandas as pd
1212
13- from sdrf_pipelines .converters .base import BaseConverter
13+ from sdrf_pipelines .converters .base import BaseConverter , ConditionBuilder
1414from sdrf_pipelines .converters .diann .constants import ENZYME_NAME_MAPPINGS , ENZYME_SPECIFICITY
1515from sdrf_pipelines .converters .diann .modifications import DiannModificationConverter
1616from sdrf_pipelines .converters .diann .plexdia import (
1717 build_channels_flag ,
1818 build_fixed_mod_flag ,
1919 detect_plexdia_type ,
2020)
21+ from sdrf_pipelines .converters .openms .experimental_design import FractionGroupTracker
2122from sdrf_pipelines .converters .openms .utils import parse_tolerance
2223
2324logger = logging .getLogger (__name__ )
@@ -56,7 +57,7 @@ def diann_convert(self, sdrf_file: str) -> None:
5657
5758 Generates:
5859 - diann_config.cfg: DIA-NN CLI flags
59- - diann_filemap .tsv: Per-file metadata
60+ - diann_design .tsv: Per-file metadata
6061
6162 Args:
6263 sdrf_file: Path to the SDRF file
@@ -98,11 +99,14 @@ def diann_convert(self, sdrf_file: str) -> None:
9899 # Compute global scan ranges across all runs
99100 scan_range_summary = self ._compute_global_scan_ranges (file_data )
100101
102+ # Extract experimental design
103+ design_rows = self ._extract_experimental_design (sdrf , file_data )
104+
101105 # Write config file
102106 self ._write_config (enzyme , diann_fixed , diann_var , plex_info , tolerance_summary , scan_range_summary )
103107
104108 # Write filemap
105- self ._write_filemap (file_data , plex_info )
109+ self ._write_filemap (file_data , plex_info , design_rows )
106110
107111 self .report_warnings ()
108112
@@ -321,10 +325,13 @@ def _extract_modifications(self, row: pd.Series, mod_cols: list[str]) -> tuple[l
321325 mod_str = str (row .get (col , "" )).strip ()
322326 if not mod_str or mod_str .lower () in ("nan" , "not available" , "" ):
323327 continue
324- if "MT=fixed" in mod_str or "mt=fixed" in mod_str :
325- fixed .append (mod_str )
326- elif "MT=variable" in mod_str or "mt=variable" in mod_str :
327- var .append (mod_str )
328+ # Normalize MT key-value to lowercase for consistent comparison
329+ normalized = re .sub (r"(?i)\bMT=\w+" , lambda m : m .group ().lower (), mod_str )
330+ mod_lower = normalized .lower ()
331+ if "mt=fixed" in mod_lower :
332+ fixed .append (normalized )
333+ elif "mt=variable" in mod_lower :
334+ var .append (normalized )
328335 return fixed , var
329336
330337 def _extract_tolerance (self , row : pd .Series , column : str ) -> tuple :
@@ -388,6 +395,11 @@ def _extract_scan_range(self, row: pd.Series, ms_level: str) -> tuple[float | No
388395
389396 # Resolve: range takes precedence over discrete
390397 if range_min is not None and range_max is not None :
398+ if range_min >= range_max :
399+ raise ValueError (
400+ f"Inverted { ms_level } scan range: min ({ range_min } ) >= max ({ range_max } ). "
401+ f"Check your SDRF annotation."
402+ )
391403 if discrete_min is not None or discrete_max is not None :
392404 self .add_warning (
393405 f"Both interval ('{ range_col } ') and discrete min/max columns found for { ms_level } . "
@@ -396,7 +408,115 @@ def _extract_scan_range(self, row: pd.Series, ms_level: str) -> tuple[float | No
396408 return range_min , range_max
397409
398410 # Fall back to discrete
399- return discrete_min , discrete_max
411+ min_mz , max_mz = discrete_min , discrete_max
412+ if min_mz is not None and max_mz is not None and min_mz >= max_mz :
413+ raise ValueError (
414+ f"Inverted { ms_level } scan range: min ({ min_mz } ) >= max ({ max_mz } ). Check your SDRF annotation."
415+ )
416+ return min_mz , max_mz
417+
418+ @staticmethod
419+ def _extract_acquisition_method (row : pd .Series ) -> str :
420+ col = "comment[proteomics data acquisition method]"
421+ if col in row .index :
422+ value = str (row [col ]).strip ()
423+ if value .lower () not in ("" , "nan" , "not available" ):
424+ # Extract NT= value if present (e.g. "NT=Data-Independent Acquisition;AC=NCIT:C161786")
425+ if "NT=" in value :
426+ nt_match = re .search (r"NT=([^;]+)" , value )
427+ if nt_match :
428+ value = nt_match .group (1 ).strip ()
429+ return value
430+ return ""
431+
432+ @staticmethod
433+ def _extract_dissociation_method (row : pd .Series ) -> str :
434+ col = "comment[dissociation method]"
435+ if col in row .index :
436+ value = str (row [col ]).strip ()
437+ if value .lower () not in ("" , "nan" , "not available" ):
438+ # Extract NT= value if present (e.g. "NT=HCD;AC=PRIDE:0000590" -> "HCD")
439+ if "NT=" in value :
440+ nt_match = re .search (r"NT=([^;]+)" , value )
441+ if nt_match :
442+ value = nt_match .group (1 ).strip ()
443+ mapping = {
444+ "collision-induced dissociation" : "CID" ,
445+ "beam-type collision-induced dissociation" : "HCD" ,
446+ "higher energy beam-type collision-induced dissociation" : "HCD" ,
447+ "electron transfer dissociation" : "ETD" ,
448+ "electron capture dissociation" : "ECD" ,
449+ }
450+ return mapping .get (value .lower (), value )
451+ return ""
452+
453+ def _extract_experimental_design (self , sdrf : pd .DataFrame , file_data : dict ) -> list [dict ]:
454+ """Extract experimental design metadata from SDRF.
455+
456+ Returns a list of dicts, one per SDRF row. For plexDIA, each channel row
457+ produces its own entry with its own Condition/BioReplicate.
458+ """
459+ factor_cols = [c for c in sdrf .columns if c .startswith ("factor value[" )]
460+ condition_builder = ConditionBuilder (factor_cols )
461+ fraction_tracker = FractionGroupTracker ()
462+
463+ source_name_list : list [str ] = []
464+ source_name2n_reps : dict [str , int ] = {}
465+ for _ , row in sdrf .iterrows ():
466+ sn = str (row ["source name" ])
467+ tech_rep = str (row .get ("comment[technical replicate]" , "1" ))
468+ if tech_rep .lower () in ("" , "nan" , "not available" ):
469+ tech_rep = "1"
470+ if sn not in source_name_list :
471+ source_name_list .append (sn )
472+ source_name2n_reps [sn ] = int (tech_rep )
473+ else :
474+ source_name2n_reps [sn ] = max (source_name2n_reps [sn ], int (tech_rep ))
475+
476+ source_to_sample : dict [str , int ] = {}
477+ source_to_biorep : dict [str , int ] = {}
478+ for i , sn in enumerate (source_name_list , start = 1 ):
479+ source_to_sample [sn ] = i
480+ source_to_biorep [sn ] = i
481+
482+ design_rows : list [dict ] = []
483+ seen_files : set [str ] = set ()
484+
485+ for _ , row in sdrf .iterrows ():
486+ filename = str (row ["comment[data file]" ])
487+ sn = str (row ["source name" ])
488+ tech_rep = str (row .get ("comment[technical replicate]" , "1" ))
489+ if tech_rep .lower () in ("" , "nan" , "not available" ):
490+ tech_rep = "1"
491+
492+ if filename not in seen_files :
493+ seen_files .add (filename )
494+ fraction = self .get_fraction_identifier (row )
495+ source_idx = source_name_list .index (sn )
496+ offset = sum (source_name2n_reps [source_name_list [i ]] for i in range (source_idx ))
497+ raw_frac_group = offset + int (tech_rep )
498+ frac_group = fraction_tracker .get_fraction_group (filename , raw_frac_group )
499+ else :
500+ fraction = self .get_fraction_identifier (row )
501+ frac_group = fraction_tracker .fraction_groups [filename ]
502+
503+ condition = condition_builder .add_from_row (row , fallback = sn )
504+
505+ design_rows .append (
506+ {
507+ "filename" : filename ,
508+ "label" : self ._extract_label (row ),
509+ "sample" : source_to_sample [sn ],
510+ "fraction_group" : frac_group ,
511+ "fraction" : int (fraction ),
512+ "condition" : condition ,
513+ "bioreplicate" : source_to_biorep [sn ],
514+ "acquisition_method" : self ._extract_acquisition_method (row ),
515+ "dissociation_method" : self ._extract_dissociation_method (row ),
516+ }
517+ )
518+
519+ return design_rows
400520
401521 def _write_config (
402522 self ,
@@ -478,34 +598,46 @@ def _write_config(
478598 if val is not None :
479599 parts .append (f"{ flag } { val } " )
480600
481- with open ("diann_config.cfg" , "w" ) as f :
601+ with open ("diann_config.cfg" , "w" , encoding = "utf-8" ) as f :
482602 f .write (" " .join (parts ))
483603
484- def _write_filemap (self , file_data : dict , plex_info : dict | None ) -> None :
485- """Write diann_filemap .tsv."""
604+ def _write_filemap (self , file_data : dict , plex_info : dict | None , design_rows : list [ dict ] | None = None ) -> None :
605+ """Write diann_design .tsv (unified design file) ."""
486606 rows = []
487607 label_type = plex_info ["type" ] if plex_info else "label free"
488608
609+ design_lookup : dict [tuple [str , str ], dict ] = {}
610+ if design_rows :
611+ for d in design_rows :
612+ design_lookup [(d ["filename" ], d ["label" ])] = d
613+
489614 for filename , fd in file_data .items ():
490615 if plex_info is not None :
491- # For plexDIA: one row per channel per file
492616 for label in fd ["labels" ]:
493- rows .append (self ._filemap_row (filename , fd , label , label_type ))
617+ design = design_lookup .get ((filename , label ))
618+ rows .append (self ._filemap_row (filename , fd , label , label_type , design ))
494619 else :
495- # Label-free: one row per file
496620 label = fd ["labels" ][0 ] if fd ["labels" ] else "label free sample"
497- rows .append (self ._filemap_row (filename , fd , label , label_type ))
621+ design = design_lookup .get ((filename , label ))
622+ rows .append (self ._filemap_row (filename , fd , label , label_type , design ))
498623
499624 df = pd .DataFrame (rows )
500- df .to_csv ("diann_filemap .tsv" , sep = "\t " , index = False )
625+ df .to_csv ("diann_design .tsv" , sep = "\t " , index = False , encoding = "utf-8" )
501626
502- def _filemap_row (self , filename : str , fd : dict , label : str , label_type : str ) -> dict :
503- """Build a single filemap row."""
627+ def _filemap_row (self , filename : str , fd : dict , label : str , label_type : str , design : dict | None = None ) -> dict :
628+ """Build a single design file row."""
504629 return {
505630 "Filename" : filename ,
506631 "URI" : fd .get ("uri" , "" ),
632+ "Sample" : design ["sample" ] if design else "" ,
633+ "FractionGroup" : design ["fraction_group" ] if design else "" ,
634+ "Fraction" : design ["fraction" ] if design else 1 ,
507635 "Label" : label ,
508636 "LabelType" : label_type ,
637+ "AcquisitionMethod" : design ["acquisition_method" ] if design else "" ,
638+ "DissociationMethod" : design ["dissociation_method" ] if design else "" ,
639+ "Condition" : design ["condition" ] if design else "" ,
640+ "BioReplicate" : design ["bioreplicate" ] if design else "" ,
509641 "Enzyme" : fd ["enzyme" ],
510642 "FixedModifications" : ";" .join (fd ["fixed_mods" ]),
511643 "VariableModifications" : ";" .join (fd ["var_mods" ]),
0 commit comments