From 01518f54811487c2685c00a5ebdf6fc47281afd0 Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Mon, 9 Jun 2025 17:36:09 +0100 Subject: [PATCH 1/7] Add skeleton untargeted metabolomics pipeline plan (#293) * Add skeleton untargeted pipeline directory with plan * fix flake8 errors in generate chemicals script * Add lint/test guidelines and minimal untargeted test * Implement dataset setup script * Add mzML generation step * Tune chromatogram width * Add ground truth table generation * Add MGF export for simulated chemicals * docs: mark simulation step complete --- AGENTS.md | 23 ++++ demo/untargeted/generate_chemicals.py | 36 ++++++ demo/untargeted/generate_dataset.py | 160 ++++++++++++++++++++++++++ demo/untargeted/plan.md | 40 +++++++ tests/test_untargeted_demo.py | 51 ++++++++ 5 files changed, 310 insertions(+) create mode 100644 AGENTS.md create mode 100644 demo/untargeted/generate_chemicals.py create mode 100644 demo/untargeted/generate_dataset.py create mode 100644 demo/untargeted/plan.md create mode 100644 tests/test_untargeted_demo.py diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..01b1dd7 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,23 @@ +# Development Guidelines + +## Linting +Run the same flake8 commands used in CI before committing: + +```bash +poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics +poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics +``` + +Ensure there are zero linting errors for changed files. + +## Testing +Only execute tests relevant to the area you are working on to save time. For changes in the `demo/untargeted` demo, run: + +```bash +poetry run pytest tests/test_untargeted_demo.py +``` + +## Good Practices +- Write clear commit messages describing your changes. +- Keep functions short and well documented. +- Prefer small, focused pull requests. diff --git a/demo/untargeted/generate_chemicals.py b/demo/untargeted/generate_chemicals.py new file mode 100644 index 0000000..218bbda --- /dev/null +++ b/demo/untargeted/generate_chemicals.py @@ -0,0 +1,36 @@ +from pathlib import Path + +from vimms.ChemicalSamplers import ( + DatabaseFormulaSampler, + UniformRTAndIntensitySampler, + GaussianChromatogramSampler, +) +from vimms.Chemicals import ChemicalMixtureCreator +from vimms.Common import ADDUCT_DICT_POS_MH, load_obj, save_obj + + +FIXTURES = Path(__file__).resolve().parents[2] / "tests" / "fixtures" + + +def generate_chemicals(n_chemicals: int = 100): + """Return a list of simulated chemicals for testing.""" + hmdb = load_obj(FIXTURES / "hmdb_compounds.p") + fs = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) + ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=180) + chromatogram_sampler = GaussianChromatogramSampler(sigma=1) + cmc = ChemicalMixtureCreator( + fs, + rt_and_intensity_sampler=ri, + chromatogram_sampler=chromatogram_sampler, + adduct_prior_dict=ADDUCT_DICT_POS_MH, + ) + return cmc.sample(n_chemicals, 2) + + +def main(): + chemicals = generate_chemicals() + save_obj(chemicals, str(FIXTURES / "demo_chemicals.p")) + + +if __name__ == "__main__": + main() diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py new file mode 100644 index 0000000..5e7641f --- /dev/null +++ b/demo/untargeted/generate_dataset.py @@ -0,0 +1,160 @@ +"""Prepare a simulated dataset design for untargeted pipeline testing. + +This module creates chemicals and an experimental design consisting of two +groups (case and control). It provides utilities to generate mzML files +for each sample using a simple Top-1 DDA controller and produces a ground +truth table of the underlying peaks. +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Tuple + +from vimms.Common import POSITIVE, PROTON_MASS +from vimms.Controller import TopNController +from vimms.Environment import Environment +from vimms.MassSpec import IndependentMassSpectrometer + +from .generate_chemicals import generate_chemicals +import pandas as pd + + +@dataclass +class ExperimentalDesign: + """Simple container describing sample groups.""" + + samples: Dict[str, List[str]] + + +def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign: + """Return an experimental design with case and control groups.""" + + design = { + "case": [f"case_{i + 1}" for i in range(n_samples_per_group)], + "control": [f"control_{i + 1}" for i in range(n_samples_per_group)], + } + return ExperimentalDesign(samples=design) + + +def setup_simulation( + n_chemicals: int = 100, n_samples_per_group: int = 5 +) -> Tuple[list, ExperimentalDesign]: + """Generate chemicals and an experimental design.""" + + chemicals = generate_chemicals(n_chemicals) + design = create_design(n_samples_per_group) + return chemicals, design + + +def generate_mzml_files( + chemicals: list, + design: ExperimentalDesign, + out_dir: Path, + max_rt: int = 180, + top_n: int = 1, +) -> None: + """Generate an mzML file for each sample in ``design``. + + Parameters + ---------- + chemicals: + List of simulated chemicals. + design: + Experimental design describing the sample groups. + out_dir: + Directory where the mzML files will be written. + max_rt: + Maximum retention time (seconds) for the simulation. + top_n: + Number of precursors to fragment in each cycle. + """ + + for group, samples in design.samples.items(): + group_dir = out_dir / group + group_dir.mkdir(parents=True, exist_ok=True) + for sample in samples: + ms = IndependentMassSpectrometer(POSITIVE, chemicals) + controller = TopNController( + POSITIVE, top_n, 1, 10, 15, 1000 + ) + env = Environment( + ms, + controller, + 0, + max_rt, + progress_bar=False, + out_dir=group_dir, + out_file=f"{sample}.mzML", + ) + env.run() + + +def generate_ground_truth_table( + chemicals: list, design: ExperimentalDesign +) -> pd.DataFrame: + """Return a table describing the true peaks for each sample.""" + + records = [] + for compound_id, chem in enumerate(chemicals): + mz = chem.mass + PROTON_MASS + rt_min = chem.get_min_rt() + rt_max = chem.get_max_rt() + rt_apex = chem.get_apex_rt() + for group_samples in design.samples.values(): + for sample in group_samples: + records.append( + { + "sample": sample, + "compound_id": compound_id, + "mz_apex": mz, + "rt_apex": rt_apex, + "intensity": chem.max_intensity, + "mz_min": mz - 0.01, + "mz_max": mz + 0.01, + "rt_min": rt_min, + "rt_max": rt_max, + } + ) + return pd.DataFrame.from_records(records) + + +def write_ground_truth_mgf(chemicals: list, out_file: Path) -> None: + """Write a simple MGF file of MS2 spectra for each chemical.""" + + with open(out_file, "w") as f: + for compound_id, chem in enumerate(chemicals): + if not chem.children: + continue + pepmass = chem.mass + PROTON_MASS + rt_apex = chem.get_apex_rt() + + f.write("BEGIN IONS\n") + f.write(f"TITLE=compound_{compound_id}\n") + f.write(f"PEPMASS={pepmass:.5f}\n") + f.write("CHARGE=1+\n") + f.write(f"RTINSECONDS={rt_apex:.2f}\n") + for frag in chem.children: + mz = frag.isotopes[0][0] + intensity = chem.max_intensity * frag.prop_ms2_mass + f.write(f"{mz:.5f} {intensity:.1f}\n") + f.write("END IONS\n") + + +def main() -> None: + """Entry point for dataset preparation.""" + + chemicals, design = setup_simulation() + out_dir = Path("./out") + out_dir.mkdir(parents=True, exist_ok=True) + + print(f"Generated {len(chemicals)} chemicals") + for group, names in design.samples.items(): + print(group, names) + generate_mzml_files(chemicals, design, out_dir) + gt = generate_ground_truth_table(chemicals, design) + gt.to_csv(out_dir / "ground_truth.csv", index=False) + write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf") + + +if __name__ == "__main__": + main() diff --git a/demo/untargeted/plan.md b/demo/untargeted/plan.md new file mode 100644 index 0000000..15e7d81 --- /dev/null +++ b/demo/untargeted/plan.md @@ -0,0 +1,40 @@ +# Untargeted Metabolomics Pipeline Plan + +This document outlines the tasks for building a simple untargeted metabolomics +processing pipeline. Each task contains a short description and a status field to +track progress. + +## Tasks + +1. **Simulated Data Generation** + - Create a Python script to generate 100 chemicals using `ChemicalMixtureCreator`. + - Split chemicals into two groups (case/control) with 5 samples each. + - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with + chromatograms roughly 5 s wide (``sigma≈1``). + - Acquire data with Top‑1 DDA in positive mode including common adducts. + - Export mzML files, a ground‑truth table linking peaks to compounds, and a + simple MGF library of MS2 spectra. + - **Status:** Completed (mzML, ground truth table, and MGF library generated) + +2. **Peak Picking** + - Since the chemicals are simulated, derive peak information directly from the + ground truth and produce a table of peaks per sample (bounding box, apex m/z, + RT, intensity, sample name, etc.). + - **Status:** Not started + +3. **Alignment** + - Implement a simple join aligner that merges peaks across samples to create + an aligned feature table. Optionally use MZmine for comparison. + - **Status:** Not started + +4. **Grouping Related Peaks** + - Cluster related peaks (e.g. isotopes/adducts) using a simple RT clustering + approach such as a Dirichlet process mixture model. Provide an option to + run a similar step in MZmine. + - **Status:** Not started + +5. **Identification** + - Match observed peaks to the ground‑truth MSP library to assess performance. + - **Status:** Not started + +A main pipeline script will orchestrate these steps once they are implemented. diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py new file mode 100644 index 0000000..af07288 --- /dev/null +++ b/tests/test_untargeted_demo.py @@ -0,0 +1,51 @@ + +from demo.untargeted.generate_chemicals import generate_chemicals +from demo.untargeted.generate_dataset import ( + create_design, + generate_mzml_files, + generate_ground_truth_table, + write_ground_truth_mgf, + setup_simulation, +) + + +def test_generate_chemicals_length(): + chems = generate_chemicals(10) + assert len(chems) == 10 + # ensure each chemical has expected attributes + for chem in chems: + assert hasattr(chem, 'mass') + assert hasattr(chem, 'rt') + + +def test_create_design(): + design = create_design(3) + assert design.samples['case'] == ['case_1', 'case_2', 'case_3'] + assert len(design.samples['control']) == 3 + + +def test_setup_simulation_returns_data(): + chems, design = setup_simulation(20, 2) + assert len(chems) == 20 + assert len(design.samples['case']) == 2 + + +def test_generate_mzml_files(tmp_path): + chems, design = setup_simulation(3, 1) + generate_mzml_files(chems, design, tmp_path, max_rt=20, top_n=1) + assert (tmp_path / 'case' / 'case_1.mzML').is_file() + + +def test_generate_ground_truth_table(): + chems, design = setup_simulation(2, 1) + df = generate_ground_truth_table(chems, design) + assert {'sample', 'compound_id', 'mz_min', 'mz_max'} <= set(df.columns) + assert len(df) == 4 # 2 samples (case/control) * 2 chemicals + + +def test_write_ground_truth_mgf(tmp_path): + chems = generate_chemicals(2) + out_file = tmp_path / "lib.mgf" + write_ground_truth_mgf(chems, out_file) + text = out_file.read_text() + assert text.startswith("BEGIN IONS") From d42626f222bdb2b15ba30c08af717cc7126032d8 Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Mon, 9 Jun 2025 22:07:10 +0100 Subject: [PATCH 2/7] Add pipeline script and evaluation (#295) --- demo/untargeted/evaluation/__init__.py | 5 ++ demo/untargeted/evaluation/metrics.py | 33 +++++++++ demo/untargeted/join_aligner.py | 65 ++++++++++++++++++ demo/untargeted/peak_picking.py | 43 ++++++++++++ demo/untargeted/pipeline.py | 75 +++++++++++++++++++++ demo/untargeted/plan.md | 23 ++++--- tests/test_untargeted_demo.py | 92 ++++++++++++++++++++++++++ 7 files changed, 328 insertions(+), 8 deletions(-) create mode 100644 demo/untargeted/evaluation/__init__.py create mode 100644 demo/untargeted/evaluation/metrics.py create mode 100644 demo/untargeted/join_aligner.py create mode 100644 demo/untargeted/peak_picking.py create mode 100644 demo/untargeted/pipeline.py diff --git a/demo/untargeted/evaluation/__init__.py b/demo/untargeted/evaluation/__init__.py new file mode 100644 index 0000000..be29703 --- /dev/null +++ b/demo/untargeted/evaluation/__init__.py @@ -0,0 +1,5 @@ +"""Evaluation utilities for the untargeted demo.""" + +from .metrics import compute_group_metrics + +__all__ = ["compute_group_metrics"] diff --git a/demo/untargeted/evaluation/metrics.py b/demo/untargeted/evaluation/metrics.py new file mode 100644 index 0000000..e132f20 --- /dev/null +++ b/demo/untargeted/evaluation/metrics.py @@ -0,0 +1,33 @@ +import pandas as pd +from sklearn.metrics import adjusted_rand_score, pair_confusion_matrix + + +def compute_group_metrics( + peaks: pd.DataFrame, + compound_col: str = "compound_id", + group_col: str = "group", +) -> dict: + """Return alignment metrics comparing predicted groups to ground truth. + + Parameters + ---------- + peaks : pd.DataFrame + Table containing columns ``compound_col`` and ``group_col``. + compound_col : str, optional + Column giving ground truth compound identifiers. + group_col : str, optional + Column giving predicted alignment group labels. + + Returns + ------- + dict + Dictionary with ``precision``, ``recall``, ``f1`` and ``ari`` keys. + """ + labels_true = peaks[compound_col].to_numpy() + labels_pred = peaks[group_col].to_numpy() + tn, fp, fn, tp = pair_confusion_matrix(labels_true, labels_pred).ravel() + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + ari = adjusted_rand_score(labels_true, labels_pred) + return {"precision": precision, "recall": recall, "f1": f1, "ari": ari} diff --git a/demo/untargeted/join_aligner.py b/demo/untargeted/join_aligner.py new file mode 100644 index 0000000..04034f1 --- /dev/null +++ b/demo/untargeted/join_aligner.py @@ -0,0 +1,65 @@ +import pandas as pd + + +def join_align( + peaks: pd.DataFrame, + mz_tol: float, + rt_tol: float, + return_labels: bool = False, +) -> pd.DataFrame: + """Align peaks across samples based on m/z and RT tolerances. + + Parameters + ---------- + peaks : pd.DataFrame + Table containing ``sample``, ``mz``, ``rt`` and ``intensity`` columns. + mz_tol : float + Absolute m/z tolerance for grouping peaks. + rt_tol : float + Absolute retention time tolerance for grouping peaks. + return_labels : bool, optional + If ``True``, also return the input peaks with a ``group`` column + describing the assigned alignment group. + + Returns + ------- + pd.DataFrame + An intensity table with groups as rows and samples as columns. If + ``return_labels`` is ``True`` a tuple ``(aligned, peaks)`` is returned + where ``peaks`` contains the assigned ``group`` for each row. + """ + required = {"sample", "mz", "rt", "intensity"} + if not required <= set(peaks.columns): + raise ValueError(f"Input peaks must contain columns {required}") + + peaks = peaks.copy().sort_values(["mz", "rt"]).reset_index(drop=True) + groups: list[dict] = [] + group_ids: list[int] = [-1] * len(peaks) + + for idx, row in peaks.iterrows(): + assigned = False + for gid, g in enumerate(groups): + if abs(row["mz"] - g["mz"]) <= mz_tol and abs(row["rt"] - g["rt"]) <= rt_tol: + n = g["n"] + 1 + g["mz"] = (g["mz"] * g["n"] + row["mz"]) / n + g["rt"] = (g["rt"] * g["n"] + row["rt"]) / n + g["n"] = n + group_ids[idx] = gid + assigned = True + break + if not assigned: + groups.append({"mz": row["mz"], "rt": row["rt"], "n": 1}) + group_ids[idx] = len(groups) - 1 + + peaks["group"] = group_ids + aligned = peaks.pivot_table( + index="group", + columns="sample", + values="intensity", + aggfunc="max", + fill_value=0, + ).sort_index() + + if return_labels: + return aligned, peaks + return aligned diff --git a/demo/untargeted/peak_picking.py b/demo/untargeted/peak_picking.py new file mode 100644 index 0000000..584eb47 --- /dev/null +++ b/demo/untargeted/peak_picking.py @@ -0,0 +1,43 @@ +import pandas as pd +from pathlib import Path + + +def peak_table_from_ground_truth(gt: pd.DataFrame) -> pd.DataFrame: + """Return a simple peak table derived from the ground truth. + + Parameters + ---------- + gt : pd.DataFrame + Ground truth table produced by :func:`generate_ground_truth_table`. + + Returns + ------- + pd.DataFrame + DataFrame with columns ``sample``, ``mz``, ``rt`` and ``intensity``. + """ + required = {"sample", "mz_apex", "rt_apex", "intensity"} + if not required <= set(gt.columns): + raise ValueError(f"Ground truth must contain columns {required}") + return gt.rename(columns={"mz_apex": "mz", "rt_apex": "rt"})[ + ["sample", "mz", "rt", "intensity"] + ] + + +def write_peak_table(gt_file: Path, out_file: Path) -> pd.DataFrame: + """Read ``gt_file`` and write the peak table to ``out_file``.""" + gt = pd.read_csv(gt_file) + peaks = peak_table_from_ground_truth(gt) + out_file.parent.mkdir(parents=True, exist_ok=True) + peaks.to_csv(out_file, index=False) + return peaks + + +def main() -> None: + """Entry point for the peak picking step.""" + gt_file = Path("./out/ground_truth.csv") + out_file = Path("./out/peaks.csv") + write_peak_table(gt_file, out_file) + + +if __name__ == "__main__": + main() diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py new file mode 100644 index 0000000..5880343 --- /dev/null +++ b/demo/untargeted/pipeline.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import json +from pathlib import Path + + +from .generate_dataset import ( + setup_simulation, + generate_mzml_files, + generate_ground_truth_table, + write_ground_truth_mgf, +) +from .peak_picking import peak_table_from_ground_truth +from .join_aligner import join_align +from .evaluation import compute_group_metrics + + +def run_pipeline( + out_dir: Path = Path("./out"), + n_chemicals: int = 100, + n_samples_per_group: int = 5, + mz_tol: float = 0.01, + rt_tol: float = 0.5, + max_rt: int = 180, + top_n: int = 1, +) -> dict: + """Run the full untargeted demo pipeline and return alignment metrics.""" + + out_dir.mkdir(parents=True, exist_ok=True) + + # Simulate chemicals and dataset design + chemicals, design = setup_simulation(n_chemicals, n_samples_per_group) + + # Generate mzML files + generate_mzml_files(chemicals, design, out_dir, max_rt=max_rt, top_n=top_n) + + # Ground truth table and library + gt = generate_ground_truth_table(chemicals, design) + gt_file = out_dir / "ground_truth.csv" + gt.to_csv(gt_file, index=False) + write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf") + + # Peak picking from ground truth + peaks = peak_table_from_ground_truth(gt) + peaks_file = out_dir / "peaks.csv" + peaks.to_csv(peaks_file, index=False) + + # Alignment + aligned, labeled = join_align(peaks, mz_tol, rt_tol, return_labels=True) + aligned.to_csv(out_dir / "aligned.csv") + + # Join group labels back to ground truth for evaluation + gt_eval = gt[["sample", "compound_id", "mz_apex", "rt_apex", "intensity"]].rename( + columns={"mz_apex": "mz", "rt_apex": "rt"} + ) + df_eval = labeled.merge(gt_eval, on=["sample", "mz", "rt", "intensity"], how="left") + + metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group") + + (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2)) + + return metrics + + +def main() -> None: + """Command-line entry point for running the pipeline.""" + + metrics = run_pipeline() + print("Alignment metrics:") + for k, v in metrics.items(): + print(f"{k}: {v:.4f}") + + +if __name__ == "__main__": + main() diff --git a/demo/untargeted/plan.md b/demo/untargeted/plan.md index 15e7d81..b625336 100644 --- a/demo/untargeted/plan.md +++ b/demo/untargeted/plan.md @@ -9,10 +9,10 @@ track progress. 1. **Simulated Data Generation** - Create a Python script to generate 100 chemicals using `ChemicalMixtureCreator`. - Split chemicals into two groups (case/control) with 5 samples each. - - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with - chromatograms roughly 5 s wide (``sigma≈1``). - - Acquire data with Top‑1 DDA in positive mode including common adducts. - - Export mzML files, a ground‑truth table linking peaks to compounds, and a + - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with + chromatograms roughly 5 s wide (``sigma~1``). + - Acquire data with Top-1 DDA in positive mode including common adducts. + - Export mzML files, a ground truth table linking peaks to compounds, and a simple MGF library of MS2 spectra. - **Status:** Completed (mzML, ground truth table, and MGF library generated) @@ -20,12 +20,12 @@ track progress. - Since the chemicals are simulated, derive peak information directly from the ground truth and produce a table of peaks per sample (bounding box, apex m/z, RT, intensity, sample name, etc.). - - **Status:** Not started + - **Status:** Completed (`peak_picking.py` generates `peaks.csv`) 3. **Alignment** - Implement a simple join aligner that merges peaks across samples to create an aligned feature table. Optionally use MZmine for comparison. - - **Status:** Not started + - **Status:** Completed (``join_align`` implemented in ``join_aligner.py``) 4. **Grouping Related Peaks** - Cluster related peaks (e.g. isotopes/adducts) using a simple RT clustering @@ -34,7 +34,14 @@ track progress. - **Status:** Not started 5. **Identification** - - Match observed peaks to the ground‑truth MSP library to assess performance. + - Match observed peaks to the ground truth MSP library to assess performance. - **Status:** Not started -A main pipeline script will orchestrate these steps once they are implemented. +6. **Evaluation** + - Measure alignment accuracy against the ground truth using metrics such as + precision, recall, F1 score and adjusted Rand index. + - **Status:** Completed (evaluated in `pipeline.py`) + +7. **Pipeline Script** + - Orchestrate all steps from data generation through evaluation. + - **Status:** Completed (`pipeline.py` chains the full workflow) diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py index af07288..5f9c3a4 100644 --- a/tests/test_untargeted_demo.py +++ b/tests/test_untargeted_demo.py @@ -8,6 +8,12 @@ setup_simulation, ) +from demo.untargeted.join_aligner import join_align +from demo.untargeted.evaluation import compute_group_metrics +from demo.untargeted.peak_picking import peak_table_from_ground_truth +from demo.untargeted.pipeline import run_pipeline +import pandas as pd + def test_generate_chemicals_length(): chems = generate_chemicals(10) @@ -49,3 +55,89 @@ def test_write_ground_truth_mgf(tmp_path): write_ground_truth_mgf(chems, out_file) text = out_file.read_text() assert text.startswith("BEGIN IONS") + + +def test_join_align_groups_by_tolerances(): + peaks = pd.DataFrame( + { + "sample": ["s1", "s1", "s2", "s3"], + "mz": [100.0, 105.0, 100.01, 200.0], + "rt": [1.0, 1.5, 1.01, 2.0], + "intensity": [10, 5, 20, 30], + } + ) + aligned = join_align(peaks, mz_tol=0.02, rt_tol=0.05) + assert set(aligned.columns) == {"s1", "s2", "s3"} + assert len(aligned) == 3 + assert aligned.loc[0, "s1"] == 10 + assert aligned.loc[0, "s2"] == 20 + assert aligned.loc[1, "s1"] == 5 + assert aligned.loc[2, "s3"] == 30 + + +def test_join_align_handles_missing_samples(): + peaks = pd.DataFrame( + { + "sample": ["s1", "s2"], + "mz": [50.0, 50.01], + "rt": [0.5, 0.49], + "intensity": [5, 15], + } + ) + aligned = join_align(peaks, mz_tol=0.02, rt_tol=0.02) + assert len(aligned) == 1 + assert aligned.loc[0, "s1"] == 5 + assert aligned.loc[0, "s2"] == 15 + + +def test_compute_group_metrics_perfect(): + df = pd.DataFrame({ + "group": [0, 0, 1, 1], + "compound_id": [1, 1, 2, 2], + }) + metrics = compute_group_metrics(df) + assert metrics["f1"] == 1.0 + assert metrics["ari"] == 1.0 + + +def test_compute_group_metrics_imperfect(): + df = pd.DataFrame({ + "group": [0, 0, 1, 1], + "compound_id": [1, 2, 1, 2], + }) + metrics = compute_group_metrics(df) + assert 0 <= metrics["precision"] < 1.0 + + +def test_peak_table_from_ground_truth(): + gt = pd.DataFrame( + { + "sample": ["s1", "s2"], + "compound_id": [0, 1], + "mz_apex": [100.0, 200.0], + "rt_apex": [1.0, 2.0], + "intensity": [10, 20], + "mz_min": [99.9, 199.9], + "mz_max": [100.1, 200.1], + "rt_min": [0.9, 1.9], + "rt_max": [1.1, 2.1], + } + ) + peaks = peak_table_from_ground_truth(gt) + assert set(peaks.columns) == {"sample", "mz", "rt", "intensity"} + assert len(peaks) == 2 + assert peaks.loc[1, "rt"] == 2.0 + + +def test_run_pipeline(tmp_path): + metrics = run_pipeline( + out_dir=tmp_path, + n_chemicals=1, + n_samples_per_group=1, + mz_tol=0.02, + rt_tol=0.1, + max_rt=20, + top_n=1, + ) + assert set(metrics) == {"precision", "recall", "f1", "ari"} + assert (tmp_path / "aligned.csv").is_file() From 39780de872385a400b635296179f76afc5077fc5 Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Tue, 10 Jun 2025 17:50:34 +0100 Subject: [PATCH 3/7] Add helper to report alignment metrics (#296) * Add metric reporting helper to pipeline * Add linear column RT noise option * Add RT drift models and use in demo pipeline * Refactor drift models into main package * Clean up drift and demo scripts --- demo/untargeted/generate_dataset.py | 64 ++++++++++++++++++++----- demo/untargeted/pipeline.py | 74 ++++++++++++++++++++++------- vimms/ColumnDrift.py | 71 +++++++++++++++++++++++++++ 3 files changed, 181 insertions(+), 28 deletions(-) create mode 100644 vimms/ColumnDrift.py diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py index 5e7641f..6805522 100644 --- a/demo/untargeted/generate_dataset.py +++ b/demo/untargeted/generate_dataset.py @@ -8,12 +8,13 @@ from dataclasses import dataclass from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List from vimms.Common import POSITIVE, PROTON_MASS from vimms.Controller import TopNController from vimms.Environment import Environment from vimms.MassSpec import IndependentMassSpectrometer +from vimms.ColumnDrift import SimulatedDriftModel from .generate_chemicals import generate_chemicals import pandas as pd @@ -37,8 +38,8 @@ def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign: def setup_simulation( - n_chemicals: int = 100, n_samples_per_group: int = 5 -) -> Tuple[list, ExperimentalDesign]: + n_chemicals=100, n_samples_per_group=5 +): """Generate chemicals and an experimental design.""" chemicals = generate_chemicals(n_chemicals) @@ -52,7 +53,8 @@ def generate_mzml_files( out_dir: Path, max_rt: int = 180, top_n: int = 1, -) -> None: + column_params=None, +) -> dict: """Generate an mzML file for each sample in ``design``. Parameters @@ -67,13 +69,34 @@ def generate_mzml_files( Maximum retention time (seconds) for the simulation. top_n: Number of precursors to fragment in each cycle. + column_params: + Optional parameters for ``SimulatedDriftModel`` specifying + ``noise_sd``, ``intercept_params`` and ``linear_params``. + + Returns + ------- + dict + Mapping sample names to the chemicals used for simulation. """ + sample_chems = {} for group, samples in design.samples.items(): group_dir = out_dir / group group_dir.mkdir(parents=True, exist_ok=True) for sample in samples: - ms = IndependentMassSpectrometer(POSITIVE, chemicals) + if column_params is not None: + drift = SimulatedDriftModel( + intercept_mu=column_params.get("intercept_params", (0.0, 5.0))[0], + intercept_sd=column_params.get("intercept_params", (0.0, 5.0))[1], + slope_mu=1.0 + column_params.get("linear_params", (0.0, 0.001))[0], + slope_sd=column_params.get("linear_params", (0.0, 0.001))[1], + noise_sd=column_params.get("noise_sd", 0.1), + ) + col = drift.make_column(chemicals) + sample_data = col.get_dataset() + else: + sample_data = chemicals + ms = IndependentMassSpectrometer(POSITIVE, sample_data) controller = TopNController( POSITIVE, top_n, 1, 10, 15, 1000 ) @@ -87,28 +110,47 @@ def generate_mzml_files( out_file=f"{sample}.mzML", ) env.run() + sample_chems[sample] = sample_data + return sample_chems def generate_ground_truth_table( - chemicals: list, design: ExperimentalDesign + chemicals, + design: ExperimentalDesign, + per_sample_chems=None, ) -> pd.DataFrame: - """Return a table describing the true peaks for each sample.""" + """Return a table describing the true peaks for each sample. + + Parameters + ---------- + chemicals: + Base chemical list used when ``per_sample_chems`` is ``None``. + design: + Experimental design describing the samples. + per_sample_chems: + Optional mapping of sample name to a list of chemicals with RT drift + applied. When provided, ground truth retention times are taken from this + mapping instead of ``chemicals``. + """ records = [] for compound_id, chem in enumerate(chemicals): mz = chem.mass + PROTON_MASS - rt_min = chem.get_min_rt() - rt_max = chem.get_max_rt() - rt_apex = chem.get_apex_rt() + base_rt_min = chem.get_min_rt() + base_rt_max = chem.get_max_rt() for group_samples in design.samples.values(): for sample in group_samples: + s_chem = chem if per_sample_chems is None else per_sample_chems[sample][compound_id] + rt_apex = s_chem.get_apex_rt() + rt_min = s_chem.get_min_rt() if per_sample_chems else base_rt_min + rt_max = s_chem.get_max_rt() if per_sample_chems else base_rt_max records.append( { "sample": sample, "compound_id": compound_id, "mz_apex": mz, "rt_apex": rt_apex, - "intensity": chem.max_intensity, + "intensity": s_chem.max_intensity, "mz_min": mz - 0.01, "mz_max": mz + 0.01, "rt_min": rt_min, diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py index 5880343..18c5a2e 100644 --- a/demo/untargeted/pipeline.py +++ b/demo/untargeted/pipeline.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import json from pathlib import Path @@ -15,27 +13,71 @@ from .evaluation import compute_group_metrics +def report_metrics(metrics): + """Print alignment metrics in a friendly format.""" + + print("Alignment metrics:") + for key, value in metrics.items(): + print(f"{key}: {value:.4f}") + + def run_pipeline( - out_dir: Path = Path("./out"), - n_chemicals: int = 100, - n_samples_per_group: int = 5, - mz_tol: float = 0.01, - rt_tol: float = 0.5, - max_rt: int = 180, - top_n: int = 1, -) -> dict: - """Run the full untargeted demo pipeline and return alignment metrics.""" + out_dir=Path("./out"), + n_chemicals=100, + n_samples_per_group=5, + mz_tol=0.01, + rt_tol=0.5, + max_rt=180, + top_n=1, + use_rt_noise=False, + noise_sd=0.1, + intercept_params=(0.0, 5.0), + linear_params=(0.0, 0.001), +): + """Run the full untargeted demo pipeline and return alignment metrics. + + Parameters + ---------- + use_rt_noise: + Whether to apply retention time drift to each injection using a + ``SimulatedDriftModel``. + noise_sd: + Standard deviation of random noise added around the drift function. + intercept_params: + Mean and standard deviation of the intercept term (seconds). + linear_params: + Mean and standard deviation of the linear term. + """ out_dir.mkdir(parents=True, exist_ok=True) # Simulate chemicals and dataset design chemicals, design = setup_simulation(n_chemicals, n_samples_per_group) + column_params = None + if use_rt_noise: + column_params = { + "noise_sd": noise_sd, + "intercept_params": intercept_params, + "linear_params": linear_params, + } + # Generate mzML files - generate_mzml_files(chemicals, design, out_dir, max_rt=max_rt, top_n=top_n) + sample_chems = generate_mzml_files( + chemicals, + design, + out_dir, + max_rt=max_rt, + top_n=top_n, + column_params=column_params, + ) # Ground truth table and library - gt = generate_ground_truth_table(chemicals, design) + gt = generate_ground_truth_table( + chemicals, + design, + per_sample_chems=sample_chems if use_rt_noise else None, + ) gt_file = out_dir / "ground_truth.csv" gt.to_csv(gt_file, index=False) write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf") @@ -65,10 +107,8 @@ def run_pipeline( def main() -> None: """Command-line entry point for running the pipeline.""" - metrics = run_pipeline() - print("Alignment metrics:") - for k, v in metrics.items(): - print(f"{k}: {v:.4f}") + metrics = run_pipeline(use_rt_noise=True) + report_metrics(metrics) if __name__ == "__main__": diff --git a/vimms/ColumnDrift.py b/vimms/ColumnDrift.py new file mode 100644 index 0000000..08c46c8 --- /dev/null +++ b/vimms/ColumnDrift.py @@ -0,0 +1,71 @@ +"""Models run-to-run retention time drift for LC-MS injections.""" + +import numpy as np +from abc import ABCMeta, abstractmethod + +from .Column import LinearColumn + + +class BaseColumnDriftModel(metaclass=ABCMeta): + """Base class for objects that provide one drifted ``LinearColumn``.""" + + def __init__(self, noise_sd=0.0, random_state=None): + self.noise_sd = noise_sd + self._rng = np.random.default_rng(random_state) + + @abstractmethod + def _sample_drift(self): + """Return ``(intercept_shift, slope_scale)`` for one injection.""" + + def make_column(self, dataset): + """Return a ``LinearColumn`` with drift for ``dataset``.""" + intercept, scale = self._sample_drift() + return LinearColumn.from_fixed_offsets(dataset, self.noise_sd, intercept, scale - 1.0) + + +class SimulatedDriftModel(BaseColumnDriftModel): + """Draw drift parameters from Normal distributions.""" + + def __init__( + self, + intercept_mu=0.0, + intercept_sd=0.0, + slope_mu=1.0, + slope_sd=0.0, + noise_sd=0.0, + random_state=None, + ): + super().__init__(noise_sd, random_state) + self.intercept_mu = intercept_mu + self.intercept_sd = intercept_sd + self.slope_mu = slope_mu + self.slope_sd = slope_sd + + def _sample_drift(self): + if self.intercept_sd > 0: + intercept = self._rng.normal(self.intercept_mu, self.intercept_sd) + else: + intercept = self.intercept_mu + if self.slope_sd > 0: + slope = self._rng.normal(self.slope_mu, self.slope_sd) + else: + slope = self.slope_mu + return intercept, slope + + +class DataDrivenDriftModel(BaseColumnDriftModel): + """Replay empirically observed drift parameters.""" + + def __init__(self, intercepts, slopes, noise_sd=0.0, random_state=None): + if len(intercepts) != len(slopes): + raise ValueError("intercepts and slopes must have the same length") + super().__init__(noise_sd, random_state) + self._intercepts = list(intercepts) + self._slopes = list(slopes) + self._cursor = 0 + + def _sample_drift(self): + intercept = self._intercepts[self._cursor] + slope = self._slopes[self._cursor] + self._cursor = (self._cursor + 1) % len(self._intercepts) + return intercept, slope From 2146ac97d7fdf49951de470734af90e0051d913a Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Wed, 11 Jun 2025 00:09:19 +0100 Subject: [PATCH 4/7] Refactor pipeline output handling (#300) * Refactor output handling and add get_peak_data * Refactor pipeline into classes --- demo/untargeted/generate_dataset.py | 75 ++++++++++-- demo/untargeted/pipeline.py | 161 +++++++++++------------- demo/untargeted/processing.py | 182 ++++++++++++++++++++++++++++ tests/test_untargeted_demo.py | 8 +- 4 files changed, 321 insertions(+), 105 deletions(-) create mode 100644 demo/untargeted/processing.py diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py index 6805522..6cf8293 100644 --- a/demo/untargeted/generate_dataset.py +++ b/demo/untargeted/generate_dataset.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional from vimms.Common import POSITIVE, PROTON_MASS from vimms.Controller import TopNController @@ -27,6 +27,16 @@ class ExperimentalDesign: samples: Dict[str, List[str]] +@dataclass +class Dataset: + """Container describing input data for the pipeline.""" + + design: ExperimentalDesign + mzml_files: Dict[str, Path] + ground_truth: Optional[pd.DataFrame] = None + mgf_file: Optional[Path] = None + + def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign: """Return an experimental design with case and control groups.""" @@ -182,20 +192,63 @@ def write_ground_truth_mgf(chemicals: list, out_file: Path) -> None: f.write("END IONS\n") -def main() -> None: - """Entry point for dataset preparation.""" +def generate_synthetic_dataset( + out_dir: Path, + n_chemicals: int = 100, + n_samples_per_group: int = 5, + max_rt: int = 180, + top_n: int = 1, + use_rt_noise: bool = False, + noise_sd: float = 0.1, + intercept_params: tuple[float, float] = (0.0, 5.0), + linear_params: tuple[float, float] = (0.0, 0.001), +) -> Dataset: + """Generate a synthetic dataset and return a :class:`Dataset` object.""" - chemicals, design = setup_simulation() - out_dir = Path("./out") out_dir.mkdir(parents=True, exist_ok=True) + chemicals, design = setup_simulation(n_chemicals, n_samples_per_group) + + column_params = None + if use_rt_noise: + column_params = { + "noise_sd": noise_sd, + "intercept_params": intercept_params, + "linear_params": linear_params, + } + + sample_chems = generate_mzml_files( + chemicals, + design, + out_dir, + max_rt=max_rt, + top_n=top_n, + column_params=column_params, + ) + + gt = generate_ground_truth_table( + chemicals, + design, + per_sample_chems=sample_chems if use_rt_noise else None, + ) + gt_file = out_dir / "ground_truth.csv" + gt.to_csv(gt_file, index=False) + mgf_file = out_dir / "ground_truth.mgf" + write_ground_truth_mgf(chemicals, mgf_file) + + mzml_files = { + sample: out_dir / group / f"{sample}.mzML" + for group, samples in design.samples.items() + for sample in samples + } + + return Dataset(design=design, mzml_files=mzml_files, ground_truth=gt, mgf_file=mgf_file) - print(f"Generated {len(chemicals)} chemicals") - for group, names in design.samples.items(): + +def main() -> None: + """Entry point for dataset preparation.""" + dataset = generate_synthetic_dataset(Path("./out")) + for group, names in dataset.design.samples.items(): print(group, names) - generate_mzml_files(chemicals, design, out_dir) - gt = generate_ground_truth_table(chemicals, design) - gt.to_csv(out_dir / "ground_truth.csv", index=False) - write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf") if __name__ == "__main__": diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py index 18c5a2e..f2d4043 100644 --- a/demo/untargeted/pipeline.py +++ b/demo/untargeted/pipeline.py @@ -1,16 +1,69 @@ -import json from pathlib import Path +from abc import ABC, abstractmethod - -from .generate_dataset import ( - setup_simulation, - generate_mzml_files, - generate_ground_truth_table, - write_ground_truth_mgf, -) -from .peak_picking import peak_table_from_ground_truth +from .generate_dataset import Dataset, generate_synthetic_dataset from .join_aligner import join_align -from .evaluation import compute_group_metrics +from .processing import ( + get_peak_data, + OutputWriter, + group_related_peaks, + identify_compounds, + annotate_spectra, + batch_normalize, + impute_missing_values, + compute_statistics, + generate_report, +) + + +class BasePipeline(ABC): + """Base class encapsulating preprocessing logic.""" + + def __init__( + self, + dataset: Dataset, + out_dir: Path, + mz_tol: float = 0.01, + rt_tol: float = 0.5, + ) -> None: + self.dataset = dataset + self.out_dir = out_dir + self.mz_tol = mz_tol + self.rt_tol = rt_tol + self.writer = OutputWriter(out_dir) + + def run(self) -> dict | None: + """Run the preprocessing pipeline.""" + + peaks = self.get_peaks() + aligned, labeled = join_align( + peaks, self.mz_tol, self.rt_tol, return_labels=True + ) + + grouped = group_related_peaks(aligned) + identified = identify_compounds(grouped, self.dataset.mgf_file) + annotated = annotate_spectra(identified, self.dataset.mgf_file) + normalized = batch_normalize(annotated, self.dataset.design) + _ = impute_missing_values(normalized) + + metrics = compute_statistics(labeled, self.dataset) + self.writer.write_all(peaks=peaks, aligned=aligned, metrics=metrics) + _ = generate_report(metrics) + return metrics + + @abstractmethod + def get_peaks(self): + """Return the peak table for alignment.""" + + +class SyntheticPipeline(BasePipeline): + """Pipeline that expects ground-truth derived peaks.""" + + def get_peaks(self): + if self.dataset.ground_truth is None: + raise ValueError("Ground truth is required for the demo pipeline") + return get_peak_data(self.dataset.ground_truth) + def report_metrics(metrics): @@ -22,92 +75,20 @@ def report_metrics(metrics): def run_pipeline( - out_dir=Path("./out"), - n_chemicals=100, - n_samples_per_group=5, - mz_tol=0.01, - rt_tol=0.5, - max_rt=180, - top_n=1, - use_rt_noise=False, - noise_sd=0.1, - intercept_params=(0.0, 5.0), - linear_params=(0.0, 0.001), -): - """Run the full untargeted demo pipeline and return alignment metrics. - - Parameters - ---------- - use_rt_noise: - Whether to apply retention time drift to each injection using a - ``SimulatedDriftModel``. - noise_sd: - Standard deviation of random noise added around the drift function. - intercept_params: - Mean and standard deviation of the intercept term (seconds). - linear_params: - Mean and standard deviation of the linear term. - """ - - out_dir.mkdir(parents=True, exist_ok=True) - - # Simulate chemicals and dataset design - chemicals, design = setup_simulation(n_chemicals, n_samples_per_group) - - column_params = None - if use_rt_noise: - column_params = { - "noise_sd": noise_sd, - "intercept_params": intercept_params, - "linear_params": linear_params, - } - - # Generate mzML files - sample_chems = generate_mzml_files( - chemicals, - design, - out_dir, - max_rt=max_rt, - top_n=top_n, - column_params=column_params, - ) + dataset: Dataset, out_dir: Path, mz_tol: float = 0.01, rt_tol: float = 0.5 +) -> dict | None: + """Convenience wrapper executing the synthetic pipeline.""" - # Ground truth table and library - gt = generate_ground_truth_table( - chemicals, - design, - per_sample_chems=sample_chems if use_rt_noise else None, + pipeline = SyntheticPipeline( + dataset=dataset, out_dir=out_dir, mz_tol=mz_tol, rt_tol=rt_tol ) - gt_file = out_dir / "ground_truth.csv" - gt.to_csv(gt_file, index=False) - write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf") - - # Peak picking from ground truth - peaks = peak_table_from_ground_truth(gt) - peaks_file = out_dir / "peaks.csv" - peaks.to_csv(peaks_file, index=False) - - # Alignment - aligned, labeled = join_align(peaks, mz_tol, rt_tol, return_labels=True) - aligned.to_csv(out_dir / "aligned.csv") - - # Join group labels back to ground truth for evaluation - gt_eval = gt[["sample", "compound_id", "mz_apex", "rt_apex", "intensity"]].rename( - columns={"mz_apex": "mz", "rt_apex": "rt"} - ) - df_eval = labeled.merge(gt_eval, on=["sample", "mz", "rt", "intensity"], how="left") - - metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group") - - (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2)) - - return metrics + return pipeline.run() def main() -> None: """Command-line entry point for running the pipeline.""" - - metrics = run_pipeline(use_rt_noise=True) + dataset = generate_synthetic_dataset(Path("./out"), use_rt_noise=True) + metrics = run_pipeline(dataset, Path("./out")) report_metrics(metrics) diff --git a/demo/untargeted/processing.py b/demo/untargeted/processing.py new file mode 100644 index 0000000..9fa9037 --- /dev/null +++ b/demo/untargeted/processing.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +"""Stub processing steps for the untargeted workflow.""" + +from pathlib import Path +import json +import pandas as pd + +from .generate_dataset import Dataset, ExperimentalDesign +from .evaluation import compute_group_metrics +from .peak_picking import peak_table_from_ground_truth + + +def get_peak_data(gt: pd.DataFrame, out_dir: Path | None = None) -> pd.DataFrame: + """Return peaks derived from ``gt`` and optionally write to ``out_dir``. + + Parameters + ---------- + gt: + Ground truth table describing the true peaks. + out_dir: + Directory where ``peaks.csv`` will be written when provided. + + Returns + ------- + pd.DataFrame + Peak table containing ``sample``, ``mz``, ``rt`` and ``intensity``. + """ + peaks = peak_table_from_ground_truth(gt) + if out_dir is not None: + out_dir.mkdir(parents=True, exist_ok=True) + peaks.to_csv(out_dir / "peaks.csv", index=False) + return peaks + + +class OutputWriter: + """Utility to persist pipeline outputs to disk.""" + + def __init__(self, out_dir: Path) -> None: + self.out_dir = out_dir + self.out_dir.mkdir(parents=True, exist_ok=True) + + def write_all( + self, + peaks: pd.DataFrame | None = None, + aligned: pd.DataFrame | None = None, + metrics: dict | None = None, + ) -> None: + """Write provided outputs to :pyattr:`out_dir`.""" + + if peaks is not None: + peaks.to_csv(self.out_dir / "peaks.csv", index=False) + if aligned is not None: + aligned.to_csv(self.out_dir / "aligned.csv", index=False) + if metrics is not None: + (self.out_dir / "metrics.json").write_text( + json.dumps(metrics, indent=2) + ) + + +def group_related_peaks(aligned: pd.DataFrame) -> pd.DataFrame: + """Group isotopes and adducts together. + + Parameters + ---------- + aligned: + Aligned feature table with sample intensities. + + Returns + ------- + pd.DataFrame + Table of grouped peaks. Currently returned unchanged. + """ + return aligned + + +def identify_compounds( + grouped: pd.DataFrame, library_file: Path | None = None +) -> pd.DataFrame: + """Identify compounds by matching grouped peaks to a library. + + Parameters + ---------- + grouped: + Peak table after grouping related peaks. + library_file: + Optional path to an MGF or MSP library for matching. + + Returns + ------- + pd.DataFrame + Peak table with identification columns added. Returned unchanged for now. + """ + return grouped + + +def annotate_spectra( + identified: pd.DataFrame, library_file: Path | None = None +) -> pd.DataFrame: + """Add spectral annotations to identified features. + + Parameters + ---------- + identified: + Peak table containing compound identifications. + library_file: + Optional path to a spectral library. + + Returns + ------- + pd.DataFrame + Annotated peak table. Returned unchanged for now. + """ + return identified + + +def batch_normalize(peaks: pd.DataFrame, design: ExperimentalDesign) -> pd.DataFrame: + """Correct systematic effects between sample groups or batches. + + Parameters + ---------- + peaks: + Table of peak intensities. + design: + Experimental design describing sample groups. + + Returns + ------- + pd.DataFrame + Normalized peak table. Returned unchanged for now. + """ + return peaks + + +def impute_missing_values(peaks: pd.DataFrame) -> pd.DataFrame: + """Fill in missing intensity values. + + Parameters + ---------- + peaks: + Table of peak intensities. + + Returns + ------- + pd.DataFrame + Peak table with missing values imputed. Returned unchanged for now. + """ + return peaks + + +def compute_statistics(labeled: pd.DataFrame, dataset: Dataset) -> dict: + """Compute evaluation metrics for the aligned peaks. + + Parameters + ---------- + labeled: + Peak-level table with grouping labels from the aligner. + dataset: + Dataset object containing the ground-truth table. + + Returns + ------- + dict + Dictionary of computed metrics. Empty if ground truth is unavailable. + """ + if dataset.ground_truth is None: + return {} + + gt_eval = dataset.ground_truth[ + ["sample", "compound_id", "mz_apex", "rt_apex", "intensity"] + ].rename(columns={"mz_apex": "mz", "rt_apex": "rt"}) + df_eval = labeled.merge( + gt_eval, on=["sample", "mz", "rt", "intensity"], how="left" + ) + metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group") + return metrics + + +def generate_report(metrics: dict) -> str: + """Return a JSON report summarizing ``metrics``.""" + + return json.dumps(metrics, indent=2) diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py index 5f9c3a4..36d8f54 100644 --- a/tests/test_untargeted_demo.py +++ b/tests/test_untargeted_demo.py @@ -6,6 +6,7 @@ generate_ground_truth_table, write_ground_truth_mgf, setup_simulation, + generate_synthetic_dataset, ) from demo.untargeted.join_aligner import join_align @@ -130,14 +131,13 @@ def test_peak_table_from_ground_truth(): def test_run_pipeline(tmp_path): - metrics = run_pipeline( - out_dir=tmp_path, + dataset = generate_synthetic_dataset( + tmp_path, n_chemicals=1, n_samples_per_group=1, - mz_tol=0.02, - rt_tol=0.1, max_rt=20, top_n=1, ) + metrics = run_pipeline(dataset, tmp_path, mz_tol=0.02, rt_tol=0.1) assert set(metrics) == {"precision", "recall", "f1", "ari"} assert (tmp_path / "aligned.csv").is_file() From 93b7e951e79492df1ebe7bf071031d0bc370eb66 Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Wed, 11 Jun 2025 09:45:29 +0100 Subject: [PATCH 5/7] Consolidate demo entrypoints (#301) --- demo/untargeted/generate_chemicals.py | 7 ------- demo/untargeted/generate_dataset.py | 12 ------------ demo/untargeted/peak_picking.py | 9 --------- demo/untargeted/pipeline.py | 16 ++++++++++++---- 4 files changed, 12 insertions(+), 32 deletions(-) diff --git a/demo/untargeted/generate_chemicals.py b/demo/untargeted/generate_chemicals.py index 218bbda..293f6d9 100644 --- a/demo/untargeted/generate_chemicals.py +++ b/demo/untargeted/generate_chemicals.py @@ -27,10 +27,3 @@ def generate_chemicals(n_chemicals: int = 100): return cmc.sample(n_chemicals, 2) -def main(): - chemicals = generate_chemicals() - save_obj(chemicals, str(FIXTURES / "demo_chemicals.p")) - - -if __name__ == "__main__": - main() diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py index 6cf8293..0c4f3ca 100644 --- a/demo/untargeted/generate_dataset.py +++ b/demo/untargeted/generate_dataset.py @@ -240,16 +240,4 @@ def generate_synthetic_dataset( for group, samples in design.samples.items() for sample in samples } - return Dataset(design=design, mzml_files=mzml_files, ground_truth=gt, mgf_file=mgf_file) - - -def main() -> None: - """Entry point for dataset preparation.""" - dataset = generate_synthetic_dataset(Path("./out")) - for group, names in dataset.design.samples.items(): - print(group, names) - - -if __name__ == "__main__": - main() diff --git a/demo/untargeted/peak_picking.py b/demo/untargeted/peak_picking.py index 584eb47..537bffc 100644 --- a/demo/untargeted/peak_picking.py +++ b/demo/untargeted/peak_picking.py @@ -32,12 +32,3 @@ def write_peak_table(gt_file: Path, out_file: Path) -> pd.DataFrame: return peaks -def main() -> None: - """Entry point for the peak picking step.""" - gt_file = Path("./out/ground_truth.csv") - out_file = Path("./out/peaks.csv") - write_peak_table(gt_file, out_file) - - -if __name__ == "__main__": - main() diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py index f2d4043..2dfe9cb 100644 --- a/demo/untargeted/pipeline.py +++ b/demo/untargeted/pipeline.py @@ -1,5 +1,6 @@ from pathlib import Path from abc import ABC, abstractmethod +import argparse from .generate_dataset import Dataset, generate_synthetic_dataset from .join_aligner import join_align @@ -84,11 +85,18 @@ def run_pipeline( ) return pipeline.run() +def main(argv: list[str] | None = None) -> None: + """Command-line entry point for generating data and running the pipeline.""" -def main() -> None: - """Command-line entry point for running the pipeline.""" - dataset = generate_synthetic_dataset(Path("./out"), use_rt_noise=True) - metrics = run_pipeline(dataset, Path("./out")) + parser = argparse.ArgumentParser(description="Run the untargeted demo pipeline") + parser.add_argument("--out-dir", type=Path, default=Path("./out")) + parser.add_argument("--use-rt-noise", action="store_true", help="Add RT noise to the synthetic dataset") + parser.add_argument("--mz-tol", type=float, default=0.01) + parser.add_argument("--rt-tol", type=float, default=0.5) + args = parser.parse_args(argv) + + dataset = generate_synthetic_dataset(args.out_dir, use_rt_noise=args.use_rt_noise) + metrics = run_pipeline(dataset, args.out_dir, mz_tol=args.mz_tol, rt_tol=args.rt_tol) report_metrics(metrics) From b31e26ad79e4024a7756eacbc4c90a710ab081b3 Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Wed, 11 Jun 2025 10:12:28 +0100 Subject: [PATCH 6/7] Make pipeline steps overridable (#303) --- demo/untargeted/pipeline.py | 43 +++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py index 2dfe9cb..9a08baf 100644 --- a/demo/untargeted/pipeline.py +++ b/demo/untargeted/pipeline.py @@ -33,6 +33,37 @@ def __init__( self.rt_tol = rt_tol self.writer = OutputWriter(out_dir) + # ------------------------------------------------------------------ + # Steps in the preprocessing pipeline. These methods wrap the helper + # functions defined in :mod:`processing` so subclasses can override + # individual steps when needed. + # ------------------------------------------------------------------ + + def group_peaks(self, aligned): + """Group isotopes/adducts together.""" + + return group_related_peaks(aligned) + + def identify(self, grouped): + """Match grouped peaks to library entries.""" + + return identify_compounds(grouped, self.dataset.mgf_file) + + def annotate(self, identified): + """Annotate identified peaks with spectral information.""" + + return annotate_spectra(identified, self.dataset.mgf_file) + + def normalize(self, annotated): + """Apply batch normalisation.""" + + return batch_normalize(annotated, self.dataset.design) + + def impute(self, normalized): + """Fill in missing values in the peak table.""" + + return impute_missing_values(normalized) + def run(self) -> dict | None: """Run the preprocessing pipeline.""" @@ -41,11 +72,11 @@ def run(self) -> dict | None: peaks, self.mz_tol, self.rt_tol, return_labels=True ) - grouped = group_related_peaks(aligned) - identified = identify_compounds(grouped, self.dataset.mgf_file) - annotated = annotate_spectra(identified, self.dataset.mgf_file) - normalized = batch_normalize(annotated, self.dataset.design) - _ = impute_missing_values(normalized) + grouped = self.group_peaks(aligned) + identified = self.identify(grouped) + annotated = self.annotate(identified) + normalized = self.normalize(annotated) + _ = self.impute(normalized) metrics = compute_statistics(labeled, self.dataset) self.writer.write_all(peaks=peaks, aligned=aligned, metrics=metrics) @@ -66,7 +97,6 @@ def get_peaks(self): return get_peak_data(self.dataset.ground_truth) - def report_metrics(metrics): """Print alignment metrics in a friendly format.""" @@ -85,6 +115,7 @@ def run_pipeline( ) return pipeline.run() + def main(argv: list[str] | None = None) -> None: """Command-line entry point for generating data and running the pipeline.""" From 017688248c82a0ea54db5422055e89470f26b23f Mon Sep 17 00:00:00 2001 From: Joe Wandy Date: Thu, 12 Jun 2025 09:04:18 +0100 Subject: [PATCH 7/7] Make heavy dependencies optional (#304) --- poetry.lock | 76 ++++++++++++++++++++++++++++--------- pyproject.toml | 16 +++++--- vimms/BoxVisualise.py | 53 ++++++++++++++++++++++++-- vimms/InSilicoSimulation.py | 20 +++++----- 4 files changed, 130 insertions(+), 35 deletions(-) diff --git a/poetry.lock b/poetry.lock index f18d81f..6dfd182 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4,9 +4,10 @@ name = "alembic" version = "1.16.1" description = "A database migration tool for SQLAlchemy." -optional = false +optional = true python-versions = ">=3.9" groups = ["main"] +markers = "extra == \"optimize\"" files = [ {file = "alembic-1.16.1-py3-none-any.whl", hash = "sha256:0cdd48acada30d93aa1035767d67dff25702f8de74d7c3919f2e8492c8db2e67"}, {file = "alembic-1.16.1.tar.gz", hash = "sha256:43d37ba24b3d17bc1eb1024fe0f51cd1dc95aeb5464594a02c6bb9ca9864bfa4"}, @@ -49,11 +50,11 @@ description = "Disable App Nap on macOS >= 10.9" optional = false python-versions = ">=3.6" groups = ["main", "dev"] -markers = "platform_system == \"Darwin\"" files = [ {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, ] +markers = {main = "extra == \"parallel\" and platform_system == \"Darwin\"", dev = "platform_system == \"Darwin\""} [[package]] name = "argon2-cffi" @@ -139,6 +140,7 @@ files = [ {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"}, {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] astroid = ["astroid (>=2,<4)"] @@ -381,7 +383,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {main = "implementation_name == \"pypy\""} +markers = {main = "extra == \"parallel\" and implementation_name == \"pypy\""} [package.dependencies] pycparser = "*" @@ -522,19 +524,20 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["main", "dev"] +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", dev = "sys_platform == \"win32\" or platform_system == \"Windows\""} [[package]] name = "colorlog" version = "6.9.0" description = "Add colours to the output of Python's logging module." -optional = false +optional = true python-versions = ">=3.6" groups = ["main"] +markers = "extra == \"optimize\"" files = [ {file = "colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff"}, {file = "colorlog-6.9.0.tar.gz", hash = "sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2"}, @@ -557,6 +560,7 @@ files = [ {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] traitlets = ">=4" @@ -772,6 +776,7 @@ files = [ {file = "debugpy-1.8.14-py2.py3-none-any.whl", hash = "sha256:5cd9a579d553b6cb9759a7908a41988ee6280b961f24f63336835d9418216a20"}, {file = "debugpy-1.8.14.tar.gz", hash = "sha256:7cd287184318416850aa8b60ac90105837bb1e59531898c07569d197d2ed5322"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "decorator" @@ -784,6 +789,7 @@ files = [ {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"}, {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "defusedxml" @@ -820,6 +826,7 @@ files = [ {file = "entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f"}, {file = "entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "events" @@ -843,6 +850,7 @@ files = [ {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"}, {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] @@ -1188,6 +1196,7 @@ files = [ {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] appnope = {version = "*", markers = "platform_system == \"Darwin\""} @@ -1215,9 +1224,10 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio name = "ipyparallel" version = "9.0.1" description = "Interactive Parallel Computing with IPython" -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"parallel\"" files = [ {file = "ipyparallel-9.0.1-py3-none-any.whl", hash = "sha256:cc0702e26af416a1c35c73eb6dbc3a6a2c49417cb24dda4511be85c9fe87ea64"}, {file = "ipyparallel-9.0.1.tar.gz", hash = "sha256:2e592cad2200c5a94fbbff639bff36e6ec9122f34b36b2fc6b4d678d9e98f29c"}, @@ -1254,6 +1264,7 @@ files = [ {file = "ipython-9.3.0-py3-none-any.whl", hash = "sha256:1a0b6dd9221a1f5dddf725b57ac0cb6fddc7b5f470576231ae9162b9b3455a04"}, {file = "ipython-9.3.0.tar.gz", hash = "sha256:79eb896f9f23f50ad16c3bc205f686f6e030ad246cc309c6279a242b14afe9d8"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -1287,6 +1298,7 @@ files = [ {file = "ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c"}, {file = "ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] pygments = "*" @@ -1339,6 +1351,7 @@ files = [ {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] parso = ">=0.8.4,<0.9.0" @@ -1480,6 +1493,7 @@ files = [ {file = "jupyter_client-7.4.9-py3-none-any.whl", hash = "sha256:214668aaea208195f4c13d28eb272ba79f945fc0cf3f11c7092c20b2ca1980e7"}, {file = "jupyter_client-7.4.9.tar.gz", hash = "sha256:52be28e04171f07aed8f20e1616a5a552ab9fee9cbbe6c1896ae170c3880d392"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] entrypoints = "*" @@ -1505,6 +1519,7 @@ files = [ {file = "jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0"}, {file = "jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] platformdirs = ">=2.5" @@ -1991,9 +2006,10 @@ source = ["Cython (>=3.0.11,<3.1.0)"] name = "mako" version = "1.3.10" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"optimize\"" files = [ {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"}, {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"}, @@ -2093,6 +2109,7 @@ files = [ {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, ] +markers = {main = "extra == \"optimize\""} [[package]] name = "mass-spec-utils" @@ -2183,6 +2200,7 @@ files = [ {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] traitlets = "*" @@ -2345,9 +2363,10 @@ files = [ name = "narwhals" version = "1.41.0" description = "Extremely lightweight compatibility layer between dataframe libraries" -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"visual\"" files = [ {file = "narwhals-1.41.0-py3-none-any.whl", hash = "sha256:d958336b40952e4c4b7aeef259a7074851da0800cf902186a58f2faeff97be02"}, {file = "narwhals-1.41.0.tar.gz", hash = "sha256:0ab2e5a1757a19b071e37ca74b53b0b5426789321d68939738337dfddea629b5"}, @@ -2459,6 +2478,7 @@ files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "networkx" @@ -2635,9 +2655,10 @@ files = [ name = "optuna" version = "4.3.0" description = "A hyperparameter optimization framework" -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"optimize\"" files = [ {file = "optuna-4.3.0-py3-none-any.whl", hash = "sha256:0ea1a01c99c09cbdf3e2dcd9af01dea86778d9fa20ca26f0238a98e7462d8dcb"}, {file = "optuna-4.3.0.tar.gz", hash = "sha256:b3866842a84bc0bbb9906363bd846cfc39d09d3196265354bfdfda6a2f123b84"}, @@ -2792,6 +2813,7 @@ files = [ {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] @@ -2849,11 +2871,11 @@ description = "Pexpect allows easy control of interactive console applications." optional = false python-versions = "*" groups = ["main", "dev"] -markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\"" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, ] +markers = {main = "extra == \"parallel\" and sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} [package.dependencies] ptyprocess = ">=0.5" @@ -2969,6 +2991,7 @@ files = [ {file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"}, {file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] @@ -2979,9 +3002,10 @@ type = ["mypy (>=1.14.1)"] name = "plotly" version = "6.1.2" description = "An open-source interactive data visualization library for Python" -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"visual\"" files = [ {file = "plotly-6.1.2-py3-none-any.whl", hash = "sha256:f1548a8ed9158d59e03d7fed548c7db5549f3130d9ae19293c8638c202648f6d"}, {file = "plotly-6.1.2.tar.gz", hash = "sha256:4fdaa228926ba3e3a213f4d1713287e69dcad1a7e66cf2025bd7d7026d5014b4"}, @@ -3057,6 +3081,7 @@ files = [ {file = "prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07"}, {file = "prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] wcwidth = "*" @@ -3103,6 +3128,7 @@ files = [ {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"}, {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] @@ -3119,7 +3145,7 @@ files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, ] -markers = {main = "sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\""} +markers = {main = "extra == \"parallel\" and sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\""} [[package]] name = "pure-eval" @@ -3132,6 +3158,7 @@ files = [ {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] tests = ["pytest"] @@ -3159,7 +3186,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -markers = {main = "implementation_name == \"pypy\""} +markers = {main = "extra == \"parallel\" and implementation_name == \"pypy\""} [[package]] name = "pyflakes" @@ -3184,6 +3211,7 @@ files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] windows-terminal = ["colorama (>=0.4.6)"] @@ -3350,7 +3378,6 @@ description = "Python for Window Extensions" optional = false python-versions = "*" groups = ["main", "dev"] -markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\"" files = [ {file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"}, {file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"}, @@ -3369,6 +3396,7 @@ files = [ {file = "pywin32-310-cp39-cp39-win32.whl", hash = "sha256:851c8d927af0d879221e616ae1f66145253537bbdd321a77e8ef701b443a9a1a"}, {file = "pywin32-310-cp39-cp39-win_amd64.whl", hash = "sha256:96867217335559ac619f00ad70e513c0fcf84b8a3af9fc2bba3b59b97da70475"}, ] +markers = {main = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\" and extra == \"parallel\"", dev = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""} [[package]] name = "pywinpty" @@ -3450,6 +3478,7 @@ files = [ {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +markers = {main = "extra == \"optimize\""} [[package]] name = "pyyaml-env-tag" @@ -3568,6 +3597,7 @@ files = [ {file = "pyzmq-26.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:49b6ca2e625b46f499fb081aaf7819a177f41eeb555acb05758aa97f4f95d147"}, {file = "pyzmq-26.4.0.tar.gz", hash = "sha256:4bd13f85f80962f91a651a7356fe0472791a5f7a92f227822b5acf44795c626d"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} @@ -3992,9 +4022,10 @@ test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis name = "seaborn" version = "0.13.2" description = "Statistical data visualization" -optional = false +optional = true python-versions = ">=3.8" groups = ["main"] +markers = "extra == \"visual\"" files = [ {file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"}, {file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"}, @@ -4203,6 +4234,7 @@ files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, ] +markers = {main = "extra == \"parallel\""} [package.dependencies] asttokens = ">=2.1.0" @@ -4268,9 +4300,10 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n name = "tabulate" version = "0.9.0" description = "Pretty-print tabular data" -optional = false +optional = true python-versions = ">=3.7" groups = ["main"] +markers = "extra == \"summary\"" files = [ {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, @@ -4353,6 +4386,7 @@ files = [ {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"}, {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "tqdm" @@ -4387,6 +4421,7 @@ files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, ] +markers = {main = "extra == \"parallel\""} [package.extras] docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] @@ -4536,6 +4571,7 @@ files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +markers = {main = "extra == \"parallel\""} [[package]] name = "webcolors" @@ -4606,7 +4642,13 @@ files = [ [package.extras] dev = ["black (>=19.3b0) ; python_version >= \"3.6\"", "pytest (>=4.6.2)"] +[extras] +optimize = ["optuna"] +parallel = ["ipyparallel"] +summary = ["tabulate"] +visual = ["plotly", "seaborn"] + [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "ce68268ccd858a3d837c021a8d3cf9b55e7a033410f1db3338a8b1f6bfce5e5f" +content-hash = "b46c146dfaf7e858dbacbe96c7d6daee3e9ec5cc0895c8644ba4700ae81b5a3f" diff --git a/pyproject.toml b/pyproject.toml index e3fc0ce..37140be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,18 +18,18 @@ numpy = "^2.2.6" pandas = "^2.2.3" scipy = "^1.15.3" matplotlib = "^3.10.3" -seaborn = "^0.13.2" -plotly = "^6.1.2" +seaborn = {version = "^0.13.2", optional = true} +plotly = {version = "^6.1.2", optional = true} scikit-learn = "^1.6.1" tqdm = "^4.67.1" joblib = "^1.5.1" -ipyparallel = "^9.0.1" +ipyparallel = {version = "^9.0.1", optional = true} requests = "^2.32.3" loguru = "^0.7.3" networkx = "^3.5" jsonpickle = "^4.1.1" statsmodels = "^0.14.4" -tabulate = "^0.9.0" +tabulate = {version = "^0.9.0", optional = true} intervaltree = "^3.1.0" events = "^0.5" pymzml = "^2.5.11" @@ -38,7 +38,13 @@ mass-spec-utils = "^0.0.12" pysmiles = "^2.0.0" numba = "^0.61.2" numba-stats = "^1.10.1" -optuna = "^4.3.0" +optuna = {version = "^4.3.0", optional = true} + +[tool.poetry.extras] +visual = ["plotly", "seaborn"] +parallel = ["ipyparallel"] +optimize = ["optuna"] +summary = ["tabulate"] [tool.poetry.group.dev.dependencies] jupyterlab = "^4.4.3" diff --git a/vimms/BoxVisualise.py b/vimms/BoxVisualise.py index e015696..13f0d70 100644 --- a/vimms/BoxVisualise.py +++ b/vimms/BoxVisualise.py @@ -12,10 +12,19 @@ from PIL import ImageColor import matplotlib.patches as patches import matplotlib.pyplot as plt -import seaborn as sns -import plotly.graph_objects as go -from plotly.subplots import make_subplots -from plotly.colors import DEFAULT_PLOTLY_COLORS +try: + import seaborn as sns +except ImportError: # pragma: no cover - optional dependency + sns = None + +try: + import plotly.graph_objects as go + from plotly.subplots import make_subplots + from plotly.colors import DEFAULT_PLOTLY_COLORS +except ImportError: # pragma: no cover - optional dependency + go = None + make_subplots = None + DEFAULT_PLOTLY_COLORS = None from vimms.Common import path_or_mzml, get_scan_times_combined, get_scan_times from vimms.Box import GenericBox, BoxGrid @@ -395,6 +404,9 @@ def plotly_ms1s( abs_scaling=False, ): + if go is None: + raise ImportError("plotly is required for plotly_ms1s; install vimms[visual]") + self.points_in_bounds(min_rt=min_rt, max_rt=max_rt, min_mz=min_mz, max_mz=max_mz) if np.sum(self.active_ms1) == 0: return @@ -451,6 +463,9 @@ def plotly_ms2s( abs_scaling=False, ): + if go is None: + raise ImportError("plotly is required for plotly_ms2s; install vimms[visual]") + self.points_in_bounds(min_rt=min_rt, max_rt=max_rt, min_mz=min_mz, max_mz=max_mz) if np.sum(self.active_ms2) == 0: return @@ -898,6 +913,10 @@ def mpl_fragmented_boxes(exp_name, eva, mode="max", min_intensity=0.0): def plotly_results_plot(experiment_names, evals, min_intensity=0.0, suptitle=None): + if go is None: + raise ImportError( + "plotly is required for plotly_results_plot; install vimms[visual]" + ) num_chems = len(evals[0].chems) results = [eva.evaluation_report(min_intensity=min_intensity) for eva in evals] repeat = max(r["num_runs"] for r in results) @@ -962,6 +981,8 @@ def plotly_results_plot(experiment_names, evals, min_intensity=0.0, suptitle=Non def plotly_mzml(mzml, draw_minm=0.0, colour_minm=None, show_precursors=False): + if go is None: + raise ImportError("plotly is required for plotly_mzml; install vimms[visual]") fig = go.Figure() mzml = path_or_mzml(mzml) @@ -983,6 +1004,10 @@ def plotly_mzml(mzml, draw_minm=0.0, colour_minm=None, show_precursors=False): def plotly_timing_hist(processing_times, title, binsize=None): + if go is None: + raise ImportError( + "plotly is required for plotly_timing_hist; install vimms[visual]" + ) fig = make_subplots(rows=1, cols=len(processing_times)) for i, run_times in enumerate(processing_times): @@ -1007,6 +1032,10 @@ def plotly_timing_hist(processing_times, title, binsize=None): def plotly_fragmentation_events(exp_name, mzmls, colour_minm=None): + if go is None: + raise ImportError( + "plotly is required for plotly_fragmentation_events; install vimms[visual]" + ) fig = make_subplots(rows=len(mzmls), cols=1, shared_xaxes="all", shared_yaxes="all") for i, mzml in enumerate(mzmls): @@ -1026,6 +1055,10 @@ def plotly_fragmentation_events(exp_name, mzmls, colour_minm=None): def seaborn_hist(data, xlabel, binsize=None): + if sns is None: + raise ImportError( + "seaborn is required for seaborn_hist; install vimms[visual]" + ) fig, axes = plt.subplots(1, len(data), figsize=(15, 5), sharey=True) try: @@ -1041,6 +1074,10 @@ def seaborn_hist(data, xlabel, binsize=None): def seaborn_mzml_timing_hist(mzmls, binsize=None, mode="combined"): + if sns is None: + raise ImportError( + "seaborn is required for seaborn_mzml_timing_hist; install vimms[visual]" + ) if mode == "combined": timings = [get_scan_times_combined(mzmls)] else: @@ -1072,10 +1109,18 @@ def seaborn_mzml_timing_hist(mzmls, binsize=None, mode="combined"): def seaborn_timing_hist(processing_times, binsize=None): + if sns is None: + raise ImportError( + "seaborn is required for seaborn_timing_hist; install vimms[visual]" + ) return seaborn_hist(processing_times, xlabel="Processing time (secs)", binsize=binsize) def seaborn_uncovered_area_hist(eva, box_likes, min_intensity=0.0, cumulative=False, binsize=None): + if sns is None: + raise ImportError( + "seaborn is required for seaborn_uncovered_area_hist; install vimms[visual]" + ) boxes = [[b.to_box(0, 0) for b in ls] for ls in box_likes] geom = BoxGrid() diff --git a/vimms/InSilicoSimulation.py b/vimms/InSilicoSimulation.py index 6b7aa3c..becbe3c 100644 --- a/vimms/InSilicoSimulation.py +++ b/vimms/InSilicoSimulation.py @@ -5,7 +5,10 @@ import os from pathlib import Path -import ipyparallel as ipp +try: + import ipyparallel as ipp +except ImportError: # pragma: no cover - optional dependency + ipp = None import matplotlib.pyplot as plt import numpy as np from loguru import logger @@ -181,14 +184,14 @@ def run_SmartROI(chems, scan_duration, params, out_dir): logger.warning("Running controllers in parallel, please wait ...") run_serial = False try: + if ipp is None: + raise ImportError rc = ipp.Client() dview = rc[:] # use all engines with dview.sync_imports(): pass dview.map_sync(run_single_SmartROI, params_list) - except OSError: # cluster has not been started - run_serial = True - except ipp.error.TimeoutError: # takes too long to run + except Exception: run_serial = True if run_serial: # if any exception from above, try to run it serially @@ -271,17 +274,16 @@ def run_WeightedDEW(chems, scan_duration, params, out_dir): # Try to run the controllers in parallel. If fails, then run it serially logger.warning("Running controllers in parallel, please wait ...") + run_serial = False try: - import ipyparallel as ipp - + if ipp is None: + raise ImportError rc = ipp.Client() dview = rc[:] # use all engines with dview.sync_imports(): pass dview.map_sync(run_single_WeightedDEW, params_list) - except OSError: # cluster has not been started - run_serial = True - except ipp.error.TimeoutError: # takes too long to run + except Exception: run_serial = True if run_serial: # if any exception from above, try to run it serially