From 01518f54811487c2685c00a5ebdf6fc47281afd0 Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Mon, 9 Jun 2025 17:36:09 +0100
Subject: [PATCH 1/7] Add skeleton untargeted metabolomics pipeline plan (#293)

* Add skeleton untargeted pipeline directory with plan

* fix flake8 errors in generate chemicals script

* Add lint/test guidelines and minimal untargeted test

* Implement dataset setup script

* Add mzML generation step

* Tune chromatogram width

* Add ground truth table generation

* Add MGF export for simulated chemicals

* docs: mark simulation step complete
---
 AGENTS.md                             |  23 ++++
 demo/untargeted/generate_chemicals.py |  36 ++++++
 demo/untargeted/generate_dataset.py   | 160 ++++++++++++++++++++++++++
 demo/untargeted/plan.md               |  40 +++++++
 tests/test_untargeted_demo.py         |  51 ++++++++
 5 files changed, 310 insertions(+)
 create mode 100644 AGENTS.md
 create mode 100644 demo/untargeted/generate_chemicals.py
 create mode 100644 demo/untargeted/generate_dataset.py
 create mode 100644 demo/untargeted/plan.md
 create mode 100644 tests/test_untargeted_demo.py

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..01b1dd7
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,23 @@
+# Development Guidelines
+
+## Linting
+Run the same flake8 commands used in CI before committing:
+
+```bash
+poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+```
+
+Ensure there are zero linting errors for changed files.
+
+## Testing
+Only execute tests relevant to the area you are working on to save time. For changes in the `demo/untargeted` demo, run:
+
+```bash
+poetry run pytest tests/test_untargeted_demo.py
+```
+
+## Good Practices
+- Write clear commit messages describing your changes.
+- Keep functions short and well documented.
+- Prefer small, focused pull requests.
diff --git a/demo/untargeted/generate_chemicals.py b/demo/untargeted/generate_chemicals.py
new file mode 100644
index 0000000..218bbda
--- /dev/null
+++ b/demo/untargeted/generate_chemicals.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+
+from vimms.ChemicalSamplers import (
+    DatabaseFormulaSampler,
+    UniformRTAndIntensitySampler,
+    GaussianChromatogramSampler,
+)
+from vimms.Chemicals import ChemicalMixtureCreator
+from vimms.Common import ADDUCT_DICT_POS_MH, load_obj, save_obj
+
+
+FIXTURES = Path(__file__).resolve().parents[2] / "tests" / "fixtures"
+
+
+def generate_chemicals(n_chemicals: int = 100):
+    """Return a list of simulated chemicals for testing."""
+    hmdb = load_obj(FIXTURES / "hmdb_compounds.p")
+    fs = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000)
+    ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=180)
+    chromatogram_sampler = GaussianChromatogramSampler(sigma=1)
+    cmc = ChemicalMixtureCreator(
+        fs,
+        rt_and_intensity_sampler=ri,
+        chromatogram_sampler=chromatogram_sampler,
+        adduct_prior_dict=ADDUCT_DICT_POS_MH,
+    )
+    return cmc.sample(n_chemicals, 2)
+
+
+def main():
+    chemicals = generate_chemicals()
+    save_obj(chemicals, str(FIXTURES / "demo_chemicals.p"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py
new file mode 100644
index 0000000..5e7641f
--- /dev/null
+++ b/demo/untargeted/generate_dataset.py
@@ -0,0 +1,160 @@
+"""Prepare a simulated dataset design for untargeted pipeline testing.
+
+This module creates chemicals and an experimental design consisting of two
+groups (case and control). It provides utilities to generate mzML files
+for each sample using a simple Top-1 DDA controller and produces a ground
+truth table of the underlying peaks.
+"""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+from vimms.Common import POSITIVE, PROTON_MASS
+from vimms.Controller import TopNController
+from vimms.Environment import Environment
+from vimms.MassSpec import IndependentMassSpectrometer
+
+from .generate_chemicals import generate_chemicals
+import pandas as pd
+
+
+@dataclass
+class ExperimentalDesign:
+    """Simple container describing sample groups."""
+
+    samples: Dict[str, List[str]]
+
+
+def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign:
+    """Return an experimental design with case and control groups."""
+
+    design = {
+        "case": [f"case_{i + 1}" for i in range(n_samples_per_group)],
+        "control": [f"control_{i + 1}" for i in range(n_samples_per_group)],
+    }
+    return ExperimentalDesign(samples=design)
+
+
+def setup_simulation(
+    n_chemicals: int = 100, n_samples_per_group: int = 5
+) -> Tuple[list, ExperimentalDesign]:
+    """Generate chemicals and an experimental design."""
+
+    chemicals = generate_chemicals(n_chemicals)
+    design = create_design(n_samples_per_group)
+    return chemicals, design
+
+
+def generate_mzml_files(
+    chemicals: list,
+    design: ExperimentalDesign,
+    out_dir: Path,
+    max_rt: int = 180,
+    top_n: int = 1,
+) -> None:
+    """Generate an mzML file for each sample in ``design``.
+
+    Parameters
+    ----------
+    chemicals:
+        List of simulated chemicals.
+    design:
+        Experimental design describing the sample groups.
+    out_dir:
+        Directory where the mzML files will be written.
+    max_rt:
+        Maximum retention time (seconds) for the simulation.
+    top_n:
+        Number of precursors to fragment in each cycle.
+    """
+
+    for group, samples in design.samples.items():
+        group_dir = out_dir / group
+        group_dir.mkdir(parents=True, exist_ok=True)
+        for sample in samples:
+            ms = IndependentMassSpectrometer(POSITIVE, chemicals)
+            controller = TopNController(
+                POSITIVE, top_n, 1, 10, 15, 1000
+            )
+            env = Environment(
+                ms,
+                controller,
+                0,
+                max_rt,
+                progress_bar=False,
+                out_dir=group_dir,
+                out_file=f"{sample}.mzML",
+            )
+            env.run()
+
+
+def generate_ground_truth_table(
+    chemicals: list, design: ExperimentalDesign
+) -> pd.DataFrame:
+    """Return a table describing the true peaks for each sample."""
+
+    records = []
+    for compound_id, chem in enumerate(chemicals):
+        mz = chem.mass + PROTON_MASS
+        rt_min = chem.get_min_rt()
+        rt_max = chem.get_max_rt()
+        rt_apex = chem.get_apex_rt()
+        for group_samples in design.samples.values():
+            for sample in group_samples:
+                records.append(
+                    {
+                        "sample": sample,
+                        "compound_id": compound_id,
+                        "mz_apex": mz,
+                        "rt_apex": rt_apex,
+                        "intensity": chem.max_intensity,
+                        "mz_min": mz - 0.01,
+                        "mz_max": mz + 0.01,
+                        "rt_min": rt_min,
+                        "rt_max": rt_max,
+                    }
+                )
+    return pd.DataFrame.from_records(records)
+
+
+def write_ground_truth_mgf(chemicals: list, out_file: Path) -> None:
+    """Write a simple MGF file of MS2 spectra for each chemical."""
+
+    with open(out_file, "w") as f:
+        for compound_id, chem in enumerate(chemicals):
+            if not chem.children:
+                continue
+            pepmass = chem.mass + PROTON_MASS
+            rt_apex = chem.get_apex_rt()
+
+            f.write("BEGIN IONS\n")
+            f.write(f"TITLE=compound_{compound_id}\n")
+            f.write(f"PEPMASS={pepmass:.5f}\n")
+            f.write("CHARGE=1+\n")
+            f.write(f"RTINSECONDS={rt_apex:.2f}\n")
+            for frag in chem.children:
+                mz = frag.isotopes[0][0]
+                intensity = chem.max_intensity * frag.prop_ms2_mass
+                f.write(f"{mz:.5f} {intensity:.1f}\n")
+            f.write("END IONS\n")
+
+
+def main() -> None:
+    """Entry point for dataset preparation."""
+
+    chemicals, design = setup_simulation()
+    out_dir = Path("./out")
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Generated {len(chemicals)} chemicals")
+    for group, names in design.samples.items():
+        print(group, names)
+    generate_mzml_files(chemicals, design, out_dir)
+    gt = generate_ground_truth_table(chemicals, design)
+    gt.to_csv(out_dir / "ground_truth.csv", index=False)
+    write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/untargeted/plan.md b/demo/untargeted/plan.md
new file mode 100644
index 0000000..15e7d81
--- /dev/null
+++ b/demo/untargeted/plan.md
@@ -0,0 +1,40 @@
+# Untargeted Metabolomics Pipeline Plan
+
+This document outlines the tasks for building a simple untargeted metabolomics
+processing pipeline. Each task contains a short description and a status field to
+track progress.
+
+## Tasks
+
+1. **Simulated Data Generation**
+   - Create a Python script to generate 100 chemicals using `ChemicalMixtureCreator`.
+   - Split chemicals into two groups (case/control) with 5 samples each.
+   - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with
+     chromatograms roughly 5 s wide (``sigma≈1``).
+   - Acquire data with Top‑1 DDA in positive mode including common adducts.
+   - Export mzML files, a ground‑truth table linking peaks to compounds, and a
+     simple MGF library of MS2 spectra.
+   - **Status:** Completed (mzML, ground truth table, and MGF library generated)
+
+2. **Peak Picking**
+   - Since the chemicals are simulated, derive peak information directly from the
+     ground truth and produce a table of peaks per sample (bounding box, apex m/z,
+     RT, intensity, sample name, etc.).
+   - **Status:** Not started
+
+3. **Alignment**
+   - Implement a simple join aligner that merges peaks across samples to create
+     an aligned feature table. Optionally use MZmine for comparison.
+   - **Status:** Not started
+
+4. **Grouping Related Peaks**
+   - Cluster related peaks (e.g. isotopes/adducts) using a simple RT clustering
+     approach such as a Dirichlet process mixture model. Provide an option to
+     run a similar step in MZmine.
+   - **Status:** Not started
+
+5. **Identification**
+   - Match observed peaks to the ground‑truth MSP library to assess performance.
+   - **Status:** Not started
+
+A main pipeline script will orchestrate these steps once they are implemented.
diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py
new file mode 100644
index 0000000..af07288
--- /dev/null
+++ b/tests/test_untargeted_demo.py
@@ -0,0 +1,51 @@
+
+from demo.untargeted.generate_chemicals import generate_chemicals
+from demo.untargeted.generate_dataset import (
+    create_design,
+    generate_mzml_files,
+    generate_ground_truth_table,
+    write_ground_truth_mgf,
+    setup_simulation,
+)
+
+
+def test_generate_chemicals_length():
+    chems = generate_chemicals(10)
+    assert len(chems) == 10
+    # ensure each chemical has expected attributes
+    for chem in chems:
+        assert hasattr(chem, 'mass')
+        assert hasattr(chem, 'rt')
+
+
+def test_create_design():
+    design = create_design(3)
+    assert design.samples['case'] == ['case_1', 'case_2', 'case_3']
+    assert len(design.samples['control']) == 3
+
+
+def test_setup_simulation_returns_data():
+    chems, design = setup_simulation(20, 2)
+    assert len(chems) == 20
+    assert len(design.samples['case']) == 2
+
+
+def test_generate_mzml_files(tmp_path):
+    chems, design = setup_simulation(3, 1)
+    generate_mzml_files(chems, design, tmp_path, max_rt=20, top_n=1)
+    assert (tmp_path / 'case' / 'case_1.mzML').is_file()
+
+
+def test_generate_ground_truth_table():
+    chems, design = setup_simulation(2, 1)
+    df = generate_ground_truth_table(chems, design)
+    assert {'sample', 'compound_id', 'mz_min', 'mz_max'} <= set(df.columns)
+    assert len(df) == 4  # 2 samples (case/control) * 2 chemicals
+
+
+def test_write_ground_truth_mgf(tmp_path):
+    chems = generate_chemicals(2)
+    out_file = tmp_path / "lib.mgf"
+    write_ground_truth_mgf(chems, out_file)
+    text = out_file.read_text()
+    assert text.startswith("BEGIN IONS")

From d42626f222bdb2b15ba30c08af717cc7126032d8 Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Mon, 9 Jun 2025 22:07:10 +0100
Subject: [PATCH 2/7] Add pipeline script and evaluation (#295)

---
 demo/untargeted/evaluation/__init__.py |  5 ++
 demo/untargeted/evaluation/metrics.py  | 33 +++++++++
 demo/untargeted/join_aligner.py        | 65 ++++++++++++++++++
 demo/untargeted/peak_picking.py        | 43 ++++++++++++
 demo/untargeted/pipeline.py            | 75 +++++++++++++++++++++
 demo/untargeted/plan.md                | 23 ++++---
 tests/test_untargeted_demo.py          | 92 ++++++++++++++++++++++++++
 7 files changed, 328 insertions(+), 8 deletions(-)
 create mode 100644 demo/untargeted/evaluation/__init__.py
 create mode 100644 demo/untargeted/evaluation/metrics.py
 create mode 100644 demo/untargeted/join_aligner.py
 create mode 100644 demo/untargeted/peak_picking.py
 create mode 100644 demo/untargeted/pipeline.py

diff --git a/demo/untargeted/evaluation/__init__.py b/demo/untargeted/evaluation/__init__.py
new file mode 100644
index 0000000..be29703
--- /dev/null
+++ b/demo/untargeted/evaluation/__init__.py
@@ -0,0 +1,5 @@
+"""Evaluation utilities for the untargeted demo."""
+
+from .metrics import compute_group_metrics
+
+__all__ = ["compute_group_metrics"]
diff --git a/demo/untargeted/evaluation/metrics.py b/demo/untargeted/evaluation/metrics.py
new file mode 100644
index 0000000..e132f20
--- /dev/null
+++ b/demo/untargeted/evaluation/metrics.py
@@ -0,0 +1,33 @@
+import pandas as pd
+from sklearn.metrics import adjusted_rand_score, pair_confusion_matrix
+
+
+def compute_group_metrics(
+    peaks: pd.DataFrame,
+    compound_col: str = "compound_id",
+    group_col: str = "group",
+) -> dict:
+    """Return alignment metrics comparing predicted groups to ground truth.
+
+    Parameters
+    ----------
+    peaks : pd.DataFrame
+        Table containing columns ``compound_col`` and ``group_col``.
+    compound_col : str, optional
+        Column giving ground truth compound identifiers.
+    group_col : str, optional
+        Column giving predicted alignment group labels.
+
+    Returns
+    -------
+    dict
+        Dictionary with ``precision``, ``recall``, ``f1`` and ``ari`` keys.
+    """
+    labels_true = peaks[compound_col].to_numpy()
+    labels_pred = peaks[group_col].to_numpy()
+    tn, fp, fn, tp = pair_confusion_matrix(labels_true, labels_pred).ravel()
+    precision = tp / (tp + fp) if tp + fp else 0.0
+    recall = tp / (tp + fn) if tp + fn else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+    ari = adjusted_rand_score(labels_true, labels_pred)
+    return {"precision": precision, "recall": recall, "f1": f1, "ari": ari}
diff --git a/demo/untargeted/join_aligner.py b/demo/untargeted/join_aligner.py
new file mode 100644
index 0000000..04034f1
--- /dev/null
+++ b/demo/untargeted/join_aligner.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+
+def join_align(
+    peaks: pd.DataFrame,
+    mz_tol: float,
+    rt_tol: float,
+    return_labels: bool = False,
+) -> pd.DataFrame:
+    """Align peaks across samples based on m/z and RT tolerances.
+
+    Parameters
+    ----------
+    peaks : pd.DataFrame
+        Table containing ``sample``, ``mz``, ``rt`` and ``intensity`` columns.
+    mz_tol : float
+        Absolute m/z tolerance for grouping peaks.
+    rt_tol : float
+        Absolute retention time tolerance for grouping peaks.
+    return_labels : bool, optional
+        If ``True``, also return the input peaks with a ``group`` column
+        describing the assigned alignment group.
+
+    Returns
+    -------
+    pd.DataFrame
+        An intensity table with groups as rows and samples as columns. If
+        ``return_labels`` is ``True`` a tuple ``(aligned, peaks)`` is returned
+        where ``peaks`` contains the assigned ``group`` for each row.
+    """
+    required = {"sample", "mz", "rt", "intensity"}
+    if not required <= set(peaks.columns):
+        raise ValueError(f"Input peaks must contain columns {required}")
+
+    peaks = peaks.copy().sort_values(["mz", "rt"]).reset_index(drop=True)
+    groups: list[dict] = []
+    group_ids: list[int] = [-1] * len(peaks)
+
+    for idx, row in peaks.iterrows():
+        assigned = False
+        for gid, g in enumerate(groups):
+            if abs(row["mz"] - g["mz"]) <= mz_tol and abs(row["rt"] - g["rt"]) <= rt_tol:
+                n = g["n"] + 1
+                g["mz"] = (g["mz"] * g["n"] + row["mz"]) / n
+                g["rt"] = (g["rt"] * g["n"] + row["rt"]) / n
+                g["n"] = n
+                group_ids[idx] = gid
+                assigned = True
+                break
+        if not assigned:
+            groups.append({"mz": row["mz"], "rt": row["rt"], "n": 1})
+            group_ids[idx] = len(groups) - 1
+
+    peaks["group"] = group_ids
+    aligned = peaks.pivot_table(
+        index="group",
+        columns="sample",
+        values="intensity",
+        aggfunc="max",
+        fill_value=0,
+    ).sort_index()
+
+    if return_labels:
+        return aligned, peaks
+    return aligned
diff --git a/demo/untargeted/peak_picking.py b/demo/untargeted/peak_picking.py
new file mode 100644
index 0000000..584eb47
--- /dev/null
+++ b/demo/untargeted/peak_picking.py
@@ -0,0 +1,43 @@
+import pandas as pd
+from pathlib import Path
+
+
+def peak_table_from_ground_truth(gt: pd.DataFrame) -> pd.DataFrame:
+    """Return a simple peak table derived from the ground truth.
+
+    Parameters
+    ----------
+    gt : pd.DataFrame
+        Ground truth table produced by :func:`generate_ground_truth_table`.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with columns ``sample``, ``mz``, ``rt`` and ``intensity``.
+    """
+    required = {"sample", "mz_apex", "rt_apex", "intensity"}
+    if not required <= set(gt.columns):
+        raise ValueError(f"Ground truth must contain columns {required}")
+    return gt.rename(columns={"mz_apex": "mz", "rt_apex": "rt"})[
+        ["sample", "mz", "rt", "intensity"]
+    ]
+
+
+def write_peak_table(gt_file: Path, out_file: Path) -> pd.DataFrame:
+    """Read ``gt_file`` and write the peak table to ``out_file``."""
+    gt = pd.read_csv(gt_file)
+    peaks = peak_table_from_ground_truth(gt)
+    out_file.parent.mkdir(parents=True, exist_ok=True)
+    peaks.to_csv(out_file, index=False)
+    return peaks
+
+
+def main() -> None:
+    """Entry point for the peak picking step."""
+    gt_file = Path("./out/ground_truth.csv")
+    out_file = Path("./out/peaks.csv")
+    write_peak_table(gt_file, out_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py
new file mode 100644
index 0000000..5880343
--- /dev/null
+++ b/demo/untargeted/pipeline.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+from .generate_dataset import (
+    setup_simulation,
+    generate_mzml_files,
+    generate_ground_truth_table,
+    write_ground_truth_mgf,
+)
+from .peak_picking import peak_table_from_ground_truth
+from .join_aligner import join_align
+from .evaluation import compute_group_metrics
+
+
+def run_pipeline(
+    out_dir: Path = Path("./out"),
+    n_chemicals: int = 100,
+    n_samples_per_group: int = 5,
+    mz_tol: float = 0.01,
+    rt_tol: float = 0.5,
+    max_rt: int = 180,
+    top_n: int = 1,
+) -> dict:
+    """Run the full untargeted demo pipeline and return alignment metrics."""
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Simulate chemicals and dataset design
+    chemicals, design = setup_simulation(n_chemicals, n_samples_per_group)
+
+    # Generate mzML files
+    generate_mzml_files(chemicals, design, out_dir, max_rt=max_rt, top_n=top_n)
+
+    # Ground truth table and library
+    gt = generate_ground_truth_table(chemicals, design)
+    gt_file = out_dir / "ground_truth.csv"
+    gt.to_csv(gt_file, index=False)
+    write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf")
+
+    # Peak picking from ground truth
+    peaks = peak_table_from_ground_truth(gt)
+    peaks_file = out_dir / "peaks.csv"
+    peaks.to_csv(peaks_file, index=False)
+
+    # Alignment
+    aligned, labeled = join_align(peaks, mz_tol, rt_tol, return_labels=True)
+    aligned.to_csv(out_dir / "aligned.csv")
+
+    # Join group labels back to ground truth for evaluation
+    gt_eval = gt[["sample", "compound_id", "mz_apex", "rt_apex", "intensity"]].rename(
+        columns={"mz_apex": "mz", "rt_apex": "rt"}
+    )
+    df_eval = labeled.merge(gt_eval, on=["sample", "mz", "rt", "intensity"], how="left")
+
+    metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group")
+
+    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
+
+    return metrics
+
+
+def main() -> None:
+    """Command-line entry point for running the pipeline."""
+
+    metrics = run_pipeline()
+    print("Alignment metrics:")
+    for k, v in metrics.items():
+        print(f"{k}: {v:.4f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/untargeted/plan.md b/demo/untargeted/plan.md
index 15e7d81..b625336 100644
--- a/demo/untargeted/plan.md
+++ b/demo/untargeted/plan.md
@@ -9,10 +9,10 @@ track progress.
 1. **Simulated Data Generation**
    - Create a Python script to generate 100 chemicals using `ChemicalMixtureCreator`.
    - Split chemicals into two groups (case/control) with 5 samples each.
-   - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with
-     chromatograms roughly 5 s wide (``sigma≈1``).
-   - Acquire data with Top‑1 DDA in positive mode including common adducts.
-   - Export mzML files, a ground‑truth table linking peaks to compounds, and a
+   - Use reasonable metabolomics ranges (e.g. m/z 100--1000, RT <3 min) with
+     chromatograms roughly 5 s wide (``sigma~1``).
+   - Acquire data with Top-1 DDA in positive mode including common adducts.
+   - Export mzML files, a ground   truth table linking peaks to compounds, and a
      simple MGF library of MS2 spectra.
    - **Status:** Completed (mzML, ground truth table, and MGF library generated)
 
@@ -20,12 +20,12 @@ track progress.
    - Since the chemicals are simulated, derive peak information directly from the
      ground truth and produce a table of peaks per sample (bounding box, apex m/z,
      RT, intensity, sample name, etc.).
-   - **Status:** Not started
+   - **Status:** Completed (`peak_picking.py` generates `peaks.csv`)
 
 3. **Alignment**
    - Implement a simple join aligner that merges peaks across samples to create
      an aligned feature table. Optionally use MZmine for comparison.
-   - **Status:** Not started
+   - **Status:** Completed (``join_align`` implemented in ``join_aligner.py``)
 
 4. **Grouping Related Peaks**
    - Cluster related peaks (e.g. isotopes/adducts) using a simple RT clustering
@@ -34,7 +34,14 @@ track progress.
    - **Status:** Not started
 
 5. **Identification**
-   - Match observed peaks to the ground‑truth MSP library to assess performance.
+   - Match observed peaks to the ground   truth MSP library to assess performance.
    - **Status:** Not started
 
-A main pipeline script will orchestrate these steps once they are implemented.
+6. **Evaluation**
+   - Measure alignment accuracy against the ground truth using metrics such as
+     precision, recall, F1 score and adjusted Rand index.
+   - **Status:** Completed (evaluated in `pipeline.py`)
+
+7. **Pipeline Script**
+   - Orchestrate all steps from data generation through evaluation.
+   - **Status:** Completed (`pipeline.py` chains the full workflow)
diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py
index af07288..5f9c3a4 100644
--- a/tests/test_untargeted_demo.py
+++ b/tests/test_untargeted_demo.py
@@ -8,6 +8,12 @@
     setup_simulation,
 )
 
+from demo.untargeted.join_aligner import join_align
+from demo.untargeted.evaluation import compute_group_metrics
+from demo.untargeted.peak_picking import peak_table_from_ground_truth
+from demo.untargeted.pipeline import run_pipeline
+import pandas as pd
+
 
 def test_generate_chemicals_length():
     chems = generate_chemicals(10)
@@ -49,3 +55,89 @@ def test_write_ground_truth_mgf(tmp_path):
     write_ground_truth_mgf(chems, out_file)
     text = out_file.read_text()
     assert text.startswith("BEGIN IONS")
+
+
+def test_join_align_groups_by_tolerances():
+    peaks = pd.DataFrame(
+        {
+            "sample": ["s1", "s1", "s2", "s3"],
+            "mz": [100.0, 105.0, 100.01, 200.0],
+            "rt": [1.0, 1.5, 1.01, 2.0],
+            "intensity": [10, 5, 20, 30],
+        }
+    )
+    aligned = join_align(peaks, mz_tol=0.02, rt_tol=0.05)
+    assert set(aligned.columns) == {"s1", "s2", "s3"}
+    assert len(aligned) == 3
+    assert aligned.loc[0, "s1"] == 10
+    assert aligned.loc[0, "s2"] == 20
+    assert aligned.loc[1, "s1"] == 5
+    assert aligned.loc[2, "s3"] == 30
+
+
+def test_join_align_handles_missing_samples():
+    peaks = pd.DataFrame(
+        {
+            "sample": ["s1", "s2"],
+            "mz": [50.0, 50.01],
+            "rt": [0.5, 0.49],
+            "intensity": [5, 15],
+        }
+    )
+    aligned = join_align(peaks, mz_tol=0.02, rt_tol=0.02)
+    assert len(aligned) == 1
+    assert aligned.loc[0, "s1"] == 5
+    assert aligned.loc[0, "s2"] == 15
+
+
+def test_compute_group_metrics_perfect():
+    df = pd.DataFrame({
+        "group": [0, 0, 1, 1],
+        "compound_id": [1, 1, 2, 2],
+    })
+    metrics = compute_group_metrics(df)
+    assert metrics["f1"] == 1.0
+    assert metrics["ari"] == 1.0
+
+
+def test_compute_group_metrics_imperfect():
+    df = pd.DataFrame({
+        "group": [0, 0, 1, 1],
+        "compound_id": [1, 2, 1, 2],
+    })
+    metrics = compute_group_metrics(df)
+    assert 0 <= metrics["precision"] < 1.0
+
+
+def test_peak_table_from_ground_truth():
+    gt = pd.DataFrame(
+        {
+            "sample": ["s1", "s2"],
+            "compound_id": [0, 1],
+            "mz_apex": [100.0, 200.0],
+            "rt_apex": [1.0, 2.0],
+            "intensity": [10, 20],
+            "mz_min": [99.9, 199.9],
+            "mz_max": [100.1, 200.1],
+            "rt_min": [0.9, 1.9],
+            "rt_max": [1.1, 2.1],
+        }
+    )
+    peaks = peak_table_from_ground_truth(gt)
+    assert set(peaks.columns) == {"sample", "mz", "rt", "intensity"}
+    assert len(peaks) == 2
+    assert peaks.loc[1, "rt"] == 2.0
+
+
+def test_run_pipeline(tmp_path):
+    metrics = run_pipeline(
+        out_dir=tmp_path,
+        n_chemicals=1,
+        n_samples_per_group=1,
+        mz_tol=0.02,
+        rt_tol=0.1,
+        max_rt=20,
+        top_n=1,
+    )
+    assert set(metrics) == {"precision", "recall", "f1", "ari"}
+    assert (tmp_path / "aligned.csv").is_file()

From 39780de872385a400b635296179f76afc5077fc5 Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Tue, 10 Jun 2025 17:50:34 +0100
Subject: [PATCH 3/7] Add helper to report alignment metrics (#296)

* Add metric reporting helper to pipeline

* Add linear column RT noise option

* Add RT drift models and use in demo pipeline

* Refactor drift models into main package

* Clean up drift and demo scripts
---
 demo/untargeted/generate_dataset.py | 64 ++++++++++++++++++++-----
 demo/untargeted/pipeline.py         | 74 ++++++++++++++++++++++-------
 vimms/ColumnDrift.py                | 71 +++++++++++++++++++++++++++
 3 files changed, 181 insertions(+), 28 deletions(-)
 create mode 100644 vimms/ColumnDrift.py

diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py
index 5e7641f..6805522 100644
--- a/demo/untargeted/generate_dataset.py
+++ b/demo/untargeted/generate_dataset.py
@@ -8,12 +8,13 @@
 
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 from vimms.Common import POSITIVE, PROTON_MASS
 from vimms.Controller import TopNController
 from vimms.Environment import Environment
 from vimms.MassSpec import IndependentMassSpectrometer
+from vimms.ColumnDrift import SimulatedDriftModel
 
 from .generate_chemicals import generate_chemicals
 import pandas as pd
@@ -37,8 +38,8 @@ def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign:
 
 
 def setup_simulation(
-    n_chemicals: int = 100, n_samples_per_group: int = 5
-) -> Tuple[list, ExperimentalDesign]:
+    n_chemicals=100, n_samples_per_group=5
+):
     """Generate chemicals and an experimental design."""
 
     chemicals = generate_chemicals(n_chemicals)
@@ -52,7 +53,8 @@ def generate_mzml_files(
     out_dir: Path,
     max_rt: int = 180,
     top_n: int = 1,
-) -> None:
+    column_params=None,
+) -> dict:
     """Generate an mzML file for each sample in ``design``.
 
     Parameters
@@ -67,13 +69,34 @@ def generate_mzml_files(
         Maximum retention time (seconds) for the simulation.
     top_n:
         Number of precursors to fragment in each cycle.
+    column_params:
+        Optional parameters for ``SimulatedDriftModel`` specifying
+        ``noise_sd``, ``intercept_params`` and ``linear_params``.
+
+    Returns
+    -------
+    dict
+        Mapping sample names to the chemicals used for simulation.
     """
 
+    sample_chems = {}
     for group, samples in design.samples.items():
         group_dir = out_dir / group
         group_dir.mkdir(parents=True, exist_ok=True)
         for sample in samples:
-            ms = IndependentMassSpectrometer(POSITIVE, chemicals)
+            if column_params is not None:
+                drift = SimulatedDriftModel(
+                    intercept_mu=column_params.get("intercept_params", (0.0, 5.0))[0],
+                    intercept_sd=column_params.get("intercept_params", (0.0, 5.0))[1],
+                    slope_mu=1.0 + column_params.get("linear_params", (0.0, 0.001))[0],
+                    slope_sd=column_params.get("linear_params", (0.0, 0.001))[1],
+                    noise_sd=column_params.get("noise_sd", 0.1),
+                )
+                col = drift.make_column(chemicals)
+                sample_data = col.get_dataset()
+            else:
+                sample_data = chemicals
+            ms = IndependentMassSpectrometer(POSITIVE, sample_data)
             controller = TopNController(
                 POSITIVE, top_n, 1, 10, 15, 1000
             )
@@ -87,28 +110,47 @@ def generate_mzml_files(
                 out_file=f"{sample}.mzML",
             )
             env.run()
+            sample_chems[sample] = sample_data
+    return sample_chems
 
 
 def generate_ground_truth_table(
-    chemicals: list, design: ExperimentalDesign
+    chemicals,
+    design: ExperimentalDesign,
+    per_sample_chems=None,
 ) -> pd.DataFrame:
-    """Return a table describing the true peaks for each sample."""
+    """Return a table describing the true peaks for each sample.
+
+    Parameters
+    ----------
+    chemicals:
+        Base chemical list used when ``per_sample_chems`` is ``None``.
+    design:
+        Experimental design describing the samples.
+    per_sample_chems:
+        Optional mapping of sample name to a list of chemicals with RT drift
+        applied. When provided, ground truth retention times are taken from this
+        mapping instead of ``chemicals``.
+    """
 
     records = []
     for compound_id, chem in enumerate(chemicals):
         mz = chem.mass + PROTON_MASS
-        rt_min = chem.get_min_rt()
-        rt_max = chem.get_max_rt()
-        rt_apex = chem.get_apex_rt()
+        base_rt_min = chem.get_min_rt()
+        base_rt_max = chem.get_max_rt()
         for group_samples in design.samples.values():
             for sample in group_samples:
+                s_chem = chem if per_sample_chems is None else per_sample_chems[sample][compound_id]
+                rt_apex = s_chem.get_apex_rt()
+                rt_min = s_chem.get_min_rt() if per_sample_chems else base_rt_min
+                rt_max = s_chem.get_max_rt() if per_sample_chems else base_rt_max
                 records.append(
                     {
                         "sample": sample,
                         "compound_id": compound_id,
                         "mz_apex": mz,
                         "rt_apex": rt_apex,
-                        "intensity": chem.max_intensity,
+                        "intensity": s_chem.max_intensity,
                         "mz_min": mz - 0.01,
                         "mz_max": mz + 0.01,
                         "rt_min": rt_min,
diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py
index 5880343..18c5a2e 100644
--- a/demo/untargeted/pipeline.py
+++ b/demo/untargeted/pipeline.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import json
 from pathlib import Path
 
@@ -15,27 +13,71 @@
 from .evaluation import compute_group_metrics
 
 
+def report_metrics(metrics):
+    """Print alignment metrics in a friendly format."""
+
+    print("Alignment metrics:")
+    for key, value in metrics.items():
+        print(f"{key}: {value:.4f}")
+
+
 def run_pipeline(
-    out_dir: Path = Path("./out"),
-    n_chemicals: int = 100,
-    n_samples_per_group: int = 5,
-    mz_tol: float = 0.01,
-    rt_tol: float = 0.5,
-    max_rt: int = 180,
-    top_n: int = 1,
-) -> dict:
-    """Run the full untargeted demo pipeline and return alignment metrics."""
+    out_dir=Path("./out"),
+    n_chemicals=100,
+    n_samples_per_group=5,
+    mz_tol=0.01,
+    rt_tol=0.5,
+    max_rt=180,
+    top_n=1,
+    use_rt_noise=False,
+    noise_sd=0.1,
+    intercept_params=(0.0, 5.0),
+    linear_params=(0.0, 0.001),
+):
+    """Run the full untargeted demo pipeline and return alignment metrics.
+
+    Parameters
+    ----------
+    use_rt_noise:
+        Whether to apply retention time drift to each injection using a
+        ``SimulatedDriftModel``.
+    noise_sd:
+        Standard deviation of random noise added around the drift function.
+    intercept_params:
+        Mean and standard deviation of the intercept term (seconds).
+    linear_params:
+        Mean and standard deviation of the linear term.
+    """
 
     out_dir.mkdir(parents=True, exist_ok=True)
 
     # Simulate chemicals and dataset design
     chemicals, design = setup_simulation(n_chemicals, n_samples_per_group)
 
+    column_params = None
+    if use_rt_noise:
+        column_params = {
+            "noise_sd": noise_sd,
+            "intercept_params": intercept_params,
+            "linear_params": linear_params,
+        }
+
     # Generate mzML files
-    generate_mzml_files(chemicals, design, out_dir, max_rt=max_rt, top_n=top_n)
+    sample_chems = generate_mzml_files(
+        chemicals,
+        design,
+        out_dir,
+        max_rt=max_rt,
+        top_n=top_n,
+        column_params=column_params,
+    )
 
     # Ground truth table and library
-    gt = generate_ground_truth_table(chemicals, design)
+    gt = generate_ground_truth_table(
+        chemicals,
+        design,
+        per_sample_chems=sample_chems if use_rt_noise else None,
+    )
     gt_file = out_dir / "ground_truth.csv"
     gt.to_csv(gt_file, index=False)
     write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf")
@@ -65,10 +107,8 @@ def run_pipeline(
 def main() -> None:
     """Command-line entry point for running the pipeline."""
 
-    metrics = run_pipeline()
-    print("Alignment metrics:")
-    for k, v in metrics.items():
-        print(f"{k}: {v:.4f}")
+    metrics = run_pipeline(use_rt_noise=True)
+    report_metrics(metrics)
 
 
 if __name__ == "__main__":
diff --git a/vimms/ColumnDrift.py b/vimms/ColumnDrift.py
new file mode 100644
index 0000000..08c46c8
--- /dev/null
+++ b/vimms/ColumnDrift.py
@@ -0,0 +1,71 @@
+"""Models run-to-run retention time drift for LC-MS injections."""
+
+import numpy as np
+from abc import ABCMeta, abstractmethod
+
+from .Column import LinearColumn
+
+
+class BaseColumnDriftModel(metaclass=ABCMeta):
+    """Base class for objects that provide one drifted ``LinearColumn``."""
+
+    def __init__(self, noise_sd=0.0, random_state=None):
+        self.noise_sd = noise_sd
+        self._rng = np.random.default_rng(random_state)
+
+    @abstractmethod
+    def _sample_drift(self):
+        """Return ``(intercept_shift, slope_scale)`` for one injection."""
+
+    def make_column(self, dataset):
+        """Return a ``LinearColumn`` with drift for ``dataset``."""
+        intercept, scale = self._sample_drift()
+        return LinearColumn.from_fixed_offsets(dataset, self.noise_sd, intercept, scale - 1.0)
+
+
+class SimulatedDriftModel(BaseColumnDriftModel):
+    """Draw drift parameters from Normal distributions."""
+
+    def __init__(
+        self,
+        intercept_mu=0.0,
+        intercept_sd=0.0,
+        slope_mu=1.0,
+        slope_sd=0.0,
+        noise_sd=0.0,
+        random_state=None,
+    ):
+        super().__init__(noise_sd, random_state)
+        self.intercept_mu = intercept_mu
+        self.intercept_sd = intercept_sd
+        self.slope_mu = slope_mu
+        self.slope_sd = slope_sd
+
+    def _sample_drift(self):
+        if self.intercept_sd > 0:
+            intercept = self._rng.normal(self.intercept_mu, self.intercept_sd)
+        else:
+            intercept = self.intercept_mu
+        if self.slope_sd > 0:
+            slope = self._rng.normal(self.slope_mu, self.slope_sd)
+        else:
+            slope = self.slope_mu
+        return intercept, slope
+
+
+class DataDrivenDriftModel(BaseColumnDriftModel):
+    """Replay empirically observed drift parameters."""
+
+    def __init__(self, intercepts, slopes, noise_sd=0.0, random_state=None):
+        if len(intercepts) != len(slopes):
+            raise ValueError("intercepts and slopes must have the same length")
+        super().__init__(noise_sd, random_state)
+        self._intercepts = list(intercepts)
+        self._slopes = list(slopes)
+        self._cursor = 0
+
+    def _sample_drift(self):
+        intercept = self._intercepts[self._cursor]
+        slope = self._slopes[self._cursor]
+        self._cursor = (self._cursor + 1) % len(self._intercepts)
+        return intercept, slope

From 2146ac97d7fdf49951de470734af90e0051d913a Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Wed, 11 Jun 2025 00:09:19 +0100
Subject: [PATCH 4/7] Refactor pipeline output handling (#300)

* Refactor output handling and add get_peak_data

* Refactor pipeline into classes
---
 demo/untargeted/generate_dataset.py |  75 ++++++++++--
 demo/untargeted/pipeline.py         | 161 +++++++++++-------------
 demo/untargeted/processing.py       | 182 ++++++++++++++++++++++++++++
 tests/test_untargeted_demo.py       |   8 +-
 4 files changed, 321 insertions(+), 105 deletions(-)
 create mode 100644 demo/untargeted/processing.py

diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py
index 6805522..6cf8293 100644
--- a/demo/untargeted/generate_dataset.py
+++ b/demo/untargeted/generate_dataset.py
@@ -8,7 +8,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from vimms.Common import POSITIVE, PROTON_MASS
 from vimms.Controller import TopNController
@@ -27,6 +27,16 @@ class ExperimentalDesign:
     samples: Dict[str, List[str]]
 
 
+@dataclass
+class Dataset:
+    """Container describing input data for the pipeline."""
+
+    design: ExperimentalDesign
+    mzml_files: Dict[str, Path]
+    ground_truth: Optional[pd.DataFrame] = None
+    mgf_file: Optional[Path] = None
+
+
 def create_design(n_samples_per_group: int = 5) -> ExperimentalDesign:
     """Return an experimental design with case and control groups."""
 
@@ -182,20 +192,63 @@ def write_ground_truth_mgf(chemicals: list, out_file: Path) -> None:
             f.write("END IONS\n")
 
 
-def main() -> None:
-    """Entry point for dataset preparation."""
+def generate_synthetic_dataset(
+    out_dir: Path,
+    n_chemicals: int = 100,
+    n_samples_per_group: int = 5,
+    max_rt: int = 180,
+    top_n: int = 1,
+    use_rt_noise: bool = False,
+    noise_sd: float = 0.1,
+    intercept_params: tuple[float, float] = (0.0, 5.0),
+    linear_params: tuple[float, float] = (0.0, 0.001),
+) -> Dataset:
+    """Generate a synthetic dataset and return a :class:`Dataset` object."""
 
-    chemicals, design = setup_simulation()
-    out_dir = Path("./out")
     out_dir.mkdir(parents=True, exist_ok=True)
+    chemicals, design = setup_simulation(n_chemicals, n_samples_per_group)
+
+    column_params = None
+    if use_rt_noise:
+        column_params = {
+            "noise_sd": noise_sd,
+            "intercept_params": intercept_params,
+            "linear_params": linear_params,
+        }
+
+    sample_chems = generate_mzml_files(
+        chemicals,
+        design,
+        out_dir,
+        max_rt=max_rt,
+        top_n=top_n,
+        column_params=column_params,
+    )
+
+    gt = generate_ground_truth_table(
+        chemicals,
+        design,
+        per_sample_chems=sample_chems if use_rt_noise else None,
+    )
+    gt_file = out_dir / "ground_truth.csv"
+    gt.to_csv(gt_file, index=False)
+    mgf_file = out_dir / "ground_truth.mgf"
+    write_ground_truth_mgf(chemicals, mgf_file)
+
+    mzml_files = {
+        sample: out_dir / group / f"{sample}.mzML"
+        for group, samples in design.samples.items()
+        for sample in samples
+    }
+
+    return Dataset(design=design, mzml_files=mzml_files, ground_truth=gt, mgf_file=mgf_file)
 
-    print(f"Generated {len(chemicals)} chemicals")
-    for group, names in design.samples.items():
+
+def main() -> None:
+    """Entry point for dataset preparation."""
+    dataset = generate_synthetic_dataset(Path("./out"))
+    for group, names in dataset.design.samples.items():
         print(group, names)
-    generate_mzml_files(chemicals, design, out_dir)
-    gt = generate_ground_truth_table(chemicals, design)
-    gt.to_csv(out_dir / "ground_truth.csv", index=False)
-    write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf")
 
 
 if __name__ == "__main__":
diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py
index 18c5a2e..f2d4043 100644
--- a/demo/untargeted/pipeline.py
+++ b/demo/untargeted/pipeline.py
@@ -1,16 +1,69 @@
-import json
 from pathlib import Path
+from abc import ABC, abstractmethod
 
-
-from .generate_dataset import (
-    setup_simulation,
-    generate_mzml_files,
-    generate_ground_truth_table,
-    write_ground_truth_mgf,
-)
-from .peak_picking import peak_table_from_ground_truth
+from .generate_dataset import Dataset, generate_synthetic_dataset
 from .join_aligner import join_align
-from .evaluation import compute_group_metrics
+from .processing import (
+    get_peak_data,
+    OutputWriter,
+    group_related_peaks,
+    identify_compounds,
+    annotate_spectra,
+    batch_normalize,
+    impute_missing_values,
+    compute_statistics,
+    generate_report,
+)
+
+
+class BasePipeline(ABC):
+    """Base class encapsulating preprocessing logic."""
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        out_dir: Path,
+        mz_tol: float = 0.01,
+        rt_tol: float = 0.5,
+    ) -> None:
+        self.dataset = dataset
+        self.out_dir = out_dir
+        self.mz_tol = mz_tol
+        self.rt_tol = rt_tol
+        self.writer = OutputWriter(out_dir)
+
+    def run(self) -> dict | None:
+        """Run the preprocessing pipeline."""
+
+        peaks = self.get_peaks()
+        aligned, labeled = join_align(
+            peaks, self.mz_tol, self.rt_tol, return_labels=True
+        )
+
+        grouped = group_related_peaks(aligned)
+        identified = identify_compounds(grouped, self.dataset.mgf_file)
+        annotated = annotate_spectra(identified, self.dataset.mgf_file)
+        normalized = batch_normalize(annotated, self.dataset.design)
+        _ = impute_missing_values(normalized)
+
+        metrics = compute_statistics(labeled, self.dataset)
+        self.writer.write_all(peaks=peaks, aligned=aligned, metrics=metrics)
+        _ = generate_report(metrics)
+        return metrics
+
+    @abstractmethod
+    def get_peaks(self):
+        """Return the peak table for alignment."""
+
+
+class SyntheticPipeline(BasePipeline):
+    """Pipeline that expects ground-truth derived peaks."""
+
+    def get_peaks(self):
+        if self.dataset.ground_truth is None:
+            raise ValueError("Ground truth is required for the demo pipeline")
+        return get_peak_data(self.dataset.ground_truth)
+
 
 
 def report_metrics(metrics):
@@ -22,92 +75,20 @@ def report_metrics(metrics):
 
 
 def run_pipeline(
-    out_dir=Path("./out"),
-    n_chemicals=100,
-    n_samples_per_group=5,
-    mz_tol=0.01,
-    rt_tol=0.5,
-    max_rt=180,
-    top_n=1,
-    use_rt_noise=False,
-    noise_sd=0.1,
-    intercept_params=(0.0, 5.0),
-    linear_params=(0.0, 0.001),
-):
-    """Run the full untargeted demo pipeline and return alignment metrics.
-
-    Parameters
-    ----------
-    use_rt_noise:
-        Whether to apply retention time drift to each injection using a
-        ``SimulatedDriftModel``.
-    noise_sd:
-        Standard deviation of random noise added around the drift function.
-    intercept_params:
-        Mean and standard deviation of the intercept term (seconds).
-    linear_params:
-        Mean and standard deviation of the linear term.
-    """
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    # Simulate chemicals and dataset design
-    chemicals, design = setup_simulation(n_chemicals, n_samples_per_group)
-
-    column_params = None
-    if use_rt_noise:
-        column_params = {
-            "noise_sd": noise_sd,
-            "intercept_params": intercept_params,
-            "linear_params": linear_params,
-        }
-
-    # Generate mzML files
-    sample_chems = generate_mzml_files(
-        chemicals,
-        design,
-        out_dir,
-        max_rt=max_rt,
-        top_n=top_n,
-        column_params=column_params,
-    )
+    dataset: Dataset, out_dir: Path, mz_tol: float = 0.01, rt_tol: float = 0.5
+) -> dict | None:
+    """Convenience wrapper executing the synthetic pipeline."""
 
-    # Ground truth table and library
-    gt = generate_ground_truth_table(
-        chemicals,
-        design,
-        per_sample_chems=sample_chems if use_rt_noise else None,
+    pipeline = SyntheticPipeline(
+        dataset=dataset, out_dir=out_dir, mz_tol=mz_tol, rt_tol=rt_tol
     )
-    gt_file = out_dir / "ground_truth.csv"
-    gt.to_csv(gt_file, index=False)
-    write_ground_truth_mgf(chemicals, out_dir / "ground_truth.mgf")
-
-    # Peak picking from ground truth
-    peaks = peak_table_from_ground_truth(gt)
-    peaks_file = out_dir / "peaks.csv"
-    peaks.to_csv(peaks_file, index=False)
-
-    # Alignment
-    aligned, labeled = join_align(peaks, mz_tol, rt_tol, return_labels=True)
-    aligned.to_csv(out_dir / "aligned.csv")
-
-    # Join group labels back to ground truth for evaluation
-    gt_eval = gt[["sample", "compound_id", "mz_apex", "rt_apex", "intensity"]].rename(
-        columns={"mz_apex": "mz", "rt_apex": "rt"}
-    )
-    df_eval = labeled.merge(gt_eval, on=["sample", "mz", "rt", "intensity"], how="left")
-
-    metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group")
-
-    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
-
-    return metrics
+    return pipeline.run()
 
 
 def main() -> None:
     """Command-line entry point for running the pipeline."""
-
-    metrics = run_pipeline(use_rt_noise=True)
+    dataset = generate_synthetic_dataset(Path("./out"), use_rt_noise=True)
+    metrics = run_pipeline(dataset, Path("./out"))
     report_metrics(metrics)
 
 
diff --git a/demo/untargeted/processing.py b/demo/untargeted/processing.py
new file mode 100644
index 0000000..9fa9037
--- /dev/null
+++ b/demo/untargeted/processing.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+"""Stub processing steps for the untargeted workflow."""
+
+from pathlib import Path
+import json
+import pandas as pd
+
+from .generate_dataset import Dataset, ExperimentalDesign
+from .evaluation import compute_group_metrics
+from .peak_picking import peak_table_from_ground_truth
+
+
+def get_peak_data(gt: pd.DataFrame, out_dir: Path | None = None) -> pd.DataFrame:
+    """Return peaks derived from ``gt`` and optionally write to ``out_dir``.
+
+    Parameters
+    ----------
+    gt:
+        Ground truth table describing the true peaks.
+    out_dir:
+        Directory where ``peaks.csv`` will be written when provided.
+
+    Returns
+    -------
+    pd.DataFrame
+        Peak table containing ``sample``, ``mz``, ``rt`` and ``intensity``.
+    """
+    peaks = peak_table_from_ground_truth(gt)
+    if out_dir is not None:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        peaks.to_csv(out_dir / "peaks.csv", index=False)
+    return peaks
+
+
+class OutputWriter:
+    """Utility to persist pipeline outputs to disk."""
+
+    def __init__(self, out_dir: Path) -> None:
+        self.out_dir = out_dir
+        self.out_dir.mkdir(parents=True, exist_ok=True)
+
+    def write_all(
+        self,
+        peaks: pd.DataFrame | None = None,
+        aligned: pd.DataFrame | None = None,
+        metrics: dict | None = None,
+    ) -> None:
+        """Write provided outputs to :pyattr:`out_dir`."""
+
+        if peaks is not None:
+            peaks.to_csv(self.out_dir / "peaks.csv", index=False)
+        if aligned is not None:
+            aligned.to_csv(self.out_dir / "aligned.csv", index=False)
+        if metrics is not None:
+            (self.out_dir / "metrics.json").write_text(
+                json.dumps(metrics, indent=2)
+            )
+
+
+def group_related_peaks(aligned: pd.DataFrame) -> pd.DataFrame:
+    """Group isotopes and adducts together.
+
+    Parameters
+    ----------
+    aligned:
+        Aligned feature table with sample intensities.
+
+    Returns
+    -------
+    pd.DataFrame
+        Table of grouped peaks. Currently returned unchanged.
+    """
+    return aligned
+
+
+def identify_compounds(
+    grouped: pd.DataFrame, library_file: Path | None = None
+) -> pd.DataFrame:
+    """Identify compounds by matching grouped peaks to a library.
+
+    Parameters
+    ----------
+    grouped:
+        Peak table after grouping related peaks.
+    library_file:
+        Optional path to an MGF or MSP library for matching.
+
+    Returns
+    -------
+    pd.DataFrame
+        Peak table with identification columns added. Returned unchanged for now.
+    """
+    return grouped
+
+
+def annotate_spectra(
+    identified: pd.DataFrame, library_file: Path | None = None
+) -> pd.DataFrame:
+    """Add spectral annotations to identified features.
+
+    Parameters
+    ----------
+    identified:
+        Peak table containing compound identifications.
+    library_file:
+        Optional path to a spectral library.
+
+    Returns
+    -------
+    pd.DataFrame
+        Annotated peak table. Returned unchanged for now.
+    """
+    return identified
+
+
+def batch_normalize(peaks: pd.DataFrame, design: ExperimentalDesign) -> pd.DataFrame:
+    """Correct systematic effects between sample groups or batches.
+
+    Parameters
+    ----------
+    peaks:
+        Table of peak intensities.
+    design:
+        Experimental design describing sample groups.
+
+    Returns
+    -------
+    pd.DataFrame
+        Normalized peak table. Returned unchanged for now.
+    """
+    return peaks
+
+
+def impute_missing_values(peaks: pd.DataFrame) -> pd.DataFrame:
+    """Fill in missing intensity values.
+
+    Parameters
+    ----------
+    peaks:
+        Table of peak intensities.
+
+    Returns
+    -------
+    pd.DataFrame
+        Peak table with missing values imputed. Returned unchanged for now.
+    """
+    return peaks
+
+
+def compute_statistics(labeled: pd.DataFrame, dataset: Dataset) -> dict:
+    """Compute evaluation metrics for the aligned peaks.
+
+    Parameters
+    ----------
+    labeled:
+        Peak-level table with grouping labels from the aligner.
+    dataset:
+        Dataset object containing the ground-truth table.
+
+    Returns
+    -------
+    dict
+        Dictionary of computed metrics. Empty if ground truth is unavailable.
+    """
+    if dataset.ground_truth is None:
+        return {}
+
+    gt_eval = dataset.ground_truth[
+        ["sample", "compound_id", "mz_apex", "rt_apex", "intensity"]
+    ].rename(columns={"mz_apex": "mz", "rt_apex": "rt"})
+    df_eval = labeled.merge(
+        gt_eval, on=["sample", "mz", "rt", "intensity"], how="left"
+    )
+    metrics = compute_group_metrics(df_eval, compound_col="compound_id", group_col="group")
+    return metrics
+
+
+def generate_report(metrics: dict) -> str:
+    """Return a JSON report summarizing ``metrics``."""
+
+    return json.dumps(metrics, indent=2)
diff --git a/tests/test_untargeted_demo.py b/tests/test_untargeted_demo.py
index 5f9c3a4..36d8f54 100644
--- a/tests/test_untargeted_demo.py
+++ b/tests/test_untargeted_demo.py
@@ -6,6 +6,7 @@
     generate_ground_truth_table,
     write_ground_truth_mgf,
     setup_simulation,
+    generate_synthetic_dataset,
 )
 
 from demo.untargeted.join_aligner import join_align
@@ -130,14 +131,13 @@ def test_peak_table_from_ground_truth():
 
 
 def test_run_pipeline(tmp_path):
-    metrics = run_pipeline(
-        out_dir=tmp_path,
+    dataset = generate_synthetic_dataset(
+        tmp_path,
         n_chemicals=1,
         n_samples_per_group=1,
-        mz_tol=0.02,
-        rt_tol=0.1,
         max_rt=20,
         top_n=1,
     )
+    metrics = run_pipeline(dataset, tmp_path, mz_tol=0.02, rt_tol=0.1)
     assert set(metrics) == {"precision", "recall", "f1", "ari"}
     assert (tmp_path / "aligned.csv").is_file()

From 93b7e951e79492df1ebe7bf071031d0bc370eb66 Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Wed, 11 Jun 2025 09:45:29 +0100
Subject: [PATCH 5/7] Consolidate demo entrypoints (#301)

---
 demo/untargeted/generate_chemicals.py |  7 -------
 demo/untargeted/generate_dataset.py   | 12 ------------
 demo/untargeted/peak_picking.py       |  9 ---------
 demo/untargeted/pipeline.py           | 16 ++++++++++++----
 4 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/demo/untargeted/generate_chemicals.py b/demo/untargeted/generate_chemicals.py
index 218bbda..293f6d9 100644
--- a/demo/untargeted/generate_chemicals.py
+++ b/demo/untargeted/generate_chemicals.py
@@ -27,10 +27,3 @@ def generate_chemicals(n_chemicals: int = 100):
     return cmc.sample(n_chemicals, 2)
 
 
-def main():
-    chemicals = generate_chemicals()
-    save_obj(chemicals, str(FIXTURES / "demo_chemicals.p"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/demo/untargeted/generate_dataset.py b/demo/untargeted/generate_dataset.py
index 6cf8293..0c4f3ca 100644
--- a/demo/untargeted/generate_dataset.py
+++ b/demo/untargeted/generate_dataset.py
@@ -240,16 +240,4 @@ def generate_synthetic_dataset(
         for group, samples in design.samples.items()
         for sample in samples
     }
-
     return Dataset(design=design, mzml_files=mzml_files, ground_truth=gt, mgf_file=mgf_file)
-
-
-def main() -> None:
-    """Entry point for dataset preparation."""
-    dataset = generate_synthetic_dataset(Path("./out"))
-    for group, names in dataset.design.samples.items():
-        print(group, names)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/demo/untargeted/peak_picking.py b/demo/untargeted/peak_picking.py
index 584eb47..537bffc 100644
--- a/demo/untargeted/peak_picking.py
+++ b/demo/untargeted/peak_picking.py
@@ -32,12 +32,3 @@ def write_peak_table(gt_file: Path, out_file: Path) -> pd.DataFrame:
     return peaks
 
 
-def main() -> None:
-    """Entry point for the peak picking step."""
-    gt_file = Path("./out/ground_truth.csv")
-    out_file = Path("./out/peaks.csv")
-    write_peak_table(gt_file, out_file)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py
index f2d4043..2dfe9cb 100644
--- a/demo/untargeted/pipeline.py
+++ b/demo/untargeted/pipeline.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from abc import ABC, abstractmethod
+import argparse
 
 from .generate_dataset import Dataset, generate_synthetic_dataset
 from .join_aligner import join_align
@@ -84,11 +85,18 @@ def run_pipeline(
     )
     return pipeline.run()
 
+def main(argv: list[str] | None = None) -> None:
+    """Command-line entry point for generating data and running the pipeline."""
 
-def main() -> None:
-    """Command-line entry point for running the pipeline."""
-    dataset = generate_synthetic_dataset(Path("./out"), use_rt_noise=True)
-    metrics = run_pipeline(dataset, Path("./out"))
+    parser = argparse.ArgumentParser(description="Run the untargeted demo pipeline")
+    parser.add_argument("--out-dir", type=Path, default=Path("./out"))
+    parser.add_argument("--use-rt-noise", action="store_true", help="Add RT noise to the synthetic dataset")
+    parser.add_argument("--mz-tol", type=float, default=0.01)
+    parser.add_argument("--rt-tol", type=float, default=0.5)
+    args = parser.parse_args(argv)
+
+    dataset = generate_synthetic_dataset(args.out_dir, use_rt_noise=args.use_rt_noise)
+    metrics = run_pipeline(dataset, args.out_dir, mz_tol=args.mz_tol, rt_tol=args.rt_tol)
     report_metrics(metrics)
 
 

From b31e26ad79e4024a7756eacbc4c90a710ab081b3 Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Wed, 11 Jun 2025 10:12:28 +0100
Subject: [PATCH 6/7] Make pipeline steps overridable (#303)

---
 demo/untargeted/pipeline.py | 43 +++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/demo/untargeted/pipeline.py b/demo/untargeted/pipeline.py
index 2dfe9cb..9a08baf 100644
--- a/demo/untargeted/pipeline.py
+++ b/demo/untargeted/pipeline.py
@@ -33,6 +33,37 @@ def __init__(
         self.rt_tol = rt_tol
         self.writer = OutputWriter(out_dir)
 
+    # ------------------------------------------------------------------
+    # Steps in the preprocessing pipeline. These methods wrap the helper
+    # functions defined in :mod:`processing` so subclasses can override
+    # individual steps when needed.
+    # ------------------------------------------------------------------
+
+    def group_peaks(self, aligned):
+        """Group isotopes/adducts together."""
+
+        return group_related_peaks(aligned)
+
+    def identify(self, grouped):
+        """Match grouped peaks to library entries."""
+
+        return identify_compounds(grouped, self.dataset.mgf_file)
+
+    def annotate(self, identified):
+        """Annotate identified peaks with spectral information."""
+
+        return annotate_spectra(identified, self.dataset.mgf_file)
+
+    def normalize(self, annotated):
+        """Apply batch normalisation."""
+
+        return batch_normalize(annotated, self.dataset.design)
+
+    def impute(self, normalized):
+        """Fill in missing values in the peak table."""
+
+        return impute_missing_values(normalized)
+
     def run(self) -> dict | None:
         """Run the preprocessing pipeline."""
 
@@ -41,11 +72,11 @@ def run(self) -> dict | None:
             peaks, self.mz_tol, self.rt_tol, return_labels=True
         )
 
-        grouped = group_related_peaks(aligned)
-        identified = identify_compounds(grouped, self.dataset.mgf_file)
-        annotated = annotate_spectra(identified, self.dataset.mgf_file)
-        normalized = batch_normalize(annotated, self.dataset.design)
-        _ = impute_missing_values(normalized)
+        grouped = self.group_peaks(aligned)
+        identified = self.identify(grouped)
+        annotated = self.annotate(identified)
+        normalized = self.normalize(annotated)
+        _ = self.impute(normalized)
 
         metrics = compute_statistics(labeled, self.dataset)
         self.writer.write_all(peaks=peaks, aligned=aligned, metrics=metrics)
@@ -66,7 +97,6 @@ def get_peaks(self):
         return get_peak_data(self.dataset.ground_truth)
 
 
-
 def report_metrics(metrics):
     """Print alignment metrics in a friendly format."""
 
@@ -85,6 +115,7 @@ def run_pipeline(
     )
     return pipeline.run()
 
+
 def main(argv: list[str] | None = None) -> None:
     """Command-line entry point for generating data and running the pipeline."""
 

From 017688248c82a0ea54db5422055e89470f26b23f Mon Sep 17 00:00:00 2001
From: Joe Wandy <joe.wandy@glasgow.ac.uk>
Date: Thu, 12 Jun 2025 09:04:18 +0100
Subject: [PATCH 7/7] Make heavy dependencies optional (#304)

---
 poetry.lock                 | 76 ++++++++++++++++++++++++++++---------
 pyproject.toml              | 16 +++++---
 vimms/BoxVisualise.py       | 53 ++++++++++++++++++++++++--
 vimms/InSilicoSimulation.py | 20 +++++-----
 4 files changed, 130 insertions(+), 35 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f18d81f..6dfd182 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4,9 +4,10 @@
 name = "alembic"
 version = "1.16.1"
 description = "A database migration tool for SQLAlchemy."
-optional = false
+optional = true
 python-versions = ">=3.9"
 groups = ["main"]
+markers = "extra == \"optimize\""
 files = [
     {file = "alembic-1.16.1-py3-none-any.whl", hash = "sha256:0cdd48acada30d93aa1035767d67dff25702f8de74d7c3919f2e8492c8db2e67"},
     {file = "alembic-1.16.1.tar.gz", hash = "sha256:43d37ba24b3d17bc1eb1024fe0f51cd1dc95aeb5464594a02c6bb9ca9864bfa4"},
@@ -49,11 +50,11 @@ description = "Disable App Nap on macOS >= 10.9"
 optional = false
 python-versions = ">=3.6"
 groups = ["main", "dev"]
-markers = "platform_system == \"Darwin\""
 files = [
     {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"},
     {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"},
 ]
+markers = {main = "extra == \"parallel\" and platform_system == \"Darwin\"", dev = "platform_system == \"Darwin\""}
 
 [[package]]
 name = "argon2-cffi"
@@ -139,6 +140,7 @@ files = [
     {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"},
     {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 astroid = ["astroid (>=2,<4)"]
@@ -381,7 +383,7 @@ files = [
     {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
     {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
-markers = {main = "implementation_name == \"pypy\""}
+markers = {main = "extra == \"parallel\" and implementation_name == \"pypy\""}
 
 [package.dependencies]
 pycparser = "*"
@@ -522,19 +524,20 @@ description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 groups = ["main", "dev"]
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
-markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", dev = "sys_platform == \"win32\" or platform_system == \"Windows\""}
 
 [[package]]
 name = "colorlog"
 version = "6.9.0"
 description = "Add colours to the output of Python's logging module."
-optional = false
+optional = true
 python-versions = ">=3.6"
 groups = ["main"]
+markers = "extra == \"optimize\""
 files = [
     {file = "colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff"},
     {file = "colorlog-6.9.0.tar.gz", hash = "sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2"},
@@ -557,6 +560,7 @@ files = [
     {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"},
     {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 traitlets = ">=4"
@@ -772,6 +776,7 @@ files = [
     {file = "debugpy-1.8.14-py2.py3-none-any.whl", hash = "sha256:5cd9a579d553b6cb9759a7908a41988ee6280b961f24f63336835d9418216a20"},
     {file = "debugpy-1.8.14.tar.gz", hash = "sha256:7cd287184318416850aa8b60ac90105837bb1e59531898c07569d197d2ed5322"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "decorator"
@@ -784,6 +789,7 @@ files = [
     {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"},
     {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "defusedxml"
@@ -820,6 +826,7 @@ files = [
     {file = "entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f"},
     {file = "entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "events"
@@ -843,6 +850,7 @@ files = [
     {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"},
     {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""]
@@ -1188,6 +1196,7 @@ files = [
     {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"},
     {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 appnope = {version = "*", markers = "platform_system == \"Darwin\""}
@@ -1215,9 +1224,10 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio
 name = "ipyparallel"
 version = "9.0.1"
 description = "Interactive Parallel Computing with IPython"
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"parallel\""
 files = [
     {file = "ipyparallel-9.0.1-py3-none-any.whl", hash = "sha256:cc0702e26af416a1c35c73eb6dbc3a6a2c49417cb24dda4511be85c9fe87ea64"},
     {file = "ipyparallel-9.0.1.tar.gz", hash = "sha256:2e592cad2200c5a94fbbff639bff36e6ec9122f34b36b2fc6b4d678d9e98f29c"},
@@ -1254,6 +1264,7 @@ files = [
     {file = "ipython-9.3.0-py3-none-any.whl", hash = "sha256:1a0b6dd9221a1f5dddf725b57ac0cb6fddc7b5f470576231ae9162b9b3455a04"},
     {file = "ipython-9.3.0.tar.gz", hash = "sha256:79eb896f9f23f50ad16c3bc205f686f6e030ad246cc309c6279a242b14afe9d8"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
@@ -1287,6 +1298,7 @@ files = [
     {file = "ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c"},
     {file = "ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 pygments = "*"
@@ -1339,6 +1351,7 @@ files = [
     {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"},
     {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 parso = ">=0.8.4,<0.9.0"
@@ -1480,6 +1493,7 @@ files = [
     {file = "jupyter_client-7.4.9-py3-none-any.whl", hash = "sha256:214668aaea208195f4c13d28eb272ba79f945fc0cf3f11c7092c20b2ca1980e7"},
     {file = "jupyter_client-7.4.9.tar.gz", hash = "sha256:52be28e04171f07aed8f20e1616a5a552ab9fee9cbbe6c1896ae170c3880d392"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 entrypoints = "*"
@@ -1505,6 +1519,7 @@ files = [
     {file = "jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0"},
     {file = "jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 platformdirs = ">=2.5"
@@ -1991,9 +2006,10 @@ source = ["Cython (>=3.0.11,<3.1.0)"]
 name = "mako"
 version = "1.3.10"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"optimize\""
 files = [
     {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
     {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
@@ -2093,6 +2109,7 @@ files = [
     {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
     {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
 ]
+markers = {main = "extra == \"optimize\""}
 
 [[package]]
 name = "mass-spec-utils"
@@ -2183,6 +2200,7 @@ files = [
     {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"},
     {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 traitlets = "*"
@@ -2345,9 +2363,10 @@ files = [
 name = "narwhals"
 version = "1.41.0"
 description = "Extremely lightweight compatibility layer between dataframe libraries"
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"visual\""
 files = [
     {file = "narwhals-1.41.0-py3-none-any.whl", hash = "sha256:d958336b40952e4c4b7aeef259a7074851da0800cf902186a58f2faeff97be02"},
     {file = "narwhals-1.41.0.tar.gz", hash = "sha256:0ab2e5a1757a19b071e37ca74b53b0b5426789321d68939738337dfddea629b5"},
@@ -2459,6 +2478,7 @@ files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "networkx"
@@ -2635,9 +2655,10 @@ files = [
 name = "optuna"
 version = "4.3.0"
 description = "A hyperparameter optimization framework"
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"optimize\""
 files = [
     {file = "optuna-4.3.0-py3-none-any.whl", hash = "sha256:0ea1a01c99c09cbdf3e2dcd9af01dea86778d9fa20ca26f0238a98e7462d8dcb"},
     {file = "optuna-4.3.0.tar.gz", hash = "sha256:b3866842a84bc0bbb9906363bd846cfc39d09d3196265354bfdfda6a2f123b84"},
@@ -2792,6 +2813,7 @@ files = [
     {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"},
     {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
@@ -2849,11 +2871,11 @@ description = "Pexpect allows easy control of interactive console applications."
 optional = false
 python-versions = "*"
 groups = ["main", "dev"]
-markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""
 files = [
     {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
 ]
+markers = {main = "extra == \"parallel\" and sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\""}
 
 [package.dependencies]
 ptyprocess = ">=0.5"
@@ -2969,6 +2991,7 @@ files = [
     {file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"},
     {file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
@@ -2979,9 +3002,10 @@ type = ["mypy (>=1.14.1)"]
 name = "plotly"
 version = "6.1.2"
 description = "An open-source interactive data visualization library for Python"
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"visual\""
 files = [
     {file = "plotly-6.1.2-py3-none-any.whl", hash = "sha256:f1548a8ed9158d59e03d7fed548c7db5549f3130d9ae19293c8638c202648f6d"},
     {file = "plotly-6.1.2.tar.gz", hash = "sha256:4fdaa228926ba3e3a213f4d1713287e69dcad1a7e66cf2025bd7d7026d5014b4"},
@@ -3057,6 +3081,7 @@ files = [
     {file = "prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07"},
     {file = "prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 wcwidth = "*"
@@ -3103,6 +3128,7 @@ files = [
     {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"},
     {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"]
@@ -3119,7 +3145,7 @@ files = [
     {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
     {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
 ]
-markers = {main = "sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\""}
+markers = {main = "extra == \"parallel\" and sys_platform != \"win32\" and sys_platform != \"emscripten\"", dev = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\""}
 
 [[package]]
 name = "pure-eval"
@@ -3132,6 +3158,7 @@ files = [
     {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"},
     {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 tests = ["pytest"]
@@ -3159,7 +3186,7 @@ files = [
     {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
-markers = {main = "implementation_name == \"pypy\""}
+markers = {main = "extra == \"parallel\" and implementation_name == \"pypy\""}
 
 [[package]]
 name = "pyflakes"
@@ -3184,6 +3211,7 @@ files = [
     {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
     {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 windows-terminal = ["colorama (>=0.4.6)"]
@@ -3350,7 +3378,6 @@ description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 groups = ["main", "dev"]
-markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""
 files = [
     {file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"},
     {file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"},
@@ -3369,6 +3396,7 @@ files = [
     {file = "pywin32-310-cp39-cp39-win32.whl", hash = "sha256:851c8d927af0d879221e616ae1f66145253537bbdd321a77e8ef701b443a9a1a"},
     {file = "pywin32-310-cp39-cp39-win_amd64.whl", hash = "sha256:96867217335559ac619f00ad70e513c0fcf84b8a3af9fc2bba3b59b97da70475"},
 ]
+markers = {main = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\" and extra == \"parallel\"", dev = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""}
 
 [[package]]
 name = "pywinpty"
@@ -3450,6 +3478,7 @@ files = [
     {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
     {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
+markers = {main = "extra == \"optimize\""}
 
 [[package]]
 name = "pyyaml-env-tag"
@@ -3568,6 +3597,7 @@ files = [
     {file = "pyzmq-26.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:49b6ca2e625b46f499fb081aaf7819a177f41eeb555acb05758aa97f4f95d147"},
     {file = "pyzmq-26.4.0.tar.gz", hash = "sha256:4bd13f85f80962f91a651a7356fe0472791a5f7a92f227822b5acf44795c626d"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 cffi = {version = "*", markers = "implementation_name == \"pypy\""}
@@ -3992,9 +4022,10 @@ test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis
 name = "seaborn"
 version = "0.13.2"
 description = "Statistical data visualization"
-optional = false
+optional = true
 python-versions = ">=3.8"
 groups = ["main"]
+markers = "extra == \"visual\""
 files = [
     {file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"},
     {file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"},
@@ -4203,6 +4234,7 @@ files = [
     {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"},
     {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.dependencies]
 asttokens = ">=2.1.0"
@@ -4268,9 +4300,10 @@ docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "n
 name = "tabulate"
 version = "0.9.0"
 description = "Pretty-print tabular data"
-optional = false
+optional = true
 python-versions = ">=3.7"
 groups = ["main"]
+markers = "extra == \"summary\""
 files = [
     {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
     {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
@@ -4353,6 +4386,7 @@ files = [
     {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"},
     {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "tqdm"
@@ -4387,6 +4421,7 @@ files = [
     {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"},
     {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [package.extras]
 docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
@@ -4536,6 +4571,7 @@ files = [
     {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
 ]
+markers = {main = "extra == \"parallel\""}
 
 [[package]]
 name = "webcolors"
@@ -4606,7 +4642,13 @@ files = [
 [package.extras]
 dev = ["black (>=19.3b0) ; python_version >= \"3.6\"", "pytest (>=4.6.2)"]
 
+[extras]
+optimize = ["optuna"]
+parallel = ["ipyparallel"]
+summary = ["tabulate"]
+visual = ["plotly", "seaborn"]
+
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.11,<3.13"
-content-hash = "ce68268ccd858a3d837c021a8d3cf9b55e7a033410f1db3338a8b1f6bfce5e5f"
+content-hash = "b46c146dfaf7e858dbacbe96c7d6daee3e9ec5cc0895c8644ba4700ae81b5a3f"
diff --git a/pyproject.toml b/pyproject.toml
index e3fc0ce..37140be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,18 +18,18 @@ numpy = "^2.2.6"
 pandas = "^2.2.3"
 scipy = "^1.15.3"
 matplotlib = "^3.10.3"
-seaborn = "^0.13.2"
-plotly = "^6.1.2"
+seaborn = {version = "^0.13.2", optional = true}
+plotly = {version = "^6.1.2", optional = true}
 scikit-learn = "^1.6.1"
 tqdm = "^4.67.1"
 joblib = "^1.5.1"
-ipyparallel = "^9.0.1"
+ipyparallel = {version = "^9.0.1", optional = true}
 requests = "^2.32.3"
 loguru = "^0.7.3"
 networkx = "^3.5"
 jsonpickle = "^4.1.1"
 statsmodels = "^0.14.4"
-tabulate = "^0.9.0"
+tabulate = {version = "^0.9.0", optional = true}
 intervaltree = "^3.1.0"
 events = "^0.5"
 pymzml = "^2.5.11"
@@ -38,7 +38,13 @@ mass-spec-utils = "^0.0.12"
 pysmiles = "^2.0.0"
 numba = "^0.61.2"
 numba-stats = "^1.10.1"
-optuna = "^4.3.0"
+optuna = {version = "^4.3.0", optional = true}
+
+[tool.poetry.extras]
+visual = ["plotly", "seaborn"]
+parallel = ["ipyparallel"]
+optimize = ["optuna"]
+summary = ["tabulate"]
 
 [tool.poetry.group.dev.dependencies]
 jupyterlab = "^4.4.3"
diff --git a/vimms/BoxVisualise.py b/vimms/BoxVisualise.py
index e015696..13f0d70 100644
--- a/vimms/BoxVisualise.py
+++ b/vimms/BoxVisualise.py
@@ -12,10 +12,19 @@
 from PIL import ImageColor
 import matplotlib.patches as patches
 import matplotlib.pyplot as plt
-import seaborn as sns
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-from plotly.colors import DEFAULT_PLOTLY_COLORS
+try:
+    import seaborn as sns
+except ImportError:  # pragma: no cover - optional dependency
+    sns = None
+
+try:
+    import plotly.graph_objects as go
+    from plotly.subplots import make_subplots
+    from plotly.colors import DEFAULT_PLOTLY_COLORS
+except ImportError:  # pragma: no cover - optional dependency
+    go = None
+    make_subplots = None
+    DEFAULT_PLOTLY_COLORS = None
 
 from vimms.Common import path_or_mzml, get_scan_times_combined, get_scan_times
 from vimms.Box import GenericBox, BoxGrid
@@ -395,6 +404,9 @@ def plotly_ms1s(
         abs_scaling=False,
     ):
 
+        if go is None:
+            raise ImportError("plotly is required for plotly_ms1s; install vimms[visual]")
+
         self.points_in_bounds(min_rt=min_rt, max_rt=max_rt, min_mz=min_mz, max_mz=max_mz)
         if np.sum(self.active_ms1) == 0:
             return
@@ -451,6 +463,9 @@ def plotly_ms2s(
         abs_scaling=False,
     ):
 
+        if go is None:
+            raise ImportError("plotly is required for plotly_ms2s; install vimms[visual]")
+
         self.points_in_bounds(min_rt=min_rt, max_rt=max_rt, min_mz=min_mz, max_mz=max_mz)
         if np.sum(self.active_ms2) == 0:
             return
@@ -898,6 +913,10 @@ def mpl_fragmented_boxes(exp_name, eva, mode="max", min_intensity=0.0):
 
 
 def plotly_results_plot(experiment_names, evals, min_intensity=0.0, suptitle=None):
+    if go is None:
+        raise ImportError(
+            "plotly is required for plotly_results_plot; install vimms[visual]"
+        )
     num_chems = len(evals[0].chems)
     results = [eva.evaluation_report(min_intensity=min_intensity) for eva in evals]
     repeat = max(r["num_runs"] for r in results)
@@ -962,6 +981,8 @@ def plotly_results_plot(experiment_names, evals, min_intensity=0.0, suptitle=Non
 
 
 def plotly_mzml(mzml, draw_minm=0.0, colour_minm=None, show_precursors=False):
+    if go is None:
+        raise ImportError("plotly is required for plotly_mzml; install vimms[visual]")
     fig = go.Figure()
 
     mzml = path_or_mzml(mzml)
@@ -983,6 +1004,10 @@ def plotly_mzml(mzml, draw_minm=0.0, colour_minm=None, show_precursors=False):
 
 
 def plotly_timing_hist(processing_times, title, binsize=None):
+    if go is None:
+        raise ImportError(
+            "plotly is required for plotly_timing_hist; install vimms[visual]"
+        )
     fig = make_subplots(rows=1, cols=len(processing_times))
 
     for i, run_times in enumerate(processing_times):
@@ -1007,6 +1032,10 @@ def plotly_timing_hist(processing_times, title, binsize=None):
 
 
 def plotly_fragmentation_events(exp_name, mzmls, colour_minm=None):
+    if go is None:
+        raise ImportError(
+            "plotly is required for plotly_fragmentation_events; install vimms[visual]"
+        )
     fig = make_subplots(rows=len(mzmls), cols=1, shared_xaxes="all", shared_yaxes="all")
 
     for i, mzml in enumerate(mzmls):
@@ -1026,6 +1055,10 @@ def plotly_fragmentation_events(exp_name, mzmls, colour_minm=None):
 
 
 def seaborn_hist(data, xlabel, binsize=None):
+    if sns is None:
+        raise ImportError(
+            "seaborn is required for seaborn_hist; install vimms[visual]"
+        )
     fig, axes = plt.subplots(1, len(data), figsize=(15, 5), sharey=True)
 
     try:
@@ -1041,6 +1074,10 @@ def seaborn_hist(data, xlabel, binsize=None):
 
 
 def seaborn_mzml_timing_hist(mzmls, binsize=None, mode="combined"):
+    if sns is None:
+        raise ImportError(
+            "seaborn is required for seaborn_mzml_timing_hist; install vimms[visual]"
+        )
     if mode == "combined":
         timings = [get_scan_times_combined(mzmls)]
     else:
@@ -1072,10 +1109,18 @@ def seaborn_mzml_timing_hist(mzmls, binsize=None, mode="combined"):
 
 
 def seaborn_timing_hist(processing_times, binsize=None):
+    if sns is None:
+        raise ImportError(
+            "seaborn is required for seaborn_timing_hist; install vimms[visual]"
+        )
     return seaborn_hist(processing_times, xlabel="Processing time (secs)", binsize=binsize)
 
 
 def seaborn_uncovered_area_hist(eva, box_likes, min_intensity=0.0, cumulative=False, binsize=None):
+    if sns is None:
+        raise ImportError(
+            "seaborn is required for seaborn_uncovered_area_hist; install vimms[visual]"
+        )
 
     boxes = [[b.to_box(0, 0) for b in ls] for ls in box_likes]
     geom = BoxGrid()
diff --git a/vimms/InSilicoSimulation.py b/vimms/InSilicoSimulation.py
index 6b7aa3c..becbe3c 100644
--- a/vimms/InSilicoSimulation.py
+++ b/vimms/InSilicoSimulation.py
@@ -5,7 +5,10 @@
 import os
 from pathlib import Path
 
-import ipyparallel as ipp
+try:
+    import ipyparallel as ipp
+except ImportError:  # pragma: no cover - optional dependency
+    ipp = None
 import matplotlib.pyplot as plt
 import numpy as np
 from loguru import logger
@@ -181,14 +184,14 @@ def run_SmartROI(chems, scan_duration, params, out_dir):
     logger.warning("Running controllers in parallel, please wait ...")
     run_serial = False
     try:
+        if ipp is None:
+            raise ImportError
         rc = ipp.Client()
         dview = rc[:]  # use all engines
         with dview.sync_imports():
             pass
         dview.map_sync(run_single_SmartROI, params_list)
-    except OSError:  # cluster has not been started
-        run_serial = True
-    except ipp.error.TimeoutError:  # takes too long to run
+    except Exception:
         run_serial = True
 
     if run_serial:  # if any exception from above, try to run it serially
@@ -271,17 +274,16 @@ def run_WeightedDEW(chems, scan_duration, params, out_dir):
 
     # Try to run the controllers in parallel. If fails, then run it serially
     logger.warning("Running controllers in parallel, please wait ...")
+    run_serial = False
     try:
-        import ipyparallel as ipp
-
+        if ipp is None:
+            raise ImportError
         rc = ipp.Client()
         dview = rc[:]  # use all engines
         with dview.sync_imports():
             pass
         dview.map_sync(run_single_WeightedDEW, params_list)
-    except OSError:  # cluster has not been started
-        run_serial = True
-    except ipp.error.TimeoutError:  # takes too long to run
+    except Exception:
         run_serial = True
 
     if run_serial:  # if any exception from above, try to run it serially