openpipelines-bio · lazappi · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,8 @@
 
 * `labels_transfer/cellmapper`: New component that transfers labels from a reference to a query with a shared embedding using CellMapper (PR #1169, PR #1177)
 
+* `annotate/calculate_marker_genes`: New component that calculates cluster marker genes using `scanpy` (PR #1168)
+
 ## MAJOR CHANGES
 
 * `qc/calculate_qc_metrics`: major improvements to memory consumption and runtimes (PR #1140).

diff --git a/src/annotate/calculate_marker_genes_scanpy/config.vsh.yaml b/src/annotate/calculate_marker_genes_scanpy/config.vsh.yaml
@@ -0,0 +1,132 @@
+name: calculate_marker_genes_scanpy
+namespace: annotate
+scope: "public"
+
+description: |
+  Calculate marker genes for clusters in a MuData object using scanpy's
+  `rank_genes_groups()` function.
+
+argument_groups:
+  - name: Inputs
+    description: Arguments that define the input
+    arguments:
+      - name: --input
+        type: file
+        required: true
+        example: input.h5mu
+        direction: input
+        description: Path to the input MuData H5MU file.
+      - name: --modality
+        type: string
+        default: rna
+        description: Name of the modality in the input MuData object.
+      - name: --input_layer
+        type: string
+        required: false
+        description: |
+          Name of the layer to use. This should contain log-normalized data.
+          If not set, the .X matrix will be used.
+      - name: --obs_clustering
+        type: string
+        required: true
+        example: leiden_0.5
+        description: |
+          Name of the `.obs` column that contains cluster assignments to
+          calculate marker genes for.
+
+  - name: Outputs
+    description: Arguments that define the output
+    arguments:
+      - name: --output
+        type: file
+        direction: output
+        default: output.h5mu
+        example: output.h5mu
+        description: Path to the output MuData H5MU file.
+      - name: --output_markers
+        type: file
+        direction: output
+        default: markers.csv
+        example: markers.csv
+        description: Path to the output CSV file where marker genes will be saved.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+  - name: Rank genes groups
+    description: Arguments for scanpy `rank_genes_groups()`
+    arguments:
+      - name: --method
+        type: string
+        default: t-test
+        choices: [logreg, t-test, wilcoxon, t-test_overestim_var]
+        description: Statistical test used to identify marker genes.
+      - name: --corr_method
+        type: string
+        default: benjamini-hochberg
+        choices: [benjamini-hochberg, bonferroni]
+        description: |
+          Method used to correct p-values for multiple testing. Not used when
+          `--method` is `logreg`.
+      - name: --tie_correct
+        type: boolean
+        default: false
+        description: |
+          Whether to apply tie correction to the Wilcoxon test statistic. Only
+          used when `--method` is `wilcoxon`.
+      - name: --key_added
+        type: string
+        default: rank_genes_groups
+        description: Key in `.uns` where the results will be stored.
+
+  - name: Filter rank genes groups
+    description: Arguments for scanpy `filter_rank_genes_groups()`
+    arguments:
+      - name: --filter_results
+        type: boolean
+        default: false
+        description: |
+          Whether to filter the results of `rank_genes_groups()` using
+          `filter_rank_genes_groups()`.
+      - name: --min_in_group_fraction
+        type: double
+        default: 0.25
+        description: Minimum in-group fraction required for a marker gene.
+      - name: --max_out_group_fraction
+        type: double
+        default: 0.5
+        description: Maximum out-group fraction allowed for a marker gene.
+      - name: --min_fold_change
+        type: double
+        default: 1
+        description: Minimum log fold change required for a marker gene.
+      - name: --compare_abs
+        type: boolean
+        default: false
+        description: |
+          Whether to compare against absolute fold changes when filtering.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/compress_h5mu.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, /src/base/requirements/anndata_mudata.yaml ]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midcpu, midmem]
diff --git a/src/annotate/calculate_marker_genes_scanpy/script.py b/src/annotate/calculate_marker_genes_scanpy/script.py
@@ -0,0 +1,155 @@
+import sys
+
+import mudata as mu
+import scanpy as sc
+
+## VIASH START
+par = {
+    "input": "input.h5mu",
+    "modality": "rna",
+    "input_layer": None,
+    "obs_clustering": "leiden_0.5",
+    "output": "output.h5mu",
+    "output_markers": "markers.csv",
+    "output_compression": "gzip",
+    "method": "t-test",
+    "corr_method": "benjamini-hochberg",
+    "tie_correct": False,
+    "key_added": "rank_genes_groups",
+    "filter_results": False,
+    "min_in_group_fraction": 0.25,
+    "max_out_group_fraction": 0.5,
+    "min_fold_change": 1,
+    "compare_abs": False,
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from compress_h5mu import write_h5ad_to_h5mu_with_compression
+
+logger = setup_logger()
+
+
+def calculate_marker_genes(adata, par):
+    input_layer = par.get("input_layer")
+    if input_layer is not None:
+        if input_layer not in adata.layers:
+            raise ValueError(
+                f"'{input_layer}' is not a layer in the input data. Available layers: {list(adata.layers.keys())}."
+            )
+        logger.info("Using layer '%s'", input_layer)
+    else:
+        logger.info("Using .X matrix")
+
+    logger.info("Using '%s' method", par["method"])
+    sc.tl.rank_genes_groups(
+        adata,
+        groupby=par["obs_clustering"],
+        use_raw=False,
+        layer=input_layer,
+        method=par["method"],
+        corr_method=par["corr_method"],
+        tie_correct=par["tie_correct"],
+        pts=True,
+        key_added=par["key_added"],
+    )
+
+
+def filter_marker_genes(adata, par):
+    logger.info("Min in-group fraction: %s", par["min_in_group_fraction"])
+    logger.info("Min fold change: %s", par["min_fold_change"])
+    logger.info("Max out-group fraction: %s", par["max_out_group_fraction"])
+    logger.info("Compare absolute values: %s", par["compare_abs"])
+
+    filtered_key_added = f"{par['key_added']}_filtered"
+    sc.tl.filter_rank_genes_groups(
+        adata,
+        groupby=par["obs_clustering"],
+        use_raw=False,
+        key_added=filtered_key_added,
+        min_in_group_fraction=par["min_in_group_fraction"],
+        min_fold_change=par["min_fold_change"],
+        max_out_group_fraction=par["max_out_group_fraction"],
+        compare_abs=par["compare_abs"],
+    )
+
+    filtered_markers = {}
+    for group in adata.obs[par["obs_clustering"]].cat.categories:
+        filtered_results = sc.get.rank_genes_groups_df(
+            adata, group=group, key=filtered_key_added
+        )
+        filtered_markers[group] = filtered_results["names"].tolist()
+
+    filtering_params = adata.uns[filtered_key_added]["params"]
+    del adata.uns[filtered_key_added]
+    adata.uns[filtered_key_added] = {"params": filtering_params}
+    adata.uns[filtered_key_added]["filters"] = {
+        "min_in_group_fraction": par["min_in_group_fraction"],
+        "min_fold_change": par["min_fold_change"],
+        "max_out_group_fraction": par["max_out_group_fraction"],
+        "compare_abs": par["compare_abs"],
+    }
+    adata.uns[filtered_key_added]["markers"] = filtered_markers
+
+    return filtered_key_added, filtered_markers
+
+
+def format_results(adata, par, filtered_key_added=None):
+    results = sc.get.rank_genes_groups_df(adata, group=None, key=par["key_added"])
+
+    if par["filter_results"]:
+        results["is_marker"] = False
+        for group in adata.obs[par["obs_clustering"]].cat.categories:
+            group_markers = adata.uns[filtered_key_added]["markers"][group]
+            is_group_marker = results["names"].isin(group_markers) & (
+                results["group"] == group
+            )
+            results.loc[is_group_marker, "is_marker"] = True
+
+    return results
+
+
+def main(par):
+    logger.info(
+        "Loading AnnData from '%s' modality '%s'", par["input"], par["modality"]
+    )
+    adata = mu.read_anndata(par["input"], mod=par["modality"])
+
+    logger.info("Selecting clustering column '%s'", par["obs_clustering"])
+    if par["obs_clustering"] not in adata.obs.columns:
+        raise ValueError(
+            f"'{par['obs_clustering']}' is not a column in .obs of the input data"
+        )
+
+    logger.info("Calculating marker genes")
+    calculate_marker_genes(adata, par)
+
+    filtered_key_added = f"{par['key_added']}_filtered"
+    if par["filter_results"]:
+        logger.info("Filtering marker genes")
+        filtered_key_added, filtered_markers = filter_marker_genes(adata, par)
+        for group, markers in filtered_markers.items():
+            logger.info("Group '%s': %s markers", group, len(markers))
+
+    logger.info("Formatting results")
+    results = format_results(adata, par, filtered_key_added)
+
+    logger.info("Writing MuData output to '%s'", par["output"])
+    write_h5ad_to_h5mu_with_compression(
+        par["output"],
+        par["input"],
+        par["modality"],
+        adata,
+        par["output_compression"],
+    )
+
+    logger.info("Writing marker results to '%s'", par["output_markers"])
+    results.to_csv(par["output_markers"], index=False)
+
+    logger.info("Done")
+
+
+if __name__ == "__main__":
+    sys.exit(main(par))
diff --git a/src/annotate/calculate_marker_genes_scanpy/test.py b/src/annotate/calculate_marker_genes_scanpy/test.py
@@ -0,0 +1,90 @@
+import sys
+
+import mudata as mu
+import pandas as pd
+import pytest
+
+## VIASH START
+meta = {"resources_dir": "resources_test/"}
+## VIASH END
+
+input_path = meta["resources_dir"] + "/TS_Blood_filtered.h5mu"
+
+
+def test_default_execution(run_component, tmp_path):
+    output_path = tmp_path / "output.h5mu"
+    markers_path = tmp_path / "markers.csv"
+
+    run_component(
+        [
+            "--input",
+            input_path,
+            "--modality",
+            "rna",
+            "--obs_clustering",
+            "cell_type",
+            "--output",
+            str(output_path),
+            "--output_markers",
+            str(markers_path),
+        ]
+    )
+
+    assert output_path.is_file(), "Output MuData file was not created"
+    output_mdata = mu.read_h5mu(output_path)
+    assert "rna" in output_mdata.mod
+    assert "rank_genes_groups" in output_mdata["rna"].uns
+
+    assert markers_path.is_file(), "Output markers CSV was not created"
+    results = pd.read_csv(markers_path)
+    expected_columns = {
+        "group",
+        "names",
+        "scores",
+        "pvals",
+        "pvals_adj",
+        "logfoldchanges",
+    }
+    assert expected_columns.issubset(results.columns)
+    assert len(results) > 0
+    assert "is_marker" not in results.columns
+
+
+def test_filter_results(run_component, tmp_path):
+    output_path = tmp_path / "output_filtered.h5mu"
+    markers_path = tmp_path / "markers_filtered.csv"
+
+    run_component(
+        [
+            "--input",
+            input_path,
+            "--modality",
+            "rna",
+            "--obs_clustering",
+            "cell_type",
+            "--output",
+            str(output_path),
+            "--output_markers",
+            str(markers_path),
+            "--filter_results",
+            "true",
+        ]
+    )
+
+    output_mdata = mu.read_h5mu(output_path)
+    assert "rank_genes_groups" in output_mdata["rna"].uns
+
+    filtered_key = "rank_genes_groups_filtered"
+    assert filtered_key in output_mdata["rna"].uns
+    filtered_uns = output_mdata["rna"].uns[filtered_key]
+    assert "params" in filtered_uns
+    assert "filters" in filtered_uns
+    assert "markers" in filtered_uns
+
+    results = pd.read_csv(markers_path)
+    assert "is_marker" in results.columns
+    assert set(results["is_marker"].unique()).issubset({True, False})
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))