openpipelines-bio
diff --git a/‎.github/workflows/integration-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/release-build.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release-build.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/viash-test.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/viash-test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/annotate/consensus_vote/config.vsh.yaml‎
Lines changed: 111 additions & 0 deletions b/‎src/annotate/consensus_vote/config.vsh.yaml‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎src/annotate/consensus_vote/script.py‎
Lines changed: 123 additions & 0 deletions b/‎src/annotate/consensus_vote/script.py‎
Lines changed: 123 additions & 0 deletions
@@ -73,7 +73,7 @@ jobs:
 
     - uses: viash-io/viash-actions/setup@v6
 
-    - uses: nf-core/setup-nextflow@v2.1.4
+    - uses: nf-core/setup-nextflow@v3.0.0
 
     # use cache
     - name: Cache resources data
 
@@ -62,7 +62,7 @@ jobs:
 
     - uses: viash-io/viash-actions/setup@v6
 
-    - uses: nf-core/setup-nextflow@v2.1.4
+    - uses: nf-core/setup-nextflow@v3.0.0
 
     # use cache
     - name: Cache resources data
 
@@ -21,6 +21,7 @@ jobs:
         uses: actions/setup-python@v6
       - uses: r-lib/actions/setup-r@v2
         with:
+          r-version: 4.5.3
           use-public-rspm: true
       - run: python -m pip install pre-commit
         shell: bash
 
@@ -8,10 +8,26 @@
 
 * `workflows/rna/rna_multisample`, `workflows/multiomics/process_batches`, `feature_annotation/highly_variable_features_scanpy`: add an option to exclude features before running highly variable gene calculation based on a user-defined list of feature names (PR #1121).
 
+* `annotate/consensus_vote`: new component computing a (weighted) majority vote across cell type labels from multiple annotation methods (PR #1151).
+* 
+* `filter/filter_with_quantile`: added a component to filter numerical .obs or .var columns based on quantile thresholds, with optional subsetting (PR #1146).
+
+* `dimred/pca`: added possibility to do chunked processing using arguments `chunks` and `chunk_size`. Also added a `seed` argument in order to better control the variability between executions (PR #1157).
+
+* `workflows/multiomics/process_singlesample`: New workflow for processing RNA, protein and GDO modalities of individual samples (PR #1147).
+
+* `transform/clear_slots`: New component that can be used to remove all items from slots of a MuData object (PR #1171).
+
+* `workflows/multiomics/process_singlesample`, `workflows/multiomics/process_samples`, `workflows/multiomics/process_batches`: add `--intersect_obs` option to remove observations that are not present in all processed modalities, so each modality shares the same set of cells (PR #1173, 1175).
+
+* `labels_transfer/cellmapper`: New component that transfers labels from a reference to a query with a shared embedding using CellMapper (PR #1169, PR #1177)
+
 ## MAJOR CHANGES
 
 * `qc/calculate_qc_metrics`: major improvements to memory consumption and runtimes (PR #1140).
 
+* `annotate/popv`: bump version to 0.6.1 (PR #1167).
+
 ## MINOR CHANGES
 
 * `dataflow/split_modalities`: improve memory consumption by only reading one modality at the same time (PR #1152).
@@ -28,6 +44,18 @@
 
 * `workflows/annotation/scanvi_scarches`: set `--input_obs_batch_label` and `--reference_obs_batch_label` defaults to `sample_id` and `--reference_var_hvg` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
 
+* `cluster/leiden`: added `flavor`, `n_iterations` and `seed` arguments (PR #1132)
+
+* `cluster/leiden`: avoid creating unnecessary copies of the output data (PR #1132).
+
+* `workflows/multiomics/process_samples`: refactored to use a shared `process_singlesample_base` subworkflow, which is also used by the new `process_singlesample` workflow to avoid code duplication (PR #1147).
+
+* Bump anndata to `0.12.11` (PR #1174).
+
+* Add missing `example` fields to several component and workflow configurations (PR #1067).
+
+* Testing: bump `viashpy` to 0.10.0 (PR #1178).
+
 ## BUG FIXES
 
 * `dataflow/split_h5mu`: pin scipy version to 1.16.3 to avoid regression that corrupts large sparse matrix indexing (PR #1153).
 
@@ -0,0 +1,111 @@
+name: consensus_vote
+namespace: annotate
+scope: "public"
+description: |
+  Combines cell type predictions from multiple annotation methods into a single consensus prediction using a weighted majority vote. 
+  For each cell, each method votes for its predicted cell type, optionally weighted by the probability score and/or a per-method weight.
+  The consensus prediction is the cell type with the highest total weighted vote.
+  Note that this method does not leverage pre-existing ontology or perform any reconciliation of cell type labels across methods, so the same cell type may be represented by different labels in different methods and will be treated as distinct cell types in the vote.
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset arguments.
+    arguments:
+      - name: "--input"
+        type: file
+        description: Input h5mu file containing cell type predictions in .obs.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_obs_predictions"
+        type: string
+        description: |
+          One or more .obs column names containing cell type predictions (labels) from
+          different annotation methods.
+        required: true
+        multiple: true
+        example: ["scanvi_pred", "celltypist_pred"]
+      - name: "--input_obs_probabilities"
+        type: string
+        description: |
+          One or more .obs column names containing prediction probability scores,
+          one per method in --input_obs_predictions. When provided, each method's
+          vote is scaled by the probability score for that cell (in addition to
+          any per-method --weights). Must be the same length as --input_obs_predictions.
+        required: false
+        multiple: true
+        example: ["scanvi_prob", "celltypist_prob", "singler_prob"]
+      - name: "--tie_label"
+        type: string
+        description: |
+          Label to assign when two or more cell types receive equal votes.
+          If not provided, tied cells are assigned None (missing value).
+        required: false
+        example: "Unknown"
+      - name: "--weights"
+        type: double
+        description: |
+          Per-method weights for the consensus vote. Must be the same length as
+          --input_obs_predictions when provided. Weights are normalized to sum to 1
+          before use. If not provided, all methods are weighted equally.
+        required: false
+        multiple: true
+        example: [1.0, 2.0]
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        alternatives: [-o]
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: consensus_pred
+        required: false
+        description: |
+          In which `.obs` slot to store the consensus predicted cell type.
+      - name: "--output_obs_score"
+        type: string
+        default: consensus_score
+        required: false
+        description: |
+          In which `.obs` slot to store the consensus score, defined as the fraction
+          of total weight assigned to the winning cell type.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+  - path: /src/utils/compress_h5mu.py
+
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: python:3.13-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, lowmem, lowdisk]
@@ -0,0 +1,123 @@
+import sys
+import mudata as mu
+import numpy as np
+import pandas as pd
+
+## VIASH START
+par = {
+    "input": "test_with_probabilities.h5mu",
+    "modality": "rna",
+    "input_obs_predictions": ["scanvi_pred", "celltypist_pred", "singler_pred"],
+    "input_obs_probabilities": ["scanvi_prob", "celltypist_prob", "singler_prob"],
+    "weights": None,
+    "tie_label": None,
+    "output": "consensus_test_output.h5mu",
+    "output_obs_predictions": "consensus_pred",
+    "output_obs_score": "consensus_score",
+    "output_compression": "gzip",
+}
+meta = {"resources_dir": "src/utils"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+from compress_h5mu import write_h5ad_to_h5mu_with_compression
+
+logger = setup_logger()
+
+
+def main():
+    prediction_cols = par["input_obs_predictions"]
+    prob_cols = par["input_obs_probabilities"]
+    weights = par["weights"]
+
+    if weights and len(weights) != len(prediction_cols):
+        raise ValueError(
+            f"--weights must have the same length as --input_obs_predictions. "
+            f"Got {len(weights)} weights for {len(prediction_cols)} prediction columns."
+        )
+    if prob_cols and len(prob_cols) != len(prediction_cols):
+        raise ValueError(
+            f"--input_obs_probabilities must have the same length as --input_obs_predictions. "
+            f"Got {len(prob_cols)} probability columns for {len(prediction_cols)} prediction columns."
+        )
+
+    logger.info("Reading input data.")
+    adata = mu.read_h5ad(par["input"], mod=par["modality"])
+
+    cols_to_check = [prediction_cols]
+    if prob_cols:
+        cols_to_check.append(prob_cols)
+    for cols in cols_to_check:
+        for col in cols:
+            if col not in adata.obs.columns:
+                raise ValueError(f"Column '{col}' not found in .obs.")
+
+    # Each method is treated equally by default, unless user specific weights are provided
+    n_methods = len(prediction_cols)
+    logger.info("Initializing weights to matrix of ones")
+    weights_arr = np.ones(n_methods, dtype=np.float32)
+    if weights:
+        logger.info("Applying user-provided weights.")
+        weights_arr = np.array(weights, dtype=np.float32)
+    logger.info("Normalizing weights")
+    weights_arr = weights_arr / weights_arr.sum()
+
+    # Apply the weights to the probabilities in the data
+    weights = pd.DataFrame(
+        [weights_arr] * adata.n_obs, index=adata.obs.index, columns=prediction_cols
+    )
+    if prob_cols:
+        logger.info("Scaling the weights with the probabilities from each method")
+        weights = weights * adata.obs[prob_cols].astype(np.float32).to_numpy()
+        assert pd.notna(weights).all(axis=None)
+
+    logger.info("Computing weighted majority vote.")
+    pred_df = adata.obs[prediction_cols].astype(str)
+
+    # For each cell and each method (index), get the label and the weight
+    incidences_weights = pd.DataFrame(
+        {"label": pred_df.stack(), "weights": weights.stack()}
+    )
+    # Move the label to the index, there might be duplicate indices now
+    incidences_weights = incidences_weights.set_index("label", append=True).rename_axis(
+        ["cell_id", "method", "label"]
+    )
+    # Sum the weights per label, from this the labels with the largest weights need to be selected
+    summed_weights = incidences_weights.groupby(level=["cell_id", "label"]).sum()
+    # Find the weight that is the largest per group
+    max_weight_per_group = summed_weights.groupby(level="cell_id").transform("max")
+    # Use the value to look-up the corresponding IDs and labels
+    max_weights_mask = summed_weights["weights"] == max_weight_per_group["weights"]
+    entries_for_max_weights = summed_weights[max_weights_mask].reset_index(
+        level="label"
+    )
+    # Find the cases where there is a tie
+    is_duplicated = max_weights_mask.groupby(level="cell_id").sum() > 1
+    # For the ties, overwrite the label. If a cell is in the frame more than once it is because of a tie.
+    entries_for_max_weights.loc[is_duplicated, ["label"]] = par["tie_label"]
+    # Now its safe to just take the first index in case of duplicates, since the label and the score is the same.
+    entries_for_max_weights = entries_for_max_weights[
+        ~entries_for_max_weights.index.duplicated()
+    ]
+    # Normalize the weights
+    normalized_scores = (
+        entries_for_max_weights["weights"]
+        / incidences_weights["weights"].groupby(level="cell_id").sum()
+    )
+    # Handle devision by 0
+    normalized_scores = normalized_scores.replace([np.inf, -np.inf], 0.0).fillna(0.0)
+    logger.info("Moving the output to the anndata.")
+    adata.obs[par["output_obs_predictions"]] = entries_for_max_weights["label"].astype(
+        "category"
+    )
+    adata.obs[par["output_obs_score"]] = normalized_scores
+
+    logger.info("Writing output data...")
+    write_h5ad_to_h5mu_with_compression(
+        par["output"], par["input"], par["modality"], adata, par["output_compression"]
+    )
+
+
+if __name__ == "__main__":
+    main()