openpipelines-bio · dorien-er · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,12 +6,20 @@
 
 * `workflows/qc/qc`, `workflows/rna/rna_multisample`, `workflows/prot/prot_multisample`, `workflows/multiomics/process_samples` and `workflows/multiomics/process_batches`: expose the `--log1p_transform` flag (PR #1182).
 
+* `workflows/multiomics/process_singlesample`: now calculates basic QC metrics for the RNA and protein modalities by default, adding metric columns to `.obs` and `.var` of the output. This can be disabled with the new `--skip_qc_metrics` flag (PR #1148).
+
+* `workflows/rna/rna_singlesample`, `workflows/multiomics/process_singlesample`, `workflows/multiomics/process_samples`: cells classified as doublets by scrublet are now removed from the output. Scrublet detection runs on the full count matrix and tags cells, and the existing `do_filter` step now also applies the `filter_with_scrublet` tag (alongside the count, mitochondrial, ribosomal and percentile filters) in a single filtering pass. Previously doublets were only annotated, not removed (PR #1183).
+
 ## NEW FEATURES
 
 * `filter/filter_with_scrublet`: added `--scrublet_score_threshold` argument to allow manually setting the doublet score threshold instead of relying on automatic detection (PR #1183).
 
 * `workflows/multiomics/process_singlesample`, `workflows/multiomics/process_samples` and `workflows/rna/rna_singlesample`: surfaced `--scrublet_score_threshold` argument to allow manually setting the doublet score threshold instead of relying on automatic detection (PR #1183).
 
+* `workflows/rna/rna_singlesample`, `workflows/multiomics/process_singlesample`, `workflows/multiomics/process_samples`: surfaced additional scrublet pass-through arguments (`--scrublet_expected_doublet_rate`, `--scrublet_stdev_doublet_rate`, `--scrublet_n_neighbors`, `--scrublet_sim_doublet_ratio`, `--scrublet_min_counts`, `--scrublet_min_cells`, `--scrublet_min_gene_variability_percent`, `--scrublet_num_pca_components`, `--scrublet_distance_metric`, `--scrublet_allow_automatic_threshold_detection_fail`) to allow tuning doublet detection (PR #1183).
+
+* `workflows/rna/rna_singlesample`, `workflows/prot/prot_singlesample`, `workflows/multiomics/process_singlesample`, `workflows/multiomics/process_samples`: Enable filtering rna and protein modalities by percentile of log-transformed counts (PR #1148).
+
 ## MINOR CHANGES
 
 * `qc/calculate_qc_metrics`: parametrize the names of the top-n-vars `.obs` output columns with the `--output_obs_top_n_vars` flag (PR #1182).
@@ -33,7 +41,7 @@
 * `workflows/rna/rna_multisample`, `workflows/multiomics/process_batches`, `feature_annotation/highly_variable_features_scanpy`: add an option to exclude features before running highly variable gene calculation based on a user-defined list of feature names (PR #1121).
 
 * `annotate/consensus_vote`: new component computing a (weighted) majority vote across cell type labels from multiple annotation methods (PR #1151).
-* 
+
 * `filter/filter_with_quantile`: added a component to filter numerical .obs or .var columns based on quantile thresholds, with optional subsetting (PR #1146).
 
 * `dimred/pca`: added possibility to do chunked processing using arguments `chunks` and `chunk_size`. Also added a `seed` argument in order to better control the variability between executions (PR #1157).

diff --git a/src/filter/filter_with_scrublet/script.py b/src/filter/filter_with_scrublet/script.py
@@ -84,7 +84,7 @@
     keep_cells = np.invert(predicted_doublets)
 except TypeError:
     # Scrublet might not throw an error and return None if it fails to detect doublets...
-    if par["scrublet_score_threshold"]:
+    if par["scrublet_score_threshold"] is not None:
         raise RuntimeError(
             "Scrublet could not detect doublets even with a manual threshold set."
         )

diff --git a/src/workflows/multiomics/process_samples/config.vsh.yaml b/src/workflows/multiomics/process_samples/config.vsh.yaml
@@ -14,6 +14,8 @@ info:
       namespace: test_workflows/multiomics/process_samples
     - name: assert_test_workflow_10_output
       namespace: test_workflows/multiomics/process_samples
+    - name: assert_test_workflow_11_output
+      namespace: test_workflows/multiomics/process_samples
 authors:
   - __merge__: /src/authors/dries_schaumont.yaml
     roles: [ author, maintainer ]
@@ -83,6 +85,19 @@ argument_groups:
         type: integer
         description: Maximum number of counts captured per cell.
 
+      - name: "--rna_min_percentile_counts"
+        example: 0.05
+        type: double
+        description: |
+          Minimum percentile of total RNA counts captured per cell. Quantile-based filtering is always
+          performed on the log-transformed total counts.
+      - name: "--rna_max_percentile_counts"
+        example: 0.95
+        type: double
+        description: |
+          Maximum percentile of total RNA counts captured per cell. Quantile-based filtering is always
+          performed on the log-transformed total counts.
+
       - name: "--rna_min_genes_per_cell"
         type: integer
         example: 200
@@ -128,6 +143,66 @@ argument_groups:
           Manual doublet score threshold passed to filter_with_scrublet. Cells with a
           doublet score above this value are classified as doublets. If not provided,
           the threshold is determined automatically by Scrublet.
+      - name: "--scrublet_expected_doublet_rate"
+        type: double
+        required: false
+        min: 0
+        max: 1
+        description: |
+          The estimated fraction of doublets as from the experimental setup, passed to
+          filter_with_scrublet. If not provided, the component default is used.
+      - name: "--scrublet_stdev_doublet_rate"
+        type: double
+        required: false
+        min: 0
+        description: Uncertainty in the expected doublet rate, passed to filter_with_scrublet.
+      - name: "--scrublet_n_neighbors"
+        type: integer
+        required: false
+        min: 0
+        description: |
+          Number of neighbors used to construct the KNN classifier of observed transcriptomes
+          and simulated doublets, passed to filter_with_scrublet.
+      - name: "--scrublet_sim_doublet_ratio"
+        type: double
+        required: false
+        min: 0
+        description: |
+          Number of doublets to simulate relative to the number of observed transcriptomes,
+          passed to filter_with_scrublet.
+      - name: "--scrublet_min_counts"
+        type: integer
+        required: false
+        description: |
+          The number of minimal UMI counts per cell that have to be present for initial cell
+          detection during scrublet doublet detection.
+      - name: "--scrublet_min_cells"
+        type: integer
+        required: false
+        description: |
+          The number of cells in which UMIs for a gene were detected, used during scrublet
+          doublet detection.
+      - name: "--scrublet_min_gene_variability_percent"
+        type: double
+        required: false
+        description: |
+          Keep the most highly variable genes (in the top percentile) as measured by the
+          v-statistic, used for gene filtering prior to PCA during scrublet doublet detection.
+      - name: "--scrublet_num_pca_components"
+        type: integer
+        required: false
+        description: |
+          Number of principal components used during PCA dimensionality reduction in scrublet
+          doublet detection.
+      - name: "--scrublet_distance_metric"
+        type: string
+        required: false
+        description: The distance metric used for computing similarities during scrublet doublet detection.
+      - name: "--scrublet_allow_automatic_threshold_detection_fail"
+        type: boolean_true
+        description: |
+          When scrublet fails to automatically determine the doublet score threshold, allow the
+          pipeline to continue and set the output columns to NA.
 
   - name: "CITE-seq filtering options"
     arguments:
@@ -140,6 +215,19 @@ argument_groups:
         type: integer
         example: 5000000
 
+      - name: "--prot_min_percentile_counts"
+        example: 0.05
+        type: double
+        description: |
+          Minimum percentile of total protein counts captured per cell. Quantile-based filtering is always
+          performed on the log-transformed total counts.
+      - name: "--prot_max_percentile_counts"
+        example: 0.95
+        type: double
+        description: |
+          Maximum percentile of total protein counts captured per cell. Quantile-based filtering is always
+          performed on the log-transformed total counts.
+
       - name: "--prot_min_proteins_per_cell"
         type: integer
         example: 200
@@ -374,6 +462,12 @@ test_resources:
   - type: nextflow_script
     path: test.nf
     entrypoint: test_wf9
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf10
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf11
   - path: /resources_test/concat_test_data
   - path: /resources_test/pbmc_1k_protein_v3
   - path: /resources_test/10x_5k_lung_crispr

diff --git a/src/workflows/multiomics/process_samples/integration_test.sh b/src/workflows/multiomics/process_samples/integration_test.sh
@@ -102,18 +102,26 @@ nextflow \
   -c src/workflows/utils/labels_ci.config \
   -c src/workflows/utils/integration_tests.config
 
-  nextflow \
+nextflow \
   run . \
   -main-script src/workflows/multiomics/process_samples/test.nf \
   -entry test_wf9 \
   -profile docker,no_publish \
   -c src/workflows/utils/labels_ci.config \
   -c src/workflows/utils/integration_tests.config
 
-  nextflow \
+nextflow \
   run . \
   -main-script src/workflows/multiomics/process_samples/test.nf \
   -entry test_wf10 \
   -profile docker,no_publish \
   -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config
+
+nextflow \
+  run . \
+  -main-script src/workflows/multiomics/process_samples/test.nf \
+  -entry test_wf11 \
+  -profile docker,no_publish \
+  -c src/workflows/utils/labels_ci.config \
   -c src/workflows/utils/integration_tests.config
diff --git a/src/workflows/multiomics/process_samples/main.nf b/src/workflows/multiomics/process_samples/main.nf
@@ -14,6 +14,8 @@ workflow run_wf {
       "add_id_make_observation_keys_unique": "add_id_make_observation_keys_unique",
       "rna_min_counts": "rna_min_counts",
       "rna_max_counts": "rna_max_counts",
+      "rna_min_percentile_counts": "rna_min_percentile_counts",
+      "rna_max_percentile_counts": "rna_max_percentile_counts",
       "rna_min_genes_per_cell": "rna_min_genes_per_cell",
       "rna_max_genes_per_cell": "rna_max_genes_per_cell",
       "rna_min_cells_per_gene": "rna_min_cells_per_gene",
@@ -23,8 +25,20 @@ workflow run_wf {
       "rna_max_fraction_ribo": "rna_max_fraction_ribo",
       "skip_scrublet_doublet_detection": "skip_scrublet_doublet_detection",
       "scrublet_score_threshold": "scrublet_score_threshold",
+      "scrublet_expected_doublet_rate": "scrublet_expected_doublet_rate",
+      "scrublet_stdev_doublet_rate": "scrublet_stdev_doublet_rate",
+      "scrublet_n_neighbors": "scrublet_n_neighbors",
+      "scrublet_sim_doublet_ratio": "scrublet_sim_doublet_ratio",
+      "scrublet_min_counts": "scrublet_min_counts",
+      "scrublet_min_cells": "scrublet_min_cells",
+      "scrublet_min_gene_variability_percent": "scrublet_min_gene_variability_percent",
+      "scrublet_num_pca_components": "scrublet_num_pca_components",
+      "scrublet_distance_metric": "scrublet_distance_metric",
+      "scrublet_allow_automatic_threshold_detection_fail": "scrublet_allow_automatic_threshold_detection_fail",
       "prot_min_counts": "prot_min_counts",
       "prot_max_counts": "prot_max_counts",
+      "prot_min_percentile_counts": "prot_min_percentile_counts",
+      "prot_max_percentile_counts": "prot_max_percentile_counts",
       "prot_min_proteins_per_cell": "prot_min_proteins_per_cell",
       "prot_max_proteins_per_cell": "prot_max_proteins_per_cell",
       "prot_min_cells_per_protein": "prot_min_cells_per_protein",

diff --git a/src/workflows/multiomics/process_samples/test.nf b/src/workflows/multiomics/process_samples/test.nf
@@ -8,6 +8,7 @@ include { process_samples } from targetDir + "/workflows/multiomics/process_samp
 include { assert_test_workflow_2_output } from targetTestDir + "/test_workflows/multiomics/process_samples/assert_test_workflow_2_output/main.nf"
 include { assert_test_workflow_9_output } from targetTestDir + "/test_workflows/multiomics/process_samples/assert_test_workflow_9_output/main.nf"
 include { assert_test_workflow_10_output } from targetTestDir + "/test_workflows/multiomics/process_samples/assert_test_workflow_10_output/main.nf"
+include { assert_test_workflow_11_output } from targetTestDir + "/test_workflows/multiomics/process_samples/assert_test_workflow_11_output/main.nf"
 
 params.resources_test = params.rootDir + "/resources_test"
 
@@ -61,6 +62,9 @@ workflow test_wf2 {
         prot_min_counts: 3,
         add_id_to_obs: true,
         add_id_make_observation_keys_unique: true,
+        // Skip scrublet so the raw-expression equality checks hold (doublet
+        // removal would otherwise drop RNA cells from the output).
+        skip_scrublet_doublet_detection: true,
         add_id_obs_output: "sample_id"
     ],
     [
@@ -84,6 +88,7 @@ workflow test_wf2 {
       obs_name_ribosomal_fraction: 'fraction_ribosomal',
       add_id_to_obs: true,
       add_id_make_observation_keys_unique: true,
+      skip_scrublet_doublet_detection: true,
       add_id_obs_output: "sample_id"
     ],
   ])
@@ -516,6 +521,60 @@ workflow test_wf10 {
       [id, new_state]
     }
     | assert_test_workflow_10_output.run(
+
+      fromState: [
+        "input": "output",
+        "orig_input": "input"
+      ],
+    )
+}
+
+workflow test_wf11 {
+
+  resources_test = file(params.resources_test)
+
+  input_ch = Channel.fromList([
+    [
+      id: "pbmc",
+      input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu"),
+      rna_min_counts: 2,
+      rna_min_percentile_counts: 0.05,
+      rna_max_percentile_counts: 0.95,
+      prot_min_counts: 3,
+      prot_min_percentile_counts: 0.05,
+      prot_max_percentile_counts: 0.95,
+      add_id_to_obs: true,
+      add_id_make_observation_keys_unique: true,
+      add_id_obs_output: "sample_id"
+    ]
+  ])
+  | map{ state -> [state.id, state] }
+
+  processed_ch = input_ch
+    | process_samples.run(
+        toState: { id, output, state -> output }
+    )
+
+  assert_ch = processed_ch
+    | toSortedList()
+    | map { output_list ->
+      assert output_list.size() == 1 : "output channel should contain one event"
+      assert output_list[0][0] == "merged" : "Output ID should be 'merged'"
+      output_list
+    }
+
+  test_ch = processed_ch.combine(input_ch)
+    | map {output_id, output_state, input_id, input_state ->
+      def new_event = [output_id, output_state + ["input": input_state.input]]
+      return new_event
+    }
+    | groupTuple()
+    | map { id, events ->
+      def output_file = events[0].output
+      def new_state = ["output": output_file, "input": events.collect{it.input}]
+      [id, new_state]
+    }
+    | assert_test_workflow_11_output.run(
       fromState: [
         "input": "output",
         "orig_input": "input"