From ba388ef1af9ad17307528a883c7515b80f111cac Mon Sep 17 00:00:00 2001
From: Xichen Wu <xichenwu@outlook.com>
Date: Tue, 17 Oct 2023 15:34:09 +0200
Subject: [PATCH 1/4] add dsb component

---
 .../dsb_index/config.vsh.yaml                 | 48 +++++++++++
 src/protein_processing/dsb_index/script.py    | 84 +++++++++++++++++++
 src/protein_processing/dsb_index/test.py      | 50 +++++++++++
 .../dsb_normalize/config.vsh.yaml             | 61 ++++++++++++++
 .../dsb_normalize/script.py                   | 66 +++++++++++++++
 src/protein_processing/dsb_normalize/test.py  | 48 +++++++++++
 6 files changed, 357 insertions(+)
 create mode 100644 src/protein_processing/dsb_index/config.vsh.yaml
 create mode 100644 src/protein_processing/dsb_index/script.py
 create mode 100644 src/protein_processing/dsb_index/test.py
 create mode 100644 src/protein_processing/dsb_normalize/config.vsh.yaml
 create mode 100644 src/protein_processing/dsb_normalize/script.py
 create mode 100644 src/protein_processing/dsb_normalize/test.py

diff --git a/src/protein_processing/dsb_index/config.vsh.yaml b/src/protein_processing/dsb_index/config.vsh.yaml
new file mode 100644
index 00000000000..4a7c47f6fbb
--- /dev/null
+++ b/src/protein_processing/dsb_index/config.vsh.yaml
@@ -0,0 +1,48 @@
+functionality:
+  name: dsb_index
+  namespace: protein_processing
+  description: "Filter background and foreground signals for normalising protein expression with DSB (Denoised and Scaled by Background)."
+  authors:
+    - name: Xichen Wu
+  arguments:
+    - name: "--data_raw"
+      type: file
+      required: true
+      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
+    - name: "--cell_index"
+      type: file
+      description: "A csv file containing filtered cell barcodes."
+    - name: "--empty_counts_range"
+      type: double
+      description: "Specify the minimum and maximum log10-counts for a droplet to be considered empty."
+      multiple: true
+    - name: "--cell_counts_range"
+      type: double
+      description: "Specify the minimum and maximum log10-counts for a droplet to be considered not empty."
+      multiple: true
+    - name: "--output"
+      type: file
+      direction: output
+      description: dsb_index output directory
+      example: "dsb_output"
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: ../../../resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.8
+    setup:
+      - type: python
+        packages:
+          - scanpy~=1.9.1
+          - muon
+          - numpy
+          - mudata~=0.2.0
+          - anndata~=0.8.0
+  - type: nextflow
+    directives:
+      label: lowcpu
diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py
new file mode 100644
index 00000000000..6a961d2f99c
--- /dev/null
+++ b/src/protein_processing/dsb_index/script.py
@@ -0,0 +1,84 @@
+from warnings import warn
+import scanpy as sc
+import muon as mu
+import numpy as np
+from mudata import MuData
+import pandas as pd
+import os
+from anndata import AnnData
+
+## VIASH START
+par = {
+    "data_raw": "mudata_raw.h5mu",
+    "output": "dsb_processed",
+    "cell_index": None,
+    "empty_counts_range": [1.5, 2.8],
+    "cell_counts_range": None
+}
+
+## VIASH END
+
+if par['data_raw'].endswith('h5mu'):
+    raw_data = mu.read_h5mu(par['data_raw'])
+    if "prot" not in raw_data.mod or "rna" not in raw_data.mod:
+        raise TypeError("Raw data does not contain 'prot' or 'rna' modalities")
+    if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs:
+        raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.")
+else:
+    raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities")
+
+droplet_barcode = raw_data.mod["prot"].obs_names
+if par["cell_index"] is not None:
+    cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist()
+    # cell_barcode = list(set(droplet_barcode) & set(cell_barcode))
+    # [idx for idx in droplet_barcode if idx in cell_index_list_given]
+    empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
+    # [idx for idx in droplet_barcode if idx not in cell_index_list_given]
+
+else:
+    cell_barcode = None
+    empty_barcode = None
+
+log10umi = np.log10(np.asarray(raw_data.mod["rna"].X.sum(axis=1)).squeeze() + 1)
+
+if par['empty_counts_range'] is not None:
+    if len(par['empty_counts_range']) != 2:
+        raise ValueError("Invalid count ranges provided for the empty droplets.")
+    if par['cell_counts_range'] is not None and max(*par['empty_counts_range']) > min(*par['cell_counts_range']):
+        raise ValueError("Overlapping count ranges")
+    empty_idx = np.where(
+        (log10umi >= min(*par['empty_counts_range'])) & (log10umi < max(*par['empty_counts_range'])))[0]
+    empty_idx = droplet_barcode[empty_idx]
+    if empty_barcode is not None:
+        empty_barcode = list(set(empty_barcode) & set(empty_idx))
+        # [idx for idx in empty_barcode if idx in empty_idx]
+    else:
+        empty_barcode = empty_idx
+
+
+if par['cell_counts_range'] is not None:
+    if len(par['cell_counts_range']) != 2:
+        raise ValueError("Invalid count ranges provided for true cells.")
+    cell_idx = np.where(
+    (log10umi >= min(*par['cell_counts_range'])) & (log10umi < max(*par['cell_counts_range'])))[0]
+    cell_idx = droplet_barcode[cell_idx]
+    if cell_barcode is not None:
+        cell_barcode = list(set(cell_barcode) & set(cell_idx))
+        #[idx for idx in cell_barcode if idx in cell_idx]
+    else:
+        cell_barcode = cell_idx
+
+if empty_barcode is None:
+    if cell_barcode is None:
+        raise ValueError("Neither cell_index nor counts ranges for empty droplets "
+                         "or cells provided for filtering empty droplets.")
+    empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
+    # [idx for idx in droplet_barcode if idx not in cell_barcode]
+elif cell_barcode is None:
+    cell_barcode = list(set(droplet_barcode).difference(empty_barcode))
+    # [idx for idx in droplet_barcode if idx not in empty_barcode]
+
+if not os.path.exists(par["output"]):
+    os.makedirs(par["output"])
+pd.DataFrame(cell_barcode).to_csv(os.path.join(par["output"], "cell_idx.csv"), header=None, index=None)
+pd.DataFrame(empty_barcode).to_csv(os.path.join(par["output"], "empty_idx.csv"), header=None, index=None)
diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py
new file mode 100644
index 00000000000..284871a808e
--- /dev/null
+++ b/src/protein_processing/dsb_index/test.py
@@ -0,0 +1,50 @@
+import subprocess
+from os import path
+import muon as mu
+import logging
+from sys import stdout
+import pandas as pd
+
+## VIASH START
+meta = {
+    'functionality_name': 'dsb_index',
+    'executable': './target/dsb_index',
+    'resources_dir': 'resources_test/'
+    
+}
+## VIASH END
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler(stdout)
+logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+console_handler.setFormatter(logFormatter)
+logger.addHandler(console_handler)
+
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu"
+cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
+output_cell_idx = "dsb_output/cell_idx.csv"
+output_empty_idx = "dsb_output/empty_idx.csv"
+cmd_pars = [
+    meta["executable"],
+    "--data_raw", input,
+    "--output", "dsb_output",
+    "--cell_index", cell_index,
+    "--empty_counts_range", "1.5:2.8"
+]
+try:
+    subprocess.check_output(cmd_pars)
+except subprocess.CalledProcessError as e:
+    print(e.stdout.decode("utf-8"))
+    raise e
+
+logger.info("> Check if output was created.")
+assert path.exists(output_cell_idx), "No output for cell index was created."
+assert path.exists(output_empty_idx), "No output for empty index was created."
+
+
+logger.info("> Check whether output cell index has the samw shape as the cell_index input.")
+cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist()
+output_cell_index = pd.read_csv(output_cell_idx, header=None).iloc[:, 0].tolist()
+assert len(output_cell_index) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.'
+
diff --git a/src/protein_processing/dsb_normalize/config.vsh.yaml b/src/protein_processing/dsb_normalize/config.vsh.yaml
new file mode 100644
index 00000000000..72d020abb95
--- /dev/null
+++ b/src/protein_processing/dsb_normalize/config.vsh.yaml
@@ -0,0 +1,61 @@
+functionality:
+  name: dsb_normalize
+  namespace: protein_processing
+  description: "Normalize protein expression with DSB (Denoised and Scaled by Background)."
+  authors:
+    - name: Xichen Wu
+  arguments:
+    - name: "--data_raw"
+      type: file
+      required: true
+      description: "AnnData object with protein expression counts or MuData object with 'prot' modality containing raw (unfiltered, including empty droplets) data."
+    - name: "--cell_index"
+      type: file
+      description: "A csv file containing filtered cell barcodes."
+    - name: "--empty_index"
+      type: file
+      description: "A csv file containing empty cell barcodes."
+    - name: "--pseudocount"
+      type: integer
+      default: 10
+      description: "Pseudocount to add before log-transform."
+    - name: "--denoise_counts"
+      type: boolean_true
+      description: "Whether to perform denoising."
+    - name: "--isotype_controls"
+      type: string
+      multiple: true
+      description: "Names of the isotype controls. If ``None``, isotype controls will not be used."
+    - name: "--add_layer"
+      type: boolean_true
+      description: "Whether to add a ``'dsb'`` layer instead of assigning to the X matrix."
+    - name: "--random_state"
+      type: integer
+      default: 1
+      description: "Random seed."
+    - name: "--output"
+      type: file
+      direction: output
+      description: dsb_normalize output directory
+      example: "dsb_output"
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: ../../../resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.8
+    setup:
+      - type: python
+        packages:
+          - scanpy~=1.9.1
+          - muon
+          - numpy
+          - mudata~=0.2.0
+          - anndata~=0.8.0
+  - type: nextflow
+    directives:
+      label: midcpu
diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py
new file mode 100644
index 00000000000..2e12b0fbf6f
--- /dev/null
+++ b/src/protein_processing/dsb_normalize/script.py
@@ -0,0 +1,66 @@
+from warnings import warn
+import os
+import scanpy as sc
+import muon as mu
+import numpy as np
+from mudata import MuData
+import pandas as pd
+from anndata import AnnData
+from muon import prot as pt
+
+## VIASH START
+par = {
+    "data_raw": "../dsb_index/mudata_raw.h5mu",
+    "output": "dsb_processed",
+    "cell_index":  "../dsb_index/dsb_processed/cell_idx.csv",
+    "empty_index": "../dsb_index/dsb_processed/empty_idx.csv",
+    "pseudocount": 10,
+    "denoise_counts": True,
+    "isotype_controls": None,
+    "add_layer": False,
+    "random_state": None
+}
+## VIASH END
+
+if par['data_raw'] is not None:
+    if par['data_raw'].endswith('h5ad'):
+        raw_data = sc.read_h5ad(par['data_raw'])
+    elif par['data_raw'].endswith('h5mu'):
+        raw_data = mu.read_h5mu(par['data_raw'])
+        if "prot" not in raw_data.mod:
+            raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+    else:
+        raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+else:
+    raise ValueError( "Raw data is not available.")
+
+if par['cell_index'] is None and par['empty_index'] is None:
+    raise ValueError( "Given the unfiltered object data_raw, at least one index file must be "
+                      "provided for foreground and background signals.")
+                      
+cells = None
+empty = None
+
+if par['empty_index'] is not None:
+    empty_idx = pd.read_csv(par["empty_index"], header=None).iloc[:, 0].tolist()
+    empty = raw_data[raw_data.obs_names.isin(empty_idx)]
+if par['cell_index'] is not None:
+    cell_idx = pd.read_csv(par["cell_index"], header=None).iloc[:, 0].tolist()
+    cells = raw_data[raw_data.obs_names.isin(cell_idx)]
+
+if empty is None:
+    empty = raw_data[~raw_data.obs_names.isin(cell_idx)]
+if cells is None:
+    cells = raw_data[~raw_data.obs_names.isin(empty_idx)]
+
+pt.pp.dsb(cells, empty, isotype_controls=par['isotype_controls'], pseudocount=par["pseudocount"],
+    denoise_counts=par["denoise_counts"], add_layer=par["add_layer"],
+    random_state=par["random_state"])
+
+if not os.path.exists(par["output"]):
+    os.makedirs(par["output"])
+if isinstance(cells, MuData):
+    cells.write(os.path.join(par["output"], "normalized.h5mu"))
+else:
+    cells.write(os.path.join(par["output"], "normalized.h5ad"))
+    
diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py
new file mode 100644
index 00000000000..3c26f5f7498
--- /dev/null
+++ b/src/protein_processing/dsb_normalize/test.py
@@ -0,0 +1,48 @@
+import subprocess
+from os import path
+import muon as mu
+import logging
+from sys import stdout
+import pandas as pd
+
+## VIASH START
+meta = {
+    'functionality_name': 'dsb_normalize',
+    'executable': './target/dsb_normalize',
+    'resources_dir': 'resources_test/'
+    
+}
+## VIASH END
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler(stdout)
+logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+console_handler.setFormatter(logFormatter)
+logger.addHandler(console_handler)
+
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu"
+cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
+output = "dsb_output/normalized.h5mu"
+cmd_pars = [
+    meta["executable"],
+    "--data_raw", input,
+    "--output", "dsb_output",
+    "--cell_index", cell_index,
+    "--empty_counts_range", "1.5:2.8",
+    "--denoise_counts"
+]
+try:
+    subprocess.check_output(cmd_pars)
+except subprocess.CalledProcessError as e:
+    print(e.stdout.decode("utf-8"))
+    raise e
+
+logger.info("> Check if output was created.")
+assert path.exists(output), "No output was created."
+
+logger.info("> Check whether output has the samw shape as the cell_index input.")
+cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist()
+mdata= mu.read_h5mu(output)
+assert len(mdata.obs) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.'
+

From 4d664aa4a67184287468c167b49d304968d2fb6f Mon Sep 17 00:00:00 2001
From: Xichen Wu <xichenwu@outlook.com>
Date: Tue, 17 Oct 2023 15:42:07 +0200
Subject: [PATCH 2/4] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ee8e59d470..d088e056bc3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,8 @@
 
 ## NEW FUNCTIONALITY
 
+* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` component.
+
 * Added `compression/compress_h5mu` component (PR #530).
 
 * Resource management: when a process exits with a status code between 137 and 140, retry the process with increased memory requirements. Memory scales by multiplying the base memory assigned to the process with the attempt number (PR #518 and PR #527).

From 251c8035c909b79a9ab334c7b5b26b7b08e55440 Mon Sep 17 00:00:00 2001
From: Xichen Wu <xichenwu@outlook.com>
Date: Tue, 17 Oct 2023 15:47:42 +0200
Subject: [PATCH 3/4] update changelog

---
 CHANGELOG.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d088e056bc3..59143228fff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,10 @@
 
 * `qc/calculate_qc_metrics`: fix calculating mitochondrial gene related QC metrics when only or no mitochondrial genes were found (PR #564).
 
+## NEW FUNCTIONALITY
+
+* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` components (PR #588).
+
 # openpipelines 0.10.1
 
 ## MINOR CHANGES
@@ -123,8 +127,6 @@
 
 ## NEW FUNCTIONALITY
 
-* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` component.
-
 * Added `compression/compress_h5mu` component (PR #530).
 
 * Resource management: when a process exits with a status code between 137 and 140, retry the process with increased memory requirements. Memory scales by multiplying the base memory assigned to the process with the attempt number (PR #518 and PR #527).

From 4b23e8c0101a68c96069aa6bc65a97bd5b718db2 Mon Sep 17 00:00:00 2001
From: Xichen Wu <xichenwu@outlook.com>
Date: Wed, 15 Nov 2023 23:40:35 +0100
Subject: [PATCH 4/4] fix test data

---
 src/protein_processing/dsb_index/script.py    | 19 +++++++------------
 src/protein_processing/dsb_index/test.py      |  2 +-
 .../dsb_normalize/script.py                   |  6 ++++--
 src/protein_processing/dsb_normalize/test.py  |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py
index 6a961d2f99c..b2fbe6d5365 100644
--- a/src/protein_processing/dsb_index/script.py
+++ b/src/protein_processing/dsb_index/script.py
@@ -20,21 +20,20 @@
 
 if par['data_raw'].endswith('h5mu'):
     raw_data = mu.read_h5mu(par['data_raw'])
-    if "prot" not in raw_data.mod or "rna" not in raw_data.mod:
-        raise TypeError("Raw data does not contain 'prot' or 'rna' modalities")
-    if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs:
-        raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.")
+elif par['data_raw'].endswith('h5'):
+    raw_data = mu.read_10x_h5(par['data_raw'])
 else:
     raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities")
 
+if "prot" not in raw_data.mod or "rna" not in raw_data.mod:
+    raise TypeError("Raw data does not contain 'prot' or 'rna' modalities")
+if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs:
+    raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.")
+
 droplet_barcode = raw_data.mod["prot"].obs_names
 if par["cell_index"] is not None:
     cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist()
-    # cell_barcode = list(set(droplet_barcode) & set(cell_barcode))
-    # [idx for idx in droplet_barcode if idx in cell_index_list_given]
     empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
-    # [idx for idx in droplet_barcode if idx not in cell_index_list_given]
-
 else:
     cell_barcode = None
     empty_barcode = None
@@ -51,7 +50,6 @@
     empty_idx = droplet_barcode[empty_idx]
     if empty_barcode is not None:
         empty_barcode = list(set(empty_barcode) & set(empty_idx))
-        # [idx for idx in empty_barcode if idx in empty_idx]
     else:
         empty_barcode = empty_idx
 
@@ -64,7 +62,6 @@
     cell_idx = droplet_barcode[cell_idx]
     if cell_barcode is not None:
         cell_barcode = list(set(cell_barcode) & set(cell_idx))
-        #[idx for idx in cell_barcode if idx in cell_idx]
     else:
         cell_barcode = cell_idx
 
@@ -73,10 +70,8 @@
         raise ValueError("Neither cell_index nor counts ranges for empty droplets "
                          "or cells provided for filtering empty droplets.")
     empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
-    # [idx for idx in droplet_barcode if idx not in cell_barcode]
 elif cell_barcode is None:
     cell_barcode = list(set(droplet_barcode).difference(empty_barcode))
-    # [idx for idx in droplet_barcode if idx not in empty_barcode]
 
 if not os.path.exists(par["output"]):
     os.makedirs(par["output"])
diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py
index 284871a808e..4c27bdf389d 100644
--- a/src/protein_processing/dsb_index/test.py
+++ b/src/protein_processing/dsb_index/test.py
@@ -21,7 +21,7 @@
 console_handler.setFormatter(logFormatter)
 logger.addHandler(console_handler)
 
-input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu"
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5"
 cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
 output_cell_idx = "dsb_output/cell_idx.csv"
 output_empty_idx = "dsb_output/empty_idx.csv"
diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py
index 2e12b0fbf6f..bb47e17e2d0 100644
--- a/src/protein_processing/dsb_normalize/script.py
+++ b/src/protein_processing/dsb_normalize/script.py
@@ -27,10 +27,12 @@
         raw_data = sc.read_h5ad(par['data_raw'])
     elif par['data_raw'].endswith('h5mu'):
         raw_data = mu.read_h5mu(par['data_raw'])
-        if "prot" not in raw_data.mod:
-            raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+    elif par['data_raw'].endswith('h5'):
+        raw_data = mu.read_10x_h5(par['data_raw'])
     else:
         raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+    if "prot" not in raw_data.mod:
+            raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
 else:
     raise ValueError( "Raw data is not available.")
 
diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py
index 3c26f5f7498..e84703581b0 100644
--- a/src/protein_processing/dsb_normalize/test.py
+++ b/src/protein_processing/dsb_normalize/test.py
@@ -21,7 +21,7 @@
 console_handler.setFormatter(logFormatter)
 logger.addHandler(console_handler)
 
-input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu"
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5"
 cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
 output = "dsb_output/normalized.h5mu"
 cmd_pars = [