From ba388ef1af9ad17307528a883c7515b80f111cac Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Tue, 17 Oct 2023 15:34:09 +0200 Subject: [PATCH 1/4] add dsb component --- .../dsb_index/config.vsh.yaml | 48 +++++++++++ src/protein_processing/dsb_index/script.py | 84 +++++++++++++++++++ src/protein_processing/dsb_index/test.py | 50 +++++++++++ .../dsb_normalize/config.vsh.yaml | 61 ++++++++++++++ .../dsb_normalize/script.py | 66 +++++++++++++++ src/protein_processing/dsb_normalize/test.py | 48 +++++++++++ 6 files changed, 357 insertions(+) create mode 100644 src/protein_processing/dsb_index/config.vsh.yaml create mode 100644 src/protein_processing/dsb_index/script.py create mode 100644 src/protein_processing/dsb_index/test.py create mode 100644 src/protein_processing/dsb_normalize/config.vsh.yaml create mode 100644 src/protein_processing/dsb_normalize/script.py create mode 100644 src/protein_processing/dsb_normalize/test.py diff --git a/src/protein_processing/dsb_index/config.vsh.yaml b/src/protein_processing/dsb_index/config.vsh.yaml new file mode 100644 index 00000000000..4a7c47f6fbb --- /dev/null +++ b/src/protein_processing/dsb_index/config.vsh.yaml @@ -0,0 +1,48 @@ +functionality: + name: dsb_index + namespace: protein_processing + description: "Filter background and foreground signals for normalising protein expression with DSB (Denoised and Scaled by Background)." + authors: + - name: Xichen Wu + arguments: + - name: "--data_raw" + type: file + required: true + description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities." + - name: "--cell_index" + type: file + description: "A csv file containing filtered cell barcodes." + - name: "--empty_counts_range" + type: double + description: "Specify the minimum and maximum log10-counts for a droplet to be considered empty." + multiple: true + - name: "--cell_counts_range" + type: double + description: "Specify the minimum and maximum log10-counts for a droplet to be considered not empty." + multiple: true + - name: "--output" + type: file + direction: output + description: dsb_index output directory + example: "dsb_output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + - path: ../../../resources_test/pbmc_1k_protein_v3 +platforms: + - type: docker + image: python:3.8 + setup: + - type: python + packages: + - scanpy~=1.9.1 + - muon + - numpy + - mudata~=0.2.0 + - anndata~=0.8.0 + - type: nextflow + directives: + label: lowcpu diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py new file mode 100644 index 00000000000..6a961d2f99c --- /dev/null +++ b/src/protein_processing/dsb_index/script.py @@ -0,0 +1,84 @@ +from warnings import warn +import scanpy as sc +import muon as mu +import numpy as np +from mudata import MuData +import pandas as pd +import os +from anndata import AnnData + +## VIASH START +par = { + "data_raw": "mudata_raw.h5mu", + "output": "dsb_processed", + "cell_index": None, + "empty_counts_range": [1.5, 2.8], + "cell_counts_range": None +} + +## VIASH END + +if par['data_raw'].endswith('h5mu'): + raw_data = mu.read_h5mu(par['data_raw']) + if "prot" not in raw_data.mod or "rna" not in raw_data.mod: + raise TypeError("Raw data does not contain 'prot' or 'rna' modalities") + if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs: + raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.") +else: + raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities") + +droplet_barcode = raw_data.mod["prot"].obs_names +if par["cell_index"] is not None: + cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist() + # cell_barcode = list(set(droplet_barcode) & set(cell_barcode)) + # [idx for idx in droplet_barcode if idx in cell_index_list_given] + empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) + # [idx for idx in droplet_barcode if idx not in cell_index_list_given] + +else: + cell_barcode = None + empty_barcode = None + +log10umi = np.log10(np.asarray(raw_data.mod["rna"].X.sum(axis=1)).squeeze() + 1) + +if par['empty_counts_range'] is not None: + if len(par['empty_counts_range']) != 2: + raise ValueError("Invalid count ranges provided for the empty droplets.") + if par['cell_counts_range'] is not None and max(*par['empty_counts_range']) > min(*par['cell_counts_range']): + raise ValueError("Overlapping count ranges") + empty_idx = np.where( + (log10umi >= min(*par['empty_counts_range'])) & (log10umi < max(*par['empty_counts_range'])))[0] + empty_idx = droplet_barcode[empty_idx] + if empty_barcode is not None: + empty_barcode = list(set(empty_barcode) & set(empty_idx)) + # [idx for idx in empty_barcode if idx in empty_idx] + else: + empty_barcode = empty_idx + + +if par['cell_counts_range'] is not None: + if len(par['cell_counts_range']) != 2: + raise ValueError("Invalid count ranges provided for true cells.") + cell_idx = np.where( + (log10umi >= min(*par['cell_counts_range'])) & (log10umi < max(*par['cell_counts_range'])))[0] + cell_idx = droplet_barcode[cell_idx] + if cell_barcode is not None: + cell_barcode = list(set(cell_barcode) & set(cell_idx)) + #[idx for idx in cell_barcode if idx in cell_idx] + else: + cell_barcode = cell_idx + +if empty_barcode is None: + if cell_barcode is None: + raise ValueError("Neither cell_index nor counts ranges for empty droplets " + "or cells provided for filtering empty droplets.") + empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) + # [idx for idx in droplet_barcode if idx not in cell_barcode] +elif cell_barcode is None: + cell_barcode = list(set(droplet_barcode).difference(empty_barcode)) + # [idx for idx in droplet_barcode if idx not in empty_barcode] + +if not os.path.exists(par["output"]): + os.makedirs(par["output"]) +pd.DataFrame(cell_barcode).to_csv(os.path.join(par["output"], "cell_idx.csv"), header=None, index=None) +pd.DataFrame(empty_barcode).to_csv(os.path.join(par["output"], "empty_idx.csv"), header=None, index=None) diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py new file mode 100644 index 00000000000..284871a808e --- /dev/null +++ b/src/protein_processing/dsb_index/test.py @@ -0,0 +1,50 @@ +import subprocess +from os import path +import muon as mu +import logging +from sys import stdout +import pandas as pd + +## VIASH START +meta = { + 'functionality_name': 'dsb_index', + 'executable': './target/dsb_index', + 'resources_dir': 'resources_test/' + +} +## VIASH END + +logger = logging.getLogger() +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler(stdout) +logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") +console_handler.setFormatter(logFormatter) +logger.addHandler(console_handler) + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu" +cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" +output_cell_idx = "dsb_output/cell_idx.csv" +output_empty_idx = "dsb_output/empty_idx.csv" +cmd_pars = [ + meta["executable"], + "--data_raw", input, + "--output", "dsb_output", + "--cell_index", cell_index, + "--empty_counts_range", "1.5:2.8" +] +try: + subprocess.check_output(cmd_pars) +except subprocess.CalledProcessError as e: + print(e.stdout.decode("utf-8")) + raise e + +logger.info("> Check if output was created.") +assert path.exists(output_cell_idx), "No output for cell index was created." +assert path.exists(output_empty_idx), "No output for empty index was created." + + +logger.info("> Check whether output cell index has the samw shape as the cell_index input.") +cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() +output_cell_index = pd.read_csv(output_cell_idx, header=None).iloc[:, 0].tolist() +assert len(output_cell_index) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' + diff --git a/src/protein_processing/dsb_normalize/config.vsh.yaml b/src/protein_processing/dsb_normalize/config.vsh.yaml new file mode 100644 index 00000000000..72d020abb95 --- /dev/null +++ b/src/protein_processing/dsb_normalize/config.vsh.yaml @@ -0,0 +1,61 @@ +functionality: + name: dsb_normalize + namespace: protein_processing + description: "Normalize protein expression with DSB (Denoised and Scaled by Background)." + authors: + - name: Xichen Wu + arguments: + - name: "--data_raw" + type: file + required: true + description: "AnnData object with protein expression counts or MuData object with 'prot' modality containing raw (unfiltered, including empty droplets) data." + - name: "--cell_index" + type: file + description: "A csv file containing filtered cell barcodes." + - name: "--empty_index" + type: file + description: "A csv file containing empty cell barcodes." + - name: "--pseudocount" + type: integer + default: 10 + description: "Pseudocount to add before log-transform." + - name: "--denoise_counts" + type: boolean_true + description: "Whether to perform denoising." + - name: "--isotype_controls" + type: string + multiple: true + description: "Names of the isotype controls. If ``None``, isotype controls will not be used." + - name: "--add_layer" + type: boolean_true + description: "Whether to add a ``'dsb'`` layer instead of assigning to the X matrix." + - name: "--random_state" + type: integer + default: 1 + description: "Random seed." + - name: "--output" + type: file + direction: output + description: dsb_normalize output directory + example: "dsb_output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + - path: ../../../resources_test/pbmc_1k_protein_v3 +platforms: + - type: docker + image: python:3.8 + setup: + - type: python + packages: + - scanpy~=1.9.1 + - muon + - numpy + - mudata~=0.2.0 + - anndata~=0.8.0 + - type: nextflow + directives: + label: midcpu diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py new file mode 100644 index 00000000000..2e12b0fbf6f --- /dev/null +++ b/src/protein_processing/dsb_normalize/script.py @@ -0,0 +1,66 @@ +from warnings import warn +import os +import scanpy as sc +import muon as mu +import numpy as np +from mudata import MuData +import pandas as pd +from anndata import AnnData +from muon import prot as pt + +## VIASH START +par = { + "data_raw": "../dsb_index/mudata_raw.h5mu", + "output": "dsb_processed", + "cell_index": "../dsb_index/dsb_processed/cell_idx.csv", + "empty_index": "../dsb_index/dsb_processed/empty_idx.csv", + "pseudocount": 10, + "denoise_counts": True, + "isotype_controls": None, + "add_layer": False, + "random_state": None +} +## VIASH END + +if par['data_raw'] is not None: + if par['data_raw'].endswith('h5ad'): + raw_data = sc.read_h5ad(par['data_raw']) + elif par['data_raw'].endswith('h5mu'): + raw_data = mu.read_h5mu(par['data_raw']) + if "prot" not in raw_data.mod: + raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") + else: + raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") +else: + raise ValueError( "Raw data is not available.") + +if par['cell_index'] is None and par['empty_index'] is None: + raise ValueError( "Given the unfiltered object data_raw, at least one index file must be " + "provided for foreground and background signals.") + +cells = None +empty = None + +if par['empty_index'] is not None: + empty_idx = pd.read_csv(par["empty_index"], header=None).iloc[:, 0].tolist() + empty = raw_data[raw_data.obs_names.isin(empty_idx)] +if par['cell_index'] is not None: + cell_idx = pd.read_csv(par["cell_index"], header=None).iloc[:, 0].tolist() + cells = raw_data[raw_data.obs_names.isin(cell_idx)] + +if empty is None: + empty = raw_data[~raw_data.obs_names.isin(cell_idx)] +if cells is None: + cells = raw_data[~raw_data.obs_names.isin(empty_idx)] + +pt.pp.dsb(cells, empty, isotype_controls=par['isotype_controls'], pseudocount=par["pseudocount"], + denoise_counts=par["denoise_counts"], add_layer=par["add_layer"], + random_state=par["random_state"]) + +if not os.path.exists(par["output"]): + os.makedirs(par["output"]) +if isinstance(cells, MuData): + cells.write(os.path.join(par["output"], "normalized.h5mu")) +else: + cells.write(os.path.join(par["output"], "normalized.h5ad")) + diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py new file mode 100644 index 00000000000..3c26f5f7498 --- /dev/null +++ b/src/protein_processing/dsb_normalize/test.py @@ -0,0 +1,48 @@ +import subprocess +from os import path +import muon as mu +import logging +from sys import stdout +import pandas as pd + +## VIASH START +meta = { + 'functionality_name': 'dsb_normalize', + 'executable': './target/dsb_normalize', + 'resources_dir': 'resources_test/' + +} +## VIASH END + +logger = logging.getLogger() +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler(stdout) +logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") +console_handler.setFormatter(logFormatter) +logger.addHandler(console_handler) + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu" +cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" +output = "dsb_output/normalized.h5mu" +cmd_pars = [ + meta["executable"], + "--data_raw", input, + "--output", "dsb_output", + "--cell_index", cell_index, + "--empty_counts_range", "1.5:2.8", + "--denoise_counts" +] +try: + subprocess.check_output(cmd_pars) +except subprocess.CalledProcessError as e: + print(e.stdout.decode("utf-8")) + raise e + +logger.info("> Check if output was created.") +assert path.exists(output), "No output was created." + +logger.info("> Check whether output has the samw shape as the cell_index input.") +cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() +mdata= mu.read_h5mu(output) +assert len(mdata.obs) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' + From 4d664aa4a67184287468c167b49d304968d2fb6f Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Tue, 17 Oct 2023 15:42:07 +0200 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ee8e59d470..d088e056bc3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,8 @@ ## NEW FUNCTIONALITY +* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` component. + * Added `compression/compress_h5mu` component (PR #530). * Resource management: when a process exits with a status code between 137 and 140, retry the process with increased memory requirements. Memory scales by multiplying the base memory assigned to the process with the attempt number (PR #518 and PR #527). From 251c8035c909b79a9ab334c7b5b26b7b08e55440 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Tue, 17 Oct 2023 15:47:42 +0200 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d088e056bc3..59143228fff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,10 @@ * `qc/calculate_qc_metrics`: fix calculating mitochondrial gene related QC metrics when only or no mitochondrial genes were found (PR #564). +## NEW FUNCTIONALITY + +* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` components (PR #588). + # openpipelines 0.10.1 ## MINOR CHANGES @@ -123,8 +127,6 @@ ## NEW FUNCTIONALITY -* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` component. - * Added `compression/compress_h5mu` component (PR #530). * Resource management: when a process exits with a status code between 137 and 140, retry the process with increased memory requirements. Memory scales by multiplying the base memory assigned to the process with the attempt number (PR #518 and PR #527). From 4b23e8c0101a68c96069aa6bc65a97bd5b718db2 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Wed, 15 Nov 2023 23:40:35 +0100 Subject: [PATCH 4/4] fix test data --- src/protein_processing/dsb_index/script.py | 19 +++++++------------ src/protein_processing/dsb_index/test.py | 2 +- .../dsb_normalize/script.py | 6 ++++-- src/protein_processing/dsb_normalize/test.py | 2 +- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py index 6a961d2f99c..b2fbe6d5365 100644 --- a/src/protein_processing/dsb_index/script.py +++ b/src/protein_processing/dsb_index/script.py @@ -20,21 +20,20 @@ if par['data_raw'].endswith('h5mu'): raw_data = mu.read_h5mu(par['data_raw']) - if "prot" not in raw_data.mod or "rna" not in raw_data.mod: - raise TypeError("Raw data does not contain 'prot' or 'rna' modalities") - if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs: - raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.") +elif par['data_raw'].endswith('h5'): + raw_data = mu.read_10x_h5(par['data_raw']) else: raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities") +if "prot" not in raw_data.mod or "rna" not in raw_data.mod: + raise TypeError("Raw data does not contain 'prot' or 'rna' modalities") +if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs: + raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.") + droplet_barcode = raw_data.mod["prot"].obs_names if par["cell_index"] is not None: cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist() - # cell_barcode = list(set(droplet_barcode) & set(cell_barcode)) - # [idx for idx in droplet_barcode if idx in cell_index_list_given] empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) - # [idx for idx in droplet_barcode if idx not in cell_index_list_given] - else: cell_barcode = None empty_barcode = None @@ -51,7 +50,6 @@ empty_idx = droplet_barcode[empty_idx] if empty_barcode is not None: empty_barcode = list(set(empty_barcode) & set(empty_idx)) - # [idx for idx in empty_barcode if idx in empty_idx] else: empty_barcode = empty_idx @@ -64,7 +62,6 @@ cell_idx = droplet_barcode[cell_idx] if cell_barcode is not None: cell_barcode = list(set(cell_barcode) & set(cell_idx)) - #[idx for idx in cell_barcode if idx in cell_idx] else: cell_barcode = cell_idx @@ -73,10 +70,8 @@ raise ValueError("Neither cell_index nor counts ranges for empty droplets " "or cells provided for filtering empty droplets.") empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) - # [idx for idx in droplet_barcode if idx not in cell_barcode] elif cell_barcode is None: cell_barcode = list(set(droplet_barcode).difference(empty_barcode)) - # [idx for idx in droplet_barcode if idx not in empty_barcode] if not os.path.exists(par["output"]): os.makedirs(par["output"]) diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py index 284871a808e..4c27bdf389d 100644 --- a/src/protein_processing/dsb_index/test.py +++ b/src/protein_processing/dsb_index/test.py @@ -21,7 +21,7 @@ console_handler.setFormatter(logFormatter) logger.addHandler(console_handler) -input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu" +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" output_cell_idx = "dsb_output/cell_idx.csv" output_empty_idx = "dsb_output/empty_idx.csv" diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py index 2e12b0fbf6f..bb47e17e2d0 100644 --- a/src/protein_processing/dsb_normalize/script.py +++ b/src/protein_processing/dsb_normalize/script.py @@ -27,10 +27,12 @@ raw_data = sc.read_h5ad(par['data_raw']) elif par['data_raw'].endswith('h5mu'): raw_data = mu.read_h5mu(par['data_raw']) - if "prot" not in raw_data.mod: - raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") + elif par['data_raw'].endswith('h5'): + raw_data = mu.read_10x_h5(par['data_raw']) else: raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") + if "prot" not in raw_data.mod: + raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") else: raise ValueError( "Raw data is not available.") diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py index 3c26f5f7498..e84703581b0 100644 --- a/src/protein_processing/dsb_normalize/test.py +++ b/src/protein_processing/dsb_normalize/test.py @@ -21,7 +21,7 @@ console_handler.setFormatter(logFormatter) logger.addHandler(console_handler) -input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5mu" +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" output = "dsb_output/normalized.h5mu" cmd_pars = [