-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathconfig.vsh.yaml
More file actions
161 lines (154 loc) · 5.85 KB
/
Copy pathconfig.vsh.yaml
File metadata and controls
161 lines (154 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
name: celltypist
namespace: annotate
scope: "public"
description: Automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm.
authors:
- __merge__: /src/authors/jakub_majercik.yaml
roles: [ author ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]
argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.
- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
default: "cell_ontology_class"
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.
- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0
- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: celltypist_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: celltypist_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/cross_check_genes.py
- path: /src/utils/subset_vars.py
- path: /src/utils/set_var_index.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/annotation_test_data/
- path: /resources_test/pbmc_1k_protein_v3/
engines:
- type: docker
image: python:3.10-slim
setup:
- type: apt
packages:
- libhdf5-dev
- procps
- type: python
__merge__: [ /src/base/requirements/scanpy.yaml, .]
- type: python
packages:
- celltypist==1.6.3
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [highcpu, highmem, highdisk]