warp/pipelines/wdl/slideseq/SlideSeq.wdl at 1aeb3ca795fabcf3adeb5ccaf5c1173fc33c1f9b · broadinstitute/warp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
version 1.0

import "../../../tasks/wdl/StarAlign.wdl" as StarAlign
import "../../../tasks/wdl/FastqProcessing.wdl" as FastqProcessing
import "../../../tasks/wdl/Metrics.wdl" as Metrics
import "../../../tasks/wdl/H5adUtils.wdl" as H5adUtils
import "../../../tasks/wdl/CheckInputs.wdl" as OptimusInputChecks
import "../../../tasks/wdl/MergeSortBam.wdl" as Merge
import "../../../tasks/wdl/Utilities.wdl" as utils


## Copyright Broad Institute, 2022
##
## This WDL pipeline implements data processing for RNA with UMIs
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script.


workflow SlideSeq {

    String pipeline_version = "3.6.5"

    input {
        Array[File] r1_fastq
        Array[File] r2_fastq
        Array[File]? i1_fastq
        String input_id
        String read_structure

        File tar_star_reference
        File annotations_gtf

        String output_bam_basename
        Boolean count_exons = true
        File bead_locations

        String cloud_provider

    }

    # docker images
    String pytools_docker = "pytools:1.0.0-1661263730"
    String picard_cloud_docker = "picard-cloud:2.26.10"
    String warp_tools_docker = "warp-tools:2.6.1"
    String star_merge_docker = "star-merge-npz:1.3.0"

    String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf"
    String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/"
    String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/"
    String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix

    String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
    String acr_docker_prefix = "dsppipelinedev.azurecr.io/"

    # choose docker prefix based on cloud provider
    String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix

    # make sure either gcp or azr is supplied as cloud_provider input
    if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
        call utils.ErrorWithMessage as ErrorMessageIncorrectInput {
            input:
                message = "cloud_provider must be supplied with either 'gcp' or 'azure'."
        }
    }

    parameter_meta {
        r1_fastq: "Array of Read 1 FASTQ files; forward read; contains cell barcodes and molecule barcodes"
        r2_fastq: "Array of Read 2 FASTQ files; reverse read; contains cDNA fragment generated from captured mRNA"
        i1_fastq: "Optional array of i1 FASTQ files; index read used for demultiplexing of multiple samples on one flow cell"
        input_id: "Name of sample matching this file; inserted into read group header"
        read_structure: "String used to specify the UMI (M) and Barcode (C) positions in the Read 1 FASTQ"
    }

    call StarAlign.STARGenomeRefVersion as ReferenceCheck {
        input:
          tar_star_reference = tar_star_reference,
          ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker
    }

    call Metrics.FastqMetricsSlideSeq as FastqMetrics {
        input:
            r1_fastq = r1_fastq,
            read_structure = read_structure,
            sample_id = input_id,
            whitelist = bead_locations
    }
    call FastqProcessing.FastqProcessingSlidSeq as SplitFastq {
        input:
            r1_fastq = r1_fastq,
            r2_fastq = r2_fastq,
            i1_fastq = i1_fastq,
            read_structure = read_structure,
            sample_id = input_id,
            whitelist = bead_locations
    }
    scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) {
        call StarAlign.STARsoloFastqSlideSeq as STARsoloFastqSlideSeq {
            input:
                r1_fastq = [SplitFastq.fastq_R1_output_array[idx]],
                r2_fastq = [SplitFastq.fastq_R2_output_array[idx]],
                whitelist = bead_locations,
                tar_star_reference = tar_star_reference,
                output_bam_basename = output_bam_basename + "_" + idx,
                read_structure = read_structure,
                count_exons = count_exons
        }
    }
    call Merge.MergeSortBamFiles as MergeBam {
        input:
            bam_inputs = STARsoloFastqSlideSeq.bam_output,
            output_bam_filename = output_bam_basename + ".bam",
            sort_order = "coordinate",
            picard_cloud_docker_path = docker_prefix + picard_cloud_docker
    }
    call Metrics.CalculateGeneMetrics as GeneMetrics {
        input:
            bam_input = MergeBam.output_bam,
            original_gtf = annotations_gtf,
            input_id = input_id,
            warp_tools_docker_path = docker_prefix + warp_tools_docker
    }
    call Metrics.CalculateUMIsMetrics as UMIsMetrics {
        input:
            bam_input = MergeBam.output_bam,
            original_gtf = annotations_gtf,
            input_id = input_id
    }

    call Metrics.CalculateCellMetrics as CellMetrics {
        input:
            bam_input = MergeBam.output_bam,
            original_gtf = annotations_gtf,
            input_id = input_id,
            warp_tools_docker_path = docker_prefix + warp_tools_docker

    }

    call StarAlign.MergeStarOutput as MergeStarOutputs {
        input:
            barcodes = STARsoloFastqSlideSeq.barcodes,
            features = STARsoloFastqSlideSeq.features,
            matrix = STARsoloFastqSlideSeq.matrix,
            input_id = input_id,
            star_merge_docker_path = docker_prefix + star_merge_docker
    }
    if ( !count_exons ) {
        call H5adUtils.SlideseqH5adGeneration as SlideseqH5adGeneration{
            input:
                input_id = input_id,
                annotation_file = annotations_gtf,
                cell_metrics = CellMetrics.cell_metrics,
                gene_metrics = GeneMetrics.gene_metrics,
                sparse_count_matrix = MergeStarOutputs.sparse_counts,
                cell_id = MergeStarOutputs.row_index,
                gene_id = MergeStarOutputs.col_index,
                add_emptydrops_data = "no",
                pipeline_version = "SlideSeq_v~{pipeline_version}",
                warp_tools_docker_path = docker_prefix + warp_tools_docker

        }
    }
    if (count_exons) {
        call StarAlign.MergeStarOutput as MergeStarOutputsExons {
            input:
                barcodes = STARsoloFastqSlideSeq.barcodes_sn_rna,
                features = STARsoloFastqSlideSeq.features_sn_rna,
                matrix = STARsoloFastqSlideSeq.matrix_sn_rna,
                input_id = input_id,
                star_merge_docker_path = docker_prefix + star_merge_docker
        }
        call H5adUtils.SingleNucleusSlideseqH5adOutput as SlideseqH5adGenerationWithExons{
            input:
                input_id = input_id,
                annotation_file = annotations_gtf,
                cell_metrics = CellMetrics.cell_metrics,
                gene_metrics = GeneMetrics.gene_metrics,
                sparse_count_matrix = MergeStarOutputs.sparse_counts,
                cell_id = MergeStarOutputs.row_index,
                gene_id = MergeStarOutputs.col_index,
                sparse_count_matrix_exon = MergeStarOutputsExons.sparse_counts,
                cell_id_exon = MergeStarOutputsExons.row_index,
                gene_id_exon = MergeStarOutputsExons.col_index,
                pipeline_version = "SlideSeq_v~{pipeline_version}",
                warp_tools_docker_path = docker_prefix + warp_tools_docker
        }
    }

    File final_h5ad_output = select_first([SlideseqH5adGenerationWithExons.h5ad_output, SlideseqH5adGeneration.h5ad_output])

    output {
        String pipeline_version_out = pipeline_version
        File genomic_reference_version = ReferenceCheck.genomic_ref_version
        File bam = MergeBam.output_bam
        # sparse count matrix
        File matrix = MergeStarOutputs.sparse_counts
        File matrix_row_index = MergeStarOutputs.row_index
        File matrix_col_index = MergeStarOutputs.col_index

        File cell_metrics = CellMetrics.cell_metrics
        File gene_metrics = GeneMetrics.gene_metrics
        File umi_metrics =  UMIsMetrics.umi_metrics

        File fastq_barcode_distribution = FastqMetrics.barcode_distribution
        File fastq_umi_distribution = FastqMetrics.umi_distribution
        File fastq_reads_per_cell = FastqMetrics.numReads_perCell
        File fastq_reads_per_umi = FastqMetrics.numReads_perUMI

        # h5ad
        File? h5ad_output_file = final_h5ad_output
    }
}