|
| 1 | +version 1.0 |
| 2 | + |
| 3 | +## Copyright Broad Institute, 2018 |
| 4 | +## |
| 5 | +## This WDL pipeline implements data pre-processing and initial variant calling (GVCF |
| 6 | +## generation) according to the GATK Best Practices (June 2016) for germline SNP and |
| 7 | +## Indel discovery in human whole-genome data. |
| 8 | +## |
| 9 | +## Requirements/expectations : |
| 10 | +## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format |
| 11 | +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) |
| 12 | +## - Input uBAM files must additionally comply with the following requirements: |
| 13 | +## - - filenames all have the same suffix (we use ".unmapped.bam") |
| 14 | +## - - files must pass validation by ValidateSamFile |
| 15 | +## - - reads are provided in query-sorted order |
| 16 | +## - - all reads must have an RG tag |
| 17 | +## - GVCF output names must end in ".g.vcf.gz" |
| 18 | +## - Reference genome must be Hg38 with ALT contigs |
| 19 | +## |
| 20 | +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. |
| 21 | +## For program versions, see docker containers. |
| 22 | +## |
| 23 | +## LICENSING : |
| 24 | +## This script is released under the WDL source code license (BSD-3) (see LICENSE in |
| 25 | +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may |
| 26 | +## be subject to different licenses. Users are responsible for checking that they are |
| 27 | +## authorized to run all programs before running this script. Please see the docker |
| 28 | +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed |
| 29 | +## licensing information pertaining to the included programs. |
| 30 | +
|
| 31 | +# Local import |
| 32 | +#import "../../../../pipelines/dna_seq/UnmappedBamToAlignedBam.wdl" as ToBam |
| 33 | +#import "../../../../tasks/AggregatedBamQC.wdl" as AggregatedQC |
| 34 | +#import "../../../../tasks/GermlineVariantDiscovery.wdl" as Calling |
| 35 | +#import "../../../../tasks/Qc.wdl" as QC |
| 36 | +#import "../../../../tasks/Utilities.wdl" as Utils |
| 37 | +#import "../../../../tasks/BamToCram.wdl" as ToCram |
| 38 | +#import "../../../../tasks/VariantCalling.wdl" as ToGvcf |
| 39 | +#import "../../../../structs/dna_seq/germline/GermlineStructs.wdl" |
| 40 | +
|
| 41 | +# Git URL import |
| 42 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/UnmappedBamToAlignedBam.wdl" as ToBam |
| 43 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/AggregatedBamQC.wdl" as AggregatedQC |
| 44 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling |
| 45 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC |
| 46 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils |
| 47 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamToCram.wdl" as ToCram |
| 48 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/VariantCalling.wdl" as ToGvcf |
| 49 | +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" |
| 50 | + |
| 51 | +# WORKFLOW DEFINITION |
| 52 | +workflow WholeGenomeGermlineSingleSample { |
| 53 | + input { |
| 54 | + SampleAndUnmappedBams sample_and_unmapped_bams |
| 55 | + GermlineSingleSampleReferences references |
| 56 | + PapiSettings papi_settings |
| 57 | + File wgs_coverage_interval_list |
| 58 | + |
| 59 | + File? haplotype_database_file |
| 60 | + Boolean provide_bam_output = false |
| 61 | + Boolean use_gatk3_haplotype_caller = true |
| 62 | + } |
| 63 | + |
| 64 | + # Not overridable: |
| 65 | + Int read_length = 250 |
| 66 | + Float lod_threshold = -20.0 |
| 67 | + String cross_check_fingerprints_by = "READGROUP" |
| 68 | + String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated" |
| 69 | + |
| 70 | + call ToBam.UnmappedBamToAlignedBam { |
| 71 | + input: |
| 72 | + sample_and_unmapped_bams = sample_and_unmapped_bams, |
| 73 | + references = references, |
| 74 | + papi_settings = papi_settings, |
| 75 | +
|
| 76 | + cross_check_fingerprints_by = cross_check_fingerprints_by, |
| 77 | + haplotype_database_file = haplotype_database_file, |
| 78 | + lod_threshold = lod_threshold, |
| 79 | + recalibrated_bam_basename = recalibrated_bam_basename |
| 80 | + } |
| 81 | +
|
| 82 | + call AggregatedQC.AggregatedBamQC { |
| 83 | + input: |
| 84 | + base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam, |
| 85 | + base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index, |
| 86 | + base_name = sample_and_unmapped_bams.base_file_name, |
| 87 | + sample_name = sample_and_unmapped_bams.sample_name, |
| 88 | + recalibrated_bam_base_name = recalibrated_bam_basename, |
| 89 | + haplotype_database_file = haplotype_database_file, |
| 90 | + references = references, |
| 91 | + papi_settings = papi_settings |
| 92 | + } |
| 93 | +
|
| 94 | + call ToCram.BamToCram as BamToCram { |
| 95 | + input: |
| 96 | + input_bam = UnmappedBamToAlignedBam.output_bam, |
| 97 | + ref_fasta = references.reference_fasta.ref_fasta, |
| 98 | + ref_fasta_index = references.reference_fasta.ref_fasta_index, |
| 99 | + ref_dict = references.reference_fasta.ref_dict, |
| 100 | + duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics, |
| 101 | + chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics, |
| 102 | + base_file_name = sample_and_unmapped_bams.base_file_name, |
| 103 | + agg_preemptible_tries = papi_settings.agg_preemptible_tries |
| 104 | + } |
| 105 | +
|
| 106 | + # QC the sample WGS metrics (stringent thresholds) |
| 107 | + call QC.CollectWgsMetrics as CollectWgsMetrics { |
| 108 | + input: |
| 109 | + input_bam = UnmappedBamToAlignedBam.output_bam, |
| 110 | + input_bam_index = UnmappedBamToAlignedBam.output_bam_index, |
| 111 | + metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics", |
| 112 | + ref_fasta = references.reference_fasta.ref_fasta, |
| 113 | + ref_fasta_index = references.reference_fasta.ref_fasta_index, |
| 114 | + wgs_coverage_interval_list = wgs_coverage_interval_list, |
| 115 | + read_length = read_length, |
| 116 | + preemptible_tries = papi_settings.agg_preemptible_tries |
| 117 | + } |
| 118 | +
|
| 119 | + # QC the sample raw WGS metrics (common thresholds) |
| 120 | + call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { |
| 121 | + input: |
| 122 | + input_bam = UnmappedBamToAlignedBam.output_bam, |
| 123 | + input_bam_index = UnmappedBamToAlignedBam.output_bam_index, |
| 124 | + metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics", |
| 125 | + ref_fasta = references.reference_fasta.ref_fasta, |
| 126 | + ref_fasta_index = references.reference_fasta.ref_fasta_index, |
| 127 | + wgs_coverage_interval_list = wgs_coverage_interval_list, |
| 128 | + read_length = read_length, |
| 129 | + preemptible_tries = papi_settings.agg_preemptible_tries |
| 130 | + } |
| 131 | +
|
| 132 | + call ToGvcf.VariantCalling as BamToGvcf { |
| 133 | + input: |
| 134 | + calling_interval_list = references.calling_interval_list, |
| 135 | + evaluation_interval_list = references.evaluation_interval_list, |
| 136 | + haplotype_scatter_count = references.haplotype_scatter_count, |
| 137 | + break_bands_at_multiples_of = references.break_bands_at_multiples_of, |
| 138 | + contamination = UnmappedBamToAlignedBam.contamination, |
| 139 | + input_bam = UnmappedBamToAlignedBam.output_bam, |
| 140 | + ref_fasta = references.reference_fasta.ref_fasta, |
| 141 | + ref_fasta_index = references.reference_fasta.ref_fasta_index, |
| 142 | + ref_dict = references.reference_fasta.ref_dict, |
| 143 | + dbsnp_vcf = references.dbsnp_vcf, |
| 144 | + dbsnp_vcf_index = references.dbsnp_vcf_index, |
| 145 | + base_file_name = sample_and_unmapped_bams.base_file_name, |
| 146 | + final_vcf_base_name = sample_and_unmapped_bams.final_gvcf_base_name, |
| 147 | + agg_preemptible_tries = papi_settings.agg_preemptible_tries, |
| 148 | + use_gatk3_haplotype_caller = use_gatk3_haplotype_caller |
| 149 | + } |
| 150 | +
|
| 151 | + if (provide_bam_output) { |
| 152 | + File provided_output_bam = UnmappedBamToAlignedBam.output_bam |
| 153 | + File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index |
| 154 | + } |
| 155 | + |
| 156 | + # Outputs that will be retained when execution is complete |
| 157 | + output { |
| 158 | + Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics |
| 159 | + |
| 160 | + Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf |
| 161 | + Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics |
| 162 | + Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf |
| 163 | + Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics |
| 164 | + Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf |
| 165 | + Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics |
| 166 | + Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf |
| 167 | + Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics |
| 168 | + |
| 169 | + File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics |
| 170 | + File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics |
| 171 | + File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf |
| 172 | + File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics |
| 173 | + |
| 174 | + File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics |
| 175 | + |
| 176 | + File selfSM = UnmappedBamToAlignedBam.selfSM |
| 177 | + Float contamination = UnmappedBamToAlignedBam.contamination |
| 178 | + |
| 179 | + File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5 |
| 180 | + |
| 181 | + File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics |
| 182 | + File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics |
| 183 | + File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics |
| 184 | + File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics |
| 185 | + File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf |
| 186 | + File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics |
| 187 | + File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf |
| 188 | + File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics |
| 189 | + File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics |
| 190 | + File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics |
| 191 | + File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf |
| 192 | + File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics |
| 193 | + File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics |
| 194 | + |
| 195 | + File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics |
| 196 | + File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics |
| 197 | + |
| 198 | + File wgs_metrics = CollectWgsMetrics.metrics |
| 199 | + File raw_wgs_metrics = CollectRawWgsMetrics.metrics |
| 200 | + |
| 201 | + File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics |
| 202 | + File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports |
| 203 | + |
| 204 | + File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics |
| 205 | + File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics |
| 206 | + |
| 207 | + File? output_bam = provided_output_bam |
| 208 | + File? output_bam_index = provided_output_bam_index |
| 209 | + |
| 210 | + File output_cram = BamToCram.output_cram |
| 211 | + File output_cram_index = BamToCram.output_cram_index |
| 212 | + File output_cram_md5 = BamToCram.output_cram_md5 |
| 213 | + |
| 214 | + File validate_cram_file_report = BamToCram.validate_cram_file_report |
| 215 | + |
| 216 | + File output_vcf = BamToGvcf.output_vcf |
| 217 | + File output_vcf_index = BamToGvcf.output_vcf_index |
| 218 | + } |
| 219 | +} |
0 commit comments