Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conf/test_microbial.config
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ params {

// BAM filtering
deduplication_tool = "dedup"
deduplication_skipregionsplit = true
run_bamfiltering = true
bamfiltering_minreadlength = 30
bamfiltering_mappingquality = 37
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ params {

// Deduplication options
skip_deduplication = false
deduplication_skipregionsplit = false
deduplication_tool = 'markduplicates'

// Qualimap
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,12 @@
"description": "Specify to skip the removal of PCR duplicates.",
"fa_icon": "fas fa-forward"
},
"deduplication_skipregionsplit": {
"type": "boolean",
"description": "Specify to run deduplicaiton without splitting bams by contig (default behavior).",
"fa_icon": "fas fa-forward",
"help_text": "Run deduplication steps bam-by-bam rather than contig-by-contig for each bam file. This reduces the total number of jobs submitted to a cluster, but increases the computational runtime. If you use a shared cluster with limited resources, running many low-resource jobs can slow down the overall runtime of eager due to scheduling constraints.\nAlso applicable for poor-quality reference genomes."
},
"deduplication_tool": {
"type": "string",
"default": "markduplicates",
Expand Down
143 changes: 83 additions & 60 deletions subworkflows/local/deduplicate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,39 +29,49 @@ workflow DEDUPLICATE {
addNewMetaFromAttributes( it, "id" , "reference" , false )
}

// Create genomic regions file for splitting the bam before deduplication
BUILD_INTERVALS( fasta_fai )
ch_versions = ch_versions.mix( BUILD_INTERVALS.out.versions.first() )
if ( params.deduplication_skipregionsplit ) {

// Prep regions for combining
ch_intervals_for_join = BUILD_INTERVALS.out.bed
.map {
// Replace meta with new meta that contains the meta.id value in the meta.reference attribute only
addNewMetaFromAttributes( it, "id" , "reference" , true )
}
// No splitting of .bam files by contig, deduplicate all in one
input_for_deduplication = ch_bam_bai

// Ensure input bam matches the regions file
ch_bam_for_split = ch_bam_bai
.map {
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
addNewMetaFromAttributes( it, "reference" , "reference" , false )
}
.combine(
by: 0,
ch_intervals_for_join
)
} else {

// Create genomic regions file for splitting the bam before deduplication
BUILD_INTERVALS( fasta_fai )
ch_versions = ch_versions.mix( BUILD_INTERVALS.out.versions.first() )

// Prep regions for combining
ch_intervals_for_join = BUILD_INTERVALS.out.bed
.map {
ignore_me, meta, bam, bai, regions ->
[ meta, bam, bai, regions ]
// Replace meta with new meta that contains the meta.id value in the meta.reference attribute only
addNewMetaFromAttributes( it, "id" , "reference" , true )
}

//Split input bam by region
BAM_SPLIT_BY_REGION( ch_bam_for_split )
ch_versions = ch_versions.mix( BAM_SPLIT_BY_REGION.out.versions )
// Ensure input bam matches the regions file
ch_bam_for_split = ch_bam_bai
.map {
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
addNewMetaFromAttributes( it, "reference" , "reference" , false )
}
.combine(
by: 0,
ch_intervals_for_join
)
.map {
ignore_me, meta, bam, bai, regions ->
[ meta, bam, bai, regions ]
}

//Split input bam by region
BAM_SPLIT_BY_REGION( ch_bam_for_split )
input_for_deduplication = BAM_SPLIT_BY_REGION.out.bam_bai
ch_versions = ch_versions.mix( BAM_SPLIT_BY_REGION.out.versions )

}

if ( params.deduplication_tool == 'markduplicates' ) {

ch_markduplicates_input = BAM_SPLIT_BY_REGION.out.bam_bai
ch_markduplicates_input = input_for_deduplication
.map {
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
addNewMetaFromAttributes( it, "reference" , "reference" , false )
Expand All @@ -83,70 +93,83 @@ workflow DEDUPLICATE {
ch_markduplicates_input.fasta,
ch_markduplicates_input.fasta_fai
)
ch_versions = ch_versions.mix( PICARD_MARKDUPLICATES.out.versions.first() )
ch_versions = ch_versions.mix( PICARD_MARKDUPLICATES.out.versions.first() )

ch_dedupped_region_bam = PICARD_MARKDUPLICATES.out.bam
ch_dedupped_bam = PICARD_MARKDUPLICATES.out.bam

} else if ( params.deduplication_tool == "dedup" ) {
ch_dedup_input = BAM_SPLIT_BY_REGION.out.bam_bai
ch_dedup_input = input_for_deduplication
.map {
meta, bam, bai ->
[ meta, bam ]
}

DEDUP( ch_dedup_input )
ch_versions = ch_versions.mix( DEDUP.out.versions.first() )
ch_versions = ch_versions.mix( DEDUP.out.versions.first() )

ch_dedupped_region_bam = DEDUP.out.bam
ch_dedupped_bam = DEDUP.out.bam
}

ch_input_for_samtools_merge = ch_dedupped_region_bam
.map {
meta, bam ->
meta2 = meta.clone().findAll{ it.key != 'genomic_region' }
[ meta2, bam ]
}
.groupTuple()
.map {
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
addNewMetaFromAttributes( it, "reference" , "reference" , false )
}
.combine(
by:0,
ch_refs
if ( params.deduplication_skipregionsplit ) {

// Bams were never split by region, so bypass of re-merging
ch_input_for_samtools_sort_dedupped = ch_dedupped_bam

} else {

// Re-merging of bams-by-contig must take place after deduplciation
ch_input_for_samtools_merge = ch_dedupped_bam
.map {
meta, bam ->
meta2 = meta.clone().findAll{ it.key != 'genomic_region' }
[ meta2, bam ]
}
.groupTuple()
.map {
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
addNewMetaFromAttributes( it, "reference" , "reference" , false )
}
.combine(
by:0,
ch_refs
)
.multiMap{
// bam here is a list of bams
ignore_me, meta, bam, meta2, fasta, fasta_fai ->
bam: [ meta, bam ]
fasta: [ meta2, fasta ]
fasta_fai: [ meta2, fasta_fai ]
}

// Merge the bams for each region into one bam
SAMTOOLS_MERGE_DEDUPPED(
ch_input_for_samtools_merge.bam,
ch_input_for_samtools_merge.fasta,
ch_input_for_samtools_merge.fasta_fai
)
.multiMap{
// bam here is a list of bams
ignore_me, meta, bam, meta2, fasta, fasta_fai ->
bam: [ meta, bam ]
fasta: [ meta2, fasta ]
fasta_fai: [ meta2, fasta_fai ]
}
ch_versions = ch_versions.mix( SAMTOOLS_MERGE_DEDUPPED.out.versions )

// Merge the bams for each region into one bam
SAMTOOLS_MERGE_DEDUPPED(
ch_input_for_samtools_merge.bam,
ch_input_for_samtools_merge.fasta,
ch_input_for_samtools_merge.fasta_fai
)
ch_versions = ch_versions.mix( SAMTOOLS_MERGE_DEDUPPED.out.versions )
ch_input_for_samtools_sort_dedupped = SAMTOOLS_MERGE_DEDUPPED.out.bam

}


// Sort the merged bam and index
SAMTOOLS_SORT_DEDUPPED ( SAMTOOLS_MERGE_DEDUPPED.out.bam )
SAMTOOLS_SORT_DEDUPPED ( ch_input_for_samtools_sort_dedupped )
ch_versions = ch_versions.mix( SAMTOOLS_SORT_DEDUPPED.out.versions )
ch_dedup_bam = SAMTOOLS_SORT_DEDUPPED.out.bam

SAMTOOLS_INDEX_DEDUPPED ( ch_dedup_bam )
ch_versions = ch_versions.mix( SAMTOOLS_INDEX_DEDUPPED.out.versions )
ch_dedup_bai = params.fasta_largeref ? SAMTOOLS_INDEX_DEDUPPED.out.csi : SAMTOOLS_INDEX_DEDUPPED.out.bai
ch_dedup_bai = params.fasta_largeref ? SAMTOOLS_INDEX_DEDUPPED.out.csi : SAMTOOLS_INDEX_DEDUPPED.out.bai

// Finally run flagstat on the dedupped bam
ch_input_for_samtools_flagstat = ch_dedup_bam.join( ch_dedup_bai )

SAMTOOLS_FLAGSTAT_DEDUPPED(
ch_input_for_samtools_flagstat
)

ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_DEDUPPED.out.versions )
ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_DEDUPPED.out.flagstat )
ch_dedup_flagstat = SAMTOOLS_FLAGSTAT_DEDUPPED.out.flagstat
Expand Down
16 changes: 2 additions & 14 deletions tests/test_microbial.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,6 @@
"BEDTOOLS_COVERAGE_DEPTH": {
"bedtools": "2.31.1)"
},
"BUILD_INTERVALS": {
"gawk": "5.1.0"
},
"BWA_ALN": {
"bwa": "0.7.18-r1243-dirty"
},
Expand Down Expand Up @@ -697,9 +694,6 @@
"SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES": {
"samtools": 1.18
},
"SAMTOOLS_INDEX": {
"samtools": 1.18
},
"SAMTOOLS_INDEX_DEDUPPED": {
"samtools": 1.18
},
Expand All @@ -712,9 +706,6 @@
"SAMTOOLS_LENGTH_FILTER_INDEX": {
"samtools": 1.18
},
"SAMTOOLS_MERGE_DEDUPPED": {
"samtools": 1.18
},
"SAMTOOLS_MERGE_LIBRARIES": {
"samtools": 1.18
},
Expand All @@ -724,9 +715,6 @@
"SAMTOOLS_SORT_MERGED_LIBRARIES": {
"samtools": 1.18
},
"SAMTOOLS_VIEW": {
"samtools": 1.18
},
"SAMTOOLS_VIEW_BAM_FILTERING": {
"samtools": 1.18
},
Expand All @@ -740,9 +728,9 @@
],
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.04.8"
"nextflow": "25.10.3"
},
"timestamp": "2025-11-07T11:08:47.368306"
"timestamp": "2026-02-13T09:52:11.74559929"
},
"authentication": {
"content": [
Expand Down
Loading