Skip to content

Commit 0427305

Browse files
pinin4fjordsclaude
andcommitted
feat(orftable_fasta_gtf_buildorfcatalogue): build catalogue with optional smORF collapse
Lift the merged BED12 to an amino-acid FASTA (bedtools/getfasta -split -s -nameOnly then seqkit/translate --trim), cluster with mmseqs/easycluster and feed custom/orfcollapse. The collapse pass is gated by a val_collapse boolean and routed via branch/mix: when false the merged catalogue is emitted unchanged and the clustering modules do not run. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 348b7cd commit 0427305

5 files changed

Lines changed: 202 additions & 22 deletions

File tree

subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/main.nf

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
// Build a multi-caller Ribo-seq ORF catalogue from per-sample, per-caller
33
// ORF prediction tables. Composes the normaliser (run once per caller-tagged
44
// input), the merger (one cohort-level invocation), and bedtools/getfasta +
5-
// seqkit/translate to produce the catalogue AA FASTA.
5+
// seqkit/translate to produce the catalogue AA FASTA. An optional amino-acid
6+
// clustering pass (controlled by `val_collapse`) folds duplicate small ORFs.
67
//
78

89
include { CUSTOM_ORFNORMALISE } from '../../../modules/nf-core/custom/orfnormalise/main'
910
include { CUSTOM_ORFMERGE } from '../../../modules/nf-core/custom/orfmerge/main'
1011
include { BEDTOOLS_GETFASTA } from '../../../modules/nf-core/bedtools/getfasta/main'
1112
include { SEQKIT_TRANSLATE } from '../../../modules/nf-core/seqkit/translate/main'
13+
include { MMSEQS_EASYCLUSTER } from '../../../modules/nf-core/mmseqs/easycluster/main'
14+
include { CUSTOM_ORFCOLLAPSE } from '../../../modules/nf-core/custom/orfcollapse/main'
1215

1316
workflow ORFTABLE_FASTA_GTF_BUILDORFCATALOGUE {
1417

@@ -20,6 +23,10 @@ workflow ORFTABLE_FASTA_GTF_BUILDORFCATALOGUE {
2023
ch_fasta // channel: [ val(meta), path(fasta) ] - reference genome FASTA
2124
ch_gtf // channel: [ val(meta), path(gtf) ] - reference GTF (used by
2225
// ribocode/ribotish normalisers; ignored by rpbp/price)
26+
val_collapse // boolean: cluster catalogue peptides by amino-acid identity
27+
// and fold duplicate small ORFs to one representative
28+
// each. When false the merged catalogue is emitted
29+
// unchanged and the clustering modules do not run.
2330

2431
main:
2532

@@ -56,16 +63,45 @@ workflow ORFTABLE_FASTA_GTF_BUILDORFCATALOGUE {
5663

5764
CUSTOM_ORFMERGE ( ch_merge_in )
5865

59-
// 3. Lift the merged BED12 into nucleotide then amino-acid FASTA.
60-
// `bedtools getfasta -split` walks BED12 blocks in mRNA order;
61-
// `seqkit translate --trim` drops trailing stops.
66+
// 3. Lift the merged BED12 into nucleotide then amino-acid FASTA, keyed by
67+
// orf_id. `bedtools getfasta -split -s -nameOnly` walks BED12 blocks in
68+
// mRNA order on the correct strand and names each sequence by the BED
69+
// name (orf_id); `seqkit translate --trim` drops trailing stops.
6270
BEDTOOLS_GETFASTA ( CUSTOM_ORFMERGE.out.bed12, ch_fasta.map { _meta, fa -> fa }.first() )
6371
SEQKIT_TRANSLATE ( BEDTOOLS_GETFASTA.out.fasta )
6472

73+
// 4. Assemble the full merged catalogue (BED12 + tables + AA FASTA) on one
74+
// cohort record, then route by `val_collapse`. The coordinate merge only
75+
// collapses genomically overlapping ORFs, so the same micropeptide
76+
// encoded at distinct loci survives as separate rows; the collapse route
77+
// clusters peptides by amino-acid identity and folds the small-ORF
78+
// (aa_length <= 100) clusters to one representative each (GENCODE
79+
// Ribo-seq ORF catalogue convention, Mudge et al. 2022). The keep route
80+
// emits the merged catalogue untouched.
81+
ch_routed = CUSTOM_ORFMERGE.out.bed12
82+
.join(CUSTOM_ORFMERGE.out.catalogue_tsv)
83+
.join(CUSTOM_ORFMERGE.out.orf_to_gene_tsv)
84+
.join(CUSTOM_ORFMERGE.out.multiqc)
85+
.join(SEQKIT_TRANSLATE.out.fastx)
86+
.branch { _meta, _bed, _tsv, _o2g, _mqc, _aa ->
87+
collapse: val_collapse
88+
keep : true
89+
}
90+
91+
MMSEQS_EASYCLUSTER ( ch_routed.collapse.map { meta, _bed, _tsv, _o2g, _mqc, aa -> [ meta, aa ] } )
92+
93+
CUSTOM_ORFCOLLAPSE (
94+
ch_routed.collapse
95+
.map { meta, bed, tsv, o2g, _mqc, aa -> [ meta, bed, tsv, o2g, aa ] }
96+
.join(MMSEQS_EASYCLUSTER.out.tsv)
97+
)
98+
99+
// For a given run exactly one route carries records, so `mix` yields a
100+
// single catalogue per output channel.
65101
emit:
66-
catalogue_bed12 = CUSTOM_ORFMERGE.out.bed12
67-
catalogue_tsv = CUSTOM_ORFMERGE.out.catalogue_tsv
68-
orf_to_gene_tsv = CUSTOM_ORFMERGE.out.orf_to_gene_tsv
69-
catalogue_aa_fasta = SEQKIT_TRANSLATE.out.fastx
70-
multiqc = CUSTOM_ORFMERGE.out.multiqc
102+
catalogue_bed12 = CUSTOM_ORFCOLLAPSE.out.bed12.mix(ch_routed.keep.map { meta, bed, _tsv, _o2g, _mqc, _aa -> [ meta, bed ] })
103+
catalogue_tsv = CUSTOM_ORFCOLLAPSE.out.catalogue_tsv.mix(ch_routed.keep.map { meta, _bed, tsv, _o2g, _mqc, _aa -> [ meta, tsv ] })
104+
orf_to_gene_tsv = CUSTOM_ORFCOLLAPSE.out.orf_to_gene_tsv.mix(ch_routed.keep.map { meta, _bed, _tsv, o2g, _mqc, _aa -> [ meta, o2g ] })
105+
catalogue_aa_fasta = CUSTOM_ORFCOLLAPSE.out.aa_fasta.mix(ch_routed.keep.map { meta, _bed, _tsv, _o2g, _mqc, aa -> [ meta, aa ] })
106+
multiqc = CUSTOM_ORFCOLLAPSE.out.multiqc.mix(ch_routed.keep.map { meta, _bed, _tsv, _o2g, mqc, _aa -> [ meta, mqc ] })
71107
}

subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/meta.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ description: |
1111
clustering of normalised calls across callers and samples.
1212
3. `bedtools/getfasta` + `seqkit/translate` - lifts the merged BED12
1313
into a catalogue AA FASTA.
14+
4. `mmseqs/easycluster` + `custom/orfcollapse` (optional, `val_collapse`) -
15+
clusters catalogue peptides by amino-acid identity and folds duplicate
16+
small ORFs to one representative each.
1417
1518
Supports five callers via the per-record `caller` val: ribocode, ribotish,
1619
ribotricer, rpbp, price. Any subset can be supplied. When `ch_orf_tables` is empty
@@ -32,6 +35,8 @@ components:
3235
- custom/orfmerge
3336
- bedtools/getfasta
3437
- seqkit/translate
38+
- mmseqs/easycluster
39+
- custom/orfcollapse
3540
input:
3641
- ch_orf_tables:
3742
description: |
@@ -53,6 +58,13 @@ input:
5358
(ribotricer, rpbp, price). Pass `[ [:], [] ]` if no GTF is
5459
available (caller subset must then exclude ribocode and ribotish).
5560
Structure: [ val(meta), path(gtf) ]
61+
- val_collapse:
62+
description: |
63+
Cluster catalogue peptides by amino-acid identity (mmseqs/easycluster)
64+
and fold each multi-member small-ORF cluster to one representative
65+
(custom/orfcollapse). When false the merged catalogue is emitted
66+
unchanged and the clustering modules do not run.
67+
Structure: val(collapse)
5668
output:
5769
- catalogue_bed12:
5870
description: |

subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/nextflow.config

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ process {
44
ext.args = '-split -s -nameOnly'
55
}
66
withName: 'ORFTABLE_FASTA_GTF_BUILDORFCATALOGUE:SEQKIT_TRANSLATE' {
7-
ext.prefix = { "${meta.id}.catalogue.aa" }
7+
ext.prefix = { "${meta.id}.catalogue" }
88
ext.args = '--trim'
99
}
10+
withName: 'ORFTABLE_FASTA_GTF_BUILDORFCATALOGUE:MMSEQS_EASYCLUSTER' {
11+
ext.args = '--min-seq-id 0.9 -c 0.8 --cov-mode 0'
12+
}
1013
}

subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ nextflow_workflow {
1212
tag "custom/orfmerge"
1313
tag "bedtools/getfasta"
1414
tag "seqkit/translate"
15+
tag "mmseqs/easycluster"
16+
tag "custom/orfcollapse"
1517
tag "gunzip"
1618

1719
setup {
@@ -36,12 +38,12 @@ nextflow_workflow {
3638
input[0] = channel.of(
3739
[
3840
[id: 'sample1'],
39-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
41+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
4042
'ribotish'
4143
],
4244
[
4345
[id: 'sample1'],
44-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
46+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
4547
'ribocode'
4648
]
4749
)
@@ -50,6 +52,42 @@ nextflow_workflow {
5052
[id: 'reference'],
5153
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/Homo_sapiens.GRCh38.111_chr20.gtf', checkIfExists: true)
5254
])
55+
input[3] = true
56+
"""
57+
}
58+
}
59+
60+
then {
61+
assertAll(
62+
{ assert workflow.success },
63+
{ assert snapshot(workflow.out).match() }
64+
)
65+
}
66+
}
67+
68+
test("homo_sapiens [chr20] - ribotish + ribocode - collapse disabled") {
69+
70+
when {
71+
workflow {
72+
"""
73+
input[0] = channel.of(
74+
[
75+
[id: 'sample1'],
76+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
77+
'ribotish'
78+
],
79+
[
80+
[id: 'sample1'],
81+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
82+
'ribocode'
83+
]
84+
)
85+
input[1] = GUNZIP.out.gunzip
86+
input[2] = channel.of([
87+
[id: 'reference'],
88+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/Homo_sapiens.GRCh38.111_chr20.gtf', checkIfExists: true)
89+
])
90+
input[3] = false
5391
"""
5492
}
5593
}

subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap

Lines changed: 101 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,31 @@
77
{
88
"id": "cohort"
99
},
10-
"cohort.catalogue.bed12:md5,d0de23019fbec9d96fd9302103a16278"
10+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
1111
]
1212
],
1313
"1": [
1414
[
1515
{
1616
"id": "cohort"
1717
},
18-
"cohort.catalogue.tsv:md5,9b7e37df457221a36257c5a7772db5e7"
18+
"cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
1919
]
2020
],
2121
"2": [
2222
[
2323
{
2424
"id": "cohort"
2525
},
26-
"cohort.orf_to_gene.tsv:md5,79699c0188eadd2aaf912a31a2c14464"
26+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
2727
]
2828
],
2929
"3": [
3030
[
3131
{
3232
"id": "cohort"
3333
},
34-
"cohort.catalogue.aa.fasta:md5,f268ea70badac0c99fd682b389d019b9"
34+
"cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3"
3535
]
3636
],
3737
"4": [
@@ -47,23 +47,23 @@
4747
{
4848
"id": "cohort"
4949
},
50-
"cohort.catalogue.aa.fasta:md5,f268ea70badac0c99fd682b389d019b9"
50+
"cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3"
5151
]
5252
],
5353
"catalogue_bed12": [
5454
[
5555
{
5656
"id": "cohort"
5757
},
58-
"cohort.catalogue.bed12:md5,d0de23019fbec9d96fd9302103a16278"
58+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
5959
]
6060
],
6161
"catalogue_tsv": [
6262
[
6363
{
6464
"id": "cohort"
6565
},
66-
"cohort.catalogue.tsv:md5,9b7e37df457221a36257c5a7772db5e7"
66+
"cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
6767
]
6868
],
6969
"multiqc": [
@@ -79,15 +79,106 @@
7979
{
8080
"id": "cohort"
8181
},
82-
"cohort.orf_to_gene.tsv:md5,79699c0188eadd2aaf912a31a2c14464"
82+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
8383
]
8484
]
8585
}
8686
],
87-
"timestamp": "2026-06-12T13:28:39.988782",
87+
"timestamp": "2026-06-16T12:00:23.161222511",
8888
"meta": {
8989
"nf-test": "0.9.5",
90-
"nextflow": "25.10.4"
90+
"nextflow": "26.04.1"
91+
}
92+
},
93+
"homo_sapiens [chr20] - ribotish + ribocode - collapse disabled": {
94+
"content": [
95+
{
96+
"0": [
97+
[
98+
{
99+
"id": "cohort"
100+
},
101+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
102+
]
103+
],
104+
"1": [
105+
[
106+
{
107+
"id": "cohort"
108+
},
109+
"cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
110+
]
111+
],
112+
"2": [
113+
[
114+
{
115+
"id": "cohort"
116+
},
117+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
118+
]
119+
],
120+
"3": [
121+
[
122+
{
123+
"id": "cohort"
124+
},
125+
"cohort.catalogue.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0"
126+
]
127+
],
128+
"4": [
129+
[
130+
{
131+
"id": "cohort"
132+
},
133+
"cohort.catalogue.mqc.tsv:md5,d8e6da169d5d0a95b9bb4332290d1dd6"
134+
]
135+
],
136+
"catalogue_aa_fasta": [
137+
[
138+
{
139+
"id": "cohort"
140+
},
141+
"cohort.catalogue.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0"
142+
]
143+
],
144+
"catalogue_bed12": [
145+
[
146+
{
147+
"id": "cohort"
148+
},
149+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
150+
]
151+
],
152+
"catalogue_tsv": [
153+
[
154+
{
155+
"id": "cohort"
156+
},
157+
"cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
158+
]
159+
],
160+
"multiqc": [
161+
[
162+
{
163+
"id": "cohort"
164+
},
165+
"cohort.catalogue.mqc.tsv:md5,d8e6da169d5d0a95b9bb4332290d1dd6"
166+
]
167+
],
168+
"orf_to_gene_tsv": [
169+
[
170+
{
171+
"id": "cohort"
172+
},
173+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
174+
]
175+
]
176+
}
177+
],
178+
"timestamp": "2026-06-16T12:00:30.555944584",
179+
"meta": {
180+
"nf-test": "0.9.5",
181+
"nextflow": "26.04.1"
91182
}
92183
}
93184
}

0 commit comments

Comments
 (0)