Skip to content

Commit 348b7cd

Browse files
pinin4fjordsclaude
andcommitted
fix(custom/orfmerge): deterministic orf_id ordering and prefix-based output naming
Sort clusters by genomic coordinate (chrom, start, end, strand, gene_id, transcript_id, orf_class) before assigning orf_ids so numbering no longer follows the nondeterministic .collect() order. Move the catalogue descriptor out of the hard path() outputs into the default prefix (${meta.id}.catalogue). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 7a05628 commit 348b7cd

5 files changed

Lines changed: 49 additions & 33 deletions

File tree

modules/nf-core/custom/orfmerge/main.nf

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,27 +11,27 @@ process CUSTOM_ORFMERGE {
1111
tuple val(meta), path(bed12s, arity: '1..*', stageAs: 'beds/*'), path(tsvs, arity: '1..*', stageAs: 'tsvs/*')
1212

1313
output:
14-
tuple val(meta), path("${prefix}.catalogue.bed12") , emit: bed12
15-
tuple val(meta), path("${prefix}.catalogue.tsv") , emit: catalogue_tsv
16-
tuple val(meta), path("${prefix}.orf_to_gene.tsv") , emit: orf_to_gene_tsv
17-
tuple val(meta), path("${prefix}.catalogue.mqc.tsv") , emit: multiqc
18-
path "versions.yml" , emit: versions, topic: versions
14+
tuple val(meta), path("${prefix}.bed12") , emit: bed12
15+
tuple val(meta), path("${prefix}.tsv") , emit: catalogue_tsv
16+
tuple val(meta), path("${prefix}.orf_to_gene.tsv") , emit: orf_to_gene_tsv
17+
tuple val(meta), path("${prefix}.mqc.tsv") , emit: multiqc
18+
path "versions.yml" , emit: versions, topic: versions
1919

2020
when:
2121
task.ext.when == null || task.ext.when
2222

2323
script:
24-
prefix = task.ext.prefix ?: "${meta.id}"
24+
prefix = task.ext.prefix ?: "${meta.id}.catalogue"
2525
args = task.ext.args ?: ''
2626
template 'orfmerge.py'
2727

2828
stub:
29-
prefix = task.ext.prefix ?: "${meta.id}"
29+
prefix = task.ext.prefix ?: "${meta.id}.catalogue"
3030
"""
31-
touch ${prefix}.catalogue.bed12
32-
touch ${prefix}.catalogue.tsv
31+
touch ${prefix}.bed12
32+
touch ${prefix}.tsv
3333
touch ${prefix}.orf_to_gene.tsv
34-
touch ${prefix}.catalogue.mqc.tsv
34+
touch ${prefix}.mqc.tsv
3535
3636
python -c "import platform, yaml; yaml.safe_dump({'${task.process}': {'python': platform.python_version()}}, open('versions.yml', 'w'), default_flow_style=False, sort_keys=False)"
3737
"""

modules/nf-core/custom/orfmerge/meta.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,25 +88,25 @@ output:
8888
type: map
8989
description: |
9090
Groovy Map matching the input meta.
91-
- ${prefix}.catalogue.bed12:
91+
- ${prefix}.bed12:
9292
type: file
9393
description: Merged ORF catalogue as BED12 (genomic blocks).
94-
pattern: "*.catalogue.bed12"
94+
pattern: "*.bed12"
9595
ontologies:
9696
- edam: http://edamontology.org/format_3586 # BED
9797
catalogue_tsv:
9898
- - meta:
9999
type: map
100100
description: |
101101
Groovy Map matching the input meta.
102-
- ${prefix}.catalogue.tsv:
102+
- ${prefix}.tsv:
103103
type: file
104104
description: |
105105
Per-ORF table with `called_by_<caller>` and `score_<caller>`
106106
columns for each supported caller, `n_samples` / `samples`
107107
cross-sample recurrence columns, plus orf_class, aa_length,
108108
and host gene_id / transcript_id.
109-
pattern: "*.catalogue.tsv"
109+
pattern: "*.tsv"
110110
ontologies:
111111
- edam: http://edamontology.org/format_3475 # TSV
112112
orf_to_gene_tsv:
@@ -129,11 +129,11 @@ output:
129129
type: map
130130
description: |
131131
Groovy Map matching the input meta.
132-
- ${prefix}.catalogue.mqc.tsv:
132+
- ${prefix}.mqc.tsv:
133133
type: file
134134
description: |
135135
MultiQC custom-content TSV (per-class ORF counts).
136-
pattern: "*.catalogue.mqc.tsv"
136+
pattern: "*.mqc.tsv"
137137
ontologies:
138138
- edam: http://edamontology.org/format_3475 # TSV
139139
versions:

modules/nf-core/custom/orfmerge/templates/orfmerge.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@
2626
2727
Outputs:
2828
29-
${prefix}.catalogue.bed12 merged catalogue (genomic blocks).
30-
${prefix}.catalogue.tsv per-ORF table with caller-tracking cols.
29+
${prefix}.bed12 merged catalogue (genomic blocks).
30+
${prefix}.tsv per-ORF table with caller-tracking cols.
3131
${prefix}.orf_to_gene.tsv one row per (orf_id, gene_id, transcript_id);
3232
an ORF can map to multiple host transcripts.
33-
${prefix}.catalogue.mqc.tsv MultiQC custom-content sidecar
33+
${prefix}.mqc.tsv MultiQC custom-content sidecar
3434
(per-class counts).
3535
"""
3636

@@ -183,10 +183,10 @@ def load_normalised(tsv_paths, bed_paths):
183183

184184

185185
def write_catalogue(prefix, clusters, bed_index):
186-
cat_bed = Path(f"{prefix}.catalogue.bed12")
187-
cat_tsv = Path(f"{prefix}.catalogue.tsv")
186+
cat_bed = Path(f"{prefix}.bed12")
187+
cat_tsv = Path(f"{prefix}.tsv")
188188
o2g_tsv = Path(f"{prefix}.orf_to_gene.tsv")
189-
mqc_tsv = Path(f"{prefix}.catalogue.mqc.tsv")
189+
mqc_tsv = Path(f"{prefix}.mqc.tsv")
190190

191191
catalogue_cols = (
192192
["orf_id", "chrom", "start", "end", "strand", "gene_id", "transcript_id", "orf_class", "aa_length"]
@@ -197,6 +197,22 @@ def write_catalogue(prefix, clusters, bed_index):
197197

198198
per_class_counts = defaultdict(int)
199199

200+
# orf_ids are assigned in iteration order below, so sort first to make the
201+
# numbering deterministic.
202+
def _sort_key(cluster):
203+
r = representative(cluster)
204+
return (
205+
r.get("chrom", ""),
206+
int(r.get("start") or 0),
207+
int(r.get("end") or 0),
208+
r.get("strand", ""),
209+
r.get("gene_id") or "",
210+
r.get("transcript_id") or "",
211+
r.get("orf_class", ""),
212+
)
213+
214+
clusters = sorted(clusters, key=_sort_key)
215+
200216
with open(cat_bed, "w") as bh, open(cat_tsv, "w") as th, open(o2g_tsv, "w") as oh:
201217
th.write("\\t".join(catalogue_cols) + "\\n")
202218
oh.write("orf_id\\tgene_id\\ttranscript_id\\n")

modules/nf-core/custom/orfmerge/tests/main.nf.test

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ nextflow_process {
1818
"""
1919
input[0] = channel.of([
2020
[id: 'sample1', caller: 'ribotish'],
21-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
21+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
2222
'ribotish'
2323
])
2424
input[1] = channel.of([
@@ -35,7 +35,7 @@ nextflow_process {
3535
"""
3636
input[0] = channel.of([
3737
[id: 'sample1', caller: 'ribocode'],
38-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
38+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
3939
'ribocode'
4040
])
4141
input[1] = channel.of([
@@ -52,7 +52,7 @@ nextflow_process {
5252
"""
5353
input[0] = channel.of([
5454
[id: 'sample2', caller: 'ribotish'],
55-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
55+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
5656
'ribotish'
5757
])
5858
input[1] = channel.of([

modules/nf-core/custom/orfmerge/tests/main.nf.test.snap

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,23 @@
77
{
88
"id": "cohort"
99
},
10-
"cohort.catalogue.bed12:md5,d0de23019fbec9d96fd9302103a16278"
10+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
1111
]
1212
],
1313
"1": [
1414
[
1515
{
1616
"id": "cohort"
1717
},
18-
"cohort.catalogue.tsv:md5,d2695c6f6eab75fd03a20d7441533d36"
18+
"cohort.catalogue.tsv:md5,d6af31aac0548c8ce0f447be02d06397"
1919
]
2020
],
2121
"2": [
2222
[
2323
{
2424
"id": "cohort"
2525
},
26-
"cohort.orf_to_gene.tsv:md5,79699c0188eadd2aaf912a31a2c14464"
26+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
2727
]
2828
],
2929
"3": [
@@ -42,15 +42,15 @@
4242
{
4343
"id": "cohort"
4444
},
45-
"cohort.catalogue.bed12:md5,d0de23019fbec9d96fd9302103a16278"
45+
"cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
4646
]
4747
],
4848
"catalogue_tsv": [
4949
[
5050
{
5151
"id": "cohort"
5252
},
53-
"cohort.catalogue.tsv:md5,d2695c6f6eab75fd03a20d7441533d36"
53+
"cohort.catalogue.tsv:md5,d6af31aac0548c8ce0f447be02d06397"
5454
]
5555
],
5656
"multiqc": [
@@ -66,18 +66,18 @@
6666
{
6767
"id": "cohort"
6868
},
69-
"cohort.orf_to_gene.tsv:md5,79699c0188eadd2aaf912a31a2c14464"
69+
"cohort.catalogue.orf_to_gene.tsv:md5,eebb86a1c52a7418ee9dc68c53bd9866"
7070
]
7171
],
7272
"versions": [
7373
"versions.yml:md5,8e0b24bd9e91050a8715f1b1b61f0805"
7474
]
7575
}
7676
],
77-
"timestamp": "2026-06-12T13:28:01.081036",
77+
"timestamp": "2026-06-16T11:49:30.349632545",
7878
"meta": {
7979
"nf-test": "0.9.5",
80-
"nextflow": "25.10.4"
80+
"nextflow": "26.04.1"
8181
}
8282
}
8383
}

0 commit comments

Comments
 (0)