Skip to content

Commit ac8d113

Browse files
authored
Merge pull request #552 from nf-core/star-igenomes-compatibility
Star igenomes compatibility
2 parents e8070df + 8bb0082 commit ac8d113

12 files changed

Lines changed: 520 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
- Address [#512](https://github.com/nf-core/scrnaseq/issues/512), adding early validation of the cellranger multi barcode sheet ([#513](https://github.com/nf-core/scrnaseq/pull/513))
1313
- Update `nf-core/cellranger` modules to Cell Ranger `10.0.0`, including output channel handling for multiplexed experiments ([#508](https://github.com/nf-core/scrnaseq/pull/508))
1414
- Replace **alevinqc** with **qcatch** for simpleaf QC; add `--skip_qcatch` parameter ([#520](https://github.com/nf-core/scrnaseq/pull/520))
15+
- Add legacy STAR iGenomes index compatibility (rnaseq-style `star_legacy` handling and `STAR_GENOMEPARAMS_UPGRADE` to rewrite STAR 2.6.x `genomeParameters.txt` metadata for modern STAR) ([#552](https://github.com/nf-core/scrnaseq/pull/552))
1516

1617
### Chore
1718

conf/igenomes.config

Lines changed: 38 additions & 0 deletions
Large diffs are not rendered by default.

docs/usage.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,28 @@ sample,multiplexed_sample_id,probe_barcode_ids,cmo_ids,ocm_ids,description
378378

379379
> You must provide the barcodes CSV with the `--cellranger_multi_barcodes` parameter.
380380
381+
## Reference genome options
382+
383+
The pipeline can resolve reference files from `conf/igenomes.config` when you provide `--genome`, for example `--genome GRCh38`. These entries may include pre-built aligner indices such as STAR indices, depending on the selected genome.
384+
385+
Some AWS iGenomes STAR indices were generated with older STAR versions and contain legacy metadata. nf-core/scrnaseq includes a compatibility step for these configured iGenomes entries so that legacy STAR indices can run with the STAR version shipped by the pipeline. This support is intended to keep existing iGenomes usage working, not to make legacy indices the preferred reference for new analyses.
386+
387+
> [!WARNING]
388+
> For production runs, we recommend building fresh indices from current reference files instead of relying on legacy AWS iGenomes indices. The nf-core [reference genome documentation](https://nf-co.re/docs/running/reference-genomes) warns that AWS iGenomes annotations are significantly outdated, for example human annotations from Ensembl release 75, and that GRCh38 iGenomes uses the NCBI assembly rather than the masked Ensembl assembly.
389+
390+
To generate and keep a STAR index for future runs, provide current FASTA and GTF files and set `--save_reference`:
391+
392+
```bash
393+
nextflow run nf-core/scrnaseq \
394+
--input samplesheet.csv \
395+
--outdir results \
396+
--aligner star \
397+
--fasta reference.fa.gz \
398+
--gtf annotation.gtf.gz \
399+
--save_reference \
400+
-profile docker
401+
```
402+
381403
## Running the pipeline
382404

383405
The minimum typical command for running the pipeline is as follows:
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- conda-forge::gawk=5.3.1
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
process STAR_GENOMEPARAMS_UPGRADE {
2+
tag "${meta.id ?: index.name}"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
7+
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a1/a125c778baf3865331101a104b60d249ee15fe1dca13bdafd888926cc5490a34/data' :
8+
'community.wave.seqera.io/library/gawk:5.3.1--e09efb5dfc4b8156' }"
9+
10+
input:
11+
tuple val(meta), path(index, stageAs: 'input_index')
12+
13+
output:
14+
tuple val(meta), path('star'), emit: index
15+
tuple val("${task.process}"), val('gawk'), eval("awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//'"), topic: versions, emit: versions_gawk
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
"""
22+
mkdir -p star
23+
for f in input_index/*; do
24+
name=\$(basename "\$f")
25+
if [ "\$name" = "genomeParameters.txt" ]; then
26+
continue
27+
fi
28+
ln -s "\$(readlink -f "\$f")" "star/\$name"
29+
done
30+
31+
awk -F'\\t' -v OFS='\\t' '
32+
\$1 == "versionGenome" && \$2 == "20201" {
33+
print "versionGenome", "2.7.4a"
34+
seen_upgraded = 1
35+
next
36+
}
37+
\$1 == "genomeType" { seen_genomeType = 1 }
38+
\$1 == "genomeTransformType" { seen_transformType = 1 }
39+
\$1 == "genomeTransformVCF" { seen_transformVCF = 1 }
40+
{ print }
41+
END {
42+
if (seen_upgraded) {
43+
if (!seen_genomeType) print "genomeType", "Full"
44+
if (!seen_transformType) print "genomeTransformType", "None"
45+
if (!seen_transformVCF) print "genomeTransformVCF", "-"
46+
}
47+
}
48+
' "input_index/genomeParameters.txt" > star/genomeParameters.txt
49+
"""
50+
51+
stub:
52+
"""
53+
mkdir -p star
54+
for f in input_index/*; do
55+
ln -s "\$(readlink -f "\$f")" "star/\$(basename "\$f")"
56+
done
57+
"""
58+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
3+
name: "star_genomeparams_upgrade"
4+
description: |
5+
Upgrade STAR 2.6.x `genomeParameters.txt` metadata into the 2.7.4a-compatible schema. The
6+
on-disk binary index files (Genome, SA, SAindex, suffix array, etc.) did not change between
7+
STAR 2.6.x and 2.7.4a; only the metadata schema in `genomeParameters.txt` differs. This
8+
process symlinks every binary file from the input index unchanged and writes a patched copy
9+
of `genomeParameters.txt` next to them, so the upgraded index can be loaded by modern STAR.
10+
11+
Idempotent: indices already at `versionGenome 2.7.4a` (or any tag other than `20201`) stream
12+
through unmodified. Targets the AWS iGenomes STAR indices in particular, which all carry
13+
`versionGenome 20201` and lack the `genomeType` / `genomeTransformType` / `genomeTransformVCF`
14+
fields written by STAR 2.7.4a+.
15+
keywords:
16+
- star
17+
- igenomes
18+
- scrnaseq
19+
- index
20+
- compatibility
21+
tools:
22+
- "gawk":
23+
description: "GNU awk - used to rewrite genomeParameters.txt metadata"
24+
homepage: "https://www.gnu.org/software/gawk/"
25+
documentation: "https://www.gnu.org/software/gawk/manual/"
26+
licence:
27+
- "GPL v3"
28+
identifier: ""
29+
input:
30+
- - meta:
31+
type: map
32+
description: |
33+
Groovy Map containing index information
34+
e.g. [ id:'star_index' ]
35+
- index:
36+
type: directory
37+
description: |
38+
Pre-built STAR index directory (typically a STAR 2.6.x-built iGenomes index). Must
39+
contain a `genomeParameters.txt` file alongside the binary index files.
40+
pattern: "*"
41+
ontologies: []
42+
output:
43+
index:
44+
- - meta:
45+
type: map
46+
description: |
47+
Groovy Map containing index information
48+
e.g. [ id:'star_index' ]
49+
- star:
50+
type: directory
51+
description: |
52+
Upgraded STAR index directory. Contains symlinks to every binary file from the
53+
input index plus a patched `genomeParameters.txt` rewritten into the 2.7.4a schema.
54+
pattern: "star"
55+
ontologies: []
56+
versions_gawk:
57+
- - ${task.process}:
58+
type: string
59+
description: The name of the process
60+
- gawk:
61+
type: string
62+
description: The name of the tool
63+
- awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//':
64+
type: eval
65+
description: The expression to obtain the version of the tool
66+
topics:
67+
versions:
68+
- - ${task.process}:
69+
type: string
70+
description: The name of the process
71+
- gawk:
72+
type: string
73+
description: The name of the tool
74+
- awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//':
75+
type: eval
76+
description: The expression to obtain the version of the tool
77+
authors:
78+
- "@pinin4fjords"
79+
maintainers:
80+
- "@pinin4fjords"
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
nextflow_process {
2+
3+
name "Test Process STAR_GENOMEPARAMS_UPGRADE"
4+
script "../main.nf"
5+
process "STAR_GENOMEPARAMS_UPGRADE"
6+
7+
test("upgrade legacy 2.6.1d-style index") {
8+
9+
config "./nextflow.legacy_index.config"
10+
11+
setup {
12+
run("STAR_GENOMEGENERATE") {
13+
script "../../../../modules/nf-core/star/genomegenerate/main.nf"
14+
process {
15+
"""
16+
input[0] = channel.of([
17+
[ id:'test_fasta' ],
18+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ]
19+
])
20+
input[1] = channel.of([
21+
[ id:'test_gtf' ],
22+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ]
23+
])
24+
"""
25+
}
26+
}
27+
}
28+
29+
when {
30+
process {
31+
"""
32+
input[0] = STAR_GENOMEGENERATE.out.index.map { _meta, idx -> [ [ id:'star_index' ], idx ] }
33+
"""
34+
}
35+
}
36+
37+
then {
38+
def patched = path("${process.out.index[0][1]}/genomeParameters.txt").text
39+
assertAll(
40+
{ assert process.success },
41+
{ assert patched.contains('versionGenome\t2.7.4a') },
42+
{ assert patched.contains('genomeType\tFull') },
43+
{ assert patched.contains('genomeTransformType\tNone') },
44+
{ assert patched.contains('genomeTransformVCF\t-') },
45+
{ assert !patched.contains('versionGenome\t20201') },
46+
{ assert snapshot(
47+
process.out.index.collect { meta, idx -> [ meta, file(idx).list().sort() ] }
48+
).match() }
49+
)
50+
}
51+
52+
cleanup {
53+
new File("${launchDir}").deleteDir()
54+
}
55+
}
56+
57+
test("idempotent on modern 2.7.x index") {
58+
59+
// STAR_GENOMEGENERATE here uses the default modern container (no legacy config),
60+
// so the produced index is already at versionGenome 2.7.4a; the upgrade process
61+
// should pass it through unchanged.
62+
63+
setup {
64+
run("STAR_GENOMEGENERATE") {
65+
script "../../../../modules/nf-core/star/genomegenerate/main.nf"
66+
process {
67+
"""
68+
input[0] = channel.of([
69+
[ id:'test_fasta' ],
70+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ]
71+
])
72+
input[1] = channel.of([
73+
[ id:'test_gtf' ],
74+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ]
75+
])
76+
"""
77+
}
78+
}
79+
}
80+
81+
when {
82+
process {
83+
"""
84+
input[0] = STAR_GENOMEGENERATE.out.index.map { _meta, idx -> [ [ id:'star_index' ], idx ] }
85+
"""
86+
}
87+
}
88+
89+
then {
90+
def patched = path("${process.out.index[0][1]}/genomeParameters.txt").text
91+
assertAll(
92+
{ assert process.success },
93+
{ assert patched.contains('versionGenome\t2.7.4a') },
94+
{ assert !patched.contains('versionGenome\t20201') }
95+
)
96+
}
97+
98+
cleanup {
99+
new File("${launchDir}").deleteDir()
100+
}
101+
}
102+
103+
test("upgrade legacy index - stub") {
104+
105+
options "-stub"
106+
config "./nextflow.legacy_index.config"
107+
108+
setup {
109+
run("STAR_GENOMEGENERATE") {
110+
script "../../../../modules/nf-core/star/genomegenerate/main.nf"
111+
process {
112+
"""
113+
input[0] = channel.of([
114+
[ id:'test_fasta' ],
115+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ]
116+
])
117+
input[1] = channel.of([
118+
[ id:'test_gtf' ],
119+
[ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ]
120+
])
121+
"""
122+
}
123+
}
124+
}
125+
126+
when {
127+
process {
128+
"""
129+
input[0] = STAR_GENOMEGENERATE.out.index.map { _meta, idx -> [ [ id:'star_index' ], idx ] }
130+
"""
131+
}
132+
}
133+
134+
then {
135+
assertAll(
136+
{ assert process.success },
137+
{ assert snapshot(process.out).match() }
138+
)
139+
}
140+
141+
cleanup {
142+
new File("${launchDir}").deleteDir()
143+
}
144+
}
145+
}

0 commit comments

Comments
 (0)