Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,10 @@ jobs:
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results

- name: Run pipeline with test data and precomputed BUSCOs
# You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_nobusco,docker --outdir ./results
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:

- name: Upload linting log file artifact
if: ${{ always() }}
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: linting-logs
path: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/sanger_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
"use_work_dir_as_temp": true,
}
profiles: test,sanger,singularity,cleanup
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: Tower debug log file
path: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/sanger_test_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
}
profiles: test_full,sanger,singularity,cleanup
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: Tower debug log file
path: |
Expand Down
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The pipeline is now considered to be a complete and suitable replacement for the
- Updated the Blastn settings to allow 7 days runtime at most, since that
covers 99.7% of the jobs.
- Allow database inputs to be optionally compressed (`.tar.gz`)
- Allow `BUSCO` run outputs to be optionally pre-computed and provided with `--busco_output`

### Software dependencies

Expand All @@ -22,11 +23,19 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i
| Dependency | Old version | New version |
| ----------- | ----------------- | --------------- |
| blast | 2.14.1 and 2.15.0 | only 2.15.0 |
| blobtoolkit | 4.3.9 | 4.4.0 |
| blobtoolkit | 4.3.9 | 4.4.4 |
| busco | 5.5.0 | 5.7.1 |
| multiqc | 1.20 and 1.21 | 1.20 and 1.25.1 |
| samtools | 1.18 and 1.19.2 | 1.20 and 1.21 |

### Parameters

| Old parameter | New parameter |
| ------------- | -------------- |
| | --busco-output |

> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.

## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13]

The pipeline has now been validated for draft (unpublished) assemblies.
Expand Down
26 changes: 21 additions & 5 deletions bin/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def parse_args(args=None):
parser.add_argument("--blastx", help="Path to the blastx database", required=True)
parser.add_argument("--blastn", help="Path to the blastn database", required=True)
parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True)
parser.add_argument("--busco_output", action="append", help="Path to BUSCO output directory", required=False)
parser.add_argument("--version", action="version", version="%(prog)s 2.0")
return parser.parse_args(args)

Expand Down Expand Up @@ -121,20 +122,33 @@ def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
return {r: ancestors[r] for r in RANKS if r in ancestors}


def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]:
def get_odb(
taxon_info: TaxonInfo,
lineage_tax_ids: str,
requested_buscos: typing.Optional[str],
pre_computed_buscos: typing.List[str],
) -> typing.List[str]:
# Read the mapping between the BUSCO lineages and their taxon_id
with open(lineage_tax_ids) as file_in:
lineage_tax_ids_dict: typing.Dict[int, str] = {}
for line in file_in:
arr = line.split()
lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10"

if requested_buscos:
valid_odbs = set(lineage_tax_ids_dict.values())

if pre_computed_buscos:
# Use pre-computed BUSCO lineages if available
odb_arr = pre_computed_buscos
for odb in odb_arr:
if odb not in valid_odbs:
print(f"Invalid pre-computed BUSCO lineage: {odb}", file=sys.stderr)
sys.exit(1)
elif requested_buscos:
odb_arr = requested_buscos.split(",")
valid_odbs = set(lineage_tax_ids_dict.values())
for odb in odb_arr:
if odb not in valid_odbs:
print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr)
print(f"Invalid requested BUSCO lineage: {odb}", file=sys.stderr)
sys.exit(1)
else:
# Do the intersection to find the ancestors that have a BUSCO lineage
Expand Down Expand Up @@ -327,7 +341,9 @@ def main(args=None):

taxon_info = fetch_taxon_info(args.taxon_query)
classification = get_classification(taxon_info)
odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)

precomputed_busco = [os.path.basename(path).replace("run_", "") for path in (args.busco_output or [])]
odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco, precomputed_busco)
taxon_id = adjust_taxon_id(args.nt, taxon_info)

if sequence_report:
Expand Down
47 changes: 47 additions & 0 deletions conf/test_nobusco.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run sanger-tol/blobtoolkit -profile test,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal aligned test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input test data
// Specify the paths to your test data
// Give any required params for the test so that command line flags are not needed
input = "${projectDir}/assets/test/samplesheet_s3.csv"

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.phiXspike.fasta.gz"
accession = "GCA_922984935.2"
taxon = "Meles meles"

// Databases
taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"
busco = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/blobtoolkit.GCA_922984935.2.2023-08-03.tar.gz"
blastp = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscogenes.dmnd.tar.gz"
blastx = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/mMelMel3.1.buscoregions.dmnd.tar.gz"
blastn = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/nt_mMelMel3.1.tar.gz"

// Precomputed BUSCO outputs
// busco_output_noArchaea.tar.gz deliberately leaves out archaea_odb10 to test the pipeline's detection and filling of missing lineages
// Switch to *_busco_output.tar.gz for fully precomputed BUSCOs
busco_output = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/GCA_922984935.2_busco_output_noArchaea.tar.gz"
//busco_output = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/resources/GCA_922984935.2_busco_output.tar.gz"

// Need to be set to avoid overfilling /tmp
use_work_dir_as_temp = true
}
22 changes: 21 additions & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,26 @@ An [example samplesheet](assets/test/samplesheet.csv) has been provided with the
The pipeline can also accept a samplesheet generated by the [nf-core/fetchngs](https://nf-co.re/fetchngs) pipeline (tested with version 1.11.0).
The pipeline then needs the `--fetchngs_samplesheet true` option _and_ `--align true`, since the data files would all be unaligned.

### Support for pre-computed `BUSCO` outputs

The pipeline may be optionally run with a set of pre-computed [`BUSCO`](https://busco.ezlab.org) runs, provided using the `--busco_output` parameter. These can be provided as either a directory path, or a `.tar.gz` compressed archive. The contents should be each `run_` output directory (directly from `BUSCO`) named as `run_[odb_dabasase_name]`:

```
GCA_922984935.2_busco_output/
├── run_archaea_odb10
├── run_bacteria_odb10
├── run_carnivora_odb10
├── run_eukaryota_odb10
├── run_eutheria_odb10
├── run_laurasiatheria_odb10
├── run_mammalia_odb10
├── run_metazoa_odb10
├── run_tetrapoda_odb10
└── run_vertebrata_odb10
```

The pipeline minimally requires outputs for the 'basal' lineages (archaea, eukaryota, and bacteria) -- any of these which are not present in the pre-computed outputs will be automatically detected and run.

## Database parameters

Configure access to your local databases with the `--busco`, `--blastp`, `--blastx`, `--blastn`, and `--taxdump` parameters.
Expand Down Expand Up @@ -272,7 +292,7 @@ List of tools for any given dataset can be fetched from the API, for example htt

| Dependency | Snakemake | Nextflow |
| ----------------- | --------- | -------- |
| blobtoolkit | 4.3.2 | 4.4.0 |
| blobtoolkit | 4.3.2 | 4.4.4 |
| blast | 2.12.0 | 2.14.1 |
| blobtk | 0.5.0 | 0.5.1 |
| busco | 5.3.2 | 5.5.0 |
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/chunk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CHUNK {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta) , path(fasta)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/countbuscos.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(table, stageAs: 'dir??/*')
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/createblobdir.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CREATEBLOBDIR {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(window, stageAs: 'windowstats/*')
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/extractbuscos.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(fasta)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(blobdir)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/unchunk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UNCHUNK {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(blast_table)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/updateblobdir.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(input, stageAs: "input_blobdir")
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/updatemeta.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(input)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/blobtoolkit/windowstats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ process BLOBTOOLKIT_WINDOWSTATS {
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "BLOBTOOLKIT_WINDOWSTATS module does not support Conda. Please use Docker / Singularity / Podman instead."
}
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), path(tsv)
Expand Down
5 changes: 4 additions & 1 deletion modules/local/generate_config.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ process GENERATE_CONFIG {
label 'process_single'

conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0"
container "docker.io/genomehubs/blobtoolkit:4.4.0"
container "docker.io/genomehubs/blobtoolkit:4.4.4"

input:
tuple val(meta), val(fasta)
Expand All @@ -15,6 +15,7 @@ process GENERATE_CONFIG {
tuple val(meta3), path(blastx, stageAs: 'blastx/*')
tuple val(meta4), path(blastn, stageAs: 'blastn/*')
tuple val(meta5), path(taxdump)
val (busco_outputs)

output:
tuple val(meta), path("*.yaml") , emit: yaml
Expand All @@ -32,6 +33,7 @@ process GENERATE_CONFIG {
def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
def accession_params = params.accession ? "--accession ${params.accession}" : ""
def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_layout ${it[0].layout} --read_path ${it[1]}"}.join(' ')
def busco_output_param = busco_outputs.collect { meta, path -> "--busco_output ${path}" }.join(' ')
"""
generate_config.py \\
--fasta $fasta \\
Expand All @@ -45,6 +47,7 @@ process GENERATE_CONFIG {
--blastx ${blastx} \\
--blastn ${blastn} \\
--taxdump ${taxdump} \\
$busco_output_param \\
--output_prefix ${prefix}

cat <<-END_VERSIONS > versions.yml
Expand Down
25 changes: 12 additions & 13 deletions modules/local/restructurebuscodir.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process RESTRUCTUREBUSCODIR {
'nf-core/ubuntu:20.04' }"

input:
tuple val(meta), val(lineage), path(batch_summary), path(short_summaries_txt), path(short_summaries_json), path(busco_dir)
tuple val(meta), val(lineage), path(batch_summary), path(short_summary_txt), path(short_summary_json), path(full_table), path(missing_busco_list), path(single_copy_busco_sequences), path(multi_copy_busco_sequences), path(fragmented_busco_sequences), path(hmmer_output)

output:
tuple val(meta), path("${lineage}"), emit: clean_busco_dir
Expand All @@ -21,24 +21,23 @@ process RESTRUCTUREBUSCODIR {
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
mkdir ${lineage}
mkdir -p ${lineage}

cp --dereference ${batch_summary} ${lineage}/short_summary.tsv
[ -n "${short_summaries_txt}" ] && cp --dereference ${short_summaries_txt} ${lineage}/short_summary.txt
[ -n "${short_summaries_json}" ] && cp --dereference ${short_summaries_json} ${lineage}/short_summary.json
cp --dereference ${batch_summary} ${lineage}/short_summary.tsv
[ -n "${short_summary_txt}" ] && cp --dereference ${short_summary_txt} ${lineage}/short_summary.txt
[ -n "${short_summary_json}" ] && cp --dereference ${short_summary_json} ${lineage}/short_summary.json

# Should we compress these ?
[ -e ${busco_dir}/*/run_*/full_table.tsv ] && cp ${busco_dir}/*/run_*/full_table.tsv ${lineage}/
[ -e ${busco_dir}/*/run_*/missing_busco_list.tsv ] && cp ${busco_dir}/*/run_*/missing_busco_list.tsv ${lineage}/
[ -e ${full_table} ] && cp ${full_table} ${lineage}/
[ -e ${missing_busco_list} ] && cp ${missing_busco_list} ${lineage}/

tar czf ${lineage}/single_copy_busco_sequences.tar.gz -C ${busco_dir}/*/run_*/busco_sequences single_copy_busco_sequences
tar czf ${lineage}/multi_copy_busco_sequences.tar.gz -C ${busco_dir}/*/run_*/busco_sequences multi_copy_busco_sequences
tar czf ${lineage}/fragmented_busco_sequences.tar.gz -C ${busco_dir}/*/run_*/busco_sequences fragmented_busco_sequences
tar czf ${lineage}/hmmer_output.tar.gz --exclude=.checkpoint -C ${busco_dir}/*/run_* hmmer_output
tar czf ${lineage}/single_copy_busco_sequences.tar.gz -C \$(dirname ${single_copy_busco_sequences}) \$(basename ${single_copy_busco_sequences})
tar czf ${lineage}/multi_copy_busco_sequences.tar.gz -C \$(dirname ${multi_copy_busco_sequences}) \$(basename ${multi_copy_busco_sequences})
tar czf ${lineage}/fragmented_busco_sequences.tar.gz -C \$(dirname ${fragmented_busco_sequences}) \$(basename ${fragmented_busco_sequences})
tar czf ${lineage}/hmmer_output.tar.gz --exclude=.checkpoint -C \$(dirname ${hmmer_output}) \$(basename ${hmmer_output})

cat <<-END_VERSIONS > versions.yml
"${task.process}":
tar: \$(tar --version| awk 'NR==1 {print \$3}' )
tar: \$(tar --version | awk 'NR==1 {print \$3}')
END_VERSIONS
"""
}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ params {
mask = false
fetchngs_samplesheet = false
busco_lineages = null
busco_output = null

// Reference options
fasta = null
Expand Down Expand Up @@ -198,6 +199,7 @@ profiles {
test { includeConfig 'conf/test.config' }
test_raw { includeConfig 'conf/test_raw.config' }
test_full { includeConfig 'conf/test_full.config' }
test_nobusco { includeConfig 'conf/test_nobusco.config' }
}

// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile
Expand Down
7 changes: 6 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -381,5 +381,10 @@
{
"$ref": "#/definitions/generic_options"
}
]
],
"properties": {
"busco_output": {
"type": "string"
}
}
}
Loading