diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index d7a0a69f..e3f6b58c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -71,7 +71,7 @@ If you wish to contribute a new step, please use the following coding standards: 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. ### Default values diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 00000000..34085279 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 00000000..bf44d961 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,109 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.13" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 583d8add..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,84 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - workflow_dispatch: - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/curationpretext') }}" - runs-on: [ubuntu-latest] - strategy: - matrix: - NXF_VER: - - "24.04.2" - - "latest-everything" - profile: - - "docker" - - "singularity" - test_name: - - "test" - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "singularity" - - steps: - - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - fetch-depth: 0 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - uses: actions/download-artifact@v4 - with: - pattern: nextflow-* - merge-multiple: true - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Download test data - # Download A fungal test data set that is full enough to show some real output. - run: | - curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf - - - - name: Install nf-test - uses: nf-core/setup-nf-test@v1 - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" - continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} - run: | - nf-test test tests/main.nf.test --profile "${{ matrix.test_name }},${{ matrix.profile }}" --verbose --debug diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f27..ac030fd5 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 551fbd2d..ea526d2d 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -12,14 +12,6 @@ on: required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - master - pull_request_target: branches: - main - master @@ -52,9 +44,9 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" architecture: "x64" - name: Setup Apptainer @@ -120,6 +112,7 @@ jobs: # echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" # - name: Compare container image counts + # id: count_comparison # run: | # if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then # initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} @@ -132,3 +125,10 @@ jobs: # else # echo "The pipeline can be downloaded successfully!" # fi + + # - name: Upload Nextflow logfile for debugging purposes + # uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + # with: + # name: nextflow_logfile.txt + # path: .nextflow.log* + # include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 96% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index 94c929ba..1c97b461 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" - name: Install pre-commit run: pip install pre-commit diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fd241fa..82c4f2f1 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -16,10 +13,10 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - - name: Set up Python 3.12 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - name: Set up Python 3.13 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" - name: Install pre-commit run: pip install pre-commit @@ -36,13 +33,13 @@ jobs: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 95b6b6af..d43797d9 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@52423e01640425a022ef5fd42c6fb5f633a02728 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 00000000..e113a611 --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,140 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.2" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use GitHub runners + - "ubuntu-latest" + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use GitHub runners + - "ubuntu-latest" + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + # - isMain: false + # profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "24.10.5" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use GitHub runners + - "ubuntu-latest" + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/template_version_comment.yml b/.github/workflows/template-version-comment.yml similarity index 95% rename from .github/workflows/template_version_comment.yml rename to .github/workflows/template-version-comment.yml index 537529bc..beb5c77f 100644 --- a/.github/workflows/template_version_comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -14,7 +14,7 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Read template version from .nf-core.yml - uses: nichmor/minimal-read-yaml@v0.0.2 + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml diff --git a/.nf-core.yml b/.nf-core.yml index e3d2362b..b9b9e6fe 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -30,7 +30,7 @@ lint: nextflow_config: - manifest.name - manifest.homePage -nf_core_version: 3.2.1 +nf_core_version: 3.3.2 repository_type: pipeline template: author: Damon-Lee B Pointon (@DLBPointon) @@ -48,4 +48,4 @@ template: - seqera_platform - multiqc - rocrate - version: 1.4.2 + version: 1.5.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dec8650..bb41beec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,24 @@ repos: hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 - - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "3.1.2" + - prettier@3.6.2 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a76..07dbd8bb 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ef4bc2..77dbc52b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,55 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.5.0](https://github.com/sanger-tol/curationpretext/releases/tag/1.5.0)] - UNSC Punic - [2025-08-04] + +### Added and Fixed + +- Template update to 3.3.2. +- Addition of the `--split_telomere` boolean flag, this is false by default. + - When `true` the pipeline will split the telomere file into a 5 and 3 prime file. +- Update `ACCESSORY_FILES` subworkflow: + - Remove `GET_LARGEST_SCAFFOLD` as we no longer need it, this was needed for TABIX so that the correct index file was used. This was used by the `TELO_FINDER` and `GAP_FINDER` subworkflows. +- Update `TELO_FINDER` subworkflow: + - Remove `GAWK_MAP_TELO` as it is no longer needed. + - Remove `GAWK_CLEAN_TELOMERE` as it is no longer needed. The reason for its inclusion has been fixed. + - Update `EXTRACT_TELO` to `EXTRACT_TELOMERE` which also removed the use of the `cat {file} | awk` pattern, replacing it with just `awk`. This was supposed to happen in `1.4.0`, but was forgotten with the files lying dormant in the repo. + - Refactor of the `TELO_FINDER` subworkflow, introducing the `TELO_EXTRACTION` subworkflow which is run per telo file. With the introduction of `split_telomere` this can be 3 files. +- Update `LONGREAD_COVERAGE` subworkflow: + - Remove `GRAPH_OVERALL_COVERAGE` as it is not in use. +- Better formatting in some files. +- Moved `GAWK_UPPER_SEQUENCE` from the `TELO_FINDER` subworkflow to the first step of the main `curationpretext` workflow, this simply makes more sense. +- Removed no longer needed scripts from bin. +- Added the module `GAWK_SPLIT_DIRECTIONS` module, a local copy of the nf-core `GAWK` module. +- Added the `gawk_split_directions.awk` script for split telomere. +- Addition of GUNZIP for the input reference genome. +- Update tests. +- Added an "AUTO" value to the `--aligner` arg. If a genome is >5Gb it will use minimap2 else bwamem2. +- Parity update for the base.config to match TreeVal. +- Minor Doc updates. +- Comment out the CONDA workflow requirement, pipeline does not support conda. + +### Paramters + +| Old Version | New Versions | +| ----------- | ---------------- | +| NA | --split_telomere | + +### Software Dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Module | Old Version | New Versions | +| ------------------------ | ------------- | ------------- | +| `GRAPH_OVERALL_COVERAGE` | perl=5.26.2 | REMOVED | +| `EXTRACT_TELO` | coreutils=9.1 | REMOVED | +| `EXTRACT_TELOMERE` | NA | coreutils=9.1 | +| `GAWK_CLEAN_TELOMERE` | 5.3.0 | REMOVED | +| `GAWK_MAP_TELO` | 5.3.0 | REMOVED | +| `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | +| `GUNZIP` | NA | 1.13 | +| `GAWK_SPLIT_DIRECTIONS` | NA | 5.3.0 | + ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] ### Added and Fixed @@ -81,11 +130,9 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i ### Added and Fixed - GRIT found a bug in `pretext_graph` ingestion code where null values were being introduced as the track name - - This has now need hardcoded, there was no need for dynamic naming anyway - GRIT found a bug in `pretext_graph` ingestion where gap and telomere tracks stopped being ingested correctly and would no longer display or be zeroed out. - - I'm not entirely sure of the cause of this but i think it is a mix of how pretext handles unnamed tracks, assuming their datatype so a null named gap track would be treated as a repeat track, and incorrect logic in the pretext_graph module. - Added GAWK module (as GAWK_CLEAN_TELOMERE) to remove "you screwed up" (this is a legacy error message which will be changed to something more informative and professional) error lines which can appear with some telo motifs or lower case motifs. These will otherwise cause the FIND_TELOMERE_WINDOWS process to crash. diff --git a/CITATION.cff b/CITATION.cff index 9d72b971..0abe02de 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -30,6 +30,6 @@ identifiers: value: 10.5281/zenodo.12773958 repository-code: "https://github.com/sanger-tol/curationpretext" license: MIT -version: 1.4.2 -date-released: "2025-07-28" +version: 1.5.0 +date-released: "2025-08-04" url: "https://pipelines.tol.sanger.ac.uk/curationpretext" diff --git a/README.md b/README.md index cb30b3cb..52b99db1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ # ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only) -[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+linting%22)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958) +[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml) +[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958) +[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -29,35 +31,31 @@ This is intended as a supplementary pipeline for the [treeval](https://github.co Currently, the pipeline uses the following flags: - `--input` - - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa` -- `--reads` +- `--sample` + - Sample is the naming prefix of the output files, e.g. iyTipFemo +- `--reads` - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/` + - This folder _must_ contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function. - `--read_type` - - The type of longread data you are utilising, e.g., ont, illumina, hifi. - `--aligner` - - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported. - `--cram` - - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/` - `--map_order` - - hic map scaffold order, input either `length` or `unsorted` - `--teloseq` - - A telomeric sequence, e.g., `TTAGGG` - `--all_output` - - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured. Now, you can run the pipeline using: diff --git a/assets/schema_input.json b/assets/schema_input.json index 20cfee4e..4c47266a 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,14 +17,14 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" } }, diff --git a/bin/findHalfcoverage.py b/bin/findHalfcoverage.py deleted file mode 100755 index f83fdcc5..00000000 --- a/bin/findHalfcoverage.py +++ /dev/null @@ -1,177 +0,0 @@ -#! /usr/bin/env python3 - -import re -import sys -from optparse import OptionParser - - -def load_scafsize(file): - # example is my.genome file, "scaffold\tsize" - - scafkey = {} - scaffile = open(file, "r") - for line in scaffile: - line = line.replace("\n", "") - name, size = re.split("\t", line) - scafkey[name] = size - - scaffile.close() - return scafkey - - -def getTotallength_undercov(file, cov, wiggleroom): - # example is bed file of coverage, - # scaffold_100_arrow 0 2 18 - - coverage_cutoff = cov + wiggleroom - - myfile = open(file, "r") - - lowcoverage_sum = 0 - prev_scaf = "" - scaf_lc = {} - - for line in myfile: - line = line.replace("\n", "") - objContents = re.split("\t", line) - - if prev_scaf != objContents[0]: - scaf_lc[prev_scaf] = lowcoverage_sum - lowcoverage_sum = 0 - - if float(objContents[3]) < coverage_cutoff: - length = float(objContents[2]) - float(objContents[1]) - lowcoverage_sum += length - - prev_scaf = objContents[0] - - scaf_lc[prev_scaf] = lowcoverage_sum - myfile.close() - - return scaf_lc - - -def get_cov_peaks(file): - # example is depthgraph.txt, "coverage\tbasepair count" - - myPeakFile = open(file, "r") - - rows = [] - for line in myPeakFile: - line = line.replace("\n", "") - items = re.split("\t", line) - rows.append(items) - - myPeakFile.close() - # print(rows[0]) - peakCov = sorted(rows, key=lambda cov: int(cov[1]), reverse=1)[0][0] - - if int(peakCov) == 0: - peakCov = sorted(rows, key=lambda cov: int(cov[1]), reverse=1)[1][0] - - halfPeak = int(peakCov) / 2 - qrtPeak = int(peakCov) / 4 - - print("#Coverage Peak is %s, HalfPeak is %s, QuarterPeak is %s " % (peakCov, halfPeak, qrtPeak)) - - return (peakCov, halfPeak, qrtPeak) - - -def calc_coverage(scafsize, totallowcov): - # calculate the % for lowcov coverage over entire scaffold. - return totallowcov / scafsize * 100 - - -def getArguments(): - # get indivudual arguments from user - - parser = OptionParser(version="%prog 1.0") - parser.add_option( - "-c", "--coveragefile", action="store", type="string", dest="covfile", help="Scaffold Coverage filename" - ) - parser.add_option( - "-m", "--mygenome", action="store", type="string", dest="mygenome", help="mygenome file, scaffold - size file" - ) - parser.add_option( - "-d", - "--depthgraph", - action="store", - type="string", - dest="depth", - help="depthgraph file, bp count at each depth", - ) - parser.add_option( - "-w", - "--wiggle", - action="store", - type="float", - dest="wig", - default=5, - help="wiggle room to add to depth cutoff ie 30X + wiggleroom. Default is 5X", - ) - parser.add_option( - "--cut", - action="store", - type="float", - dest="covcut", - default=60, - help="%Number for coverage cutoff to include in results. ie 50% of scaffold needs to be under diploid peak etc. Default is 60%", - ) - parser.add_option( - "-t", - "--totalsize", - action="store", - type="int", - dest="totsize", - default=250000, - help="total size that determines max coverage boundary.", - ) - - (options, args) = parser.parse_args() - - if options.covfile == None or options.mygenome == None or options.depth == None: - print("Missing Options") - exit() - - return options - - -def main(): - # main program - - options = getArguments() - - scaffold_sizes = load_scafsize(options.mygenome) - (hapCov, dipCov, tetCov) = get_cov_peaks(options.depth) - scaffold_lowcovsum = getTotallength_undercov(options.covfile, dipCov, options.wig) - - for scaffoldName in scaffold_lowcovsum: - if scaffoldName == "": - continue - - # print("==" + scaffoldName) - totalSize = float(scaffold_sizes[scaffoldName]) - lowcovSize = float(scaffold_lowcovsum[scaffoldName]) - - coverage = calc_coverage(totalSize, lowcovSize) - - if coverage > options.covcut: - if totalSize > options.totsize: - print( - "**\t" - + "\t".join( - [str(i) for i in [scaffoldName, int(totalSize), int(lowcovSize), "{:.1f}".format(coverage)]] - ) - ) - else: - print( - "==\t" - + "\t".join( - [str(i) for i in [scaffoldName, int(totalSize), int(lowcovSize), "{:.1f}".format(coverage)]] - ) - ) - - -# -- script execuation -- # -if __name__ == "__main__": - main() diff --git a/bin/gawk_split_directions.awk b/bin/gawk_split_directions.awk new file mode 100644 index 00000000..df82aa10 --- /dev/null +++ b/bin/gawk_split_directions.awk @@ -0,0 +1,8 @@ +## Split telomere file based on column 4 contents +## Date: 03/07/2025 + +BEGIN { + FS="\t"; OFS="\t" +} { + print > "direction."$3".telomere" +} diff --git a/bin/get_avgcov.sh b/bin/get_avgcov.sh deleted file mode 100755 index 2eac5ca5..00000000 --- a/bin/get_avgcov.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# get_avgcov.sh -# ------------------- -# A shell script to calculate average coverage for each scaffold -# into bed format for use -# ------------------- -# Author = yy5 -# Modified = dp24 -# ------------------- -version='1.0.0' -if [ $1 == '-v' ]; -then - echo "$version" -else - awk '{OFS="\t"; $5=$4*($3-$2); print}' $1|awk '{OFS="\t"; sum[$1]+=$5} END {for (chrom in sum) print chrom, sum[chrom]}'|awk 'BEGIN {FS="\t"; OFS="\t"} NR==FNR {genome[$1]=$2; next} {if ($1 in genome) print $1, genome[$1], $2, $3; else print $1, "NA", $2, $3}' - $2| awk '{OFS="\t"; print $1,"0",$3,($2/$3)}' | awk 'BEGIN {FS="\t"; OFS="\t"} {printf "%s\t%s\t%s\t%.0f\n", $1, $2, $3, int($4 + 0.5)}'|sort -T $4 -k1,1 -k2,2n> $3 -fi diff --git a/bin/graph_overall_coverage.pl b/bin/graph_overall_coverage.pl deleted file mode 100755 index 174e61b7..00000000 --- a/bin/graph_overall_coverage.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env perl - -# Script originally developed by Yumi Sims (yy5@sanger.ac.uk) - -use warnings; - -# my $file = shift; - -my ($file) = @ARGV; - -if (!@ARGV || ($ARGV[0] eq '--version')) { - print "1.0\n"; - exit 0; -} - -open (FILE, $file) || die "can't open file $file\n"; - -my %depthcount; -while (my $line = ) { - chomp $line; - my ($id, $start, $end, $depth) = split ("\t", $line); - my $length = $end - $start; - - if ($depthcount{$depth}){ - $depthcount{$depth} += $length; - } - else { - $depthcount{$depth} = $length; - } -} - -foreach my $depth (sort {$a<=>$b} keys %depthcount){ - print join("\t", $depth, $depthcount{$depth}) ."\n"; -} diff --git a/bin/longread_cov_log.py b/bin/longread_cov_log.py deleted file mode 100755 index d5cc177c..00000000 --- a/bin/longread_cov_log.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -import optparse -import math - -# Script originally developed by Will Eagles (we3@sanger.ac.uk) - - -def process_line(line): - line_values = line.rsplit(None, 1) - - try: - cov_val = float(line_values[1]) - except: - cov_val = 0 - - if cov_val > 0: - log_cov_val = math.log(cov_val) - else: - log_cov_val = 0 - - return line_values[0] + "\t" + str(round(log_cov_val, 2)) - - -def main(): - parser = optparse.OptionParser(version="%prog 1.0") - parser.add_option( - "-i", - "--inputfile", - dest="inputfile", - default="default.input", - ) - - options, remainder = parser.parse_args() - - cov_bed = open(options.inputfile, "r") - - for line in cov_bed: - print(process_line(line)) - - -if __name__ == "__main__": - main() diff --git a/conf/base.config b/conf/base.config index 9add5450..9ca32fe1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -13,29 +13,49 @@ process { memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 2 + errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } + maxRetries = 1 maxErrors = '-1' // IN CASES WHERE THERE IS ONE HIC FILE THIS WILL NEED ALMOST NOTHING withName:SAMTOOLS_MERGE { cpus = { 16 } memory = { 50.GB * task.attempt } + time = { 30.h * task.attempt } } withName: '.*:.*:LONGREAD_COVERAGE:(MINIMAP2_ALIGN|MINIMAP2_ALIGN_SPLIT)' { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 )) } + cpus = { 20 * 1 } + memory = { + 1.GB * ( + reference.size() < 2e9 ? 30 : + (reference.size() < 5e9 ? 40 : + (reference.size() < 10e9 ? 60 : + Math.ceil((reference.size() / 1e9) * 3) + ) + ) + ) * Math.ceil(task.attempt * 1) + } + time = { 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48) } } withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 30 ) * Math.ceil( task.attempt * 1 ) ) } + cpus = { 16 * 1 } + memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 30 ) * Math.ceil( task.attempt * 1 ) ) } } withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 3 ) * Math.ceil( task.attempt * 1 ) ) } + cpus = { 16 * 1 } + memory = { + 1.GB * ( + reference.size() < 2e9 ? 30 : + (reference.size() < 5e9 ? 40 : + (reference.size() < 10e9 ? 60 : + Math.ceil((reference.size() / 1e9) * 3) + ) + ) + ) * Math.ceil(task.attempt * 1) + } } withName: PRETEXT_GRAPH { @@ -43,9 +63,9 @@ process { } withName: PRETEXTMAP_STANDRD{ - cpus = { 8 * task.attempt } + cpus = { 8 * 1 } memory = { 3.GB * task.attempt } - time = { 1.h * ( ( fasta.size() < 4e9 ? 24 : 48 ) * Math.ceil( task.attempt * 1 ) ) } + time = { 1.h * ( ( fasta.size() < 4e9 ? 24 : 48 ) * task.attempt ) } } withName: PRETEXTMAP_HIGHRES { @@ -86,6 +106,11 @@ process { memory = { 1.GB * task.attempt } } + withName: BEDTOOLS_INTERSECT { + memory = { 10.GB * task.attempt } + time = { 20.h * task.attempt } + } + // Process-specific resource requirements // NOTE - Please try and reuse the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. @@ -125,4 +150,8 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel: process_gpu { + ext.use_gpu = { workflow.profile.contains('gpu') } + accelerator = { workflow.profile.contains('gpu') ? 1 : null } + } } diff --git a/conf/modules.config b/conf/modules.config index 65623cdb..f867ecc2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -17,9 +17,18 @@ process { // withName: 'PRETEXT_INGEST_SNDRD|PRETEXT_INGEST_HIRES' { publishDir = [ - path: { "${params.outdir}/pretext_maps_processed" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + [ + path: { "${params.outdir}/pretext_maps_processed" }, + pattern: "*normal.pretext", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ], + [ + path: { "${params.outdir}/pretext_maps_processed" }, + pattern: "*hr.pretext", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ], ] } @@ -85,10 +94,9 @@ process { ext.suffix = 'fasta' } - withName: 'GAWK_CLEAN_TELOMERE' { - ext.args2 = "'/^>/'" - ext.prefix = { "${meta.id}_CLEAN" } - ext.suffix = 'telomere' + withName: 'GAWK_SPLIT_DIRECTIONS' { + ext.prefix = { "${input}_telo" } + ext.suffix = 'telomere' } // diff --git a/conf/test.config b/conf/test.config index 80d23e85..f98582dd 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,13 +22,14 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - input = "${baseDir}/TreeValTinyData/assembly/draft/grTriPseu1.fa" - reads = "${baseDir}/TreeValTinyData/genomic_data/pacbio/" - cram = "${baseDir}/TreeValTinyData/genomic_data/hic-arima/" - sample = "CurationPretextTest" - teloseq = "TTAGGG" - aligner = "bwamem2" - all_output = false - skip_tracks = "NONE" - run_hires = false + input = "${baseDir}/TreeValTinyData/assembly/draft/grTriPseu1.fa" + reads = "${baseDir}/TreeValTinyData/genomic_data/pacbio/" + cram = "${baseDir}/TreeValTinyData/genomic_data/hic-arima/" + sample = "CurationPretextTest" + teloseq = "TTAGGG" + aligner = "bwamem2" + all_output = false + skip_tracks = "NONE" + run_hires = false + split_telomere = true } diff --git a/conf/test_full.config b/conf/test_full.config index e164c0aa..3166bfd1 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -20,13 +20,15 @@ params { // Input data for full size test // Limit resources so that this can run on GitHub Actions - sample = "testing" - input = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa" - reads = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/pacbio/" - cram = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/" - sample = "CurationPretextTest" - teloseq = "TTAGGG" - aligner = "bwamem2" - all_output = true - skip_tracks = "NONE" + sample = "testing" + input = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa" + reads = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/pacbio/" + cram = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/" + sample = "CurationPretextTest" + teloseq = "TTAGGG" + aligner = "bwamem2" + all_output = true + skip_tracks = "NONE" + split_telomere = true + } diff --git a/docs/usage.md b/docs/usage.md index 842b4cdd..c723c594 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,6 +10,8 @@ Currently, the pipeline expects input data to be in a specific format. The `--input` should be `.fasta` or `.fa` (the same format but differing suffix). +The `--sample` is your chosen naming for the output files. + The `--cram` should point to the folder containing `.cram` files along with a `.crai` per `.cram`. The `--reads` should point to the folder containing `.fasta.gz` files. diff --git a/modules.json b/modules.json index 30d74ced..61f01451 100644 --- a/modules.json +++ b/modules.json @@ -45,6 +45,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "gunzip": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "minimap2/align": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", diff --git a/modules/local/extract/telomere/main.nf b/modules/local/extract/telomere/main.nf index a0ce237d..41022f00 100644 --- a/modules/local/extract/telomere/main.nf +++ b/modules/local/extract/telomere/main.nf @@ -1,6 +1,6 @@ process EXTRACT_TELOMERE { tag "${meta.id}" - label 'process_low' + label 'process_single' conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/gawk_split_directions/environment.yml b/modules/local/gawk_split_directions/environment.yml new file mode 100644 index 00000000..f52109e8 --- /dev/null +++ b/modules/local/gawk_split_directions/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.0 diff --git a/modules/local/gawk_split_directions/main.nf b/modules/local/gawk_split_directions/main.nf new file mode 100644 index 00000000..29b4af8a --- /dev/null +++ b/modules/local/gawk_split_directions/main.nf @@ -0,0 +1,58 @@ +process GAWK_SPLIT_DIRECTIONS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + input: + tuple val(meta), path(input) + path(program_file) + + output: + tuple val(meta), path("direction.0.${suffix}"), emit: prime5 + tuple val(meta), path("direction.1.${suffix}"), emit: prime3 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files + + program = program_file ? "-f ${program_file}" : "${args2}" + + input.collect{ + assert it.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + awk \\ + ${args} \\ + ${program} \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/gawk_split_directions/meta.yml b/modules/local/gawk_split_directions/meta.yml new file mode 100644 index 00000000..34c50b12 --- /dev/null +++ b/modules/local/gawk_split_directions/meta.yml @@ -0,0 +1,63 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: ["GPL v3"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed on + this file on the `ext.args2` or in the program file. + If the files have a `.gz` extension, they will be unzipped using `zcat`. + pattern: "*" + - - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't + wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" + - - disable_redirect_output: + type: boolean + description: Disable the redirection of awk output to a given file. This is + useful if you want to use awk's built-in redirect to write files instead + of the shell's redirect. +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: The output file - if using shell redirection, specify the name of this + file using `ext.prefix` and the extension using `ext.suffix`. Otherwise, ensure + the awk program produces files with the extension in `ext.suffix`. + pattern: "*" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/local/get/largest_scaffold/main.nf b/modules/local/get/largest_scaffold/main.nf deleted file mode 100644 index a496a800..00000000 --- a/modules/local/get/largest_scaffold/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process GET_LARGEST_SCAFFOLD { - - tag "$meta.id" - label 'process_low' - - conda "conda-forge::coreutils=9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'docker.io/ubuntu:20.04' }" - - input: - tuple val( meta ), path( file ) - - output: - env largest_scaff, emit: scaff_size - path "versions.yml", emit: versions - - script: - def LARGEST_SCAFF_VERSION = "2.0" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - largest_scaff=\$(head -n 1 "${file}" | cut -d\$'\t' -f2) - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - get_largest_scaffold: $LARGEST_SCAFF_VERSION - coreutils: $VERSION - END_VERSIONS - """ - - stub: - def LARGEST_SCAFF_VERSION = "2.0" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - largest_scaff=1000000 - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - get_largest_scaff: $LARGEST_SCAFF_VERSION - coreutils: $VERSION - END_VERSIONS - """ -} diff --git a/modules/local/graph/overall_coverage/main.nf b/modules/local/graph/overall_coverage/main.nf deleted file mode 100644 index 87892813..00000000 --- a/modules/local/graph/overall_coverage/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process GRAPH_OVERALL_COVERAGE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::perl=5.26.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'biocontainers/perl:5.26.2' }" - - input: - tuple val(meta), path(bed) - - output: - tuple val(meta), path("*.part") , emit: part - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - graph_overall_coverage.pl $bed > ${prefix}.part - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - perl: \$(echo \$(perl --version 2>&1) | awk '/This/ {print \$9}')) - graph_overall_coverage.pl: \$(graph_overall_coverage.pl --version) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.part - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - perl: \$(echo \$(perl --version 2>&1) | awk '/This/ {print \$9}')) graph_overall_coverage.pl: \$(graph_overall_coverage.pl --version) - END_VERSIONS - """ -} diff --git a/modules/local/pretext/graph/main.nf b/modules/local/pretext/graph/main.nf index 4e9c92ad..df351d1f 100644 --- a/modules/local/pretext/graph/main.nf +++ b/modules/local/pretext/graph/main.nf @@ -5,11 +5,12 @@ process PRETEXT_GRAPH { container "quay.io/sanger-tol/pretext:0.0.9-yy5-c2" input: - tuple val(meta), path(pretext_file) + tuple val(meta), path(pretext_file) path(gap_file, stageAs: 'gap_file.bed') path(coverage, stageAs: 'coverage.bw') - path(telomere_file, stageAs: 'telomere.bed') + path(telomere_file, stageAs: 'telomere/*') path(repeat_density, stageAs: 'repeat_density.bw') + val(split_telo_bool) output: tuple val(meta), path("*.pretext") , emit: pretext @@ -29,8 +30,21 @@ process PRETEXT_GRAPH { def UCSC_VERSION = '447' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. // Using single [ ] as nextflow will use sh where possible not bash + // + // Core Args must match the below (taken from PretextView), this allows + // the use of keyboard shortcuts for main tracks: + // + // data_type_dic{ // use this data_type + // {"default", 0, }, + // {"repeat_density", 1}, + // {"gap", 2}, + // {"coverage", 3}, + // {"coverage_avg", 4}, + // {"telomere", 5}, + // {"not_weighted", 6} + // }; + // """ - echo "PROCESSING ESSENTIAL FILES" if [ -s "${coverage}" ]; then @@ -50,20 +64,77 @@ process PRETEXT_GRAPH { fi echo "NOW PROCESSING NON-ESSENTIAL files" - input_file="repeat.pretext.part" - if [ -s "${gap_file}" ]; then echo "Processing GAP file..." cat "${gap_file}" | PretextGraph ${args} -i repeat.pretext.part -n "gap" -o gap.pretext.part input_file="gap.pretext.part" fi - if [ -s "${telomere_file}" ]; then - echo "Processing TELO file..." - cat "${telomere_file}" | PretextGraph ${args} -i "\$input_file" -n "telomere" -o "${prefix}.pretext" + # Check if telomere directory has any files + if [ "\$(ls -A telomere 2>/dev/null)" ]; then + file_telox="" + file_5p="" + file_3p="" + file_og="" + + for file in telomere/*.bedgraph; do + [ -e "\$file" ] || continue # skip if no match + fname=\$(basename "\$file") + + case "\$fname" in + *telox*) + echo + file_telox="\$file" + ;; + *5P*) + file_5p="\$file" + ;; + *3P*) + file_3p="\$file" + ;; + *) + file_og="\$file" + ;; + esac + done + + if [ -s "\$file_og" ]; then + echo "Processing OG_TELOMERE file: \$file_og" + + # Must be named "telomere" + PretextGraph $args -i "\$input_file" -n "telomere" -o telo_0.pretext < "\$file_og" + else + echo "OG TELOMERE file - Could be empty or missing" + cp "\$input_file" telo_0.pretext + fi + + if [ -s "\$file_telox" ]; then + echo "Processing TELOX_TELOMERE file: \$file_telox" + PretextGraph $args -i telo_0.pretext -n "telox_telomere" -o telo_1.pretext < "\$file_telox" + else + echo "TELOX file - Could be empty or missing" + cp telo_0.pretext telo_1.pretext + fi + + if [ -s "\$file_5p" ]; then + echo "Processing 5-Prime TELOMERE file: \$file_5p" + PretextGraph $args -i telo_1.pretext -n "5p_telomere" -o telo_2.pretext < "\$file_5p" + else + echo "5-Prime TELOMERE file - Could be empty or missing" + cp telo_1.pretext telo_2.pretext + fi + + if [ -s "\$file_3p" ]; then + echo "Processing 3-Prime TELOMERE file: \$file_3p" + PretextGraph $args -i telo_2.pretext -n "3p_telomere" -o "${prefix}.pretext" < "\$file_3p" + else + echo "3-Prime TELOMERE file - Could be empty or missing" + cp telo_2.pretext "${prefix}.pretext" + fi + else - mv "\$input_file" "${prefix}.pretext" + cp "\$input_file" "${prefix}.pretext" fi cat <<-END_VERSIONS > versions.yml @@ -84,7 +155,6 @@ process PRETEXT_GRAPH { def UCSC_VERSION = '448' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ touch ${prefix}.pretext - cat <<-END_VERSIONS > versions.yml "${task.process}": PretextGraph: \$(PretextGraph | grep "Version" | sed 's/Pretext* Version //;') diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 00000000..9b926b1f --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::coreutils=9.5 + - conda-forge::grep=3.11 + - conda-forge::gzip=1.13 + - conda-forge::lbzip2=2.5 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..3ffc8e92 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,55 @@ +process GUNZIP { + tag "${archive}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data' + : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${gunzip}"), emit: gunzip + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def extension = (archive.toString() - '.gz').tokenize('.')[-1] + def name = archive.toString() - '.gz' - ".${extension}" + def prefix = task.ext.prefix ?: name + gunzip = prefix + ".${extension}" + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + ${args} \\ + ${archive} \\ + > ${gunzip} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def extension = (archive.toString() - '.gz').tokenize('.')[-1] + def name = archive.toString() - '.gz' - ".${extension}" + def prefix = task.ext.prefix ?: name + gunzip = prefix + ".${extension}" + """ + touch ${gunzip} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..926bb22a --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,52 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" + ontologies: [] +output: + gunzip: + - - meta: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + ontologies: [] + - ${gunzip}: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" + - "@gallvp" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 00000000..776211ad --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,121 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - prefix") { + + config './nextflow.config' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - prefix - stub") { + + options '-stub' + config './nextflow.config' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 00000000..a0f0e67e --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "Should run without failures - prefix - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:22.080222697" + }, + "Should run without failures - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:14.593020264" + }, + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:01.295397925" + }, + "Should run without failures - prefix": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:07.414271387" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/nextflow.config b/modules/nf-core/gunzip/tests/nextflow.config new file mode 100644 index 00000000..dec77642 --- /dev/null +++ b/modules/nf-core/gunzip/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GUNZIP { + ext.prefix = { "${meta.id}.xyz" } + } +} diff --git a/nextflow.config b/nextflow.config index f76a56fc..b86dba7b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,12 +11,13 @@ params { // Input options input = null + split_telomere = false skip_tracks = "NONE" sample = "pretext_rerun" teloseq = "TTAGGG" reads = null cram = null - aligner = "bwamem2" + aligner = "AUTO" read_type = "hifi" map_order = "unsorted" all_output = false @@ -161,12 +162,20 @@ profiles { ] } } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' + } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } } -// Load nf-core custom profiles from different Institutions -includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" +// Load nf-core custom profiles from different institutions + +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load sanger-tol/curationpretext custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled @@ -222,7 +231,6 @@ dag { manifest { name = 'sanger-tol/curationpretext' - author = """Damon-Lee B Pointon (@DLBPointon)""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ [ name: 'Damon-Lee B Pointon', @@ -258,14 +266,14 @@ manifest { description = """A simple pipeline to generate pretext files for genomic curation.""" mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '!>=24.04.2' - version = '1.4.2' + nextflowVersion = '!>=24.10.5' + version = '1.5.0' doi = '10.5281/zenodo.12773958' } // Nextflow plugins plugins { - id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.4.2' // Validation of pipeline parameters and creation of an input channel from a sample sheet } validation { diff --git a/nextflow_schema.json b/nextflow_schema.json index 307a510b..06b76aca 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,6 +20,13 @@ "help_text": "You need the input fasta file", "fa_icon": "fas fa-file-fasta" }, + "split_telomere": { + "type": "boolean", + "format": "boolean", + "description": "Split the telomere file into 5' and 3' files for seperate ingestion into the HiC maps", + "default": false, + "fa_icon": "fas fa-check" + }, "skip_tracks": { "type": "string", "description": "Skip generation for specified tracks", @@ -62,9 +69,9 @@ "aligner": { "type": "string", "description": "Aligner for use {minimap2, bwamem2} in generating map", - "help_text": "Pick between {minimap2, bwamem2}. Defaults to 'minimap2'", + "help_text": "Pick between {minimap2, bwamem2, AUTO}. Defaults to 'minimap2'", "fa_icon": "fas fa-file-signature", - "enum": ["bwamem2", "minimap2"] + "enum": ["bwamem2", "minimap2", "AUTO"] }, "run_hires": { "type": "boolean", diff --git a/nf-test.config b/nf-test.config index 78917975..3a1fff59 100644 --- a/nf-test.config +++ b/nf-test.config @@ -1,16 +1,24 @@ config { - testsDir "tests" - workDir ".nf-test" - libDir "tests/lib" - withTrace true - autoSort false - // Running as stub - stops any generation of of pretext files which are what we want to see - options "-dump-channels" + // location for all nf-test tests + testsDir "." + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests + configFile "tests/nextflow.config" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + profile "test" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins plugins { load "nft-utils@0.0.3" } - - configFile "conf/test.config" - profile "test" } diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 50ea4f24..c8de9e84 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2025-05-27T09:34:43+00:00", - "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+CI%22)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+linting%22)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://pipelines.tol.sanger.ac.uk/curationpretext/1.4.0/usage) on how to set-up Nextflow. Make sure to [test your setup](https://pipelines.tol.sanger.ac.uk/curationpretext/1.4.0/usage) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--sample`\n - Sample is the naming prefix of the output files, e.g. iyTipFemo\n\n- `--reads`\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n - This folder _must_ contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function.\n\n- `--read_type`\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/subworkflows/local/accessory_files/main.nf b/subworkflows/local/accessory_files/main.nf index 07121a29..1f04bb04 100644 --- a/subworkflows/local/accessory_files/main.nf +++ b/subworkflows/local/accessory_files/main.nf @@ -9,7 +9,6 @@ include { REPEAT_DENSITY } from '../repeat_density/main' include { LONGREAD_COVERAGE } from '../longread_coverage/main' include { GAWK as GAWK_GENERATE_GENOME_FILE } from '../../../modules/nf-core/gawk/main' -include { GET_LARGEST_SCAFFOLD } from '../../../modules/local/get/largest_scaffold/main' workflow ACCESSORY_FILES { take: @@ -42,17 +41,6 @@ workflow ACCESSORY_FILES { ch_versions = ch_versions.mix( GAWK_GENERATE_GENOME_FILE.out.versions ) - // - // MODULE: Cut out the largest scaffold size and use as comparator against 512MB - // This is the cut off for TABIX using tbi indexes - // TODO: Investigate this as a pure groovy function. - // - GET_LARGEST_SCAFFOLD ( - GAWK_GENERATE_GENOME_FILE.out.output - ) - ch_versions = ch_versions.mix( GET_LARGEST_SCAFFOLD.out.versions ) - - // // SUBWORKFLOW: GENERATES A GAP.BED FILE TO ID THE LOCATIONS OF GAPS // @@ -60,8 +48,7 @@ workflow ACCESSORY_FILES { gap_file = ch_empty_file } else { GAP_FINDER ( - reference_tuple, - GET_LARGEST_SCAFFOLD.out.scaff_size.map{it -> it[1].toInteger()} + reference_tuple ) ch_versions = ch_versions.mix(GAP_FINDER.out.versions) gap_file = GAP_FINDER.out.gap_file.map{ it -> it[1] } @@ -75,12 +62,11 @@ workflow ACCESSORY_FILES { telo_file = ch_empty_file } else { TELO_FINDER ( - GET_LARGEST_SCAFFOLD.out.scaff_size.map{it -> it[1].toInteger()}, reference_tuple, val_teloseq ) ch_versions = ch_versions.mix(TELO_FINDER.out.versions) - telo_file = TELO_FINDER.out.bedgraph_file.map{ it -> it[1] } + telo_file = TELO_FINDER.out.bedgraph_file } @@ -118,7 +104,7 @@ workflow ACCESSORY_FILES { emit: gap_file repeat_file - telo_file + telo_file // This is the possible collection of telomere files longread_output versions = ch_versions } diff --git a/subworkflows/local/gap_finder/main.nf b/subworkflows/local/gap_finder/main.nf index 406f7173..10ca907c 100644 --- a/subworkflows/local/gap_finder/main.nf +++ b/subworkflows/local/gap_finder/main.nf @@ -9,7 +9,6 @@ include { GAWK as GAWK_GAP_LENGTH } from '../../../modules/nf-core/gawk/main' workflow GAP_FINDER { take: reference_tuple // Channel [ val(meta), path(fasta) ] - max_scaff_size // val(size of largest scaffold in bp) main: ch_versions = Channel.empty() diff --git a/subworkflows/local/longread_coverage/main.nf b/subworkflows/local/longread_coverage/main.nf index e2e988e3..9fd1f927 100644 --- a/subworkflows/local/longread_coverage/main.nf +++ b/subworkflows/local/longread_coverage/main.nf @@ -11,7 +11,6 @@ include { SAMTOOLS_MERGE } from '../../../modules include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER_PRIMARY } from '../../../modules/nf-core/samtools/view/main' include { UCSC_BEDGRAPHTOBIGWIG } from '../../../modules/nf-core/ucsc/bedgraphtobigwig/main' -include { GRAPH_OVERALL_COVERAGE } from '../../../modules/local/graph/overall_coverage/main' workflow LONGREAD_COVERAGE { @@ -97,7 +96,9 @@ workflow LONGREAD_COVERAGE { // // MODULE: BAM TO PRIMARY BED // - BEDTOOLS_BAMTOBED(SAMTOOLS_VIEW_FILTER_PRIMARY.out.bam) + BEDTOOLS_BAMTOBED( + SAMTOOLS_VIEW_FILTER_PRIMARY.out.bam + ) ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions) @@ -140,15 +141,6 @@ workflow LONGREAD_COVERAGE { ch_versions = ch_versions.mix( GNU_SORT.out.versions ) - // - // MODULE: GENERATE DEPTHGRAPH - // - GRAPH_OVERALL_COVERAGE( - GNU_SORT.out.sorted - ) - ch_versions = ch_versions.mix( GRAPH_OVERALL_COVERAGE.out.versions ) - - // // LOGIC: PREPARING NORMAL COVERAGE INPUT // diff --git a/subworkflows/local/repeat_density/main.nf b/subworkflows/local/repeat_density/main.nf index b691a5d0..ce4400d2 100644 --- a/subworkflows/local/repeat_density/main.nf +++ b/subworkflows/local/repeat_density/main.nf @@ -25,12 +25,15 @@ workflow REPEAT_DENSITY { main: ch_versions = Channel.empty() + + // // MODULE: MARK UP THE REPEAT REGIONS OF THE REFERENCE GENOME // WINDOWMASKER_MKCOUNTS ( reference_tuple ) ch_versions = ch_versions.mix( WINDOWMASKER_MKCOUNTS.out.versions ) + // // MODULE: CALCULATE THE STATISTICS OF THE MARKED UP REGIONS // @@ -38,18 +41,21 @@ workflow REPEAT_DENSITY { reference_tuple ) ch_versions = ch_versions.mix( WINDOWMASKER_USTAT.out.versions ) + // // MODULE: USE USTAT OUTPUT TO EXTRACT REPEATS FROM FASTA // EXTRACT_REPEAT( WINDOWMASKER_USTAT.out.intervals ) ch_versions = ch_versions.mix( EXTRACT_REPEAT.out.versions ) + // // MODULE: CREATE WINDOWS FROM .GENOME FILE // BEDTOOLS_MAKEWINDOWS( dot_genome ) ch_versions = ch_versions.mix( BEDTOOLS_MAKEWINDOWS.out.versions ) + // // LOGIC: COMBINE TWO CHANNELS AND OUTPUT tuple(meta, windows_file, repeat_file) // @@ -63,6 +69,7 @@ workflow REPEAT_DENSITY { } .set { intervals } + // // MODULE: GENERATES THE REPEAT FILE FROM THE WINDOW FILE AND GENOME FILE // @@ -72,6 +79,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( BEDTOOLS_INTERSECT.out.versions ) + // // MODULE: FIXES IDS FOR REPEATS // @@ -82,6 +90,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_RENAME_IDS.out.versions ) + // // MODULE: SORTS THE ABOVE BED FILES // @@ -94,6 +103,7 @@ workflow REPEAT_DENSITY { GNU_SORT_C ( BEDTOOLS_MAKEWINDOWS.out.bed ) // windows file ch_versions = ch_versions.mix( GNU_SORT_C.out.versions ) + // // MODULE: ADDS 4TH COLUMN TO BED FILE USED IN THE REPEAT DENSITY GRAPH // @@ -104,6 +114,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_REFORMAT_INTERSECT.out.versions ) + // // LOGIC: COMBINES THE REFORMATTED INTERSECT FILE AND WINDOWS FILE CHANNELS AND SORTS INTO // tuple(intersect_meta, windows file, intersect file) @@ -118,6 +129,7 @@ workflow REPEAT_DENSITY { } .set { for_mapping } + // // MODULE: MAPS THE REPEATS AGAINST THE REFERENCE GENOME // @@ -127,6 +139,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( BEDTOOLS_MAP.out.versions ) + // // MODULE: REPLACES . WITH 0 IN MAPPED FILE // @@ -137,6 +150,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_REPLACE_DOTS.out.versions ) + // // MODULE: CONVERTS GENOME FILE AND BED INTO A BIGWIG FILE // @@ -146,6 +160,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( UCSC_BEDGRAPHTOBIGWIG.out.versions ) + emit: repeat_density = UCSC_BEDGRAPHTOBIGWIG.out.bigwig versions = ch_versions diff --git a/subworkflows/local/telo_extraction/main.nf b/subworkflows/local/telo_extraction/main.nf new file mode 100644 index 00000000..e5bfd667 --- /dev/null +++ b/subworkflows/local/telo_extraction/main.nf @@ -0,0 +1,37 @@ +include { FIND_TELOMERE_WINDOWS } from '../../../modules/local/find/telomere_windows/main' +include { EXTRACT_TELOMERE } from '../../../modules/local/extract/telomere/main' + +workflow TELO_EXTRACTION { + take: + telomere_file //tuple(meta, file) + + main: + ch_versions = Channel.empty() + + // + // MODULE: GENERATES A WINDOWS FILE FROM THE ABOVE + // + FIND_TELOMERE_WINDOWS ( + telomere_file + ) + ch_versions = ch_versions.mix( FIND_TELOMERE_WINDOWS.out.versions ) + + + def windows_file = FIND_TELOMERE_WINDOWS.out.windows + def safe_windows = windows_file.ifEmpty { Channel.empty() } + + // + // MODULE: Extract the telomere data from the FIND_TELOMERE + // file and reformat into bed + // + EXTRACT_TELOMERE( + safe_windows + ) + ch_versions = ch_versions.mix( EXTRACT_TELOMERE.out.versions ) + + + emit: + bedgraph_file = EXTRACT_TELOMERE.out.bedgraph + versions = ch_versions + +} diff --git a/subworkflows/local/telo_finder/main.nf b/subworkflows/local/telo_finder/main.nf index d0d52123..cdf0d223 100644 --- a/subworkflows/local/telo_finder/main.nf +++ b/subworkflows/local/telo_finder/main.nf @@ -3,16 +3,14 @@ // // MODULE IMPORT BLOCK // -include { GAWK as GAWK_UPPER_SEQUENCE } from '../../../modules/nf-core/gawk/main' include { FIND_TELOMERE_REGIONS } from '../../../modules/local/find/telomere_regions/main' -include { GAWK as GAWK_CLEAN_TELOMERE } from '../../../modules/nf-core/gawk/main' -include { FIND_TELOMERE_WINDOWS } from '../../../modules/local/find/telomere_windows/main' -include { EXTRACT_TELOMERE } from '../../../modules/local/extract/telomere/main' +include { GAWK_SPLIT_DIRECTIONS } from '../../../modules/local/gawk_split_directions/main' + +include { TELO_EXTRACTION } from '../../../subworkflows/local/telo_extraction/main' workflow TELO_FINDER { take: - max_scaff_size // val(size of largest scaffold in bp) reference_tuple // Channel [ val(meta), path(fasta) ] teloseq @@ -20,57 +18,69 @@ workflow TELO_FINDER { ch_versions = Channel.empty() - // - // MODULE: UPPERCASE THE REFERENCE SEQUENCE - // - GAWK_UPPER_SEQUENCE( - reference_tuple, - [], - false, - ) - ch_versions = ch_versions.mix( GAWK_UPPER_SEQUENCE.out.versions ) - // // MODULE: FINDS THE TELOMERIC SEQEUNCE IN REFERENCE // FIND_TELOMERE_REGIONS ( - GAWK_UPPER_SEQUENCE.out.output, + reference_tuple, teloseq ) ch_versions = ch_versions.mix( FIND_TELOMERE_REGIONS.out.versions ) // - // MODULE: CLEAN THE .TELOMERE FILE IF CONTAINS "you screwed up" ERROR MESSAGE - // (LIKELY WHEN USING LOWERCASE LETTERS OR BAD MOTIF) - // WORKS BE RETURNING LINES THAT START WITH '>' + // MODULE: SPLIT THE TELOMERE FILE INTO 5' and 3' FILES + // THIS IS RUNNING ON A LOCAL VERSION OF THE GAWK MODULE // - GAWK_CLEAN_TELOMERE ( - FIND_TELOMERE_REGIONS.out.telomere, - [], - false - ) - ch_versions = ch_versions.mix( GAWK_CLEAN_TELOMERE.out.versions ) + if (params.split_telomere) { + GAWK_SPLIT_DIRECTIONS ( + FIND_TELOMERE_REGIONS.out.telomere, + file("${projectDir}/bin/gawk_split_directions.awk") + ) + ch_versions = ch_versions.mix( GAWK_SPLIT_DIRECTIONS.out.versions ) + GAWK_SPLIT_DIRECTIONS.out.prime5 + .map { meta, file -> + tuple( [id: meta.id + "_5P"], file) + } + .set { prime5_telo } + + GAWK_SPLIT_DIRECTIONS.out.prime3 + .map { meta, file -> + tuple( [id: meta.id + "_3P"], file) + } + .set { prime3_telo } + + prime5_telo + .mix(prime3_telo) + .mix(FIND_TELOMERE_REGIONS.out.telomere) + .set { telo_for_extraction } + + } else { + telo_for_extraction = FIND_TELOMERE_REGIONS.out.telomere + } - // - // MODULE: GENERATES A WINDOWS FILE FROM THE ABOVE - // - FIND_TELOMERE_WINDOWS ( - GAWK_CLEAN_TELOMERE.out.output - ) - ch_versions = ch_versions.mix( FIND_TELOMERE_WINDOWS.out.versions ) // - // MODULE: EXTRACTS THE LOCATION OF TELOMERIC SEQUENCE BASED ON THE WINDOWS + // SUBWORKFLOW: TELO_EXTRACTION + // - The prime5.mix(prime3) creates a queue channel to execute + // TELO_EXTRACTION per item in channel // - EXTRACT_TELOMERE ( - FIND_TELOMERE_WINDOWS.out.windows + TELO_EXTRACTION ( + telo_for_extraction ) - ch_versions = ch_versions.mix( EXTRACT_TELOMERE.out.versions ) + ch_versions = ch_versions.mix( TELO_EXTRACTION.out.versions ) + + + TELO_EXTRACTION.out.bedgraph_file + .map{ _meta, bedgraph -> + bedgraph + } + .collect() + .set { telo_bedgraphs } + emit: - bed_file = EXTRACT_TELOMERE.out.bed - bedgraph_file = EXTRACT_TELOMERE.out.bedgraph + bedgraph_file = telo_bedgraphs // Used in pretext_graph versions = ch_versions } diff --git a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf index 059abb4d..f0882ce6 100644 --- a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf @@ -79,17 +79,24 @@ workflow PIPELINE_INITIALISATION { type: 'dir' ) - ch_reference = input_fasta.map { fasta -> + ch_reference = input_fasta.map { fasta -> + def fasta_size = fasta.size() + def selected_aligner = (params.aligner == "AUTO") ? + (fasta_size > 5e9 ? "minimap2" : "bwamem2") : + params.aligner + tuple( - [ id: params.sample, - aligner: params.aligner, + [ + id: params.sample, + aligner: selected_aligner, map_order: params.map_order, - ref_size: fasta.size(), + ref_size: fasta_size, ], fasta ) } + ch_cram_reads = cram_dir.map { dir -> tuple( [ id: params.sample ], @@ -244,4 +251,3 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } - diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index 0907ac58..443e828c 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,8 +1,8 @@ plugins { - id "nf-schema@2.1.0" + id "nf-schema@2.4.2" } validation { parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" monochromeLogs = true -} \ No newline at end of file +} diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 00000000..73eb92f7 --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,2 @@ +.DS_Store +pipeline_info/*.{html,json,txt,yml} diff --git a/tests/main.nf.test b/tests/default.nf.test similarity index 65% rename from tests/main.nf.test rename to tests/default.nf.test index a39a3713..9e2b2c1a 100644 --- a/tests/main.nf.test +++ b/tests/default.nf.test @@ -8,14 +8,28 @@ nextflow_pipeline { test("Full run") { + setup { + println "\nDownloading the test data..." + def command = ['bash', '-c', "curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf - -C ${projectDir}/"] + def process = command.execute() + process.waitFor() + + if (process.exitValue() != 0) { + throw new RuntimeException("Error - failed to download ${dbKey}: ${process.err.text}") + } + } + when { params { outdir = "${outputDir}" all_output = true skip_tracks = "NONE" + split_telomere = true } } + // SETUP FOR THE TEST_DATA + then { def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ["pipeline_info/*.{html,json,txt}"]) @@ -30,25 +44,23 @@ nextflow_pipeline { assertAll( {assert workflow.success}, {assert snapshot( - // Test for number of successful processes - should be 29 for a full run - workflow.trace.succeeded().size(), + // Test for number of successful processes + workflow.trace.succeeded().size(), // 42 with out needing to gunzip the assembly removeNextflowVersion("$outputDir/pipeline_info/sanger-tol_curationpretext_software_versions.yml"), // Stable name with relative path stable_name, - stable_name.size(), + stable_name.size(), // 18 // Accessory files accessories, - accessories.size(), - - // The two pretext files - // Presence of files indicated presence of the raw_pretexts - // we expect this to be a list of two files - // we can't use their md5sum as they will be different everytime - // Then double check that there are two - // one is a hr and the other a normal variant + accessories.size(), // 9 + + // The pretext files + // We only expect 1 pretext file as when using the `test` profile + // we are skipping hr pretext file generation + // so size will be 1 and presence of hr file is false pretext_maps_raw.size(), pretext_maps_raw.any{it.toString().contains("_hr_pi.pretext".toString())}, pretext_maps_raw.any{it.toString().contains("_normal_pi.pretext".toString())}, diff --git a/tests/main.nf.test.snap b/tests/default.nf.test.snap similarity index 85% rename from tests/main.nf.test.snap rename to tests/default.nf.test.snap index 7815abe3..8d8f6ea3 100644 --- a/tests/main.nf.test.snap +++ b/tests/default.nf.test.snap @@ -1,7 +1,7 @@ { "Full run": { "content": [ - 40, + 42, { "BEDTOOLS_BAMTOBED": { "bedtools": "2.31.1" @@ -43,9 +43,6 @@ "FIND_TELOMERE_WINDOWS": { "telomere": 1.0 }, - "GAWK_CLEAN_TELOMERE": { - "gawk": "5.3.0" - }, "GAWK_GAP_LENGTH": { "gawk": "5.3.0" }, @@ -61,12 +58,11 @@ "GAWK_REPLACE_DOTS": { "gawk": "5.3.0" }, - "GAWK_UPPER_SEQUENCE": { + "GAWK_SPLIT_DIRECTIONS": { "gawk": "5.3.0" }, - "GET_LARGEST_SCAFFOLD": { - "get_largest_scaffold": 2.0, - "coreutils": 9.1 + "GAWK_UPPER_SEQUENCE": { + "gawk": "5.3.0" }, "GNU_SORT": { "coreutils": 9.3 @@ -80,10 +76,6 @@ "GNU_SORT_C": { "coreutils": 9.3 }, - "GRAPH_OVERALL_COVERAGE": { - "perl": "(v5.26.2))", - "graph_overall_coverage.pl": 1.0 - }, "MINIMAP2_ALIGN": { "minimap2": "2.28-r1209", "samtools": 1.2 @@ -125,13 +117,17 @@ "windowmasker": "1.0.0" }, "Workflow": { - "sanger-tol/curationpretext": "v1.4.2" + "sanger-tol/curationpretext": "v1.5.0" } }, [ "accessory_files", "accessory_files/CurationPretextTest.bigWig", "accessory_files/CurationPretextTest.gap.bedgraph", + "accessory_files/CurationPretextTest_3P_telomere.bed", + "accessory_files/CurationPretextTest_3P_telomere.bedgraph", + "accessory_files/CurationPretextTest_5P_telomere.bed", + "accessory_files/CurationPretextTest_5P_telomere.bedgraph", "accessory_files/CurationPretextTest_telomere.bed", "accessory_files/CurationPretextTest_telomere.bedgraph", "accessory_files/coverage.bigWig", @@ -144,15 +140,19 @@ "pretext_snapshot", "pretext_snapshot/CurationPretextTest_normalFullMap.png" ], - 14, + 18, [ "CurationPretextTest.bigWig:md5,3f66a9152d793a62f877b733c2336dfd", "CurationPretextTest.gap.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_3P_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_3P_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_5P_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_5P_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", "coverage.bigWig:md5,2e474506c957152b231ac63c859f0b17" ], - 5, + 9, 1, false, true, @@ -164,8 +164,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.4" + "nextflow": "25.04.6" }, - "timestamp": "2025-04-16T11:23:34.556355" + "timestamp": "2025-08-21T21:25:49.92252227" } -} +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..e3be3550 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,14 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/curationpretext' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners diff --git a/workflows/curationpretext.nf b/workflows/curationpretext.nf index 874da2cb..69ef0a1d 100644 --- a/workflows/curationpretext.nf +++ b/workflows/curationpretext.nf @@ -4,12 +4,16 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { GAWK as GAWK_UPPER_SEQUENCE } from '../modules/nf-core/gawk/main' include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' -include { GENERATE_MAPS } from '../subworkflows/local/generate_maps/main' -include { ACCESSORY_FILES } from '../subworkflows/local/accessory_files/main' +include { GUNZIP } from '../modules/nf-core/gunzip/main' + include { PRETEXT_GRAPH as PRETEXT_INGEST_SNDRD } from '../modules/local/pretext/graph/main' include { PRETEXT_GRAPH as PRETEXT_INGEST_HIRES } from '../modules/local/pretext/graph/main' +include { GENERATE_MAPS } from '../subworkflows/local/generate_maps/main' +include { ACCESSORY_FILES } from '../subworkflows/local/accessory_files/main' + include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -32,11 +36,49 @@ workflow CURATIONPRETEXT { ch_empty_file = Channel.fromPath("${baseDir}/assets/EMPTY.txt") + ch_reference + .branch { meta, file -> + zipped: file.name.endsWith('.gz') + unzipped: !file.name.endsWith('.gz') + } + .set {ch_input} + + // + // MODULE: UNZIP INPUTS IF NEEDED + // + GUNZIP ( + ch_input.zipped + ) + ch_versions = ch_versions.mix(GUNZIP.out.versions) + + + // + // LOGIC: MIX CHANELS WHICH MAY OR MAY NOT BE EMPTY INTO A SINGLE QUEUE CHANNEL + // + unzipped_input = Channel.empty() + + unzipped_input + .mix(ch_input.unzipped, GUNZIP.out.gunzip) + .set { unzipped_reference } + + + // + // MODULE: UPPERCASE THE REFERENCE SEQUENCE + // + GAWK_UPPER_SEQUENCE( + unzipped_reference, + [], + false, + ) + ch_upper_ref = GAWK_UPPER_SEQUENCE.out.output + ch_versions = ch_versions.mix( GAWK_UPPER_SEQUENCE.out.versions ) + + // // MODULE: GENERATE INDEX OF REFERENCE FASTA // SAMTOOLS_FAIDX ( - ch_reference, + ch_upper_ref, [[],[]], false ) @@ -76,7 +118,7 @@ workflow CURATIONPRETEXT { // SUBWORKFLOW: GENERATE SUPPLEMENTARY FILES FOR PRETEXT INGESTION // ACCESSORY_FILES ( - ch_reference, + ch_upper_ref, ch_reads, val_teloseq, SAMTOOLS_FAIDX.out.fai @@ -96,7 +138,7 @@ workflow CURATIONPRETEXT { // - GENERATE_MAPS IS THE MINIMAL OUTPUT EXPECTED FROM THIS PIPELLINE // GENERATE_MAPS ( - ch_reference, + ch_upper_ref, ch_cram_reads, SAMTOOLS_FAIDX.out.fai ) @@ -115,6 +157,7 @@ workflow CURATIONPRETEXT { cove_file, telo_file, rept_file, + params.split_telomere ) ch_versions = ch_versions.mix( PRETEXT_INGEST_SNDRD.out.versions ) @@ -130,6 +173,7 @@ workflow CURATIONPRETEXT { cove_file, telo_file, rept_file, + params.split_telomere ) ch_versions = ch_versions.mix( PRETEXT_INGEST_SNDRD.out.versions ) }