nf-core · charles-plessy · May 25, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - Support for export to BAM and CRAM formats ([#31](https://github.com/nf-core/pairgenomealign/issues/31)) ([#43](https://github.com/nf-core/pairgenomealign/issues/43)).
+- SAM/BAM/CRAM alignments files are sorted and their header features all sequences of the _target_ genome.
+- Report ungapped percent identity ([#46](https://github.com/nf-core/pairgenomealign/issues/46)).
+- Update full-size test genomes to feature more T2T assemblies ([#59](https://github.com/nf-core/pairgenomealign/issues/59)).
 
 ### `Dependencies`
 
@@ -20,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - Remove noisy tag in the `MULTIQC_ASSEMBLYSCAN_PLOT_DATA` local module ([#64](https://github.com/nf-core/pairgenomealign/issues/64)).
+- Restore BED format support ([#56](https://github.com/nf-core/pairgenomealign/issues/56)).
+- Document the `multiqc_train.txt` and `multiqc_last_o2o.txt` aggregating alignment statistics ([#52](https://github.com/nf-core/pairgenomealign/issues/52)).
+- Point the test configs samplesheets to `nf-core/test-datasets` in order to run the AWS full tests ([#62](https://github.com/nf-core/pairgenomealign/issues/62)).
 
 ## [v2.1.0](https://github.com/nf-core/pairgenomealign/releases/tag/2.1.0) "Goya champuru" - [May 16th 2025]
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -55,8 +55,12 @@ custom_data:
         description: "target__query"
       TotalAlignmentLength:
         title: "Total alignment length"
-      PercentSimilarity:
-        title: "Percent similarity"
+      PercentIdentity:
+        title: "Percent identity"
+        description: "Matches / alignment length, including gaps"
+      PercentIdentityNoGaps:
+        title: "Ungapped percent identity"
+        description: "Matches / (Matches + Mismatches), thus excluding gaps"
 
 sp:
   last_o2o:

diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv
diff --git a/assets/samplesheet_small.csv b/assets/samplesheet_small.csv
diff --git a/conf/modules.config b/conf/modules.config
@@ -141,6 +141,7 @@ process {
     }
 
     withName: 'SAMTOOLS_DICT' {
+        ext.args = { "-u ./${fasta} -a ${meta.id}" }
         publishDir = [
             enabled: false
         ]

diff --git a/conf/test.config b/conf/test.config
@@ -23,7 +23,8 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Input data
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/pairgenomealign/tests/testsamplesheet.csv'
+//    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/pairgenomealign/tests/testsamplesheet.csv'
+    input  = params.pipelines_testdata_base_path + 'pairgenomealign/tests/testsamplesheet.csv'
 
     // Target
     target = params.pipelines_testdata_base_path + 'modules/data/genomics/sarscov2/genome/genome.fasta'

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,10 +15,10 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data
-    input  = 'assets/samplesheet_full.csv'
+    input  = params.pipelines_testdata_base_path + 'pairgenomealign/tests/testsamplesheet_full.csv'
 
     // Target
-    // Took  1h 32m 14s (257.8 CPU hours) to compute on OIST's HPC cluster.
+    // Took  1h 42m 18s (285.4 CPU hours) to compute on OIST's HPC cluster.
     target = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz'
     targetName = 'Homo_sapiens_GRCh38.p14'
 

diff --git a/conf/test_small.config b/conf/test_small.config
@@ -20,7 +20,7 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input = 'assets/samplesheet_small.csv'
+    input  = params.pipelines_testdata_base_path + 'pairgenomealign/tests/testsamplesheet_small.csv'
 
     // Target
     target = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/025/258/505/GCA_025258505.1_ASM2525850v1/GCA_025258505.1_ASM2525850v1_genomic.fna.gz'

diff --git a/...apiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png b/...apiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png
diff --git a/docs/images/Homo_sapiens_GRCh38.p14___Macaca_mulatta_GCA_049350105.1.o2o_plt.png b/docs/images/Homo_sapiens_GRCh38.p14___Macaca_mulatta_GCA_049350105.1.o2o_plt.png
diff --git a/docs/images/mqc_base_content_summary-pct.png b/docs/images/mqc_base_content_summary-pct.png
diff --git a/docs/images/mqc_contigs_length_statistics.png b/docs/images/mqc_contigs_length_statistics.png
diff --git a/docs/images/mqc_last_o2o-stats.png b/docs/images/mqc_last_o2o-stats.png
diff --git a/docs/images/mqc_train-stats.png b/docs/images/mqc_train-stats.png
diff --git a/docs/output.md b/docs/output.md
@@ -37,11 +37,13 @@ Basic statistics on nucleotide content and contig length are collected for align
 
 - `alignment/`
   - `*.train` is the alignment parameters computed by `last-train` (optional)
+  - `*.train.tsv` reports some of the parameters computed by `last-train` for MultiQC (optional)
   - `*.m2m_aln.maf.gz` is the _**many-to-many**_ alignment between _target_ and _query_ genomes. (optional through the `--m2m` option)
   - `*.m2o_aln.maf.gz` is the _**many-to-one**_ alignment regions of the _target_ genome are matched at most once by the _query_ genome. (optional through the `--m2m` option)
   - `*.o2m_aln.maf.gz` is the _**one-to-many**_ alignment between the _target_ and _query_ genomes. (optional through the `--m2m` option)
   - `*.o2o_aln.maf.gz` is the _**one-to-one**_ alignment between the _target_ and _query_ genomes.
-  - For each _**one-to-one**_ alignment there will be an additional file in a format such as Axt, Chain, GFF or SAM/BAM/CRAM if you used the `--export_aln_to` parameter. These extra files are always compressed with gzip when their format is text-based.
+  - `*.o2o_aln.tsv` reports nucleotide percent identity of the _**one-to-one**_ alignment for MultiQC.
+  - For each _**one-to-one**_ alignment there will be an additional file in a format such as Axt, Chain, GFF or SAM/BAM/CRAM if you used the `--export_aln_to` parameter. These extra files are always compressed with gzip when their format is text-based. The SAM/BAM/CRAM files are always sorted. Their header features all sequences from the _target_ genome, including the ones that did not align to the _query_ so that alignment files can be merged without disturbing the sort order.
 
 </details>
 
@@ -60,9 +62,9 @@ Genomes are aligned witn [`lastal`](https://gitlab.com/mcfrith/last/-/blob/main/
 
 </details>
 
-Dot plots are representing the pairwise genome alignments and produced with the [`last-dotplot`](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-dotplot.rst) tool. By default, their maximal width is fixed to aproximately 1000 pixels, so that the _target_ genome is always represented at the same scale in all plots. In the one-to-one alignment example below, the `hg38` human genome (_target_) is represented on the horizontal axis and a monkey genopme (_Macaca mulatta_ accession number `GCA\_003339765.3`) on the vertical axis (_query_). Regions containing unknown (`N`) sequences are on pink background. Forward (+/+) alignments are plotted in red and reverse (+/– or –/+) in blue. _Target_ (human) contigs are displayed in their original order. _Query_ contigs (monkey) are reordered and possibly reverse-complemented to diagonalise the plot as much as possible. The names of reverse-complemented contigs are printed in blue.
+Dot plots are representing the pairwise genome alignments and produced with the [`last-dotplot`](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-dotplot.rst) tool. By default, their maximal width is fixed to aproximately 1000 pixels, so that the _target_ genome is always represented at the same scale in all plots. In the one-to-one alignment example below, the `hg38` human genome (_target_) is represented on the horizontal axis and a monkey genopme (_Macaca mulatta_ accession number `GCA\_049350105.1`) on the vertical axis (_query_). Regions containing unknown (`N`) sequences are on pink background. Forward (+/+) alignments are plotted in red and reverse (+/– or –/+) in blue. _Target_ (human) contigs are displayed in their original order. _Query_ contigs (monkey) are reordered and possibly reverse-complemented to diagonalise the plot as much as possible. The names of reverse-complemented contigs are printed in blue.
 
-![Example of a dot-plot produced by the pipeline after aligning human and macaque genomes](images/Homo_sapiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png "Human–Monkey comparison")
+![Example of a dot-plot produced by the pipeline after aligning human and macaque genomes](images/Homo_sapiens_GRCh38.p14___Macaca_mulatta_GCA_049350105.1.o2o_plt.png "Human–Monkey comparison")
 
 ### `N` regions
 
@@ -85,6 +87,8 @@ The poly-N regions longer than 9 bases in each genome sequence often indicate co
 - `multiqc/`
   - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
   - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
+    - `/multiqc_data/multiqc_train.txt`: table reporting the alignment parameters chosen by `last-train`, for each sample.
+    - `multiqc_data/multiqc_last_o2o.txt`: table reporting the nucleotide percent identity in the alignments computed by `lastal`, for each sample.
   - `multiqc_plots/`: directory containing static images from the report in various formats.
   - `assemblyscan_plot_data`: GC content and contig length statistics parsed from `assemblyscan` for MultiQC with a local module.
 
@@ -108,15 +112,15 @@ Contig length statistics can be displayed by MultiQC as violin plots.
 
 ![Example of a contig length report for primate genomes](images/mqc_contigs_length_statistics.png "Contig length statistics")
 
-#### TRaining parameters
+#### Training parameters
 
 Alignment parameters computed by `last-train` can be displayed by MultiQC as violin plots.
 
 ![Example of alignment parameters for primate genomes aligned to the human genome](images/mqc_train-stats.png "Alignment parameters")
 
 #### Alignment
 
-Alignment statistics can be displayed by MultiQC as violin plots.
+Alignment statistics can be displayed by MultiQC as violin plots. There is no standard way to compute nucleotide identity ([May A. 2004](https://doi.org/10.1016/j.str.2004.04.001)), therefore the pipeline reports two alternatives, including or excluding gaps from the computation.
 
 ![Example of alignment statistics for primate genomes aligned to the human genome](images/mqc_last_o2o-stats.png "Alignment statistics")
 

diff --git a/modules.json b/modules.json
@@ -17,7 +17,7 @@
                     },
                     "last/lastal": {
                         "branch": "master",
-                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "git_sha": "90689d2e4b5a6a50dd0e0a5a29cb5f5b0fc6ab95",
                         "installed_by": ["modules"]
                     },
                     "last/lastdb": {
@@ -27,7 +27,7 @@
                     },
                     "last/mafconvert": {
                         "branch": "master",
-                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "git_sha": "299a50faa298a9cfd794b33d1fd19513ead43074",
                         "installed_by": ["modules"]
                     },
                     "last/mafswap": {
@@ -37,7 +37,7 @@
                     },
                     "last/split": {
                         "branch": "master",
-                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "git_sha": "8d93c521ea9887d6539fcd2e4d909d863eef6fec",
                         "installed_by": ["modules"]
                     },
                     "last/train": {

diff --git a/modules/nf-core/last/lastal/main.nf b/modules/nf-core/last/lastal/main.nf
diff --git a/modules/nf-core/last/lastal/meta.yml b/modules/nf-core/last/lastal/meta.yml