From 0a6939f7a06d2fd2d278033d86ebcfe5c7949b9e Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Wed, 10 Jun 2026 14:38:20 +0200 Subject: [PATCH 1/4] Add KAT hist, comp, gcp --- tools/kat/kat_comp.xml | 171 +++++++++++++++++++++++++++++++++++++++++ tools/kat/kat_gcp.xml | 171 +++++++++++++++++++++++++++++++++++++++++ tools/kat/kat_hist.xml | 171 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 513 insertions(+) create mode 100644 tools/kat/kat_comp.xml create mode 100644 tools/kat/kat_gcp.xml create mode 100644 tools/kat/kat_hist.xml diff --git a/tools/kat/kat_comp.xml b/tools/kat/kat_comp.xml new file mode 100644 index 0000000000..52254c71e0 --- /dev/null +++ b/tools/kat/kat_comp.xml @@ -0,0 +1,171 @@ + + creates a histogram of k-mer occurrences + + macros.xml + + + + + + + + + + echo @TOOL_VERSION@ + + + +
+ + + +
+ +
+ + ^[0-9]+(,[0-9]+)*$|^$ + + +
+
+ + + + advanced_options['output_type'] == 'png' + + + advanced_options['output_type'] == 'pdf' + + + + + + +
+ + + +
+ + + + + + + + + + + + + +
+ + + +
+ +
+
+ +
+ + + + + + + + + + + +
+ + + +
+ +
+ + + + + + + + + + + + + +
+
+ = the last bucket's lower bound. + +This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. + +----- + +**Input** + +- One or more **FASTA** or **FASTQ** files (gzipped accepted) +- Or a single pre-computed **Jellyfish hash** file + +----- + +**Outputs** + +1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. +2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. + +----- + +**Tips** + +- The default k-mer length of **27** works well for most short-read sequencing data. +- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. +- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. +- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). + ]]> + + +
\ No newline at end of file diff --git a/tools/kat/kat_gcp.xml b/tools/kat/kat_gcp.xml new file mode 100644 index 0000000000..52254c71e0 --- /dev/null +++ b/tools/kat/kat_gcp.xml @@ -0,0 +1,171 @@ + + creates a histogram of k-mer occurrences + + macros.xml + + + + + + + + + + echo @TOOL_VERSION@ + + + +
+ + + +
+ +
+ + ^[0-9]+(,[0-9]+)*$|^$ + + +
+
+ + + + advanced_options['output_type'] == 'png' + + + advanced_options['output_type'] == 'pdf' + + + + + + +
+ + + +
+ + + + + + + + + + + + + +
+ + + +
+ +
+
+ +
+ + + + + + + + + + + +
+ + + +
+ +
+ + + + + + + + + + + + + +
+
+ = the last bucket's lower bound. + +This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. + +----- + +**Input** + +- One or more **FASTA** or **FASTQ** files (gzipped accepted) +- Or a single pre-computed **Jellyfish hash** file + +----- + +**Outputs** + +1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. +2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. + +----- + +**Tips** + +- The default k-mer length of **27** works well for most short-read sequencing data. +- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. +- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. +- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). + ]]> + + +
\ No newline at end of file diff --git a/tools/kat/kat_hist.xml b/tools/kat/kat_hist.xml new file mode 100644 index 0000000000..52254c71e0 --- /dev/null +++ b/tools/kat/kat_hist.xml @@ -0,0 +1,171 @@ + + creates a histogram of k-mer occurrences + + macros.xml + + + + + + + + + + echo @TOOL_VERSION@ + + + +
+ + + +
+ +
+ + ^[0-9]+(,[0-9]+)*$|^$ + + +
+
+ + + + advanced_options['output_type'] == 'png' + + + advanced_options['output_type'] == 'pdf' + + + + + + +
+ + + +
+ + + + + + + + + + + + + +
+ + + +
+ +
+
+ +
+ + + + + + + + + + + +
+ + + +
+ +
+ + + + + + + + + + + + + +
+
+ = the last bucket's lower bound. + +This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. + +----- + +**Input** + +- One or more **FASTA** or **FASTQ** files (gzipped accepted) +- Or a single pre-computed **Jellyfish hash** file + +----- + +**Outputs** + +1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. +2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. + +----- + +**Tips** + +- The default k-mer length of **27** works well for most short-read sequencing data. +- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. +- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. +- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). + ]]> + + +
\ No newline at end of file From a3cdee9b66a4e85f48e5d2069ddd14da27411c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gr=C3=BCning?= Date: Wed, 10 Jun 2026 22:01:43 +0200 Subject: [PATCH 2/4] Update .shed.yml --- tools/kat/.shed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kat/.shed.yml b/tools/kat/.shed.yml index 05a46ede9d..6a23a97d46 100644 --- a/tools/kat/.shed.yml +++ b/tools/kat/.shed.yml @@ -6,7 +6,7 @@ long_description: | using k-mer frequency analysis. It includes tools for generating k-mer histicity plots, comparing k-mer spectra between datasets, filtering sequences by k-mer content, and analysing GC/coverage distributions. -homepage_url: https://github.com/TGAC/KAT +homepage_url: https://github.com/EarlhamInst/KAT remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/kat type: unrestricted categories: From 8dfbe81013d542b0acdf5b984c0254279e77de36 Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Thu, 11 Jun 2026 11:18:08 +0200 Subject: [PATCH 3/4] remove kat_hist --- tools/kat/kat_hist.xml | 171 ----------------------------------------- 1 file changed, 171 deletions(-) delete mode 100644 tools/kat/kat_hist.xml diff --git a/tools/kat/kat_hist.xml b/tools/kat/kat_hist.xml deleted file mode 100644 index 52254c71e0..0000000000 --- a/tools/kat/kat_hist.xml +++ /dev/null @@ -1,171 +0,0 @@ - - creates a histogram of k-mer occurrences - - macros.xml - - - - - - - - - - echo @TOOL_VERSION@ - - - -
- - - -
- -
- - ^[0-9]+(,[0-9]+)*$|^$ - - -
-
- - - - advanced_options['output_type'] == 'png' - - - advanced_options['output_type'] == 'pdf' - - - - - - -
- - - -
- - - - - - - - - - - - - -
- - - -
- -
-
- -
- - - - - - - - - - - -
- - - -
- -
- - - - - - - - - - - - - -
-
- = the last bucket's lower bound. - -This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. - ------ - -**Input** - -- One or more **FASTA** or **FASTQ** files (gzipped accepted) -- Or a single pre-computed **Jellyfish hash** file - ------ - -**Outputs** - -1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. -2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. - ------ - -**Tips** - -- The default k-mer length of **27** works well for most short-read sequencing data. -- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. -- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. -- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). - ]]> - - -
\ No newline at end of file From 785224de809446259fe6be679e509e8f55edbae9 Mon Sep 17 00:00:00 2001 From: Saim Momin Date: Thu, 11 Jun 2026 21:02:34 +0200 Subject: [PATCH 4/4] Fix overwritten tools --- tools/kat/kat_comp.xml | 501 ++++++++++++++++++++++++++++++++++------- tools/kat/kat_gcp.xml | 268 ++++++++++++++-------- 2 files changed, 597 insertions(+), 172 deletions(-) diff --git a/tools/kat/kat_comp.xml b/tools/kat/kat_comp.xml index 52254c71e0..bfd210b4fa 100644 --- a/tools/kat/kat_comp.xml +++ b/tools/kat/kat_comp.xml @@ -1,170 +1,505 @@ - - creates a histogram of k-mer occurrences + + compares k-mer count hashes macros.xml - + echo @TOOL_VERSION@ + #end if + ]]> - -
- - - + +
+ + +
+ + +
+ +
- -
- - ^[0-9]+(,[0-9]+)*$|^$ + + + + + + + + + + + + + + ^[0-9]+(,[0-9]+)*$|^$ + + + + + +
+ + + + + + ^[0-9]+(,[0-9]+)*$|^$ + + ^[0-9]+(,[0-9]+)*$|^$ + +
+ + +
+ + + + +
+ + +
+ +
+ - - - advanced_options['output_type'] == 'png' + + + + + + output_options['output_type'] == 'png' and not output_options['density_plot'] + + + + output_options['output_type'] == 'png' and output_options['density_plot'] + + + + output_options['output_type'] == 'pdf' and not output_options['density_plot'] + + + + output_options['output_type'] == 'pdf' and output_options['density_plot'] + + + + + + + + output_options['output_hists'] + + + output_options['output_hists'] and output_options['output_type'] == 'png' + + + output_options['output_hists'] and output_options['output_type'] == 'pdf' + + + + + output_options['output_hists'] - - advanced_options['output_type'] == 'pdf' + + output_options['output_hists'] and output_options['output_type'] == 'png' + + + output_options['output_hists'] and output_options['output_type'] == 'pdf' + + + + + input3['use_input3'] == 'yes' + + + + input3['use_input3'] == 'yes' + + + + input3['use_input3'] == 'yes' + - - - -
- - - + + +
+
- +
+ +
+ + + +
+ +
+
+ + +
+ - - + + + - + + + + + + + + +
- - - + + + +
+ +
+
+ +
+ + +
- +
-
+
+
- + + + + + + + + - - + - + - + + + + - - - -
- + + + + +
+
- +
+ +
+ + + +
+ +
+
+ + +
+ - - + + + - + + + + + + + + + +
+ + + +
+ +
+
+ +
+ + + +
+ +
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ +
+
+ +
+ + + + +
+ + + + +
+
+ +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ = the last bucket's lower bound. +2. **Reads vs assembly** – Compares k-mers from a read set (dataset 1) against those from an assembly (dataset 2). The matrix is visualised as a stacked spectra-cn histogram, revealing how well the assembly represents the read data and highlighting missing, duplicated, or collapsed content. -This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. +3. **Filtered comparison** – A third dataset can be provided as a k-mer filter, restricting the comparison to only k-mers present in that set. ----- -**Input** +**Inputs** -- One or more **FASTA** or **FASTQ** files (gzipped accepted) -- Or a single pre-computed **Jellyfish hash** file +- **Dataset 1**: One or more FASTA/FASTQ files (gzip accepted) or a pre-counted Jellyfish hash. For the reads-vs-assembly use case, provide reads here. +- **Dataset 2**: One or more FASTA/FASTQ files (gzip accepted) or a pre-counted Jellyfish hash. For the reads-vs-assembly use case, provide the assembly here. +- **Dataset 3** *(optional)*: A third dataset used as a k-mer filter. Only k-mers present in this set are considered during comparison. ----- **Outputs** -1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. -2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. +- **Comparison matrix** (`.mx`): Tab-separated matrix of intersected k-mer counts, with rows corresponding to dataset 1 bins and columns to dataset 2 bins. +- **Plot**: Visualisation of the matrix in the selected format (PNG, PDF, or PostScript). A spectra-cn plot by default, or a density plot if selected. +- **Histogram data and plots** *(optional)*: Per-dataset k-mer frequency histograms and corresponding plots, enabled via *Output histogram data and plots*. ----- -**Tips** +**Notes** + +- For the reads-vs-assembly use case, dataset 1 must be the **reads** and dataset 2 the **assembly**. +- **Scaling factors** can be used to normalise coverage differences between datasets before comparison. +- **5' trim** values can be specified per file using comma-separated integers (e.g. `5,5` for two files). +- Use **Disable automatic hash growing** when working with large genomes or under strict memory limits. +- If inputs are already Jellyfish hashes, k-mer length and hash size settings are ignored for those inputs. + -- The default k-mer length of **27** works well for most short-read sequencing data. -- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. -- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. -- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). ]]> diff --git a/tools/kat/kat_gcp.xml b/tools/kat/kat_gcp.xml index 52254c71e0..93756c439a 100644 --- a/tools/kat/kat_gcp.xml +++ b/tools/kat/kat_gcp.xml @@ -1,170 +1,260 @@ - - creates a histogram of k-mer occurrences + + GC vs Coverage analysis of k-mers macros.xml - + echo @TOOL_VERSION@ + #else: + '$input_type.jellyfish_hash' + #end if + ]]> - -
- - - -
- -
- - ^[0-9]+(,[0-9]+)*$|^$ + + + + + + + + + + + +
+ + + +
+
+ + + +
+
- - - advanced_options['output_type'] == 'png' + + + output_options['output_type'] == 'png' - - advanced_options['output_type'] == 'pdf' + + output_options['output_type'] == 'pdf' + - - - -
- - - + + + + + + +
+
- + - - + + + - + - - - + + + + + + + + + + +
- - - + + + + + +
-
- -
- + + + + - - + + + + + + + + + + + + + + +
+ + + + + + +
+ + +
+ + + + + + + + + + - + + + + + + + + + +
- - - -
- + + + + + + +
+ +
+
+
- +
+ +
+ + + + + + + + - - + - + - - - + + + + +
= the last bucket's lower bound. - -This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret. +- Distinguishing legitimate genomic content from contamination +- Identifying unexpected sequence content +- Analysing biological composition of a sequencing library ----- -**Input** +**Inputs** -- One or more **FASTA** or **FASTQ** files (gzipped accepted) -- Or a single pre-computed **Jellyfish hash** file +- **FastA/FastQ files**: One or more sequence files (optionally gzip-compressed). KAT will perform k-mer counting internally. +- **Jellyfish hash**: A pre-computed k-mer hash produced by Jellyfish. ----- **Outputs** -1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`. -2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram. +- **GCP matrix** (`.mx`): Tab-separated matrix of GC count vs k-mer coverage bins. Each cell contains the count of distinct k-mers with that GC/coverage combination. +- **GCP plot**: A 2D density plot visualising the matrix, useful for detecting contamination or ploidy signals. +- **Stats**: A JSON file with summary statistics for the run. ----- **Tips** -- The default k-mer length of **27** works well for most short-read sequencing data. -- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers. -- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers. -- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`). +- For contamination screening, look for distinct clusters in the GCP plot that sit away from the main genomic peak. +- Increase `--mer_len` for more specific k-mer discrimination (at the cost of memory). + ]]>