Skip to content

Commit 06b3609

Browse files
authored
Add kat_hist (#1903)
1 parent cc74c4f commit 06b3609

1 file changed

Lines changed: 171 additions & 0 deletions

File tree

tools/kat/kat_hist.xml

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
<tool id="kat_hist" name="KAT Histogram" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2+
<description>creates a histogram of k-mer occurrences</description>
3+
<macros>
4+
<import>macros.xml</import>
5+
</macros>
6+
<expand macro="xrefs"/>
7+
<expand macro="requirements"/>
8+
<stdio>
9+
<regex match="corrupted double-linked list" source="stderr" level="warning" description="KAT C++ heap corruption - output files were still produced correctly"/>
10+
<exit_code range="2:133" level="fatal" description="Error occurred"/>
11+
<exit_code range="135:" level="fatal" description="Error occurred"/>
12+
<exit_code range=":-1" level="fatal" description="Error occurred"/>
13+
</stdio>
14+
<version_command>echo @TOOL_VERSION@</version_command>
15+
<command><![CDATA[
16+
#for $i in range(len($input_files)):
17+
#set $f = $input_files[$i]
18+
@SET_EXT@
19+
cp '$f' 'input_${i}${ext}' &&
20+
#end for
21+
kat hist
22+
-o kat_hist_out
23+
-t "\${GALAXY_SLOTS:-1}"
24+
--low $histogram_options.low
25+
--high $histogram_options.high
26+
--inc $histogram_options.inc
27+
--mer_len $kmer_options.mer_len
28+
--hash_size $kmer_options.hash_size
29+
$kmer_options.non_canonical
30+
@CMD_5PTRIM@
31+
--output_type '$advanced_options.output_type'
32+
#for $i in range(len($input_files)):
33+
#set $f = $input_files[$i]
34+
@SET_EXT@
35+
'input_${i}${ext}'
36+
#end for
37+
]]></command>
38+
<inputs>
39+
<param name="input_files" type="data" format="fastqsanger,fastqsanger.gz,fasta,fasta.gz" multiple="true" label="Input files" help="One or more FastA/FastQ files (gzipped accepted), or a single Jellyfish hash file"/>
40+
<section name="histogram_options" title="Histogram Options" expanded="true">
41+
<param argument="--low" type="integer" value="1" min="1" label="Low count" help="Low count value of histogram"/>
42+
<param argument="--high" type="integer" value="10000" min="1" label="High count" help="High count value of histogram"/>
43+
<param argument="--inc" type="integer" value="1" min="1" label="Increment" help="Increment for each bin"/>
44+
</section>
45+
<expand macro="kmer_params"/>
46+
<section name="advanced_options" title="Advanced Options" expanded="false">
47+
<param name="trim_5p" argument="--5ptrim" type="text" value="" label="5' trim" help="Ignore the first X bases from reads. For multiple input files, provide comma-separated values (e.g. 5,10,0).">
48+
<validator type="regex" message="Must be a number or comma-separated numbers">^[0-9]+(,[0-9]+)*$|^$</validator>
49+
</param>
50+
<expand macro="output_type_param"/>
51+
</section>
52+
</inputs>
53+
<outputs>
54+
<data name="output_hist" format="tabular" from_work_dir="kat_hist_out" label="${tool.name} on ${on_string}: Histogram"/>
55+
<data name="output_plot" format="png" from_work_dir="kat_hist_out.png" label="${tool.name} on ${on_string}: PNG Plot">
56+
<filter>advanced_options['output_type'] == 'png'</filter>
57+
</data>
58+
<data name="output_pdf" format="pdf" from_work_dir="kat_hist_out.pdf" label="${tool.name} on ${on_string}: PDF Plot">
59+
<filter>advanced_options['output_type'] == 'pdf'</filter>
60+
</data>
61+
</outputs>
62+
<tests>
63+
<!-- Test 01: Default options with PNG Output -->
64+
<test expect_num_outputs="2">
65+
<param name="input_files" location="https://zenodo.org/records/20347489/files/kat-input-subsampled.fastq.gz" ftype="fastqsanger.gz"/>
66+
<section name="histogram_options">
67+
<param name="low" value="1"/>
68+
<param name="high" value="10000"/>
69+
<param name="inc" value="1"/>
70+
</section>
71+
<output name="output_hist" file="kat_hist.out">
72+
<assert_contents>
73+
<has_text text="# Kmer value:27"/>
74+
<has_n_lines n="10007"/>
75+
</assert_contents>
76+
</output>
77+
<output name="output_plot" file="kat_hist.out.png" ftype="png">
78+
<assert_contents>
79+
<has_image_channels channels="4"/>
80+
<has_image_height height="1800"/>
81+
<has_image_width width="2400"/>
82+
</assert_contents>
83+
</output>
84+
</test>
85+
<!-- Test 02: Custom k-mer length with PDF Output -->
86+
<test expect_num_outputs="2">
87+
<param name="input_files" location="https://zenodo.org/records/20347489/files/kat-input-subsampled.fastq.gz" ftype="fastqsanger.gz"/>
88+
<section name="kmer_options">
89+
<param name="mer_len" value="21"/>
90+
</section>
91+
<section name="advanced_options">
92+
<param name="output_type" value="pdf"/>
93+
</section>
94+
<output name="output_hist">
95+
<assert_contents>
96+
<has_text text="# Kmer value:21"/>
97+
<has_n_lines n="10007"/>
98+
</assert_contents>
99+
</output>
100+
<output name="output_pdf" ftype="pdf">
101+
<assert_contents>
102+
<has_size size="13956"/>
103+
</assert_contents>
104+
</output>
105+
</test>
106+
<!-- Test 03: 5' trim with PNG Output -->
107+
<test expect_num_outputs="2">
108+
<param name="input_files" location="https://zenodo.org/records/20347489/files/kat-input-subsampled.fastq.gz" ftype="fastqsanger.gz"/>
109+
<section name="advanced_options">
110+
<param name="trim_5p" value="5"/>
111+
</section>
112+
<output name="output_hist">
113+
<assert_contents>
114+
<has_text text="# Kmer value:27"/>
115+
<has_n_lines n="10007"/>
116+
</assert_contents>
117+
</output>
118+
<output name="output_plot" ftype="png">
119+
<assert_contents>
120+
<has_image_channels channels="4"/>
121+
<has_image_height height="1800"/>
122+
<has_image_width width="2400"/>
123+
</assert_contents>
124+
</output>
125+
</test>
126+
</tests>
127+
<help format="markdown"><![CDATA[
128+
**KAT Histogram (kat hist)**
129+
130+
Creates a histogram of k-mer occurrences from sequencing data.
131+
132+
-----
133+
134+
**What it does**
135+
136+
The histogram shows how many k-mers appear at each frequency in the input data. Bucket *i* tallies k-mers with count *c* satisfying:
137+
138+
```
139+
low + i*inc <= c < low + (i+1)*inc
140+
```
141+
142+
The final bucket catches all k-mers with count >= the last bucket's lower bound.
143+
144+
This tool is similar to the `histo` command in Jellyfish, but the output includes metadata that makes the histogram easier to plot and interpret.
145+
146+
-----
147+
148+
**Input**
149+
150+
- One or more **FASTA** or **FASTQ** files (gzipped accepted)
151+
- Or a single pre-computed **Jellyfish hash** file
152+
153+
-----
154+
155+
**Outputs**
156+
157+
1. **Histogram** (tabular) - tab-separated count/frequency table with metadata header lines prefixed with `#`.
158+
2. **Plot** (PNG/PDF) - visual plot of the k-mer frequency histogram.
159+
160+
-----
161+
162+
**Tips**
163+
164+
- The default k-mer length of **27** works well for most short-read sequencing data.
165+
- For diploid or polyploid genomes, distinct peaks correspond to heterozygous and homozygous k-mers.
166+
- If the hash size is too small, KAT will automatically double it (increasing runtime and memory). Increase it if you expect many unique k-mers.
167+
- For multiple input files, provide different 5' trim values as comma-separated numbers (e.g. `5,10`).
168+
]]></help>
169+
<expand macro="citations"/>
170+
<expand macro="creator"/>
171+
</tool>

0 commit comments

Comments
 (0)