Polish split_file_to_collection and splitfasta help with diagrams (#1820)

nekrut · claude · bgruening · web-flow · commit 12b8a7fa3612 · 2026-03-25T22:09:14.000Z
* Polish split_file_to_collection and splitfasta help with diagrams - Add PNG diagrams via macros showing split operations - Rewrite help with structured Description/Examples format - Bump splitfasta version to 0.5.2 - All tests pass (20/20 + 2/2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add examples to split_file_to_collection, fix help reference Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Use bullet list for splitting modes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Restructure help: merge allocation into examples Drop abstract index table. Each allocation mode (alternating, batch, random) is now shown with the same FASTA input so the difference is immediately visible. Add plain-English annotations to each example. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Use emoji sequence names in examples Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add regex substitution example for tabular column split Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add .lint_skip for pre-existing TestsCaseValidation warnings Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Bump split_file_to_collection version to 0.5.3 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Apply suggestion from @bgruening * Delete tools/splitfasta/macros.xml * Update splitFasta.xml * Update split_file_to_collection.xml --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
diff --git a/tools/splitfasta/splitFasta.xml b/tools/splitfasta/splitFasta.xml
@@ -1,4 +1,4 @@
-<tool id="rbc_splitfasta" name="Split Fasta" version="0.5.1" profile="23.0">
+<tool id="rbc_splitfasta" name="Split Fasta" version="0.5.2" profile="23.0">
     <description>files into a collection</description>
     <requirements>
         <requirement type="package" version="1.76">biopython</requirement>
@@ -52,8 +52,34 @@
         </test>
     </tests>
     <help><![CDATA[
-        Takes an input FASTA file and writes entries (i.e. sequences) to separate datasets, which are organized in a dataset collection.
-        There are two modes: 1) each sequence is written to its own data set which is named by the ID of the sequence or 2) The file is split into a given number of chunks which are numbered.
+
+===========
+Description
+===========
+
+Splits a FASTA file into separate datasets organized in a collection. Two modes are available:
+
+- **Each sequence in its own dataset** — one output file per sequence, named by the sequence ID
+- **Split into chunks** — sequences are distributed across a specified number of output files
+
+.. image:: $PATH_TO_IMAGES/split_fasta.png
+  :alt: Split a FASTA file into a collection with one sequence per dataset
+  :width: 620
+
+========
+Examples
+========
+
+**One sequence per dataset**
+
+A FASTA file with 3 sequences produces a collection of 3 datasets named ``seq_A``, ``seq_B``, ``seq_C``.
+
+-------
+
+**Split into 2 chunks**
+
+The same file split into 2 chunks produces ``part1`` (2 sequences) and ``part2`` (1 sequence).
+
     ]]></help>
     <citations>
         <citation type="bibtex">
diff --git a/tools/splitfasta/static/images/split_fasta.png b/tools/splitfasta/static/images/split_fasta.png
diff --git a/tools/splitfasta/static/images/split_fasta.svg b/tools/splitfasta/static/images/split_fasta.svg
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 620 230" width="620" height="230">
+  <defs>
+    <filter id="shadow" x="-4%" y="-4%" width="110%" height="110%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="2" result="blur"/>
+      <feComponentTransfer in="blur"><feFuncA type="linear" slope="0.3"/></feComponentTransfer>
+      <feMerge><feMergeNode/><feMergeNode in="SourceGraphic"/></feMerge>
+    </filter>
+    <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#555"/>
+    </marker>
+  </defs>
+  <!-- Left: single FASTA file -->
+  <rect x="10" y="20" width="170" height="170" rx="8" ry="8" fill="#fff" stroke="#aaa" stroke-width="1" filter="url(#shadow)"/>
+  <text x="95" y="38" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Input: FASTA</text>
+  <text x="25" y="58" font-family="monospace" font-size="9" fill="#555">&gt;seq_A</text>
+  <text x="25" y="70" font-family="monospace" font-size="9" fill="#888">ATCGATCG...</text>
+  <text x="25" y="88" font-family="monospace" font-size="9" fill="#555">&gt;seq_B</text>
+  <text x="25" y="100" font-family="monospace" font-size="9" fill="#888">GCTAGCTA...</text>
+  <text x="25" y="118" font-family="monospace" font-size="9" fill="#555">&gt;seq_C</text>
+  <text x="25" y="130" font-family="monospace" font-size="9" fill="#888">TTAACCGG...</text>
+  <text x="95" y="170" text-anchor="middle" font-family="sans-serif" font-size="9" fill="#888">3 sequences</text>
+  <!-- Arrow -->
+  <line x1="195" y1="105" x2="305" y2="105" stroke="#555" stroke-width="2" marker-end="url(#arrowhead)"/>
+  <text x="250" y="95" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="bold" fill="#555">Split</text>
+  <text x="250" y="123" text-anchor="middle" font-family="sans-serif" font-size="9" fill="#888">one per sequence</text>
+  <!-- Right: output collection -->
+  <rect x="320" y="10" width="290" height="190" rx="8" ry="8" fill="#d9ead3" stroke="#888" stroke-width="1" filter="url(#shadow)"/>
+  <text x="465" y="28" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Output: A collection</text>
+  <!-- seq_A -->
+  <rect x="332" y="38" width="266" height="40" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="52" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">seq_A</text>
+  <text x="345" y="68" font-family="monospace" font-size="9" fill="#888">&gt;seq_A  ATCGATCG...</text>
+  <!-- seq_B -->
+  <rect x="332" y="86" width="266" height="40" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="100" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">seq_B</text>
+  <text x="345" y="116" font-family="monospace" font-size="9" fill="#888">&gt;seq_B  GCTAGCTA...</text>
+  <!-- seq_C -->
+  <rect x="332" y="134" width="266" height="40" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="148" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">seq_C</text>
+  <text x="345" y="164" font-family="monospace" font-size="9" fill="#888">&gt;seq_C  TTAACCGG...</text>
+  <!-- Annotation -->
+  <text x="310" y="223" text-anchor="middle" font-family="sans-serif" font-size="10" font-style="italic" fill="#666">each sequence becomes a separate dataset named by its ID</text>
+</svg>
diff --git a/tools/text_processing/split_file_to_collection/.lint_skip b/tools/text_processing/split_file_to_collection/.lint_skip
@@ -0,0 +1 @@
+TestsCaseValidation
diff --git a/tools/text_processing/split_file_to_collection/split_file_to_collection.xml b/tools/text_processing/split_file_to_collection/split_file_to_collection.xml
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.5.2">
+<tool id="split_file_to_collection" name="Split file" version="0.5.3">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">
@@ -30,7 +30,7 @@
             <param name="newfilenames" type="text" label="Base name for new files in collection"
                 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/>
             <conditional name="select_allocate">
-                <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram">
+                <param name="allocate" type="select" label="Method to allocate records to new files" help="See the help section for a diagram">
                     <option value="random">At random</option>
                     <option value="batch">Maintain record order</option>
                     <option value="byrow" selected="true">Alternate output files</option>
@@ -527,58 +527,110 @@
         </test>
     </tests>
     <help><![CDATA[
-**Split file into a dataset collection**
-
-This tool splits a data set consisting of records into multiple data sets within a collection.
-A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
-(headers + sequence + qualities), etc. The important property is that the records either have a 
-specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record
-can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA.
-The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$").
-For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. 
-If the generic splitter is used, an option is also available to split records either before or after the
-separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
-others).
-
-If splitting by line (or by some other item, like a FASTA entry or an MGF record), the splitting can be either done alternatingly, in original record order, or at random.
-
-If t records are to be distributed to n new data sets, then the i-th record goes to data set
-
-* floor(i / t * n) (for batch),
-* i % n (for alternating), or
-* a random data set
-
-For instance, t=5 records are distributed as follows on n=2 data sets
-
-= === === ====
-i bat alt rand
-= === === ====
-0 0   0   0
-1 0   1   1
-2 0   0   1
-3 1   1   0
-4 1   0   0
-= === === ====
-
-If the five records are distributed on n=3 data sets:
-
-= === === ====
-i bat alt rand
-= === === ====
-0 0   0   0
-1 0   1   1
-2 1   2   2
-3 1   0   0
-4 2   1   1
-= === === ====
-
-Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
-
-If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
-In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
-The default regular expression uses each value in the column without modifying it.
-
-Two modes are available for the tool. For the main mode, the number of output files is selected. In this case, records are shared out between this number of files. Alternatively, 'chunking mode' can be selected, which puts a fixed number of records (the 'chunk size') into each output file.
+
+===========
+Description
+===========
+
+Splits a dataset into multiple files organized as a dataset collection. Supports FASTA, FASTQ, tabular, text, MGF, SD-files, and generic record-based formats.
+
+Records can be defined by a fixed line count (e.g. 4 lines for FASTQ) or by a regular expression marking record boundaries (e.g. ``>.*`` for FASTA). Presets handle common formats automatically; the generic splitter allows custom separators.
+
+.. image:: $PATH_TO_IMAGES/split_file.png
+  :alt: Split a dataset into a collection of files
+  :width: 620
+
+You can control how many output files are created:
+
+- **Number of output files** — records are shared out between *n* files
+- **Chunk mode** — each file gets exactly *k* records (the last file may get fewer)
+
+For tabular input, you can also split by a column value — a new file is created for each unique value in the chosen column, with optional regex substitution.
+
+========
+Examples
+========
+
+The following examples use a FASTA file with 4 sequences as input::
+
+ >🍎        >🍊        >🍋        >🍇
+ ATCG       GCTA       TTAA       CCGG
+
+-------
+
+**Alternating** (default) — records are dealt out round-robin, like cards. Split into 2 files::
+
+ split_000000.fasta:       split_000001.fasta:
+   >🍎  (1st record)         >🍊  (2nd record)
+   ATCG                      GCTA
+   >🍋  (3rd record)         >🍇  (4th record)
+   TTAA                      CCGG
+
+Records alternate: 🍎→file0, 🍊→file1, 🍋→file0, 🍇→file1.
+
+-------
+
+**Batch** — records stay in original order, split into contiguous blocks. Split into 2 files::
+
+ split_000000.fasta:       split_000001.fasta:
+   >🍎  (1st record)         >🍋  (3rd record)
+   ATCG                      TTAA
+   >🍊  (2nd record)         >🍇  (4th record)
+   GCTA                      CCGG
+
+First half goes to file 0, second half to file 1.
+
+-------
+
+**Random** — each record is assigned to a random file (seeded for reproducibility)::
+
+ split_000000.fasta:       split_000001.fasta:
+   >🍎                       >🍊
+   ATCG                      GCTA
+   >🍇                       >🍋
+   CCGG                      TTAA
+
+.. class:: warningmark
+
+Random mode does not guarantee every output file will be non-empty.
+
+-------
+
+**Chunk mode** — fixed number of records per file. With **chunk size** = 1::
+
+ split_000000.fasta:  split_000001.fasta:  split_000002.fasta:  split_000003.fasta:
+   >🍎                 >🍊                 >🍋                 >🍇
+   ATCG                GCTA                TTAA                CCGG
+
+-------
+
+**Split tabular by column value**
+
+A tabular file with a "group" column::
+
+ gene    group   score
+ geneA   wnt     0.9
+ geneB   notch   0.7
+ geneC   wnt     0.8
+ geneD   notch   0.6
+
+Split by column 2 produces one file per unique value::
+
+ wnt.tabular:            notch.tabular:
+   gene   group  score     gene   group  score
+   geneA  wnt    0.9       geneB  notch  0.7
+   geneC  wnt    0.8       geneD  notch  0.6
+
+-------
+
+**Split tabular by column with regex substitution**
+
+Column values can be transformed before grouping using a regex match/replace pair. For example, if column 1 contains filenames like ``sample1.mgf``, ``sample2.mgf``, you can strip the extension::
+
+ Match regex:   (.*)\.mgf
+ Replace with:  \1
+
+This groups rows by the part before ``.mgf`` and names the output files accordingly (``sample1.tabular``, ``sample2.tabular`` instead of ``sample1.mgf.tabular``, ``sample2.mgf.tabular``).
 
     ]]></help>
     <citations>
diff --git a/tools/text_processing/split_file_to_collection/static/images/split_file.png b/tools/text_processing/split_file_to_collection/static/images/split_file.png
diff --git a/tools/text_processing/split_file_to_collection/static/images/split_file.svg b/tools/text_processing/split_file_to_collection/static/images/split_file.svg
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 620 280" width="620" height="280">
+  <defs>
+    <filter id="shadow" x="-4%" y="-4%" width="110%" height="110%">
+      <feGaussianBlur in="SourceAlpha" stdDeviation="2" result="blur"/>
+      <feComponentTransfer in="blur"><feFuncA type="linear" slope="0.3"/></feComponentTransfer>
+      <feMerge><feMergeNode/><feMergeNode in="SourceGraphic"/></feMerge>
+    </filter>
+    <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#555"/>
+    </marker>
+  </defs>
+  <!-- Left: single input file -->
+  <rect x="10" y="40" width="170" height="180" rx="8" ry="8" fill="#fff" stroke="#aaa" stroke-width="1" filter="url(#shadow)"/>
+  <text x="95" y="58" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Input: A dataset</text>
+  <text x="25" y="80" font-family="monospace" font-size="9" fill="#555">record 1  🍎</text>
+  <text x="25" y="96" font-family="monospace" font-size="9" fill="#555">record 2  🍊</text>
+  <line x1="20" y1="102" x2="170" y2="102" stroke="#ddd" stroke-width="0.5" stroke-dasharray="4,3"/>
+  <text x="25" y="118" font-family="monospace" font-size="9" fill="#555">record 3  🍋</text>
+  <text x="25" y="134" font-family="monospace" font-size="9" fill="#555">record 4  🍇</text>
+  <line x1="20" y1="140" x2="170" y2="140" stroke="#ddd" stroke-width="0.5" stroke-dasharray="4,3"/>
+  <text x="25" y="156" font-family="monospace" font-size="9" fill="#555">record 5  🍓</text>
+  <text x="25" y="172" font-family="monospace" font-size="9" fill="#555">record 6  🫐</text>
+  <text x="95" y="208" text-anchor="middle" font-family="sans-serif" font-size="9" fill="#888">6 records</text>
+  <!-- Arrow -->
+  <line x1="195" y1="130" x2="305" y2="130" stroke="#555" stroke-width="2" marker-end="url(#arrowhead)"/>
+  <text x="250" y="118" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="bold" fill="#555">Split</text>
+  <text x="250" y="148" text-anchor="middle" font-family="sans-serif" font-size="9" fill="#888">into 3 files</text>
+  <!-- Right: output collection -->
+  <rect x="320" y="10" width="290" height="240" rx="8" ry="8" fill="#d9ead3" stroke="#888" stroke-width="1" filter="url(#shadow)"/>
+  <text x="465" y="28" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Output: A collection</text>
+  <!-- file 0 -->
+  <rect x="332" y="38" width="266" height="56" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="54" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">split_000000</text>
+  <text x="345" y="70" font-family="monospace" font-size="9" fill="#555">record 1  🍎</text>
+  <text x="345" y="84" font-family="monospace" font-size="9" fill="#555">record 4  🍇</text>
+  <!-- file 1 -->
+  <rect x="332" y="102" width="266" height="56" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="118" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">split_000001</text>
+  <text x="345" y="134" font-family="monospace" font-size="9" fill="#555">record 2  🍊</text>
+  <text x="345" y="148" font-family="monospace" font-size="9" fill="#555">record 5  🍓</text>
+  <!-- file 2 -->
+  <rect x="332" y="166" width="266" height="56" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
+  <text x="465" y="182" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">split_000002</text>
+  <text x="345" y="198" font-family="monospace" font-size="9" fill="#555">record 3  🍋</text>
+  <text x="345" y="212" font-family="monospace" font-size="9" fill="#555">record 6  🫐</text>
+  <!-- Annotation -->
+  <text x="310" y="273" text-anchor="middle" font-family="sans-serif" font-size="10" font-style="italic" fill="#666">records distributed across output files (alternating mode shown)</text>
+</svg>