split2: new flag -N, --seqid-as-filename

shenwei356 · shenwei356 · commit 2db06f7883eb · 2025-08-28T13:09:53.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+- [SeqKit v2.11.0](https://github.com/shenwei356/seqkit/releases/tag/v2.11.0) - 2025-xx-xx
+[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.11.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.11.0)
+    - `seqkit split1`:
+        - add a new flag `-N, --seqid-as-filename` to use the first sequence ID as the file name. 
+        E.g., using `-N -s 1` is equal to `seqkit split --by-id`, but it's much faster and uses less memory.
 - [SeqKit v2.10.1](https://github.com/shenwei356/seqkit/releases/tag/v2.10.1) - 2025-08-19
 [![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.10.1/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.10.1)
     - `seqkit seq`:
diff --git a/doc/docs/usage.md b/doc/docs/usage.md
@@ -2555,6 +2555,8 @@ part size or number of parts.
 If you just want to split by parts or sizes, please use "seqkit split2",
 which can apply to paired- and single-end FASTQ.
 
+If you want to split sequences by ID, please use "seqkit split2 -s 1 -N".
+
 If you want to cut a sequence into multiple segments.
   1. For cutting into even chunks, please use 'kmcp utils split-genomes'
      (https://bioinf.shenwei.me/kmcp/usage/#split-genomes).
@@ -2696,6 +2698,9 @@ The prefix of output files:
   1. For stdin: stdin
   2. Others: same to the input file
   3. Set via the options: --by-length-prefix, --by-part-prefix, or --by-size-prefix
+  4. Use the ID of the first sequence in each subset.
+     E.g, 'seqkit split2 --by-size 1 --seqid-as-filename' is equal to
+     'seqkit split --by-id', but it's much faster and uses less memory.
 
 The extension of output files:
   1. For stdin: .fast[aq]
@@ -2715,7 +2720,7 @@ If you want to cut a sequence into multiple segments.
         seqkit sliding -g -s 40 -W 40 input.fasta -o out.fasta
 
 Usage:
-  seqkit split2 [flags]
+  seqkit split2 [flags] 
 
 Flags:
   -l, --by-length string          split sequences into chunks of >=N bases, supports K/M/G suffix
@@ -2733,6 +2738,8 @@ Flags:
   -O, --out-dir string            output directory (default value is $infile.split)
   -1, --read1 string              (gzipped) read1 file
   -2, --read2 string              (gzipped) read2 file
+  -N, --seqid-as-filename         use the first sequence ID as the file name. E.g., using '-N -s 1' is
+                                  equal to 'seqkit split --by-id' but much faster and uses less memory.
 ```
 
 Examples
@@ -2835,6 +2842,17 @@ Examples
         [INFO] split into 2 parts
         [INFO] write 1250 sequences to file: out/reads_1.part_001.fq.gz
         [INFO] write 1250 sequences to file: out/reads_1.part_002.fq.gz
+        
+1. Splitting sequences into separated files, with the sequence ID as the file name.
+
+        $ seqkit head -n 5 ../tests/hairpin.fa | seqkit split2 -s 1 -N
+        [INFO] split seqs from stdin
+        [INFO] split into 1 seqs per file
+        [INFO] write 1 sequences to file: stdin.split/cel-let-7.fasta
+        [INFO] write 1 sequences to file: stdin.split/cel-lin-4.fasta
+        [INFO] write 1 sequences to file: stdin.split/cel-mir-1.fasta
+        [INFO] write 1 sequences to file: stdin.split/cel-mir-2.fasta
+        [INFO] write 1 sequences to file: stdin.split/cel-mir-34.fasta
 
 ## pair
 
diff --git a/seqkit/cmd/split.go b/seqkit/cmd/split.go
@@ -51,6 +51,8 @@ part size or number of parts.
 If you just want to split by parts or sizes, please use "seqkit split2",
 which can apply to paired- and single-end FASTQ.
 
+If you want to split sequences by ID, please use "seqkit split2 -s 1 -N".
+
 If you want to cut a sequence into multiple segments.
   1. For cutting into even chunks, please use 'kmcp utils split-genomes'
      (https://bioinf.shenwei.me/kmcp/usage/#split-genomes).
diff --git a/seqkit/cmd/split2.go b/seqkit/cmd/split2.go
@@ -54,6 +54,9 @@ The prefix of output files:
   1. For stdin: stdin
   2. Others: same to the input file
   3. Set via the options: --by-length-prefix, --by-part-prefix, or --by-size-prefix
+  4. Use the ID of the first sequence in each subset.
+     E.g, 'seqkit split2 --by-size 1 --seqid-as-filename' is equal to
+     'seqkit split --by-id', but it's much faster and uses less memory.
 
 The extension of output files:
   1. For stdin: .fast[aq]
@@ -125,6 +128,8 @@ If you want to cut a sequence into multiple segments.
 		prefixByPartSet := cmd.Flags().Lookup("by-part-prefix").Changed
 		prefixByLengthSet := cmd.Flags().Lookup("by-length-prefix").Changed
 
+		seqIDAsFileName := getFlagBool(cmd, "seqid-as-filename")
+
 		if size == 0 && parts == 0 && length == 0 {
 			checkError(fmt.Errorf(`one of flags should be given: -s/-p/-l. type "seqkit split2 -h" for help`))
 		}
@@ -184,6 +189,10 @@ If you want to cut a sequence into multiple segments.
 			}
 		}
 
+		if pairedEnd && seqIDAsFileName {
+			checkError(fmt.Errorf("the flag -N/--seqid-as-filename is not applicable for paired-end reads"))
+		}
+
 		if !quiet {
 			log.Infof("split seqs from %s", source)
 			if bySize {
@@ -313,15 +322,19 @@ If you want to cut a sequence into multiple segments.
 
 							i++
 
-							if prefixBySizeSet {
-								prefix = prefixBySize
-								if pairedEnd {
-									prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+							if !seqIDAsFileName {
+								if prefixBySizeSet {
+									prefix = prefixBySize
+									if pairedEnd {
+										prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+									}
+								} else {
+									prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
 								}
+								outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							} else {
-								prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
+								outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
 							}
-							outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							outfhPre, err = xopen.Wopen(outfilePre)
 							checkError(err)
 
@@ -332,15 +345,20 @@ If you want to cut a sequence into multiple segments.
 
 						if outfhPre == nil { // first record
 							var outfh2 *xopen.Writer
-							if prefixByLengthSet {
-								prefix = prefixByLength
-								if pairedEnd {
-									prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+							var outfile string
+							if !seqIDAsFileName {
+								if prefixByLengthSet {
+									prefix = prefixByLength
+									if pairedEnd {
+										prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+									}
+								} else {
+									prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
 								}
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							} else {
-								prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
 							}
-							outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							outfh2, err = xopen.Wopen(outfile)
 							checkError(err)
 
@@ -361,16 +379,20 @@ If you want to cut a sequence into multiple segments.
 							i++
 
 							var outfh2 *xopen.Writer
-
-							if prefixByLengthSet {
-								prefix = prefixByLength
-								if pairedEnd {
-									prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+							var outfile string
+							if !seqIDAsFileName {
+								if prefixByLengthSet {
+									prefix = prefixByLength
+									if pairedEnd {
+										prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+									}
+								} else {
+									prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
 								}
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							} else {
-								prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
 							}
-							outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							outfh2, err = xopen.Wopen(outfile)
 							checkError(err)
 
@@ -388,15 +410,19 @@ If you want to cut a sequence into multiple segments.
 					if bySize {
 						// first record, for bySize
 						if outfhPre == nil {
-							if prefixBySizeSet {
-								prefix = prefixBySize
-								if pairedEnd {
-									prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+							if !seqIDAsFileName {
+								if prefixBySizeSet {
+									prefix = prefixBySize
+									if pairedEnd {
+										prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+									}
+								} else {
+									prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
 								}
+								outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							} else {
-								prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
+								outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
 							}
-							outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							outfhPre, err = xopen.Wopen(outfilePre)
 							checkError(err)
 
@@ -416,15 +442,20 @@ If you want to cut a sequence into multiple segments.
 						// first record, for byParts
 						if i+1 > len(outfhs) {
 							var outfh2 *xopen.Writer
-							if prefixByPartSet {
-								prefix = prefixByPart
-								if pairedEnd {
-									prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+							var outfile string
+							if !seqIDAsFileName {
+								if prefixByLengthSet {
+									prefix = prefixByLength
+									if pairedEnd {
+										prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
+									}
+								} else {
+									prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
 								}
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							} else {
-								prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
+								outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
 							}
-							outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
 							outfh2, err = xopen.Wopen(outfile)
 							checkError(err)
 
@@ -490,5 +521,7 @@ func init() {
 	split2Cmd.Flags().StringP("by-part-prefix", "", "", `file prefix for --by-part. The placeholder "{read}" is needed for paired-end files.`)
 	split2Cmd.Flags().StringP("by-length-prefix", "", "", `file prefix for --by-length. The placeholder "{read}" is needed for paired-end files.`)
 
+	split2Cmd.Flags().BoolP("seqid-as-filename", "N", false, "use the first sequence ID as the file name. E.g., using '-N -s 1' is equal to 'seqkit split --by-id' but much faster and uses less memory.")
+
 	split2Cmd.Flags().StringP("extension", "e", "", `set output file extension, e.g., ".gz", ".xz", or ".zst"`)
 }
diff --git a/seqkit/cmd/version.go b/seqkit/cmd/version.go
@@ -29,7 +29,7 @@ import (
 )
 
 // VERSION of seqkit
-const VERSION = "2.10.1"
+const VERSION = "2.11.0"
 
 // versionCmd represents the version command
 var versionCmd = &cobra.Command{

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ import (`
`29`	`29`	`)`
`30`	`30`
`31`	`31`	`// VERSION of seqkit`
`32`		`-const VERSION = "2.10.1"`
	`32`	`+const VERSION = "2.11.0"`
`33`	`33`
`34`	`34`	`// versionCmd represents the version command`
`35`	`35`	`var versionCmd = &cobra.Command{`