Skip to content

Commit 2db06f7

Browse files
committed
split2: new flag -N, --seqid-as-filename
1 parent 74705d9 commit 2db06f7

File tree

5 files changed

+91
-33
lines changed

5 files changed

+91
-33
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
- [SeqKit v2.11.0](https://github.com/shenwei356/seqkit/releases/tag/v2.11.0) - 2025-xx-xx
2+
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.11.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.11.0)
3+
- `seqkit split1`:
4+
- add a new flag `-N, --seqid-as-filename` to use the first sequence ID as the file name.
5+
E.g., using `-N -s 1` is equal to `seqkit split --by-id`, but it's much faster and uses less memory.
16
- [SeqKit v2.10.1](https://github.com/shenwei356/seqkit/releases/tag/v2.10.1) - 2025-08-19
27
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.10.1/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.10.1)
38
- `seqkit seq`:

doc/docs/usage.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2555,6 +2555,8 @@ part size or number of parts.
25552555
If you just want to split by parts or sizes, please use "seqkit split2",
25562556
which can apply to paired- and single-end FASTQ.
25572557
2558+
If you want to split sequences by ID, please use "seqkit split2 -s 1 -N".
2559+
25582560
If you want to cut a sequence into multiple segments.
25592561
1. For cutting into even chunks, please use 'kmcp utils split-genomes'
25602562
(https://bioinf.shenwei.me/kmcp/usage/#split-genomes).
@@ -2696,6 +2698,9 @@ The prefix of output files:
26962698
1. For stdin: stdin
26972699
2. Others: same to the input file
26982700
3. Set via the options: --by-length-prefix, --by-part-prefix, or --by-size-prefix
2701+
4. Use the ID of the first sequence in each subset.
2702+
E.g, 'seqkit split2 --by-size 1 --seqid-as-filename' is equal to
2703+
'seqkit split --by-id', but it's much faster and uses less memory.
26992704
27002705
The extension of output files:
27012706
1. For stdin: .fast[aq]
@@ -2715,7 +2720,7 @@ If you want to cut a sequence into multiple segments.
27152720
seqkit sliding -g -s 40 -W 40 input.fasta -o out.fasta
27162721
27172722
Usage:
2718-
seqkit split2 [flags]
2723+
seqkit split2 [flags]
27192724
27202725
Flags:
27212726
-l, --by-length string split sequences into chunks of >=N bases, supports K/M/G suffix
@@ -2733,6 +2738,8 @@ Flags:
27332738
-O, --out-dir string output directory (default value is $infile.split)
27342739
-1, --read1 string (gzipped) read1 file
27352740
-2, --read2 string (gzipped) read2 file
2741+
-N, --seqid-as-filename use the first sequence ID as the file name. E.g., using '-N -s 1' is
2742+
equal to 'seqkit split --by-id' but much faster and uses less memory.
27362743
```
27372744

27382745
Examples
@@ -2835,6 +2842,17 @@ Examples
28352842
[INFO] split into 2 parts
28362843
[INFO] write 1250 sequences to file: out/reads_1.part_001.fq.gz
28372844
[INFO] write 1250 sequences to file: out/reads_1.part_002.fq.gz
2845+
2846+
1. Splitting sequences into separated files, with the sequence ID as the file name.
2847+
2848+
$ seqkit head -n 5 ../tests/hairpin.fa | seqkit split2 -s 1 -N
2849+
[INFO] split seqs from stdin
2850+
[INFO] split into 1 seqs per file
2851+
[INFO] write 1 sequences to file: stdin.split/cel-let-7.fasta
2852+
[INFO] write 1 sequences to file: stdin.split/cel-lin-4.fasta
2853+
[INFO] write 1 sequences to file: stdin.split/cel-mir-1.fasta
2854+
[INFO] write 1 sequences to file: stdin.split/cel-mir-2.fasta
2855+
[INFO] write 1 sequences to file: stdin.split/cel-mir-34.fasta
28382856

28392857
## pair
28402858

seqkit/cmd/split.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ part size or number of parts.
5151
If you just want to split by parts or sizes, please use "seqkit split2",
5252
which can apply to paired- and single-end FASTQ.
5353
54+
If you want to split sequences by ID, please use "seqkit split2 -s 1 -N".
55+
5456
If you want to cut a sequence into multiple segments.
5557
1. For cutting into even chunks, please use 'kmcp utils split-genomes'
5658
(https://bioinf.shenwei.me/kmcp/usage/#split-genomes).

seqkit/cmd/split2.go

Lines changed: 64 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ The prefix of output files:
5454
1. For stdin: stdin
5555
2. Others: same to the input file
5656
3. Set via the options: --by-length-prefix, --by-part-prefix, or --by-size-prefix
57+
4. Use the ID of the first sequence in each subset.
58+
E.g, 'seqkit split2 --by-size 1 --seqid-as-filename' is equal to
59+
'seqkit split --by-id', but it's much faster and uses less memory.
5760
5861
The extension of output files:
5962
1. For stdin: .fast[aq]
@@ -125,6 +128,8 @@ If you want to cut a sequence into multiple segments.
125128
prefixByPartSet := cmd.Flags().Lookup("by-part-prefix").Changed
126129
prefixByLengthSet := cmd.Flags().Lookup("by-length-prefix").Changed
127130

131+
seqIDAsFileName := getFlagBool(cmd, "seqid-as-filename")
132+
128133
if size == 0 && parts == 0 && length == 0 {
129134
checkError(fmt.Errorf(`one of flags should be given: -s/-p/-l. type "seqkit split2 -h" for help`))
130135
}
@@ -184,6 +189,10 @@ If you want to cut a sequence into multiple segments.
184189
}
185190
}
186191

192+
if pairedEnd && seqIDAsFileName {
193+
checkError(fmt.Errorf("the flag -N/--seqid-as-filename is not applicable for paired-end reads"))
194+
}
195+
187196
if !quiet {
188197
log.Infof("split seqs from %s", source)
189198
if bySize {
@@ -313,15 +322,19 @@ If you want to cut a sequence into multiple segments.
313322

314323
i++
315324

316-
if prefixBySizeSet {
317-
prefix = prefixBySize
318-
if pairedEnd {
319-
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
325+
if !seqIDAsFileName {
326+
if prefixBySizeSet {
327+
prefix = prefixBySize
328+
if pairedEnd {
329+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
330+
}
331+
} else {
332+
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
320333
}
334+
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
321335
} else {
322-
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
336+
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
323337
}
324-
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
325338
outfhPre, err = xopen.Wopen(outfilePre)
326339
checkError(err)
327340

@@ -332,15 +345,20 @@ If you want to cut a sequence into multiple segments.
332345

333346
if outfhPre == nil { // first record
334347
var outfh2 *xopen.Writer
335-
if prefixByLengthSet {
336-
prefix = prefixByLength
337-
if pairedEnd {
338-
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
348+
var outfile string
349+
if !seqIDAsFileName {
350+
if prefixByLengthSet {
351+
prefix = prefixByLength
352+
if pairedEnd {
353+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
354+
}
355+
} else {
356+
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
339357
}
358+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
340359
} else {
341-
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
360+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
342361
}
343-
outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
344362
outfh2, err = xopen.Wopen(outfile)
345363
checkError(err)
346364

@@ -361,16 +379,20 @@ If you want to cut a sequence into multiple segments.
361379
i++
362380

363381
var outfh2 *xopen.Writer
364-
365-
if prefixByLengthSet {
366-
prefix = prefixByLength
367-
if pairedEnd {
368-
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
382+
var outfile string
383+
if !seqIDAsFileName {
384+
if prefixByLengthSet {
385+
prefix = prefixByLength
386+
if pairedEnd {
387+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
388+
}
389+
} else {
390+
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
369391
}
392+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
370393
} else {
371-
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
394+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
372395
}
373-
outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
374396
outfh2, err = xopen.Wopen(outfile)
375397
checkError(err)
376398

@@ -388,15 +410,19 @@ If you want to cut a sequence into multiple segments.
388410
if bySize {
389411
// first record, for bySize
390412
if outfhPre == nil {
391-
if prefixBySizeSet {
392-
prefix = prefixBySize
393-
if pairedEnd {
394-
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
413+
if !seqIDAsFileName {
414+
if prefixBySizeSet {
415+
prefix = prefixBySize
416+
if pairedEnd {
417+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
418+
}
419+
} else {
420+
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
395421
}
422+
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
396423
} else {
397-
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
424+
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
398425
}
399-
outfilePre = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
400426
outfhPre, err = xopen.Wopen(outfilePre)
401427
checkError(err)
402428

@@ -416,15 +442,20 @@ If you want to cut a sequence into multiple segments.
416442
// first record, for byParts
417443
if i+1 > len(outfhs) {
418444
var outfh2 *xopen.Writer
419-
if prefixByPartSet {
420-
prefix = prefixByPart
421-
if pairedEnd {
422-
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
445+
var outfile string
446+
if !seqIDAsFileName {
447+
if prefixByLengthSet {
448+
prefix = prefixByLength
449+
if pairedEnd {
450+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
451+
}
452+
} else {
453+
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
423454
}
455+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
424456
} else {
425-
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
457+
outfile = filepath.Join(outdir, fmt.Sprintf("%s%s", pathutil.RemoveInvalidPathChars(string(record.ID), "__"), fileExt))
426458
}
427-
outfile := filepath.Join(outdir, fmt.Sprintf("%s%03d%s", prefix, i+1, fileExt))
428459
outfh2, err = xopen.Wopen(outfile)
429460
checkError(err)
430461

@@ -490,5 +521,7 @@ func init() {
490521
split2Cmd.Flags().StringP("by-part-prefix", "", "", `file prefix for --by-part. The placeholder "{read}" is needed for paired-end files.`)
491522
split2Cmd.Flags().StringP("by-length-prefix", "", "", `file prefix for --by-length. The placeholder "{read}" is needed for paired-end files.`)
492523

524+
split2Cmd.Flags().BoolP("seqid-as-filename", "N", false, "use the first sequence ID as the file name. E.g., using '-N -s 1' is equal to 'seqkit split --by-id' but much faster and uses less memory.")
525+
493526
split2Cmd.Flags().StringP("extension", "e", "", `set output file extension, e.g., ".gz", ".xz", or ".zst"`)
494527
}

seqkit/cmd/version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
)
3030

3131
// VERSION of seqkit
32-
const VERSION = "2.10.1"
32+
const VERSION = "2.11.0"
3333

3434
// versionCmd represents the version command
3535
var versionCmd = &cobra.Command{

0 commit comments

Comments
 (0)