Skip to content

Commit ad523f7

Browse files
committed
split2: fix prefix checking when paired-end files are given. #512
1 parent 9462133 commit ad523f7

File tree

3 files changed

+57
-10
lines changed

3 files changed

+57
-10
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
- [SeqKit v2.10.0](https://github.com/shenwei356/seqkit/releases/tag/v2.10.0) - 2025-xx-xx
22
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.10.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.10.0)
3+
- `seqkit split2`:
4+
- fix prefix checking when paired-end files are given. [#512](https://github.com/shenwei356/seqkit/issues/512)
35
- `seqkit stat`:
46
- do not compute GC content and N's for protein sequence. [#497](https://github.com/shenwei356/seqkit/issues/497)
57
- `seqkit grep`:

doc/docs/usage.md

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,11 +2714,14 @@ Usage:
27142714
27152715
Flags:
27162716
-l, --by-length string split sequences into chunks of >=N bases, supports K/M/G suffix
2717-
--by-length-prefix string file prefix for --by-length
2718-
-p, --by-part int split sequences into N parts
2719-
--by-part-prefix string file prefix for --by-part
2717+
--by-length-prefix string file prefix for --by-length. The placeholder "{read}" is needed for
2718+
paired-end files.
2719+
-p, --by-part int split sequences into N parts with the round robin distribution
2720+
--by-part-prefix string file prefix for --by-part. The placeholder "{read}" is needed for
2721+
paired-end files.
27202722
-s, --by-size int split sequences into multi parts with N sequences
2721-
--by-size-prefix string file prefix for --by-size
2723+
--by-size-prefix string file prefix for --by-size. The placeholder "{read}" is needed for
2724+
paired-end files.
27222725
-e, --extension string set output file extension, e.g., ".gz", ".xz", or ".zst"
27232726
-f, --force overwrite output directory
27242727
-h, --help help for split2
@@ -2801,6 +2804,17 @@ Examples
28012804
[INFO] write 1250 sequences to file: out/reads_2.part_002.fq.gz
28022805
[INFO] write 1250 sequences to file: out/reads_1.part_001.fq.gz
28032806
[INFO] write 1250 sequences to file: out/reads_1.part_002.fq.gz
2807+
2808+
Custom prefix
2809+
2810+
$ seqkit split2 -1 reads_1.fq.gz -2 reads_2.fq.gz -p 2 -O out -f --by-part-prefix "x_r{read}_"
2811+
[INFO] flag -1/--read1 and -2/--read2 given, ignore: -
2812+
[INFO] split seqs from reads_1.fq.gz and reads_2.fq.gz
2813+
[INFO] split into 2 parts
2814+
[INFO] write 1250 sequences to file: out/x_r2_001.fq.gz
2815+
[INFO] write 1250 sequences to file: out/x_r1_001.fq.gz
2816+
[INFO] write 1250 sequences to file: out/x_r2_002.fq.gz
2817+
[INFO] write 1250 sequences to file: out/x_r1_002.fq.gz
28042818

28052819
1. For FASTA files (single-end)
28062820

seqkit/cmd/split2.go

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ import (
2525
"io"
2626
"os"
2727
"path/filepath"
28+
"regexp"
2829
"runtime"
30+
"strconv"
2931
"strings"
3032
"sync"
3133

@@ -113,6 +115,8 @@ If you want to cut a sequence into multiple segments.
113115

114116
extension := getFlagString(cmd, "extension")
115117

118+
reRead := regexp.MustCompile(`\{read\}`)
119+
116120
prefixBySize := getFlagString(cmd, "by-size-prefix")
117121
prefixByPart := getFlagString(cmd, "by-part-prefix")
118122
prefixByLength := getFlagString(cmd, "by-length-prefix")
@@ -134,6 +138,7 @@ If you want to cut a sequence into multiple segments.
134138
}
135139

136140
var source string
141+
var pairedEnd bool
137142
if read1 == "" {
138143
if read2 == "" {
139144
// single end from file or stdin
@@ -165,6 +170,17 @@ If you want to cut a sequence into multiple segments.
165170
}
166171
files = []string{read1, read2}
167172
source = read1 + " and " + read2
173+
174+
pairedEnd = true
175+
if prefixBySizeSet && !reRead.MatchString(prefixBySize) {
176+
checkError(fmt.Errorf(`--by-size-prefix should contains the placeholder "{read}" when paired-end files are given, such as "sample_{read}.fq.gz`))
177+
}
178+
if prefixByPartSet && !reRead.MatchString(prefixByPart) {
179+
checkError(fmt.Errorf(`--by-part-prefix should contains the placeholder "{read}" when paired-end files are given, such as "sample_{read}.fq.gz`))
180+
}
181+
if prefixByLengthSet && !reRead.MatchString(prefixByLength) {
182+
checkError(fmt.Errorf(`--by-size-prefix should contains the placeholder "{read}" when paired-end files are given, such as "sample_{read}.fq.gz`))
183+
}
168184
}
169185
}
170186

@@ -180,7 +196,7 @@ If you want to cut a sequence into multiple segments.
180196
}
181197

182198
var wg sync.WaitGroup
183-
for _, file := range files {
199+
for i, file := range files {
184200
isstdin := isStdin(file)
185201
var fileName, fileExt, fileExt2 string
186202
if isstdin {
@@ -226,7 +242,7 @@ If you want to cut a sequence into multiple segments.
226242
}
227243

228244
wg.Add(1)
229-
go func(file string) {
245+
go func(file string, pairedEnd bool, r int) {
230246
defer wg.Done()
231247

232248
renameFileExt := true
@@ -299,6 +315,9 @@ If you want to cut a sequence into multiple segments.
299315

300316
if prefixBySizeSet {
301317
prefix = prefixBySize
318+
if pairedEnd {
319+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
320+
}
302321
} else {
303322
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
304323
}
@@ -315,6 +334,9 @@ If you want to cut a sequence into multiple segments.
315334
var outfh2 *xopen.Writer
316335
if prefixByLengthSet {
317336
prefix = prefixByLength
337+
if pairedEnd {
338+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
339+
}
318340
} else {
319341
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
320342
}
@@ -342,6 +364,9 @@ If you want to cut a sequence into multiple segments.
342364

343365
if prefixByLengthSet {
344366
prefix = prefixByLength
367+
if pairedEnd {
368+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
369+
}
345370
} else {
346371
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
347372
}
@@ -365,6 +390,9 @@ If you want to cut a sequence into multiple segments.
365390
if outfhPre == nil {
366391
if prefixBySizeSet {
367392
prefix = prefixBySize
393+
if pairedEnd {
394+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
395+
}
368396
} else {
369397
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
370398
}
@@ -390,6 +418,9 @@ If you want to cut a sequence into multiple segments.
390418
var outfh2 *xopen.Writer
391419
if prefixByPartSet {
392420
prefix = prefixByPart
421+
if pairedEnd {
422+
prefix = reRead.ReplaceAllString(prefix, strconv.Itoa(r))
423+
}
393424
} else {
394425
prefix = fmt.Sprintf("%s.part_", filepath.Base(fileName))
395426
}
@@ -437,7 +468,7 @@ If you want to cut a sequence into multiple segments.
437468
}
438469
}
439470

440-
}(file)
471+
}(file, pairedEnd, i+1)
441472
}
442473

443474
wg.Wait()
@@ -455,9 +486,9 @@ func init() {
455486
split2Cmd.Flags().StringP("out-dir", "O", "", "output directory (default value is $infile.split)")
456487
split2Cmd.Flags().BoolP("force", "f", false, "overwrite output directory")
457488

458-
split2Cmd.Flags().StringP("by-size-prefix", "", "", "file prefix for --by-size")
459-
split2Cmd.Flags().StringP("by-part-prefix", "", "", "file prefix for --by-part")
460-
split2Cmd.Flags().StringP("by-length-prefix", "", "", "file prefix for --by-length")
489+
split2Cmd.Flags().StringP("by-size-prefix", "", "", `file prefix for --by-size. The placeholder "{read}" is needed for paired-end files.`)
490+
split2Cmd.Flags().StringP("by-part-prefix", "", "", `file prefix for --by-part. The placeholder "{read}" is needed for paired-end files.`)
491+
split2Cmd.Flags().StringP("by-length-prefix", "", "", `file prefix for --by-length. The placeholder "{read}" is needed for paired-end files.`)
461492

462493
split2Cmd.Flags().StringP("extension", "e", "", `set output file extension, e.g., ".gz", ".xz", or ".zst"`)
463494
}

0 commit comments

Comments
 (0)