@@ -54,6 +54,9 @@ The prefix of output files:
5454 1. For stdin: stdin
5555 2. Others: same to the input file
5656 3. Set via the options: --by-length-prefix, --by-part-prefix, or --by-size-prefix
57+ 4. Use the ID of the first sequence in each subset.
58+ E.g, 'seqkit split2 --by-size 1 --seqid-as-filename' is equal to
59+ 'seqkit split --by-id', but it's much faster and uses less memory.
5760
5861The extension of output files:
5962 1. For stdin: .fast[aq]
@@ -125,6 +128,8 @@ If you want to cut a sequence into multiple segments.
125128 prefixByPartSet := cmd .Flags ().Lookup ("by-part-prefix" ).Changed
126129 prefixByLengthSet := cmd .Flags ().Lookup ("by-length-prefix" ).Changed
127130
131+ seqIDAsFileName := getFlagBool (cmd , "seqid-as-filename" )
132+
128133 if size == 0 && parts == 0 && length == 0 {
129134 checkError (fmt .Errorf (`one of flags should be given: -s/-p/-l. type "seqkit split2 -h" for help` ))
130135 }
@@ -184,6 +189,10 @@ If you want to cut a sequence into multiple segments.
184189 }
185190 }
186191
192+ if pairedEnd && seqIDAsFileName {
193+ checkError (fmt .Errorf ("the flag -N/--seqid-as-filename is not applicable for paired-end reads" ))
194+ }
195+
187196 if ! quiet {
188197 log .Infof ("split seqs from %s" , source )
189198 if bySize {
@@ -313,15 +322,19 @@ If you want to cut a sequence into multiple segments.
313322
314323 i ++
315324
316- if prefixBySizeSet {
317- prefix = prefixBySize
318- if pairedEnd {
319- prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
325+ if ! seqIDAsFileName {
326+ if prefixBySizeSet {
327+ prefix = prefixBySize
328+ if pairedEnd {
329+ prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
330+ }
331+ } else {
332+ prefix = fmt .Sprintf ("%s.part_" , filepath .Base (fileName ))
320333 }
334+ outfilePre = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
321335 } else {
322- prefix = fmt .Sprintf ("%s.part_ " , filepath . Base ( fileName ))
336+ outfilePre = filepath . Join ( outdir , fmt .Sprintf ("%s%s " , pathutil . RemoveInvalidPathChars ( string ( record . ID ), "__" ), fileExt ))
323337 }
324- outfilePre = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
325338 outfhPre , err = xopen .Wopen (outfilePre )
326339 checkError (err )
327340
@@ -332,15 +345,20 @@ If you want to cut a sequence into multiple segments.
332345
333346 if outfhPre == nil { // first record
334347 var outfh2 * xopen.Writer
335- if prefixByLengthSet {
336- prefix = prefixByLength
337- if pairedEnd {
338- prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
348+ var outfile string
349+ if ! seqIDAsFileName {
350+ if prefixByLengthSet {
351+ prefix = prefixByLength
352+ if pairedEnd {
353+ prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
354+ }
355+ } else {
356+ prefix = fmt .Sprintf ("%s.part_" , filepath .Base (fileName ))
339357 }
358+ outfile = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
340359 } else {
341- prefix = fmt .Sprintf ("%s.part_ " , filepath . Base ( fileName ))
360+ outfile = filepath . Join ( outdir , fmt .Sprintf ("%s%s " , pathutil . RemoveInvalidPathChars ( string ( record . ID ), "__" ), fileExt ))
342361 }
343- outfile := filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
344362 outfh2 , err = xopen .Wopen (outfile )
345363 checkError (err )
346364
@@ -361,16 +379,20 @@ If you want to cut a sequence into multiple segments.
361379 i ++
362380
363381 var outfh2 * xopen.Writer
364-
365- if prefixByLengthSet {
366- prefix = prefixByLength
367- if pairedEnd {
368- prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
382+ var outfile string
383+ if ! seqIDAsFileName {
384+ if prefixByLengthSet {
385+ prefix = prefixByLength
386+ if pairedEnd {
387+ prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
388+ }
389+ } else {
390+ prefix = fmt .Sprintf ("%s.part_" , filepath .Base (fileName ))
369391 }
392+ outfile = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
370393 } else {
371- prefix = fmt .Sprintf ("%s.part_ " , filepath . Base ( fileName ))
394+ outfile = filepath . Join ( outdir , fmt .Sprintf ("%s%s " , pathutil . RemoveInvalidPathChars ( string ( record . ID ), "__" ), fileExt ))
372395 }
373- outfile := filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
374396 outfh2 , err = xopen .Wopen (outfile )
375397 checkError (err )
376398
@@ -388,15 +410,19 @@ If you want to cut a sequence into multiple segments.
388410 if bySize {
389411 // first record, for bySize
390412 if outfhPre == nil {
391- if prefixBySizeSet {
392- prefix = prefixBySize
393- if pairedEnd {
394- prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
413+ if ! seqIDAsFileName {
414+ if prefixBySizeSet {
415+ prefix = prefixBySize
416+ if pairedEnd {
417+ prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
418+ }
419+ } else {
420+ prefix = fmt .Sprintf ("%s.part_" , filepath .Base (fileName ))
395421 }
422+ outfilePre = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
396423 } else {
397- prefix = fmt .Sprintf ("%s.part_ " , filepath . Base ( fileName ))
424+ outfilePre = filepath . Join ( outdir , fmt .Sprintf ("%s%s " , pathutil . RemoveInvalidPathChars ( string ( record . ID ), "__" ), fileExt ))
398425 }
399- outfilePre = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
400426 outfhPre , err = xopen .Wopen (outfilePre )
401427 checkError (err )
402428
@@ -416,15 +442,20 @@ If you want to cut a sequence into multiple segments.
416442 // first record, for byParts
417443 if i + 1 > len (outfhs ) {
418444 var outfh2 * xopen.Writer
419- if prefixByPartSet {
420- prefix = prefixByPart
421- if pairedEnd {
422- prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
445+ var outfile string
446+ if ! seqIDAsFileName {
447+ if prefixByLengthSet {
448+ prefix = prefixByLength
449+ if pairedEnd {
450+ prefix = reRead .ReplaceAllString (prefix , strconv .Itoa (r ))
451+ }
452+ } else {
453+ prefix = fmt .Sprintf ("%s.part_" , filepath .Base (fileName ))
423454 }
455+ outfile = filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
424456 } else {
425- prefix = fmt .Sprintf ("%s.part_ " , filepath . Base ( fileName ))
457+ outfile = filepath . Join ( outdir , fmt .Sprintf ("%s%s " , pathutil . RemoveInvalidPathChars ( string ( record . ID ), "__" ), fileExt ))
426458 }
427- outfile := filepath .Join (outdir , fmt .Sprintf ("%s%03d%s" , prefix , i + 1 , fileExt ))
428459 outfh2 , err = xopen .Wopen (outfile )
429460 checkError (err )
430461
@@ -490,5 +521,7 @@ func init() {
490521 split2Cmd .Flags ().StringP ("by-part-prefix" , "" , "" , `file prefix for --by-part. The placeholder "{read}" is needed for paired-end files.` )
491522 split2Cmd .Flags ().StringP ("by-length-prefix" , "" , "" , `file prefix for --by-length. The placeholder "{read}" is needed for paired-end files.` )
492523
524+ split2Cmd .Flags ().BoolP ("seqid-as-filename" , "N" , false , "use the first sequence ID as the file name. E.g., using '-N -s 1' is equal to 'seqkit split --by-id' but much faster and uses less memory." )
525+
493526 split2Cmd .Flags ().StringP ("extension" , "e" , "" , `set output file extension, e.g., ".gz", ".xz", or ".zst"` )
494527}
0 commit comments