Skip to content

Commit 65bf54f

Browse files
committed
feat: implement chunking for FASTA records and enhance CLI options
1 parent 3815cc1 commit 65bf54f

File tree

3 files changed

+109
-48
lines changed

3 files changed

+109
-48
lines changed

cmd/radigest/main.go

Lines changed: 109 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,105 @@
11
package main
22

33
import (
4+
"encoding/json"
45
"flag"
56
"fmt"
67
"log"
8+
"os"
79
"runtime"
10+
"sort"
811
"strings"
912
"sync"
10-
"encoding/json"
11-
"os"
1213

1314
"radigest/internal/collector"
1415
"radigest/internal/digest"
1516
"radigest/internal/enzyme"
1617
"radigest/internal/fasta"
1718
)
1819

20+
var (
21+
version = "dev"
22+
commit = "none"
23+
date = "unknown"
24+
)
25+
26+
func produceChunks(faCh <-chan fasta.Record, jobs chan<- fasta.Record, chunkSz int) {
27+
defer close(jobs)
28+
for rec := range faCh {
29+
seq := rec.Seq
30+
n := len(seq)
31+
for from := 0; from < n; from += chunkSz {
32+
to := from + chunkSz
33+
if to > n {
34+
to = n
35+
}
36+
jobs <- fasta.Record{
37+
ID: rec.ID,
38+
Seq: seq[from:to],
39+
}
40+
}
41+
}
42+
}
43+
1944
func main() {
2045
// ---- CLI flags ----------------------------------------------------------
2146
fastaPath := flag.String("fasta", "", "reference FASTA file (required)")
22-
enzFlag := flag.String("enzymes", "", "comma-separated enzyme names (≥2, first two form the AB pair)")
23-
minLen := flag.Int("min", 0, "minimum fragment length")
24-
maxLen := flag.Int("max", 1<<30, "maximum fragment length")
25-
gffPath := flag.String("gff", "fragments.gff3", "output GFF3")
26-
jsonPath := flag.String("json", "", "write run summary JSON")
27-
chunkSz := flag.Int("chunk", 8<<20, "chunk size (bp) sent to each worker")
28-
threads := flag.Int("threads", runtime.NumCPU(), "worker goroutines")
47+
enzFlag := flag.String("enzymes", "", "comma-separated enzyme names (≥2, first two form the AB pair)")
48+
minLen := flag.Int("min", 0, "minimum fragment length (bp)")
49+
maxLen := flag.Int("max", 1<<30, "maximum fragment length (bp)")
50+
gffPath := flag.String("gff", "fragments.gff3", "output GFF3 file")
51+
jsonPath := flag.String("json", "", "optional: write run summary JSON here")
52+
chunkSz := flag.Int("chunk", 8<<20, "chunk size (bp) sent to each worker")
53+
threads := flag.Int("threads", runtime.NumCPU(), "number of worker goroutines")
54+
showVer := flag.Bool("version", false, "print version and exit")
55+
listEns := flag.Bool("list-enzymes", false, "list available enzyme names and exit")
56+
57+
flag.Usage = func() {
58+
b := &strings.Builder{}
59+
fmt.Fprintln(b, "radigest — in-silico double-digest and GFF3 fragment export")
60+
fmt.Fprintln(b)
61+
fmt.Fprintln(b, "Usage:")
62+
fmt.Fprintln(b, " radigest -fasta <ref.fa> -enzymes <E1,E2[,E3...]> [options]")
63+
fmt.Fprintln(b)
64+
fmt.Fprintln(b, "Required flags:")
65+
fmt.Fprintln(b, " -fasta, -enzymes")
66+
fmt.Fprintln(b)
67+
fmt.Fprintln(b, "Options:")
68+
flag.CommandLine.SetOutput(b)
69+
flag.PrintDefaults()
70+
flag.CommandLine.SetOutput(os.Stderr)
71+
fmt.Fprintln(b)
72+
fmt.Fprintln(b, "Examples:")
73+
fmt.Fprintln(b, " # Basic EcoRI/MseI digest to GFF3")
74+
fmt.Fprintln(b, " radigest -fasta ref.fa -enzymes EcoRI,MseI -gff out.gff3")
75+
fmt.Fprintln(b, " # Restrict fragment size and emit JSON summary")
76+
fmt.Fprintln(b, " radigest -fasta ref.fa -enzymes EcoRI,MseI -min 100 -max 800 -json run.json")
77+
fmt.Fprintln(b, " # See supported enzymes")
78+
fmt.Fprintln(b, " radigest -list-enzymes")
79+
fmt.Fprintln(os.Stderr, b.String())
80+
}
81+
2982
flag.Parse()
3083

84+
if *showVer {
85+
fmt.Printf("radigest %s (commit %s, %s)\n", version, commit, date)
86+
return
87+
}
88+
if *listEns {
89+
names := make([]string, 0, len(enzyme.DB))
90+
for name := range enzyme.DB {
91+
names = append(names, name)
92+
}
93+
sort.Strings(names)
94+
for _, n := range names {
95+
fmt.Println(n)
96+
}
97+
return
98+
}
3199
if *fastaPath == "" || *enzFlag == "" {
32-
log.Fatal("flags --fasta and --enzymes are required")
100+
fmt.Fprintln(os.Stderr, "error: flags -fasta and -enzymes are required\n")
101+
flag.Usage()
102+
os.Exit(2)
33103
}
34104

35105
// ---- build enzyme slice -------------------------------------------------
@@ -68,53 +138,44 @@ func main() {
68138
}
69139

70140
// ---- stream FASTA into jobs --------------------------------------------
71-
faCh := make(chan fasta.Record, 2)
141+
faCh := make(chan fasta.Record)
72142
go func() {
73143
if err := fasta.Stream(*fastaPath, faCh); err != nil {
74144
log.Fatalf("fasta stream: %v", err)
75145
}
146+
// NOTE: assume fasta.Stream closes faCh when it returns.
76147
}()
77148

78-
for rec := range faCh {
79-
// split sequence into windows of *chunkSz bases
80-
for from := 0; from < len(rec.Seq); from += *chunkSz {
81-
to := from + *chunkSz
82-
if to > len(rec.Seq) { to = len(rec.Seq) }
83-
jobs <- fasta.Record{
84-
ID: rec.ID,
85-
Seq: rec.Seq[from:to],
86-
}
87-
}
88-
}
89-
90-
for rec := range faCh {
91-
jobs <- rec
92-
}
93-
close(jobs) // no more work
94-
wg.Wait() // workers done
95-
close(cIn) // tell collector to finish
149+
// single consumer / producer path
150+
go produceChunks(faCh, jobs, *chunkSz)
151+
152+
// wait for workers, finish collector
153+
wg.Wait() // jobs closed by produceChunks
154+
close(cIn) // tell collector to finish
96155

97156
// ---- summary ------------------------------------------------------------
98157
stats := <-done
99158
fmt.Printf("Fragments kept: %d\nBases covered: %d\nChromosomes: %d\n",
100-
stats.TotalFragments, stats.TotalBases, len(stats.PerChr))
159+
stats.TotalFragments, stats.TotalBases, len(stats.PerChr))
101160
if *jsonPath != "" {
102-
out := struct {
103-
Enzymes []string `json:"enzymes"`
104-
MinLength int `json:"min_length"`
105-
MaxLength int `json:"max_length"`
106-
collector.Stats
107-
}{
108-
Enzymes: strings.Split(*enzFlag, ","),
109-
MinLength: *minLen,
110-
MaxLength: *maxLen,
111-
Stats: stats,
112-
}
113-
f, err := os.Create(*jsonPath)
114-
if err != nil { log.Fatalf("write json: %v", err) }
115-
if err := json.NewEncoder(f).Encode(out); err != nil {
116-
log.Fatalf("encode json: %v", err)
117-
}
118-
f.Close()
161+
out := struct {
162+
Enzymes []string `json:"enzymes"`
163+
MinLength int `json:"min_length"`
164+
MaxLength int `json:"max_length"`
165+
collector.Stats
166+
}{
167+
Enzymes: strings.Split(*enzFlag, ","),
168+
MinLength: *minLen,
169+
MaxLength: *maxLen,
170+
Stats: stats,
171+
}
172+
f, err := os.Create(*jsonPath)
173+
if err != nil {
174+
log.Fatalf("write json: %v", err)
175+
}
176+
if err := json.NewEncoder(f).Encode(out); err != nil {
177+
log.Fatalf("encode json: %v", err)
178+
}
179+
_ = f.Close()
119180
}
120-
}
181+
}

main

-2.76 MB
Binary file not shown.

radigest

100644100755
26.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)