Skip to content

Commit a9a40b0

Browse files
test(bench): benchmark suite for cluster, MMR, selector, and compress (#69)
Implements issue #24. Adds deterministic benchmarks using fixed-seed random embeddings so results are reproducible across runs: - BenchmarkCluster_{10,50,100,500}Chunks (pkg/contextlab) - BenchmarkMMR_{10,50}Chunks (pkg/contextlab) - BenchmarkSelector_{10,50}Clusters (pkg/contextlab) - BenchmarkCompress_{Short,Long}Text (pkg/compress) Run with: go test -bench=. -benchmem ./pkg/contextlab/... ./pkg/compress/... Co-authored-by: Ona <no-reply@ona.com>
1 parent 50d5a5a commit a9a40b0

2 files changed

Lines changed: 132 additions & 0 deletions

File tree

pkg/compress/bench_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package compress
2+
3+
import (
4+
"context"
5+
"strings"
6+
"testing"
7+
8+
"github.com/Siddhant-K-code/distill/pkg/types"
9+
)
10+
11+
func BenchmarkCompress_ShortText(b *testing.B) {
12+
c := NewExtractiveCompressor()
13+
ctx := context.Background()
14+
chunk := types.Chunk{ID: "bench", Text: "This is a short text for compression benchmarking."}
15+
opts := DefaultOptions()
16+
b.ResetTimer()
17+
for i := 0; i < b.N; i++ {
18+
_, _, _ = c.Compress(ctx, []types.Chunk{chunk}, opts)
19+
}
20+
}
21+
22+
func BenchmarkCompress_LongText(b *testing.B) {
23+
c := NewExtractiveCompressor()
24+
ctx := context.Background()
25+
long := strings.Repeat("This is a longer text with more content for compression benchmarking. ", 50)
26+
chunk := types.Chunk{ID: "bench", Text: long}
27+
opts := DefaultOptions()
28+
b.ResetTimer()
29+
for i := 0; i < b.N; i++ {
30+
_, _, _ = c.Compress(ctx, []types.Chunk{chunk}, opts)
31+
}
32+
}

pkg/contextlab/bench_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package contextlab
2+
3+
import (
4+
"math/rand"
5+
"testing"
6+
7+
"github.com/Siddhant-K-code/distill/pkg/types"
8+
)
9+
10+
// deterministicEmbedding generates a reproducible embedding for a given seed.
11+
// Using a fixed seed ensures benchmark results are stable across runs.
12+
func deterministicEmbedding(seed int64, dims int) []float32 {
13+
rng := rand.New(rand.NewSource(seed))
14+
v := make([]float32, dims)
15+
for i := range v {
16+
v[i] = rng.Float32()
17+
}
18+
return v
19+
}
20+
21+
// makeBenchChunks builds n chunks with deterministic embeddings.
22+
func makeBenchChunks(n, dims int) []types.Chunk {
23+
chunks := make([]types.Chunk, n)
24+
for i := range chunks {
25+
chunks[i] = types.Chunk{
26+
ID: string(rune('A'+i%26)) + string(rune('0'+i/26%10)),
27+
Text: "benchmark chunk content for semantic deduplication testing",
28+
Embedding: deterministicEmbedding(int64(i), dims),
29+
}
30+
}
31+
return chunks
32+
}
33+
34+
func BenchmarkCluster_10Chunks(b *testing.B) {
35+
chunks := makeBenchChunks(10, 128)
36+
b.ResetTimer()
37+
for i := 0; i < b.N; i++ {
38+
_ = ClusterByThreshold(chunks, 0.15)
39+
}
40+
}
41+
42+
func BenchmarkCluster_50Chunks(b *testing.B) {
43+
chunks := makeBenchChunks(50, 128)
44+
b.ResetTimer()
45+
for i := 0; i < b.N; i++ {
46+
_ = ClusterByThreshold(chunks, 0.15)
47+
}
48+
}
49+
50+
func BenchmarkCluster_100Chunks(b *testing.B) {
51+
chunks := makeBenchChunks(100, 128)
52+
b.ResetTimer()
53+
for i := 0; i < b.N; i++ {
54+
_ = ClusterByThreshold(chunks, 0.15)
55+
}
56+
}
57+
58+
func BenchmarkCluster_500Chunks(b *testing.B) {
59+
chunks := makeBenchChunks(500, 128)
60+
b.ResetTimer()
61+
for i := 0; i < b.N; i++ {
62+
_ = ClusterByThreshold(chunks, 0.15)
63+
}
64+
}
65+
66+
func BenchmarkMMR_10Chunks(b *testing.B) {
67+
chunks := makeBenchChunks(10, 128)
68+
b.ResetTimer()
69+
for i := 0; i < b.N; i++ {
70+
_ = MMRRerank(chunks, 0.7, 5)
71+
}
72+
}
73+
74+
func BenchmarkMMR_50Chunks(b *testing.B) {
75+
chunks := makeBenchChunks(50, 128)
76+
b.ResetTimer()
77+
for i := 0; i < b.N; i++ {
78+
_ = MMRRerank(chunks, 0.7, 10)
79+
}
80+
}
81+
82+
func BenchmarkSelector_10Clusters(b *testing.B) {
83+
chunks := makeBenchChunks(10, 128)
84+
result := ClusterByThreshold(chunks, 0.15)
85+
sel := NewSelector(DefaultSelectorConfig())
86+
b.ResetTimer()
87+
for i := 0; i < b.N; i++ {
88+
_ = sel.Select(result)
89+
}
90+
}
91+
92+
func BenchmarkSelector_50Clusters(b *testing.B) {
93+
chunks := makeBenchChunks(50, 128)
94+
result := ClusterByThreshold(chunks, 0.15)
95+
sel := NewSelector(DefaultSelectorConfig())
96+
b.ResetTimer()
97+
for i := 0; i < b.N; i++ {
98+
_ = sel.Select(result)
99+
}
100+
}

0 commit comments

Comments
 (0)