diff --git a/pkg/compress/bench_test.go b/pkg/compress/bench_test.go new file mode 100644 index 0000000..aee3142 --- /dev/null +++ b/pkg/compress/bench_test.go @@ -0,0 +1,32 @@ +package compress + +import ( + "context" + "strings" + "testing" + + "github.com/Siddhant-K-code/distill/pkg/types" +) + +func BenchmarkCompress_ShortText(b *testing.B) { + c := NewExtractiveCompressor() + ctx := context.Background() + chunk := types.Chunk{ID: "bench", Text: "This is a short text for compression benchmarking."} + opts := DefaultOptions() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _, _ = c.Compress(ctx, []types.Chunk{chunk}, opts) + } +} + +func BenchmarkCompress_LongText(b *testing.B) { + c := NewExtractiveCompressor() + ctx := context.Background() + long := strings.Repeat("This is a longer text with more content for compression benchmarking. ", 50) + chunk := types.Chunk{ID: "bench", Text: long} + opts := DefaultOptions() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _, _ = c.Compress(ctx, []types.Chunk{chunk}, opts) + } +} diff --git a/pkg/contextlab/bench_test.go b/pkg/contextlab/bench_test.go new file mode 100644 index 0000000..36141b7 --- /dev/null +++ b/pkg/contextlab/bench_test.go @@ -0,0 +1,100 @@ +package contextlab + +import ( + "math/rand" + "testing" + + "github.com/Siddhant-K-code/distill/pkg/types" +) + +// deterministicEmbedding generates a reproducible embedding for a given seed. +// Using a fixed seed ensures benchmark results are stable across runs. +func deterministicEmbedding(seed int64, dims int) []float32 { + rng := rand.New(rand.NewSource(seed)) + v := make([]float32, dims) + for i := range v { + v[i] = rng.Float32() + } + return v +} + +// makeBenchChunks builds n chunks with deterministic embeddings. +func makeBenchChunks(n, dims int) []types.Chunk { + chunks := make([]types.Chunk, n) + for i := range chunks { + chunks[i] = types.Chunk{ + ID: string(rune('A'+i%26)) + string(rune('0'+i/26%10)), + Text: "benchmark chunk content for semantic deduplication testing", + Embedding: deterministicEmbedding(int64(i), dims), + } + } + return chunks +} + +func BenchmarkCluster_10Chunks(b *testing.B) { + chunks := makeBenchChunks(10, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = ClusterByThreshold(chunks, 0.15) + } +} + +func BenchmarkCluster_50Chunks(b *testing.B) { + chunks := makeBenchChunks(50, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = ClusterByThreshold(chunks, 0.15) + } +} + +func BenchmarkCluster_100Chunks(b *testing.B) { + chunks := makeBenchChunks(100, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = ClusterByThreshold(chunks, 0.15) + } +} + +func BenchmarkCluster_500Chunks(b *testing.B) { + chunks := makeBenchChunks(500, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = ClusterByThreshold(chunks, 0.15) + } +} + +func BenchmarkMMR_10Chunks(b *testing.B) { + chunks := makeBenchChunks(10, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = MMRRerank(chunks, 0.7, 5) + } +} + +func BenchmarkMMR_50Chunks(b *testing.B) { + chunks := makeBenchChunks(50, 128) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = MMRRerank(chunks, 0.7, 10) + } +} + +func BenchmarkSelector_10Clusters(b *testing.B) { + chunks := makeBenchChunks(10, 128) + result := ClusterByThreshold(chunks, 0.15) + sel := NewSelector(DefaultSelectorConfig()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = sel.Select(result) + } +} + +func BenchmarkSelector_50Clusters(b *testing.B) { + chunks := makeBenchChunks(50, 128) + result := ClusterByThreshold(chunks, 0.15) + sel := NewSelector(DefaultSelectorConfig()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = sel.Select(result) + } +}