Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -938,3 +938,103 @@ func BenchmarkRealWorldScenario(b *testing.B) {
}
})
}

// =============================================================================
// Compression Level Benchmarks
// =============================================================================

func BenchmarkCompressionLevels(b *testing.B) {
sizes := []int{100, 1000, 5000}

levels := []struct {
name string
level CompressionLevel
}{
{"None", CompressionNone},
{"Zstd-1", CompressionFastest},
{"Zstd-3", CompressionBalanced},
{"Zstd-9", CompressionBetter},
{"Zstd-15", CompressionBest},
{"Zstd-19", CompressionMax},
}

for _, size := range sizes {
idx := setupTestIndex(size)

for _, l := range levels {
b.Run(fmt.Sprintf("Encode/RGs=%d/%s", size, l.name), func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = EncodeWithLevel(idx, l.level)
}
})
}

// Pre-encode for decode and size benchmarks
encoded := make(map[string][]byte)
for _, l := range levels {
encoded[l.name], _ = EncodeWithLevel(idx, l.level)
}

for _, l := range levels {
data := encoded[l.name]
b.Run(fmt.Sprintf("Decode/RGs=%d/%s", size, l.name), func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = Decode(data)
}
})
}

// Report sizes
noneSize := len(encoded["None"])
for _, l := range levels {
data := encoded[l.name]
b.Run(fmt.Sprintf("Size/RGs=%d/%s", size, l.name), func(b *testing.B) {
b.ReportMetric(float64(len(data)), "bytes")
b.ReportMetric(float64(len(data))/1024, "KB")
if noneSize > 0 {
b.ReportMetric(float64(len(data))/float64(noneSize)*100, "ratio%")
}
})
}
}
}

func BenchmarkCompressionLevelsThroughput(b *testing.B) {
idx := setupTestIndex(5000)

levels := []struct {
name string
level CompressionLevel
}{
{"None", CompressionNone},
{"Zstd-1", CompressionFastest},
{"Zstd-3", CompressionBalanced},
{"Zstd-9", CompressionBetter},
{"Zstd-15", CompressionBest},
{"Zstd-19", CompressionMax},
}

for _, l := range levels {
data, _ := EncodeWithLevel(idx, l.level)
uncompressedData, _ := EncodeWithLevel(idx, CompressionNone)
uncompressedSize := len(uncompressedData) - 4 // subtract magic bytes

b.Run(fmt.Sprintf("EncodeMBps/%s", l.name), func(b *testing.B) {
b.SetBytes(int64(uncompressedSize))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = EncodeWithLevel(idx, l.level)
}
})

b.Run(fmt.Sprintf("DecodeMBps/%s", l.name), func(b *testing.B) {
b.SetBytes(int64(uncompressedSize))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _ = Decode(data)
}
})
}
}
148 changes: 148 additions & 0 deletions gin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,154 @@ func TestSerializeRoundTrip(t *testing.T) {
}
}

func TestCompressionLevels(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 3)
builder.AddDocument(0, []byte(`{"name": "alice", "age": 30, "active": true}`))
builder.AddDocument(1, []byte(`{"name": "bob", "age": 25, "active": false}`))
builder.AddDocument(2, []byte(`{"name": "charlie", "age": 35}`))
idx := builder.Finalize()

levels := []struct {
name string
level CompressionLevel
}{
{"None", CompressionNone},
{"Fastest", CompressionFastest},
{"Balanced", CompressionBalanced},
{"Better", CompressionBetter},
{"Best", CompressionBest},
{"Max", CompressionMax},
}

for _, tc := range levels {
t.Run(tc.name, func(t *testing.T) {
encoded, err := EncodeWithLevel(idx, tc.level)
if err != nil {
t.Fatalf("encode with level %d failed: %v", tc.level, err)
}

decoded, err := Decode(encoded)
if err != nil {
t.Fatalf("decode failed: %v", err)
}

if decoded.Header.NumDocs != idx.Header.NumDocs {
t.Errorf("NumDocs mismatch: %d vs %d", decoded.Header.NumDocs, idx.Header.NumDocs)
}
if decoded.Header.NumRowGroups != idx.Header.NumRowGroups {
t.Errorf("NumRowGroups mismatch")
}

result := decoded.Evaluate([]Predicate{EQ("$.name", "bob")})
if !result.IsSet(1) {
t.Error("query on decoded index failed")
}
})
}
}

func TestCompressionUncompressedMagic(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 2)
builder.AddDocument(0, []byte(`{"value": 1}`))
builder.AddDocument(1, []byte(`{"value": 2}`))
idx := builder.Finalize()

encoded, err := EncodeWithLevel(idx, CompressionNone)
if err != nil {
t.Fatalf("encode uncompressed failed: %v", err)
}

if len(encoded) < 4 {
t.Fatal("encoded data too short")
}
if string(encoded[:4]) != "GINu" {
t.Errorf("expected uncompressed magic 'GINu', got %q", string(encoded[:4]))
}

decoded, err := Decode(encoded)
if err != nil {
t.Fatalf("decode uncompressed failed: %v", err)
}

result := decoded.Evaluate([]Predicate{EQ("$.value", float64(2))})
if !result.IsSet(1) {
t.Error("query on decoded uncompressed index failed")
}
}

func TestCompressionCompressedMagic(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 2)
builder.AddDocument(0, []byte(`{"value": 1}`))
builder.AddDocument(1, []byte(`{"value": 2}`))
idx := builder.Finalize()

encoded, err := EncodeWithLevel(idx, CompressionFastest)
if err != nil {
t.Fatalf("encode compressed failed: %v", err)
}

if len(encoded) < 4 {
t.Fatal("encoded data too short")
}
if string(encoded[:4]) != "GINc" {
t.Errorf("expected compressed magic 'GINc', got %q", string(encoded[:4]))
}

decoded, err := Decode(encoded)
if err != nil {
t.Fatalf("decode compressed failed: %v", err)
}

result := decoded.Evaluate([]Predicate{EQ("$.value", float64(2))})
if !result.IsSet(1) {
t.Error("query on decoded compressed index failed")
}
}

func TestCompressionSizeReduction(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 100)
for i := 0; i < 100; i++ {
builder.AddDocument(DocID(i), []byte(fmt.Sprintf(`{"id": %d, "name": "user_%d", "value": %d}`, i, i%10, i*100)))
}
idx := builder.Finalize()

uncompressed, _ := EncodeWithLevel(idx, CompressionNone)
fastest, _ := EncodeWithLevel(idx, CompressionFastest)

if len(fastest) >= len(uncompressed) {
t.Errorf("zstd-1 should be smaller than uncompressed: %d >= %d", len(fastest), len(uncompressed))
}

// Note: For small datasets, higher compression levels may not always produce smaller output
// due to compression dictionary overhead. The benefit is more pronounced with larger data.
t.Logf("Sizes - Uncompressed: %d, Zstd-1: %d (%.1f%% of original)",
len(uncompressed), len(fastest), float64(len(fastest))/float64(len(uncompressed))*100)
}

func TestCompressionInvalidLevel(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 2)
builder.AddDocument(0, []byte(`{"value": 1}`))
idx := builder.Finalize()

tests := []struct {
name string
level CompressionLevel
}{
{"negative", CompressionLevel(-1)},
{"too_high", CompressionLevel(20)},
{"way_too_high", CompressionLevel(100)},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
_, err := EncodeWithLevel(idx, tc.level)
if err == nil {
t.Errorf("expected error for compression level %d", tc.level)
}
})
}
}

func TestNestedJSON(t *testing.T) {
builder := mustNewBuilder(t, DefaultConfig(), 2)

Expand Down
76 changes: 67 additions & 9 deletions serialize.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@ import (

const maxConfigSize = 1 << 20 // 1MB max config size

// CompressionLevel specifies the compression level for index serialization.
type CompressionLevel int

const (
CompressionNone CompressionLevel = 0 // No compression
CompressionFastest CompressionLevel = 1 // zstd level 1
CompressionBalanced CompressionLevel = 3 // zstd level 3
CompressionBetter CompressionLevel = 9 // zstd level 9
CompressionBest CompressionLevel = 15 // zstd level 15 (recommended)
CompressionMax CompressionLevel = 19 // zstd level 19 (slow)
)

const (
uncompressedMagic = "GINu"
compressedMagic = "GINc"
)

type SerializedConfig struct {
BloomFilterSize uint32 `json:"bloom_filter_size"`
BloomFilterHashes uint8 `json:"bloom_filter_hashes"`
Expand Down Expand Up @@ -60,7 +77,18 @@ func readRGSet(r io.Reader) (*RGSet, error) {
return RGSetFromRoaring(bitmap, int(numRGs)), nil
}

// Encode serializes the index using zstd-15 compression (recommended default).
func Encode(idx *GINIndex) ([]byte, error) {
return EncodeWithLevel(idx, CompressionBest)
}

// EncodeWithLevel serializes the index with the specified compression level.
// Use CompressionNone (0) for no compression, or 1-19 for zstd compression levels.
func EncodeWithLevel(idx *GINIndex, level CompressionLevel) ([]byte, error) {
if level < 0 || level > 19 {
return nil, errors.Errorf("compression level must be 0-19, got %d", level)
}

var buf bytes.Buffer

if len(idx.DocIDMapping) > 0 {
Expand Down Expand Up @@ -113,25 +141,55 @@ func Encode(idx *GINIndex) ([]byte, error) {
return nil, errors.Wrap(err, "write config")
}

encoder, err := zstd.NewWriter(nil)
if level == CompressionNone {
return append([]byte(uncompressedMagic), buf.Bytes()...), nil
}

encoder, err := zstd.NewWriter(nil,
zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(int(level))))
if err != nil {
return nil, errors.Wrap(err, "create zstd encoder")
}
defer func() { _ = encoder.Close() }()

return encoder.EncodeAll(buf.Bytes(), nil), nil
compressed := encoder.EncodeAll(buf.Bytes(), nil)
return append([]byte(compressedMagic), compressed...), nil
}

func Decode(data []byte) (*GINIndex, error) {
decoder, err := zstd.NewReader(nil)
if err != nil {
return nil, errors.Wrap(err, "create zstd decoder")
if len(data) < 4 {
return nil, errors.New("data too short")
}
defer decoder.Close()

decompressed, err := decoder.DecodeAll(data, nil)
if err != nil {
return nil, errors.Wrap(err, "decompress data")
var decompressed []byte
magic := string(data[:4])

switch magic {
case uncompressedMagic:
decompressed = data[4:]
case compressedMagic:
decoder, err := zstd.NewReader(nil)
if err != nil {
return nil, errors.Wrap(err, "create zstd decoder")
}
defer decoder.Close()

decompressed, err = decoder.DecodeAll(data[4:], nil)
if err != nil {
return nil, errors.Wrap(err, "decompress data")
}
default:
// Legacy format: try zstd decompression without magic (backward compatibility)
decoder, err := zstd.NewReader(nil)
if err != nil {
return nil, errors.Wrap(err, "create zstd decoder")
}
defer decoder.Close()

decompressed, err = decoder.DecodeAll(data, nil)
if err != nil {
return nil, errors.Wrap(err, "decompress data")
}
}

buf := bytes.NewReader(decompressed)
Expand Down
Loading