diff --git a/common/types.go b/common/types.go index f54f80e..1c7763f 100644 --- a/common/types.go +++ b/common/types.go @@ -23,10 +23,10 @@ type ItemSketchHasher[C comparable] interface { Hash(item C) uint64 } -type ItemSketchSerde[C comparable] interface { - SizeOf(item C) int +type ItemSketchSerde[T any] interface { + SizeOf(item T) int SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) - SerializeManyToSlice(items []C) []byte - SerializeOneToSlice(item C) []byte - DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]C, error) + SerializeManyToSlice(items []T) []byte + SerializeOneToSlice(item T) []byte + DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]T, error) } diff --git a/sampling/reservoir_items_sketch.go b/sampling/reservoir_items_sketch.go index c9a0cfe..ffe90e2 100644 --- a/sampling/reservoir_items_sketch.go +++ b/sampling/reservoir_items_sketch.go @@ -308,10 +308,10 @@ func (s *ReservoirItemsSketch[T]) forceIncrementItemsSeen(delta int64) error { // Serialization constants const ( - preambleIntsEmpty = 1 - serVer = 2 - flagEmpty = 0x04 - resizeFactorMask = 0xC0 + preambleIntsEmpty = 1 + reservoirItemsSketchSerialVersion = 2 + flagEmpty = 0x04 + resizeFactorMask = 0xC0 ) func resizeFactorBitsFor(rf ResizeFactor) (byte, error) { @@ -358,7 +358,7 @@ func (s *ReservoirItemsSketch[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) { if s.isEmpty() { buf := make([]byte, 8) buf[0] = rfBits | preambleIntsEmpty - buf[1] = serVer + buf[1] = reservoirItemsSketchSerialVersion buf[2] = byte(internal.FamilyEnum.ReservoirItems.Id) buf[3] = flagEmpty binary.LittleEndian.PutUint32(buf[4:], uint32(s.k)) @@ -375,7 +375,7 @@ func (s *ReservoirItemsSketch[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) { buf := make([]byte, preBytes+len(itemsBytes)) buf[0] = rfBits | byte(preLongs) - buf[1] = serVer + buf[1] = reservoirItemsSketchSerialVersion buf[2] = byte(internal.FamilyEnum.ReservoirItems.Id) buf[3] = 0 binary.LittleEndian.PutUint32(buf[4:], uint32(s.k)) @@ -445,7 +445,7 @@ func NewReservoirItemsSketchFromSlice[T any](data []byte, serde ItemsSerDe[T]) ( k := int(binary.LittleEndian.Uint32(data[4:])) - if ver != serVer { + if ver != reservoirItemsSketchSerialVersion { if ver == 1 { encK := binary.LittleEndian.Uint16(data[4:]) decodedK, err := decodeReservoirSize(encK) diff --git a/sampling/reservoir_items_sketch_serialization_test.go b/sampling/reservoir_items_sketch_serialization_test.go index e97df48..110bcc0 100644 --- a/sampling/reservoir_items_sketch_serialization_test.go +++ b/sampling/reservoir_items_sketch_serialization_test.go @@ -375,7 +375,7 @@ func TestReservoirItemsSketchDeserializationErrors(t *testing.T) { t.Run("BadFamily", func(t *testing.T) { data := make([]byte, 8) data[0] = 0xC0 | preambleIntsEmpty - data[1] = serVer + data[1] = reservoirItemsSketchSerialVersion data[2] = 99 // invalid family ID data[3] = flagEmpty binary.LittleEndian.PutUint32(data[4:], 100) @@ -388,7 +388,7 @@ func TestReservoirItemsSketchDeserializationErrors(t *testing.T) { t.Run("BadPreLongs", func(t *testing.T) { data := make([]byte, 8) data[0] = 0xC0 | 5 // invalid preamble longs - data[1] = serVer + data[1] = reservoirItemsSketchSerialVersion data[2] = byte(internal.FamilyEnum.ReservoirItems.Id) data[3] = flagEmpty binary.LittleEndian.PutUint32(data[4:], 100) diff --git a/sampling/varopt_items_sketch.go b/sampling/varopt_items_sketch.go index dd6e896..aa93a7d 100644 --- a/sampling/varopt_items_sketch.go +++ b/sampling/varopt_items_sketch.go @@ -26,6 +26,7 @@ import ( "slices" "strings" + "github.com/apache/datasketches-go/common" "github.com/apache/datasketches-go/internal" ) @@ -81,12 +82,15 @@ type varOptConfig struct { resizeFactor ResizeFactor } +// WithResizeFactor sets the resize factor in the VarOpt configuration. func WithResizeFactor(rf ResizeFactor) VarOptOption { return func(c *varOptConfig) { c.resizeFactor = rf } } +// NewVarOptItemsSketch creates a new VarOptItemsSketch with a specified maximum capacity `k` and optional configurations. +// It returns an error if `k` is less than 1 or exceeds the maximum allowed value (2^31 - 2). func NewVarOptItemsSketch[T any](k uint, opts ...VarOptOption) (*VarOptItemsSketch[T], error) { if k < 1 || k > varOptMaxK { return nil, errors.New("k must be at least 1 and less than 2^31 - 1") @@ -120,6 +124,42 @@ func NewVarOptItemsSketch[T any](k uint, opts ...VarOptOption) (*VarOptItemsSket }, nil } +func newVarOptItemsSketchFromState[T any]( + k int, rf ResizeFactor, isGadget bool, +) (*VarOptItemsSketch[T], error) { + if k == 0 || k > varOptMaxK { + return nil, errors.New("k must be at least 1 and less than 2^31 - 1") + } + + ceilingLgK := math.Log2(float64(common.CeilingPowerOf2(k))) + initialLgSize := startingSubMultiple(int(ceilingLgK), int(rf), minLgArrItems) + currItemsAlloc := adjustedSamplingAllocationSize(k, 1< 0 { + numBytes++ + } + } + + for sample := range s.All() { + numBytes += serde.SizeOf(sample.Item) + } + + return numBytes +} diff --git a/sampling/varopt_items_sketch_decoder.go b/sampling/varopt_items_sketch_decoder.go new file mode 100644 index 0000000..40d7283 --- /dev/null +++ b/sampling/varopt_items_sketch_decoder.go @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + "math" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" +) + +// TODO: Support Stream I/O. + +// Decode reconstructs a VarOptItemsSketch from a byte slice using the provided ItemsSerDe implementation for deserialization. +// Returns the reconstructed VarOptItemsSketch or an error if deserialization fails. +func Decode[T any](buffer []byte, serde common.ItemSketchSerde[T]) (*VarOptItemsSketch[T], error) { + if len(buffer) < 8 { + return nil, errors.New("data too short") + } + + index := 0 + + fistBytes := buffer[index] + index++ + + preambleLongs := fistBytes & 0x3F + + rf := (fistBytes >> 6) & 0x03 + + serVer := buffer[index] + index++ + + familyID := buffer[index] + index++ + + flags := buffer[index] + index++ + + k := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + isEmpty := (flags & emptyFlagMask) != 0 + if err := validateVarOptItemsSketchPreambleLongs(preambleLongs, isEmpty); err != nil { + return nil, err + } + + if err := validateVarOptItemsSketchFamilyAndSerVer(familyID, serVer); err != nil { + return nil, err + } + + isGadget := (flags & gadgetFlagMask) != 0 + + if isEmpty { + return newVarOptItemsSketchFromState[T](int(k), ResizeFactor(rf), isGadget) + } + + if err := validateBuffer(buffer, index+8); err != nil { + return nil, err + } + n := binary.LittleEndian.Uint64(buffer[index:]) + index += 8 + + if err := validateBuffer(buffer, index+4); err != nil { + return nil, err + } + h := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + if err := validateBuffer(buffer, index+4); err != nil { + return nil, err + } + r := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + allocSize, err := computeVarOptItemsSketchDataSize( + preambleLongs, k, n, h, r, ResizeFactor(rf), + ) + if err != nil { + return nil, err + } + + totalWeightR := float64(0) + // validate R region weight. + if preambleLongs == preambleLongsFull { + if err := validateBuffer(buffer, index+8); err != nil { + return nil, err + } + + totalWeightR = math.Float64frombits(binary.LittleEndian.Uint64(buffer[index:])) + index += 8 + + if math.IsNaN(totalWeightR) || r == 0 || totalWeightR <= 0 { + return nil, fmt.Errorf("data is corrupt in full mode: invalid R region weight: %f", totalWeightR) + } + } + + sliceLen := int(allocSize) + if r == 0 { + sliceLen = int(h) + } + + // read h weights, fill in rest of slice with -1.0 + weights := make([]float64, sliceLen, allocSize) + if err := validateBuffer(buffer, index+int(h)*8); err != nil { + return nil, err + } + for i := 0; i < int(h); i++ { + w := math.Float64frombits(binary.LittleEndian.Uint64(buffer[index:])) + index += 8 + + if w <= 0 { + return nil, fmt.Errorf("non-positive weight: %f", w) + } + + weights[i] = w + } + for i := h; i < uint32(len(weights)); i++ { + weights[i] = -1 + } + + var ( + marks []bool + numMarksInH uint32 + ) + if isGadget { + marks = make([]bool, sliceLen, allocSize) + val := uint8(0) + for i := 0; i < int(h); i++ { + if (i & 0x7) == 0 { + if err := validateBuffer(buffer, index+1); err != nil { + return nil, err + } + + val = buffer[index] + index++ + } + + marks[i] = (val>>(i&0x7))&0x1 == 1 + if marks[i] { + numMarksInH++ + } + } + } + + data := make([]T, sliceLen, allocSize) + + hBytes, err := serde.SizeOfMany(buffer, index, int(h)) + if err != nil { + return nil, err + } + if err := validateBuffer(buffer, index+hBytes); err != nil { + return nil, err + } + hRegionData, err := serde.DeserializeManyFromSlice(buffer, index, int(h)) + if err != nil { + return nil, err + } + index += hBytes + copy(data[:h], hRegionData) + + if r > 0 { + rBytes, err := serde.SizeOfMany(buffer, index, int(r)) + if err != nil { + return nil, err + } + if err := validateBuffer(buffer, index+rBytes); err != nil { + return nil, err + } + rData, err := serde.DeserializeManyFromSlice(buffer, index, int(r)) + if err != nil { + return nil, err + } + index += rBytes + + copy(data[h+1:h+1+r], rData) + } + + m := 0 + if r > 0 { + m = 1 + } + return &VarOptItemsSketch[T]{ + data: data, + weights: weights, + k: int(k), + h: int(h), + m: m, + r: int(r), + n: int64(n), + totalWeightR: totalWeightR, + rf: ResizeFactor(rf), + marks: marks, + numMarksInH: numMarksInH, + }, nil +} + +func validateVarOptItemsSketchPreambleLongs(preambleLongs uint8, isEmpty bool) error { + if isEmpty { + if preambleLongs != preambleLongsEmpty { + return fmt.Errorf("invalid preamble longs: expected empty, got %d", preambleLongs) + } + } else if preambleLongs != preambleLongsWarmup && preambleLongs != preambleLongsFull { + return fmt.Errorf("invalid preamble longs: expected warmup or full, got %d", preambleLongs) + } + return nil +} + +func validateVarOptItemsSketchFamilyAndSerVer(familyID, serVer uint8) error { + if int(familyID) == internal.FamilyEnum.VarOptItems.Id { + if serVer != varOptItemsSketchSerialVersion { + return fmt.Errorf("invalid serialization version: expected %d, got %d", varOptItemsSketchSerialVersion, serVer) + } + return nil + } + + return fmt.Errorf("invalid family ID: expected %d, got %d", internal.FamilyEnum.VarOptItems.Id, familyID) +} + +func computeVarOptItemsSketchDataSize( + preambleLongs uint8, k uint32, n uint64, h uint32, r uint32, rf ResizeFactor, +) (uint32, error) { + if k == 0 || k > varOptMaxK { + return 0, errors.New("k must be at least 1 and less than 2^31 - 1") + } + + allocSize := 0 + if n <= uint64(k) { + if preambleLongs != preambleLongsWarmup { + return 0, fmt.Errorf("invalid preamble longs: expected warmup because n<=k, got %d", preambleLongs) + } + + if n != uint64(h) { + return 0, fmt.Errorf("invalid state in warmup mode: expected n==h, got n=%d, h=%d", n, h) + } + + if r > 0 { + return 0, fmt.Errorf("invalid state in warmup mode: expected r==0, got r=%d", r) + } + + ceilingLgK := math.Log2(float64(common.CeilingPowerOf2(int(k)))) + minLgSize := math.Log2(float64(common.CeilingPowerOf2(int(h)))) + initialLgSize := startingSubMultiple(int(ceilingLgK), int(rf), int(minLgSize)) + allocSize = adjustedSamplingAllocationSize(int(k), 1<k, got %d", preambleLongs) + } + + if h+r != k { + return 0, fmt.Errorf("invalid state in full mode: expected h+r==k, got h=%d, r=%d, k=%d", h, r, k) + } + + allocSize = int(k) + 1 + } + + return uint32(allocSize), nil +} + +func validateBuffer(buf []byte, endIndex int) error { + if len(buf) < endIndex { + return io.ErrUnexpectedEOF + } + return nil +} diff --git a/sampling/varopt_items_sketch_encoder.go b/sampling/varopt_items_sketch_encoder.go new file mode 100644 index 0000000..b85a428 --- /dev/null +++ b/sampling/varopt_items_sketch_encoder.go @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" +) + +const ( + preambleLongsEmpty = uint8(1) + preambleLongsWarmup = uint8(3) + preambleLongsFull = uint8(4) + varOptItemsSketchSerialVersion = uint8(2) + gadgetFlagMask = uint8(128) + emptyFlagMask = uint8(4) +) + +// VarOptItemsSketchEncoder writes encoded data to the provided io.Writer and uses ItemsSerDe for custom serialization of items. +type VarOptItemsSketchEncoder[T any] struct { + w io.Writer + serde common.ItemSketchSerde[T] +} + +// NewVarOptItemsSketchEncoder creates a new VarOptItemsSketchEncoder. +func NewVarOptItemsSketchEncoder[T any]( + w io.Writer, + serde common.ItemSketchSerde[T], +) VarOptItemsSketchEncoder[T] { + return VarOptItemsSketchEncoder[T]{ + w: w, + serde: serde, + } +} + +// Encode writes the provided VarOptItemsSketch to the underlying io.Writer. +func (enc *VarOptItemsSketchEncoder[T]) Encode(sketch *VarOptItemsSketch[T]) error { + if sketch == nil { + return errors.New("cannot encode nil VarOptItemsSketch") + } + + isEmpty := sketch.n == 0 && sketch.r == 0 + + preambleLongs := preambleLongsFull + if isEmpty { + preambleLongs = preambleLongsEmpty + } else if sketch.r == 0 { + preambleLongs = preambleLongsWarmup + } + + firstByte := (preambleLongs & 0x3F) | (uint8(sketch.rf) << 6) + if err := binary.Write(enc.w, binary.LittleEndian, firstByte); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, varOptItemsSketchSerialVersion); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint8(internal.FamilyEnum.VarOptItems.Id)); err != nil { + return err + } + + flags := uint8(0) + if sketch.marks != nil { + flags |= gadgetFlagMask + } + if isEmpty { + flags |= emptyFlagMask + } + if err := binary.Write(enc.w, binary.LittleEndian, flags); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.k)); err != nil { + return err + } + + if isEmpty { + return nil + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint64(sketch.n)); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.h)); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.r)); err != nil { + return err + } + + if sketch.r > 0 { + if err := binary.Write(enc.w, binary.LittleEndian, sketch.totalWeightR); err != nil { + return err + } + } + + for i := 0; i < sketch.h; i++ { + if i >= len(sketch.weights) { + return fmt.Errorf("invalid weights array size: %d, h: %d", len(sketch.weights), sketch.h) + } + + if err := binary.Write(enc.w, binary.LittleEndian, sketch.weights[i]); err != nil { + return err + } + } + + if sketch.marks != nil { + val := uint8(0) + for i := 0; i < sketch.h; i++ { + if sketch.marks[i] { + val |= 0x1 << (i & 0x7) + } + + if i&0x7 == 0x7 { + if err := binary.Write(enc.w, binary.LittleEndian, val); err != nil { + return err + } + val = 0 + } + } + + // write out any remaining values. + if sketch.h&0x7 > 0 { + if err := binary.Write(enc.w, binary.LittleEndian, val); err != nil { + return err + } + } + } + + expectedDataLen := sketch.h + if sketch.r > 0 { + expectedDataLen += sketch.r + 1 + } + if len(sketch.data) != expectedDataLen { + return fmt.Errorf("invalid data array size: %d, h: %d, r: %d", len(sketch.data), sketch.h, sketch.r) + } + + b := enc.serde.SerializeManyToSlice(sketch.data[:sketch.h]) + if _, err := enc.w.Write(b); err != nil { + return err + } + + if sketch.r > 0 { + rStart := sketch.h + 1 // skip gap + rEnd := rStart + sketch.r + b = enc.serde.SerializeManyToSlice(sketch.data[rStart:rEnd]) + if _, err := enc.w.Write(b); err != nil { + return err + } + } + + return nil +} diff --git a/sampling/varopt_items_sketch_serialization_test.go b/sampling/varopt_items_sketch_serialization_test.go new file mode 100644 index 0000000..5fb8d1e --- /dev/null +++ b/sampling/varopt_items_sketch_serialization_test.go @@ -0,0 +1,550 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const varOptItemsSerializationEpsilon = 1e-13 + +func TestGenerateGoBinariesForCompatibilityTestingVarOptItemsSketch(t *testing.T) { + if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 { + t.Skipf("%s not set", internal.DSketchTestGenerateGo) + } + + err := os.MkdirAll(internal.GoPath, os.ModePerm) + require.NoError(t, err) + + t.Run("long generate", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + n := n + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](32) + require.NoError(t, err) + + for i := 1; i <= n; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + filename := filepath.Join(internal.GoPath, fmt.Sprintf("varopt_sketch_long_n%d_go.sk", n)) + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](1024) + require.NoError(t, err) + + for i := 1; i <= 200; i++ { + require.NoError(t, sketch.Update(strconv.Itoa(i), 1000.0/float64(i))) + } + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + filename := filepath.Join(internal.GoPath, "varopt_sketch_string_exact_go.sk") + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) + + t.Run("long sampling", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](1024) + require.NoError(t, err) + + for i := 0; i < 2000; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + require.NoError(t, sketch.Update(-1, 100000.0)) + require.NoError(t, sketch.Update(-2, 110000.0)) + require.NoError(t, sketch.Update(-3, 120000.0)) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + filename := filepath.Join(internal.GoPath, "varopt_sketch_long_sampling_go.sk") + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) +} + +func TestVarOptItemsSketchJavaCompat(t *testing.T) { + t.Run("long", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, fmt.Sprintf("varopt_sketch_long_n%d_java.sk", n)) + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.Equal(t, n == 0, sketch.IsEmpty()) + assert.Equal(t, 32, sketch.K()) + assert.Equal(t, int64(n), sketch.N()) + if n > 10 { + assert.Equal(t, 32, sketch.NumSamples()) + } else { + assert.Equal(t, n, sketch.NumSamples()) + } + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, float64(n), summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, float64(n), summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, "varopt_sketch_string_exact_java.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(200), sketch.N()) + assert.Equal(t, 200, sketch.NumSamples()) + + expectedWeight := 0.0 + for i := 1; i <= 200; i++ { + expectedWeight += 1000.0 / float64(i) + } + + summary, err := sketch.EstimateSubsetSum(func(string) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, expectedWeight, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, expectedWeight, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + + t.Run("long sampling", func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, "varopt_sketch_long_sampling_java.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(2003), sketch.N()) + assert.Equal(t, sketch.K(), sketch.NumSamples()) + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, 332000.0, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, 332000.0, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x < 0 }) + require.NoError(t, err) + assert.InDelta(t, 330000.0, summary.Estimate, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x >= 0 }) + require.NoError(t, err) + assert.InDelta(t, 2000.0, summary.Estimate, varOptItemsSerializationEpsilon) + }) +} + +func TestVarOptItemsSketchCppCompat(t *testing.T) { + t.Run("long", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + n := n + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + filename := filepath.Join(internal.CppPath, fmt.Sprintf("varopt_sketch_long_n%d_cpp.sk", n)) + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.Equal(t, n == 0, sketch.IsEmpty()) + assert.Equal(t, 32, sketch.K()) + assert.Equal(t, int64(n), sketch.N()) + if n > 10 { + assert.Equal(t, 32, sketch.NumSamples()) + } else { + assert.Equal(t, n, sketch.NumSamples()) + } + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, float64(n), summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, float64(n), summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + filename := filepath.Join(internal.CppPath, "varopt_sketch_string_exact_cpp.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(200), sketch.N()) + assert.Equal(t, 200, sketch.NumSamples()) + + expectedWeight := 0.0 + for i := 1; i <= 200; i++ { + expectedWeight += 1000.0 / float64(i) + } + + summary, err := sketch.EstimateSubsetSum(func(string) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, expectedWeight, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, expectedWeight, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + + t.Run("long sampling", func(t *testing.T) { + filename := filepath.Join(internal.CppPath, "varopt_sketch_long_sampling_cpp.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(2003), sketch.N()) + assert.Equal(t, sketch.K(), sketch.NumSamples()) + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, 332000.0, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, 332000.0, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x < 0 }) + require.NoError(t, err) + assert.InDelta(t, 330000.0, summary.Estimate, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x >= 0 }) + require.NoError(t, err) + assert.InDelta(t, 2000.0, summary.Estimate, varOptItemsSerializationEpsilon) + }) +} + +func TestVarOptItemsSketchSerialization(t *testing.T) { + t.Run("nil sketch encode", func(t *testing.T) { + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder[int64](&buf, common.ItemSketchLongSerDe{}) + + err := encoder.Encode(nil) + require.ErrorContains(t, err, "cannot encode nil VarOptItemsSketch") + }) + + t.Run("bad serialization version", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 16, 16) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[1] = 0 + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid serialization version: expected 2, got 0") + }) + + t.Run("bad family", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 16, 16) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[2] = 0 + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid family ID: expected 13, got 0") + }) + + t.Run("bad prelongs", func(t *testing.T) { + for _, preLongs := range []byte{0, 2, 5} { + preLongs := preLongs + t.Run(string(rune('0'+preLongs)), func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 32, 33) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[0] = preLongs + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, fmt.Sprintf("invalid preamble longs: expected warmup or full, got %d", preLongs)) + }) + } + }) + + t.Run("malformed preamble", func(t *testing.T) { + source := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 50, 50), common.ItemSketchLongSerDe{}) + + t.Run("full preamble without R", func(t *testing.T) { + data := cloneBytes(source) + data[0] = preambleLongsFull + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected warmup because n<=k, got 4") + }) + + t.Run("zero k", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[4:], 0) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "k must be at least 1 and less than 2^31 - 1") + }) + + t.Run("negative H count", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[16:], math.MaxUint32) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in warmup mode: expected n==h, got n=50, h=4294967295") + }) + + t.Run("negative R count", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[20:], uint32(0xffffff80)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in warmup mode: expected r==0, got r=4294967168") + }) + + t.Run("warmup preamble in full mode", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + data[0] = (data[0] & 0xc0) | preambleLongsWarmup + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected full because n>k, got 3") + }) + }) + + t.Run("empty sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](5) + require.NoError(t, err) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + require.Len(t, data, int(preambleLongsEmpty<<3)) + + loaded, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.Equal(t, int64(0), loaded.N()) + assert.Equal(t, 0, loaded.NumSamples()) + assert.True(t, loaded.IsEmpty()) + }) + + t.Run("non-empty degenerate sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](12, WithResizeFactor(ResizeX2)) + require.NoError(t, err) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + for len(data) < int(preambleLongsWarmup<<3) { + data = append(data, 0) + } + data[3] = 0 + + _, err = Decode[string](data, common.ItemSketchStringSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected warmup or full, got 1") + }) + + t.Run("invalid full mode H plus R count", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint32(data[20:], 0) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in full mode: expected h+r==k") + }) + + t.Run("corrupt serialized R weight", func(t *testing.T) { + t.Run("zero", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(0)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + + t.Run("negative", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(-1.5)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + + t.Run("nan", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(math.NaN())) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + }) + + t.Run("corrupt serialized H weight", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 100, 20) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + preambleBytes := int(data[0]&0x3f) << 3 + binary.LittleEndian.PutUint64(data[preambleBytes:], math.Float64bits(-1.5)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "non-positive weight: -1.500000") + }) + + t.Run("round trip", func(t *testing.T) { + t.Run("under-full sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 100, 10) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-1], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("end-of-warmup sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 2843, 2843) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + require.Equal(t, preambleLongsWarmup, data[0]&0x3f) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-1000], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("full sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 32, 32) + require.NoError(t, sketch.Update(100, 100.0)) + require.NoError(t, sketch.Update(101, 101.0)) + + totalWeight, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + cumulativeWeight := 0.0 + for sample := range sketch.All() { + cumulativeWeight += sample.Weight + } + require.InDelta(t, 1.0, cumulativeWeight/totalWeight.TotalSketchWeight, varOptItemsSerializationEpsilon) + + samples := collectVarOptItemsSamples(sketch) + require.GreaterOrEqual(t, len(samples), 2) + require.InDelta(t, 100.0, samples[0].Weight, varOptItemsSerializationEpsilon) + require.InDelta(t, 101.0, samples[1].Weight, varOptItemsSerializationEpsilon) + require.Equal(t, int64(100), samples[0].Item) + require.Equal(t, int64(101), samples[1].Item) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + require.Equal(t, preambleLongsFull, data[0]&0x3f) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-100], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("string sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](5) + require.NoError(t, err) + for _, item := range []string{"a", "bc", "def", "ghij", "klmno"} { + require.NoError(t, sketch.Update(item, 1.0)) + } + require.NoError(t, sketch.Update("heavy item", 100.0)) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + loaded, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[string](data[:len(data)-12], common.ItemSketchStringSerDe{}) + require.ErrorContains(t, err, "offset out of bounds") + }) + }) +} + +func createUnweightedVarOptItemsSketch(t *testing.T, k int, n int) *VarOptItemsSketch[int64] { + t.Helper() + + sketch, err := NewVarOptItemsSketch[int64](uint(k)) + require.NoError(t, err) + for i := 0; i < n; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + return sketch +} + +func encodeVarOptItemsSketch[T any](t *testing.T, sketch *VarOptItemsSketch[T], serde common.ItemSketchSerde[T]) []byte { + t.Helper() + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, serde) + require.NoError(t, encoder.Encode(sketch)) + return buf.Bytes() +} + +func assertVarOptItemsSketchEqual[T comparable](t *testing.T, expected *VarOptItemsSketch[T], actual *VarOptItemsSketch[T]) { + t.Helper() + + require.Equal(t, expected.K(), actual.K()) + require.Equal(t, expected.N(), actual.N()) + require.Equal(t, expected.NumSamples(), actual.NumSamples()) + require.Equal(t, expected.H(), actual.H()) + require.Equal(t, expected.R(), actual.R()) + + expectedSamples := collectVarOptItemsSamples(expected) + actualSamples := collectVarOptItemsSamples(actual) + require.Len(t, actualSamples, len(expectedSamples)) + for i := range expectedSamples { + require.Equal(t, expectedSamples[i].Item, actualSamples[i].Item) + require.InDelta(t, expectedSamples[i].Weight, actualSamples[i].Weight, varOptItemsSerializationEpsilon) + } +} + +func collectVarOptItemsSamples[T any](sketch *VarOptItemsSketch[T]) []Sample[T] { + samples := make([]Sample[T], 0, sketch.NumSamples()) + for sample := range sketch.All() { + samples = append(samples, sample) + } + return samples +} + +func cloneBytes(src []byte) []byte { + dst := make([]byte, len(src)) + copy(dst, src) + return dst +} diff --git a/sampling/varopt_items_sketch_test.go b/sampling/varopt_items_sketch_test.go index f0a24a9..160d7c2 100644 --- a/sampling/varopt_items_sketch_test.go +++ b/sampling/varopt_items_sketch_test.go @@ -18,9 +18,11 @@ package sampling import ( + "bytes" "math" "testing" + "github.com/apache/datasketches-go/common" "github.com/stretchr/testify/assert" ) @@ -124,6 +126,80 @@ func TestVarOptItemsSketch_Reset(t *testing.T) { }) } +func TestVarOptItemsSketch_SerializedSizeBytes(t *testing.T) { + t.Run("empty sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsEmpty<<3), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) + + t.Run("warmup sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + for i := int64(1); i <= 5; i++ { + err = sketch.Update(i, float64(i)) + assert.NoError(t, err) + } + assert.Equal(t, 0, sketch.R()) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsWarmup<<3)+sketch.H()*8+sketch.H()*8, sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) + + t.Run("warmup string sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](10) + assert.NoError(t, err) + for i, item := range []string{"a", "bc", "def"} { + err = sketch.Update(item, float64(i+1)) + assert.NoError(t, err) + } + assert.Equal(t, 0, sketch.R()) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchStringSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + expectedSize := int(preambleLongsWarmup << 3) + expectedSize += sketch.H() * 8 + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("a") + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("bc") + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("def") + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchStringSerDe{})) + assert.Equal(t, expectedSize, sketch.SerializedSizeBytes(common.ItemSketchStringSerDe{})) + }) + + t.Run("full sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + for i := int64(1); i <= 20; i++ { + err = sketch.Update(i, 1.0) + assert.NoError(t, err) + } + assert.Greater(t, sketch.R(), 0) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsFull<<3)+sketch.H()*8+(sketch.H()+sketch.R())*8, sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) +} + func TestVarOptItemsSketch_All(t *testing.T) { t.Run("empty sketch", func(t *testing.T) { sketch, err := NewVarOptItemsSketch[int](10) diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk new file mode 100644 index 0000000..e4505fe Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000000_cpp.sk new file mode 100644 index 0000000..3fd21b6 Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000000_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100000_cpp.sk new file mode 100644 index 0000000..a26b17a Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100000_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk new file mode 100644 index 0000000..4947cec Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk new file mode 100644 index 0000000..c2dfc8f Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk new file mode 100644 index 0000000..962473a Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10_cpp.sk new file mode 100644 index 0000000..f1ac8c0 Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1_cpp.sk new file mode 100644 index 0000000..86f6bc3 Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_sampling_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_sampling_cpp.sk new file mode 100644 index 0000000..a56dfc5 Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_long_sampling_cpp.sk differ diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk new file mode 100644 index 0000000..2da7e4e Binary files /dev/null and b/serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n0_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n0_go.sk new file mode 100644 index 0000000..e4505fe Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n0_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n1000000_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n1000000_go.sk new file mode 100644 index 0000000..2df47b9 Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n1000000_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n100000_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n100000_go.sk new file mode 100644 index 0000000..ed77580 Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n100000_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk new file mode 100644 index 0000000..22ffba3 Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n1000_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n1000_go.sk new file mode 100644 index 0000000..a85fa2e Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n1000_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk new file mode 100644 index 0000000..c273a9e Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n10_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n10_go.sk new file mode 100644 index 0000000..f1ac8c0 Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n10_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n1_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n1_go.sk new file mode 100644 index 0000000..86f6bc3 Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_n1_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_sampling_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_sampling_go.sk new file mode 100644 index 0000000..2f6aeab Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_long_sampling_go.sk differ diff --git a/serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk new file mode 100644 index 0000000..2da7e4e Binary files /dev/null and b/serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n0_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n0_java.sk new file mode 100644 index 0000000..e4505fe Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n0_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n1000000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n1000000_java.sk new file mode 100644 index 0000000..4c006b6 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n1000000_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk new file mode 100644 index 0000000..a7a0b79 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n10000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n10000_java.sk new file mode 100644 index 0000000..5cc9661 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n10000_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk new file mode 100644 index 0000000..460dd49 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n100_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n100_java.sk new file mode 100644 index 0000000..c476f17 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n100_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n10_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n10_java.sk new file mode 100644 index 0000000..f1ac8c0 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n10_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n1_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n1_java.sk new file mode 100644 index 0000000..86f6bc3 Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_n1_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_sampling_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_sampling_java.sk new file mode 100644 index 0000000..b9e7d4e Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_long_sampling_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_sketch_string_exact_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_string_exact_java.sk new file mode 100644 index 0000000..2da7e4e Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_sketch_string_exact_java.sk differ diff --git a/serialization_test_data/java_generated_files/varopt_union_double_sampling_java.sk b/serialization_test_data/java_generated_files/varopt_union_double_sampling_java.sk new file mode 100644 index 0000000..d176e0b Binary files /dev/null and b/serialization_test_data/java_generated_files/varopt_union_double_sampling_java.sk differ