From f03b5723607612ca3ef0e40adde7ede239811edc Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 9 May 2026 23:03:29 +0900 Subject: [PATCH 1/3] feat: varopt items sketch serialization --- common/types.go | 10 +- sampling/reservoir_items_sketch.go | 14 +- ...servoir_items_sketch_serialization_test.go | 4 +- sampling/varopt_items_sketch.go | 64 +++ sampling/varopt_items_sketch_decoder.go | 286 ++++++++++ sampling/varopt_items_sketch_encoder.go | 177 +++++++ .../varopt_items_sketch_serialization_test.go | 500 ++++++++++++++++++ .../varopt_sketch_long_n0_cpp.sk | Bin 0 -> 8 bytes .../varopt_sketch_long_n1000000_cpp.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100000_cpp.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10000_cpp.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n1000_cpp.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100_cpp.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10_cpp.sk | Bin 0 -> 184 bytes .../varopt_sketch_long_n1_cpp.sk | Bin 0 -> 40 bytes .../varopt_sketch_long_sampling_cpp.sk | Bin 0 -> 8248 bytes .../varopt_sketch_string_exact_cpp.sk | Bin 0 -> 2916 bytes .../varopt_sketch_long_n0_go.sk | Bin 0 -> 8 bytes .../varopt_sketch_long_n1000000_go.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100000_go.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10000_go.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n1000_go.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100_go.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10_go.sk | Bin 0 -> 184 bytes .../varopt_sketch_long_n1_go.sk | Bin 0 -> 40 bytes .../varopt_sketch_long_sampling_go.sk | Bin 0 -> 8248 bytes .../varopt_sketch_string_exact_go.sk | Bin 0 -> 2916 bytes .../varopt_sketch_long_n0_java.sk | Bin 0 -> 8 bytes .../varopt_sketch_long_n1000000_java.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100000_java.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10000_java.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n1000_java.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n100_java.sk | Bin 0 -> 288 bytes .../varopt_sketch_long_n10_java.sk | Bin 0 -> 184 bytes .../varopt_sketch_long_n1_java.sk | Bin 0 -> 40 bytes .../varopt_sketch_long_sampling_java.sk | Bin 0 -> 8248 bytes .../varopt_sketch_string_exact_java.sk | Bin 0 -> 2916 bytes .../varopt_union_double_sampling_java.sk | Bin 0 -> 572 bytes 38 files changed, 1041 insertions(+), 14 deletions(-) create mode 100644 sampling/varopt_items_sketch_decoder.go create mode 100644 sampling/varopt_items_sketch_encoder.go create mode 100644 sampling/varopt_items_sketch_serialization_test.go create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000000_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n100000_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n10_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_n1_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_long_sampling_cpp.sk create mode 100644 serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n0_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n1000000_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n100000_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n1000_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n10_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_n1_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_long_sampling_go.sk create mode 100644 serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n0_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n1000000_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n10000_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n100_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n10_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_n1_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_long_sampling_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_sketch_string_exact_java.sk create mode 100644 serialization_test_data/java_generated_files/varopt_union_double_sampling_java.sk diff --git a/common/types.go b/common/types.go index f54f80e..1c7763f 100644 --- a/common/types.go +++ b/common/types.go @@ -23,10 +23,10 @@ type ItemSketchHasher[C comparable] interface { Hash(item C) uint64 } -type ItemSketchSerde[C comparable] interface { - SizeOf(item C) int +type ItemSketchSerde[T any] interface { + SizeOf(item T) int SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) - SerializeManyToSlice(items []C) []byte - SerializeOneToSlice(item C) []byte - DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]C, error) + SerializeManyToSlice(items []T) []byte + SerializeOneToSlice(item T) []byte + DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]T, error) } diff --git a/sampling/reservoir_items_sketch.go b/sampling/reservoir_items_sketch.go index c9a0cfe..ffe90e2 100644 --- a/sampling/reservoir_items_sketch.go +++ b/sampling/reservoir_items_sketch.go @@ -308,10 +308,10 @@ func (s *ReservoirItemsSketch[T]) forceIncrementItemsSeen(delta int64) error { // Serialization constants const ( - preambleIntsEmpty = 1 - serVer = 2 - flagEmpty = 0x04 - resizeFactorMask = 0xC0 + preambleIntsEmpty = 1 + reservoirItemsSketchSerialVersion = 2 + flagEmpty = 0x04 + resizeFactorMask = 0xC0 ) func resizeFactorBitsFor(rf ResizeFactor) (byte, error) { @@ -358,7 +358,7 @@ func (s *ReservoirItemsSketch[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) { if s.isEmpty() { buf := make([]byte, 8) buf[0] = rfBits | preambleIntsEmpty - buf[1] = serVer + buf[1] = reservoirItemsSketchSerialVersion buf[2] = byte(internal.FamilyEnum.ReservoirItems.Id) buf[3] = flagEmpty binary.LittleEndian.PutUint32(buf[4:], uint32(s.k)) @@ -375,7 +375,7 @@ func (s *ReservoirItemsSketch[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) { buf := make([]byte, preBytes+len(itemsBytes)) buf[0] = rfBits | byte(preLongs) - buf[1] = serVer + buf[1] = reservoirItemsSketchSerialVersion buf[2] = byte(internal.FamilyEnum.ReservoirItems.Id) buf[3] = 0 binary.LittleEndian.PutUint32(buf[4:], uint32(s.k)) @@ -445,7 +445,7 @@ func NewReservoirItemsSketchFromSlice[T any](data []byte, serde ItemsSerDe[T]) ( k := int(binary.LittleEndian.Uint32(data[4:])) - if ver != serVer { + if ver != reservoirItemsSketchSerialVersion { if ver == 1 { encK := binary.LittleEndian.Uint16(data[4:]) decodedK, err := decodeReservoirSize(encK) diff --git a/sampling/reservoir_items_sketch_serialization_test.go b/sampling/reservoir_items_sketch_serialization_test.go index e97df48..110bcc0 100644 --- a/sampling/reservoir_items_sketch_serialization_test.go +++ b/sampling/reservoir_items_sketch_serialization_test.go @@ -375,7 +375,7 @@ func TestReservoirItemsSketchDeserializationErrors(t *testing.T) { t.Run("BadFamily", func(t *testing.T) { data := make([]byte, 8) data[0] = 0xC0 | preambleIntsEmpty - data[1] = serVer + data[1] = reservoirItemsSketchSerialVersion data[2] = 99 // invalid family ID data[3] = flagEmpty binary.LittleEndian.PutUint32(data[4:], 100) @@ -388,7 +388,7 @@ func TestReservoirItemsSketchDeserializationErrors(t *testing.T) { t.Run("BadPreLongs", func(t *testing.T) { data := make([]byte, 8) data[0] = 0xC0 | 5 // invalid preamble longs - data[1] = serVer + data[1] = reservoirItemsSketchSerialVersion data[2] = byte(internal.FamilyEnum.ReservoirItems.Id) data[3] = flagEmpty binary.LittleEndian.PutUint32(data[4:], 100) diff --git a/sampling/varopt_items_sketch.go b/sampling/varopt_items_sketch.go index dd6e896..a55db1a 100644 --- a/sampling/varopt_items_sketch.go +++ b/sampling/varopt_items_sketch.go @@ -26,6 +26,7 @@ import ( "slices" "strings" + "github.com/apache/datasketches-go/common" "github.com/apache/datasketches-go/internal" ) @@ -81,12 +82,15 @@ type varOptConfig struct { resizeFactor ResizeFactor } +// WithResizeFactor sets the resize factor in the VarOpt configuration. func WithResizeFactor(rf ResizeFactor) VarOptOption { return func(c *varOptConfig) { c.resizeFactor = rf } } +// NewVarOptItemsSketch creates a new VarOptItemsSketch with a specified maximum capacity `k` and optional configurations. +// It returns an error if `k` is less than 1 or exceeds the maximum allowed value (2^31 - 2). func NewVarOptItemsSketch[T any](k uint, opts ...VarOptOption) (*VarOptItemsSketch[T], error) { if k < 1 || k > varOptMaxK { return nil, errors.New("k must be at least 1 and less than 2^31 - 1") @@ -120,6 +124,42 @@ func NewVarOptItemsSketch[T any](k uint, opts ...VarOptOption) (*VarOptItemsSket }, nil } +func newVarOptItemsSketchFromState[T any]( + k int, rf ResizeFactor, isGadget bool, +) (*VarOptItemsSketch[T], error) { + if k == 0 || k > varOptMaxK { + return nil, errors.New("k must be at least 1 and less than 2^31 - 1") + } + + ceilingLgK := math.Log2(float64(common.CeilingPowerOf2(k))) + initialLgSize := startingSubMultiple(int(ceilingLgK), int(rf), minLgArrItems) + currItemsAlloc := adjustedSamplingAllocationSize(k, 1< 0 { + numBytes++ + } + } + + numBytes += serde.SizeOfItem() * (s.h + s.r) + return numBytes +} diff --git a/sampling/varopt_items_sketch_decoder.go b/sampling/varopt_items_sketch_decoder.go new file mode 100644 index 0000000..40d7283 --- /dev/null +++ b/sampling/varopt_items_sketch_decoder.go @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + "math" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" +) + +// TODO: Support Stream I/O. + +// Decode reconstructs a VarOptItemsSketch from a byte slice using the provided ItemsSerDe implementation for deserialization. +// Returns the reconstructed VarOptItemsSketch or an error if deserialization fails. +func Decode[T any](buffer []byte, serde common.ItemSketchSerde[T]) (*VarOptItemsSketch[T], error) { + if len(buffer) < 8 { + return nil, errors.New("data too short") + } + + index := 0 + + fistBytes := buffer[index] + index++ + + preambleLongs := fistBytes & 0x3F + + rf := (fistBytes >> 6) & 0x03 + + serVer := buffer[index] + index++ + + familyID := buffer[index] + index++ + + flags := buffer[index] + index++ + + k := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + isEmpty := (flags & emptyFlagMask) != 0 + if err := validateVarOptItemsSketchPreambleLongs(preambleLongs, isEmpty); err != nil { + return nil, err + } + + if err := validateVarOptItemsSketchFamilyAndSerVer(familyID, serVer); err != nil { + return nil, err + } + + isGadget := (flags & gadgetFlagMask) != 0 + + if isEmpty { + return newVarOptItemsSketchFromState[T](int(k), ResizeFactor(rf), isGadget) + } + + if err := validateBuffer(buffer, index+8); err != nil { + return nil, err + } + n := binary.LittleEndian.Uint64(buffer[index:]) + index += 8 + + if err := validateBuffer(buffer, index+4); err != nil { + return nil, err + } + h := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + if err := validateBuffer(buffer, index+4); err != nil { + return nil, err + } + r := binary.LittleEndian.Uint32(buffer[index:]) + index += 4 + + allocSize, err := computeVarOptItemsSketchDataSize( + preambleLongs, k, n, h, r, ResizeFactor(rf), + ) + if err != nil { + return nil, err + } + + totalWeightR := float64(0) + // validate R region weight. + if preambleLongs == preambleLongsFull { + if err := validateBuffer(buffer, index+8); err != nil { + return nil, err + } + + totalWeightR = math.Float64frombits(binary.LittleEndian.Uint64(buffer[index:])) + index += 8 + + if math.IsNaN(totalWeightR) || r == 0 || totalWeightR <= 0 { + return nil, fmt.Errorf("data is corrupt in full mode: invalid R region weight: %f", totalWeightR) + } + } + + sliceLen := int(allocSize) + if r == 0 { + sliceLen = int(h) + } + + // read h weights, fill in rest of slice with -1.0 + weights := make([]float64, sliceLen, allocSize) + if err := validateBuffer(buffer, index+int(h)*8); err != nil { + return nil, err + } + for i := 0; i < int(h); i++ { + w := math.Float64frombits(binary.LittleEndian.Uint64(buffer[index:])) + index += 8 + + if w <= 0 { + return nil, fmt.Errorf("non-positive weight: %f", w) + } + + weights[i] = w + } + for i := h; i < uint32(len(weights)); i++ { + weights[i] = -1 + } + + var ( + marks []bool + numMarksInH uint32 + ) + if isGadget { + marks = make([]bool, sliceLen, allocSize) + val := uint8(0) + for i := 0; i < int(h); i++ { + if (i & 0x7) == 0 { + if err := validateBuffer(buffer, index+1); err != nil { + return nil, err + } + + val = buffer[index] + index++ + } + + marks[i] = (val>>(i&0x7))&0x1 == 1 + if marks[i] { + numMarksInH++ + } + } + } + + data := make([]T, sliceLen, allocSize) + + hBytes, err := serde.SizeOfMany(buffer, index, int(h)) + if err != nil { + return nil, err + } + if err := validateBuffer(buffer, index+hBytes); err != nil { + return nil, err + } + hRegionData, err := serde.DeserializeManyFromSlice(buffer, index, int(h)) + if err != nil { + return nil, err + } + index += hBytes + copy(data[:h], hRegionData) + + if r > 0 { + rBytes, err := serde.SizeOfMany(buffer, index, int(r)) + if err != nil { + return nil, err + } + if err := validateBuffer(buffer, index+rBytes); err != nil { + return nil, err + } + rData, err := serde.DeserializeManyFromSlice(buffer, index, int(r)) + if err != nil { + return nil, err + } + index += rBytes + + copy(data[h+1:h+1+r], rData) + } + + m := 0 + if r > 0 { + m = 1 + } + return &VarOptItemsSketch[T]{ + data: data, + weights: weights, + k: int(k), + h: int(h), + m: m, + r: int(r), + n: int64(n), + totalWeightR: totalWeightR, + rf: ResizeFactor(rf), + marks: marks, + numMarksInH: numMarksInH, + }, nil +} + +func validateVarOptItemsSketchPreambleLongs(preambleLongs uint8, isEmpty bool) error { + if isEmpty { + if preambleLongs != preambleLongsEmpty { + return fmt.Errorf("invalid preamble longs: expected empty, got %d", preambleLongs) + } + } else if preambleLongs != preambleLongsWarmup && preambleLongs != preambleLongsFull { + return fmt.Errorf("invalid preamble longs: expected warmup or full, got %d", preambleLongs) + } + return nil +} + +func validateVarOptItemsSketchFamilyAndSerVer(familyID, serVer uint8) error { + if int(familyID) == internal.FamilyEnum.VarOptItems.Id { + if serVer != varOptItemsSketchSerialVersion { + return fmt.Errorf("invalid serialization version: expected %d, got %d", varOptItemsSketchSerialVersion, serVer) + } + return nil + } + + return fmt.Errorf("invalid family ID: expected %d, got %d", internal.FamilyEnum.VarOptItems.Id, familyID) +} + +func computeVarOptItemsSketchDataSize( + preambleLongs uint8, k uint32, n uint64, h uint32, r uint32, rf ResizeFactor, +) (uint32, error) { + if k == 0 || k > varOptMaxK { + return 0, errors.New("k must be at least 1 and less than 2^31 - 1") + } + + allocSize := 0 + if n <= uint64(k) { + if preambleLongs != preambleLongsWarmup { + return 0, fmt.Errorf("invalid preamble longs: expected warmup because n<=k, got %d", preambleLongs) + } + + if n != uint64(h) { + return 0, fmt.Errorf("invalid state in warmup mode: expected n==h, got n=%d, h=%d", n, h) + } + + if r > 0 { + return 0, fmt.Errorf("invalid state in warmup mode: expected r==0, got r=%d", r) + } + + ceilingLgK := math.Log2(float64(common.CeilingPowerOf2(int(k)))) + minLgSize := math.Log2(float64(common.CeilingPowerOf2(int(h)))) + initialLgSize := startingSubMultiple(int(ceilingLgK), int(rf), int(minLgSize)) + allocSize = adjustedSamplingAllocationSize(int(k), 1<k, got %d", preambleLongs) + } + + if h+r != k { + return 0, fmt.Errorf("invalid state in full mode: expected h+r==k, got h=%d, r=%d, k=%d", h, r, k) + } + + allocSize = int(k) + 1 + } + + return uint32(allocSize), nil +} + +func validateBuffer(buf []byte, endIndex int) error { + if len(buf) < endIndex { + return io.ErrUnexpectedEOF + } + return nil +} diff --git a/sampling/varopt_items_sketch_encoder.go b/sampling/varopt_items_sketch_encoder.go new file mode 100644 index 0000000..b85a428 --- /dev/null +++ b/sampling/varopt_items_sketch_encoder.go @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" +) + +const ( + preambleLongsEmpty = uint8(1) + preambleLongsWarmup = uint8(3) + preambleLongsFull = uint8(4) + varOptItemsSketchSerialVersion = uint8(2) + gadgetFlagMask = uint8(128) + emptyFlagMask = uint8(4) +) + +// VarOptItemsSketchEncoder writes encoded data to the provided io.Writer and uses ItemsSerDe for custom serialization of items. +type VarOptItemsSketchEncoder[T any] struct { + w io.Writer + serde common.ItemSketchSerde[T] +} + +// NewVarOptItemsSketchEncoder creates a new VarOptItemsSketchEncoder. +func NewVarOptItemsSketchEncoder[T any]( + w io.Writer, + serde common.ItemSketchSerde[T], +) VarOptItemsSketchEncoder[T] { + return VarOptItemsSketchEncoder[T]{ + w: w, + serde: serde, + } +} + +// Encode writes the provided VarOptItemsSketch to the underlying io.Writer. +func (enc *VarOptItemsSketchEncoder[T]) Encode(sketch *VarOptItemsSketch[T]) error { + if sketch == nil { + return errors.New("cannot encode nil VarOptItemsSketch") + } + + isEmpty := sketch.n == 0 && sketch.r == 0 + + preambleLongs := preambleLongsFull + if isEmpty { + preambleLongs = preambleLongsEmpty + } else if sketch.r == 0 { + preambleLongs = preambleLongsWarmup + } + + firstByte := (preambleLongs & 0x3F) | (uint8(sketch.rf) << 6) + if err := binary.Write(enc.w, binary.LittleEndian, firstByte); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, varOptItemsSketchSerialVersion); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint8(internal.FamilyEnum.VarOptItems.Id)); err != nil { + return err + } + + flags := uint8(0) + if sketch.marks != nil { + flags |= gadgetFlagMask + } + if isEmpty { + flags |= emptyFlagMask + } + if err := binary.Write(enc.w, binary.LittleEndian, flags); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.k)); err != nil { + return err + } + + if isEmpty { + return nil + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint64(sketch.n)); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.h)); err != nil { + return err + } + + if err := binary.Write(enc.w, binary.LittleEndian, uint32(sketch.r)); err != nil { + return err + } + + if sketch.r > 0 { + if err := binary.Write(enc.w, binary.LittleEndian, sketch.totalWeightR); err != nil { + return err + } + } + + for i := 0; i < sketch.h; i++ { + if i >= len(sketch.weights) { + return fmt.Errorf("invalid weights array size: %d, h: %d", len(sketch.weights), sketch.h) + } + + if err := binary.Write(enc.w, binary.LittleEndian, sketch.weights[i]); err != nil { + return err + } + } + + if sketch.marks != nil { + val := uint8(0) + for i := 0; i < sketch.h; i++ { + if sketch.marks[i] { + val |= 0x1 << (i & 0x7) + } + + if i&0x7 == 0x7 { + if err := binary.Write(enc.w, binary.LittleEndian, val); err != nil { + return err + } + val = 0 + } + } + + // write out any remaining values. + if sketch.h&0x7 > 0 { + if err := binary.Write(enc.w, binary.LittleEndian, val); err != nil { + return err + } + } + } + + expectedDataLen := sketch.h + if sketch.r > 0 { + expectedDataLen += sketch.r + 1 + } + if len(sketch.data) != expectedDataLen { + return fmt.Errorf("invalid data array size: %d, h: %d, r: %d", len(sketch.data), sketch.h, sketch.r) + } + + b := enc.serde.SerializeManyToSlice(sketch.data[:sketch.h]) + if _, err := enc.w.Write(b); err != nil { + return err + } + + if sketch.r > 0 { + rStart := sketch.h + 1 // skip gap + rEnd := rStart + sketch.r + b = enc.serde.SerializeManyToSlice(sketch.data[rStart:rEnd]) + if _, err := enc.w.Write(b); err != nil { + return err + } + } + + return nil +} diff --git a/sampling/varopt_items_sketch_serialization_test.go b/sampling/varopt_items_sketch_serialization_test.go new file mode 100644 index 0000000..fe7a514 --- /dev/null +++ b/sampling/varopt_items_sketch_serialization_test.go @@ -0,0 +1,500 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sampling + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/apache/datasketches-go/common" + "github.com/apache/datasketches-go/internal" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const varOptItemsSerializationEpsilon = 1e-13 + +func TestGenerateGoBinariesForCompatibilityTestingVarOptItemsSketch(t *testing.T) { + if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 { + t.Skipf("%s not set", internal.DSketchTestGenerateGo) + } + + err := os.MkdirAll(internal.GoPath, os.ModePerm) + require.NoError(t, err) + + t.Run("long generate", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + n := n + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](32) + require.NoError(t, err) + + for i := 1; i <= n; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + filename := filepath.Join(internal.GoPath, fmt.Sprintf("varopt_sketch_long_n%d_go.sk", n)) + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](1024) + require.NoError(t, err) + + for i := 1; i <= 200; i++ { + require.NoError(t, sketch.Update(strconv.Itoa(i), 1000.0/float64(i))) + } + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + filename := filepath.Join(internal.GoPath, "varopt_sketch_string_exact_go.sk") + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) + + t.Run("long sampling", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](1024) + require.NoError(t, err) + + for i := 0; i < 2000; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + require.NoError(t, sketch.Update(-1, 100000.0)) + require.NoError(t, sketch.Update(-2, 110000.0)) + require.NoError(t, sketch.Update(-3, 120000.0)) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + filename := filepath.Join(internal.GoPath, "varopt_sketch_long_sampling_go.sk") + require.NoError(t, os.WriteFile(filename, data, 0644)) + }) +} + +func TestVarOptItemsSketchJavaCompat(t *testing.T) { + t.Run("long", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, fmt.Sprintf("varopt_sketch_long_n%d_java.sk", n)) + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.Equal(t, n == 0, sketch.IsEmpty()) + assert.Equal(t, 32, sketch.K()) + assert.Equal(t, int64(n), sketch.N()) + if n > 10 { + assert.Equal(t, 32, sketch.NumSamples()) + } else { + assert.Equal(t, n, sketch.NumSamples()) + } + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, float64(n), summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, float64(n), summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, "varopt_sketch_string_exact_java.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(200), sketch.N()) + assert.Equal(t, 200, sketch.NumSamples()) + + expectedWeight := 0.0 + for i := 1; i <= 200; i++ { + expectedWeight += 1000.0 / float64(i) + } + + summary, err := sketch.EstimateSubsetSum(func(string) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, expectedWeight, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, expectedWeight, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + + t.Run("long sampling", func(t *testing.T) { + filename := filepath.Join(internal.JavaPath, "varopt_sketch_long_sampling_java.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("Java file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(2003), sketch.N()) + assert.Equal(t, sketch.K(), sketch.NumSamples()) + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, 332000.0, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, 332000.0, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x < 0 }) + require.NoError(t, err) + assert.InDelta(t, 330000.0, summary.Estimate, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x >= 0 }) + require.NoError(t, err) + assert.InDelta(t, 2000.0, summary.Estimate, varOptItemsSerializationEpsilon) + }) +} + +func TestVarOptItemsSketchCppCompat(t *testing.T) { + t.Run("long", func(t *testing.T) { + for _, n := range []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} { + n := n + t.Run(fmt.Sprintf("n%d", n), func(t *testing.T) { + filename := filepath.Join(internal.CppPath, fmt.Sprintf("varopt_sketch_long_n%d_cpp.sk", n)) + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.Equal(t, n == 0, sketch.IsEmpty()) + assert.Equal(t, 32, sketch.K()) + assert.Equal(t, int64(n), sketch.N()) + if n > 10 { + assert.Equal(t, 32, sketch.NumSamples()) + } else { + assert.Equal(t, n, sketch.NumSamples()) + } + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, float64(n), summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, float64(n), summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + } + }) + + t.Run("string exact", func(t *testing.T) { + filename := filepath.Join(internal.CppPath, "varopt_sketch_string_exact_cpp.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(200), sketch.N()) + assert.Equal(t, 200, sketch.NumSamples()) + + expectedWeight := 0.0 + for i := 1; i <= 200; i++ { + expectedWeight += 1000.0 / float64(i) + } + + summary, err := sketch.EstimateSubsetSum(func(string) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, expectedWeight, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, expectedWeight, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + }) + + t.Run("long sampling", func(t *testing.T) { + filename := filepath.Join(internal.CppPath, "varopt_sketch_long_sampling_cpp.sk") + data, err := os.ReadFile(filename) + if os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + } + require.NoError(t, err) + + sketch, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assert.False(t, sketch.IsEmpty()) + assert.Equal(t, 1024, sketch.K()) + assert.Equal(t, int64(2003), sketch.N()) + assert.Equal(t, sketch.K(), sketch.NumSamples()) + + summary, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + assert.InDelta(t, 332000.0, summary.Estimate, varOptItemsSerializationEpsilon) + assert.InDelta(t, 332000.0, summary.TotalSketchWeight, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x < 0 }) + require.NoError(t, err) + assert.InDelta(t, 330000.0, summary.Estimate, varOptItemsSerializationEpsilon) + + summary, err = sketch.EstimateSubsetSum(func(x int64) bool { return x >= 0 }) + require.NoError(t, err) + assert.InDelta(t, 2000.0, summary.Estimate, varOptItemsSerializationEpsilon) + }) +} + +func TestVarOptItemsSketchSerialization(t *testing.T) { + t.Run("bad serialization version", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 16, 16) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[1] = 0 + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid serialization version: expected 2, got 0") + }) + + t.Run("bad family", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 16, 16) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[2] = 0 + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid family ID: expected 13, got 0") + }) + + t.Run("bad prelongs", func(t *testing.T) { + for _, preLongs := range []byte{0, 2, 5} { + preLongs := preLongs + t.Run(string(rune('0'+preLongs)), func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 32, 33) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + data[0] = preLongs + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, fmt.Sprintf("invalid preamble longs: expected warmup or full, got %d", preLongs)) + }) + } + }) + + t.Run("malformed preamble", func(t *testing.T) { + source := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 50, 50), common.ItemSketchLongSerDe{}) + + t.Run("full preamble without R", func(t *testing.T) { + data := cloneBytes(source) + data[0] = preambleLongsFull + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected warmup because n<=k, got 4") + }) + + t.Run("zero k", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[4:], 0) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "k must be at least 1 and less than 2^31 - 1") + }) + + t.Run("negative H count", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[16:], math.MaxUint32) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in warmup mode: expected n==h, got n=50, h=4294967295") + }) + + t.Run("negative R count", func(t *testing.T) { + data := cloneBytes(source) + binary.LittleEndian.PutUint32(data[20:], uint32(0xffffff80)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in warmup mode: expected r==0, got r=4294967168") + }) + }) + + t.Run("empty sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](5) + require.NoError(t, err) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + require.Len(t, data, int(preambleLongsEmpty<<3)) + + loaded, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assert.Equal(t, int64(0), loaded.N()) + assert.Equal(t, 0, loaded.NumSamples()) + assert.True(t, loaded.IsEmpty()) + }) + + t.Run("non-empty degenerate sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](12, WithResizeFactor(ResizeX2)) + require.NoError(t, err) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + for len(data) < int(preambleLongsWarmup<<3) { + data = append(data, 0) + } + data[3] = 0 + + _, err = Decode[string](data, common.ItemSketchStringSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected warmup or full, got 1") + }) + + t.Run("corrupt serialized weight", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 100, 20) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + preambleBytes := int(data[0]&0x3f) << 3 + binary.LittleEndian.PutUint64(data[preambleBytes:], math.Float64bits(-1.5)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "non-positive weight: -1.500000") + }) + + t.Run("round trip", func(t *testing.T) { + t.Run("under-full sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 100, 10) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-1], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("end-of-warmup sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 2843, 2843) + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + require.Equal(t, preambleLongsWarmup, data[0]&0x3f) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-1000], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("full sketch", func(t *testing.T) { + sketch := createUnweightedVarOptItemsSketch(t, 32, 32) + require.NoError(t, sketch.Update(100, 100.0)) + require.NoError(t, sketch.Update(101, 101.0)) + + totalWeight, err := sketch.EstimateSubsetSum(func(int64) bool { return true }) + require.NoError(t, err) + cumulativeWeight := 0.0 + for sample := range sketch.All() { + cumulativeWeight += sample.Weight + } + require.InDelta(t, 1.0, cumulativeWeight/totalWeight.TotalSketchWeight, varOptItemsSerializationEpsilon) + + samples := collectVarOptItemsSamples(sketch) + require.GreaterOrEqual(t, len(samples), 2) + require.InDelta(t, 100.0, samples[0].Weight, varOptItemsSerializationEpsilon) + require.InDelta(t, 101.0, samples[1].Weight, varOptItemsSerializationEpsilon) + require.Equal(t, int64(100), samples[0].Item) + require.Equal(t, int64(101), samples[1].Item) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) + require.Equal(t, preambleLongsFull, data[0]&0x3f) + + loaded, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[int64](data[:len(data)-100], common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "unexpected EOF") + }) + + t.Run("string sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](5) + require.NoError(t, err) + for _, item := range []string{"a", "bc", "def", "ghij", "klmno"} { + require.NoError(t, sketch.Update(item, 1.0)) + } + require.NoError(t, sketch.Update("heavy item", 100.0)) + + data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchStringSerDe{}) + loaded, err := Decode[string](data, common.ItemSketchStringSerDe{}) + require.NoError(t, err) + assertVarOptItemsSketchEqual(t, sketch, loaded) + + _, err = Decode[string](data[:len(data)-12], common.ItemSketchStringSerDe{}) + require.ErrorContains(t, err, "offset out of bounds") + }) + }) +} + +func createUnweightedVarOptItemsSketch(t *testing.T, k int, n int) *VarOptItemsSketch[int64] { + t.Helper() + + sketch, err := NewVarOptItemsSketch[int64](uint(k)) + require.NoError(t, err) + for i := 0; i < n; i++ { + require.NoError(t, sketch.Update(int64(i), 1.0)) + } + return sketch +} + +func encodeVarOptItemsSketch[T any](t *testing.T, sketch *VarOptItemsSketch[T], serde common.ItemSketchSerde[T]) []byte { + t.Helper() + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, serde) + require.NoError(t, encoder.Encode(sketch)) + return buf.Bytes() +} + +func assertVarOptItemsSketchEqual[T comparable](t *testing.T, expected *VarOptItemsSketch[T], actual *VarOptItemsSketch[T]) { + t.Helper() + + require.Equal(t, expected.K(), actual.K()) + require.Equal(t, expected.N(), actual.N()) + require.Equal(t, expected.NumSamples(), actual.NumSamples()) + require.Equal(t, expected.H(), actual.H()) + require.Equal(t, expected.R(), actual.R()) + + expectedSamples := collectVarOptItemsSamples(expected) + actualSamples := collectVarOptItemsSamples(actual) + require.Len(t, actualSamples, len(expectedSamples)) + for i := range expectedSamples { + require.Equal(t, expectedSamples[i].Item, actualSamples[i].Item) + require.InDelta(t, expectedSamples[i].Weight, actualSamples[i].Weight, varOptItemsSerializationEpsilon) + } +} + +func collectVarOptItemsSamples[T any](sketch *VarOptItemsSketch[T]) []Sample[T] { + samples := make([]Sample[T], 0, sketch.NumSamples()) + for sample := range sketch.All() { + samples = append(samples, sample) + } + return samples +} + +func cloneBytes(src []byte) []byte { + dst := make([]byte, len(src)) + copy(dst, src) + return dst +} diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n0_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e4505fe12adbd93fea1b01467102f5f2af82bf2b GIT binary patch literal 8 PcmX@e#LJ?Ebd(h=QE&_*uJMu)%>NwE4yM&k z@`CBRLJ<2oq2jA9L-fC{fY`^v3Q_m^Da5=4HBPX;!cYi*ek(-3zC6S|3+?#8@^kM& K^~1q60>at54MGbRK-9V1g770eA^ZeE2yL$l5my(6 z(8m`*=vm?r`l~fWUG8ZJU(yjmA997z{F@+j-~61U;d@?%(BGXQ;_tm6^a(GBc-|!l4FEXQAFTiY literal 0 HcmV?d00001 diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n10000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..4947cecbbcfbb0158d506492cff35821fcff6b1f GIT binary patch literal 288 zcmX@Y#LJ+-z`!7&4kVyJ0mkS!?2yg_;k<#;uKW=GNpT20NfAO%fyxWAK=_}9A#{ZV zgr3g}q3e|(^jjtf?Z*wF-^)ViwNQGx7=$khHUB%5wu7p_#txBpflv%pQ1>i?+M^G3 g&nu`qtf1y{i9j?+utDe=sDG58{0)2%@e-&y06v=)M*si- literal 0 HcmV?d00001 diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n1000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..c2dfc8f810bc8a57194d4ec65c559248ee68d98b GIT binary patch literal 288 zcmX@Y#LJ+-z`*c=8Aw2Z0*v9%?-0re;V41HqM`H$CI~|v%6|mq|Ax{AP<;oX=ITRf zXQ;RilP{%6`=Gts5)7w{&c9gFw~q(D19AD--6O9P`Uz2|AXp_ QgqrggYVKDkeHBUr07G~bL;wH) literal 0 HcmV?d00001 diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_long_n100_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..962473a4bf232e86e87393724b651db7f9fa5116 GIT binary patch literal 288 zcmXxWJqvFk~8ZHkUuKPR7B1OdK|7mH8 t7;baD;F1+vo_S@#k}GD+x#orqw;Xxlfp0#z{ACi^ujy&z`UM;q(5kuB)p{YEYg);E6!sQ1-y1pWwg#n(_Bv z{{D$v9QlZE-g(Sp_a3uFM&zG<9%2S&eKZgVZ1gv;+gpa`RL_OE@k#vannyp|vpwc@ zbJhR4@$a7kj}a({Ay^0tV-cKgPca^fbL9#4mynmlbLz9bTSnePf2ewxyc|x`AEj?J zmRJ8u{VRDTd1c%nPxE`LDv!Xo%%AptU7RN$WbQ0KfVIpIv%ieFI`X<$59?#PzDRz7 zpSQP>yo+}m^Cq}g{(*O2mjBCqKKWrjhM(ZA^}ir5>HV+O+hy*<_E<)JnD;x$@7t3o zFUq^Bx8mJ+us!MWD2-&o%;{vHm;82du?eSyp6BlRt?e>7jKegKQ$IQ8+k(mid|C&?#cF?*--^L!c~ zxO^aXx5o{Vd-l(}#CCi~a%^|%3FQa@^LlYBF7!L68r-{BndB{5aL z1FO6Defz8F+pV64YyDpK@V!{j{AlxM%umq2mLJkzKwedUZu5uL1M0uH_|p2k)B&+>D4#JejnM&EzckGpTM{5~(He+ORCe-*FcCf||Q{0;d{te}3I_pm=! ze@DEhUR3`BK8io&3(W_zJPxv85C-EYd$aO&z9T!&fjKdYdLMJ6y_-ipFK&?+)K@@$ z&|M+C5Pq(&2rr81@|xZ)F5fM0Yu2O;fL)T%b(Lfp4a0Oc^aS0r{Gll7-PJXP5*THL_U)b@cSzvf7aX_ z^;n#Xweylj^*-q^d;~Wn20O!OPp^$#axoR ze&xyf)?i2ZH+-r2*5(FdHGSXe+lZTRGj72I-;=_>!&de*$5iH_Rdr$K-coxs$uXr9W z;1ut~dH15cgL^LV%XkGd%w6?182x@@eO>s`8)Evcn|MmhC4ENS^g&v1o`hj z24OIk_uhK%4aDr~IWQ+C>7VGGC*{FBFQ3Zu^8#29L$I8Eg?SOYEHB1Gu{ge>-q1gn zlJZhm8p~jkzA6|d@8EkA-Mii1&GKw~Ij+G<=KJ$-UIj1O7r}qBuR8C}y2%6jdaL*0XK*t{y6-K0 zLH6|Lx7Cm6FK+)99-}W#-(Wrj!}Sm2%lL3U0yo+>S^W}@R3EB7nvcPca4b%6#|@ky z59X8jWSoMvy|)$HW3qRr>zjcyv8cXTyqw?P93G2vF~q)kd_FF~IQ2z*kNt7HitmW$ zOK>TEj-TpV&ZFFwzS|WbKlC7O3Rn>ScagttuBp6*JDTz4xLw|ohw}&aw2{ApZLuA0*VmqRz*KjYu&=Yc z3!ZXcSNQi}H^V9ez zxZZcn;4?FG^RxJDoP)7A7ypCva6U%czmU(gcQKE{vHIfqRo_|L-Ji=l%9rymFaeY8 zPvk2x%>Gq;HGYLjn2c-ixc9!{yX{%WZ>n$LFRO3ltJJT`V|`CgOwrd<-L9K{TQMDtiFTtZTe^0e@b8G|F#a2AH#;~N6qInf1IDd zlXwc#F-1QO1le~^&`JF~U*q1VFid}lzW?dFgj3{K_*J}yqwKlPpXR0Py(Qnz|KNA< Xy#9OqDepbt8Tb(Y#DIPNSRn9!0#6|o literal 0 HcmV?d00001 diff --git a/serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk b/serialization_test_data/cpp_generated_files/varopt_sketch_string_exact_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..2da7e4ed24dcb8dd19a57c0596d540b71f14a439 GIT binary patch literal 2916 zcmXw&drXye7{`xyH*Xo|ez=^&9YHwn<@a9ncz6IYguqK>l1(Iz0V!E&Wjb<9m@Lyw zT{9y^XIkPP-iE1ZZMtkZz^R#tL!FsAb>6TPaO<4sd3yKm`R%~_`Mkg1^L*bE14mgb zgDsZdEOh<9;xG6DXk7uXMXo=V@b5*mTB`Z)Avjdi)mqb4&i^li528=KAAM>LKUWGz zZf#$kd1Mwp|0?W@ni#%cVQvPr?2cU1nV$yLconm!2u9ry8~>bM$iAu26XT_526OoT=L{NSCT6M0_~AlJPk$GyYk_nHuTFF5pOg~9vO;f$ka zNNtYJ`_*9Wbt8Ohyl(+dzFpJXK1J~URdC^C)m{aY3=aTjq1g zkcADF`Fs)_+|>Togw+zCQ-Xvl9N`@M5K_hUSqNqwjk_GfTkGN8Jlt!{F92KOf&=Cm&C+&@2^`;^Y_ z9mo9|2bDKcPL@7PR6i0eNKj*Wxd~(^(Jw@lb~*hefGS%B<_0>T%0~P z?~_qU-2X(#-&uaNbW0-7ArZRU*KInaB=Y_e;Y#Dc4JB;}JeLF*S^V+dEAJ%md=lV< ze`NSh;5o&^oYC+8Gh$yn&nq4_Che`xE{*58#Y3St^L*3bc%GjRHe`9!XZw9TM<3Yt zuWQH%`FNf_s2l4H-#(tJ7s?yj=QXr@xzAqsOD$`uT;=6Cdm*nmZSdw?FVEY{=kW5} z-8H0ryFBA53!(L>!{C;isf9yK-8t8@D~=t z`G|pv1I?Ef%Q2jjXc*UZbo8xXqd70p@G#}YlIpq9oSSGU_&#{4H8pl$6N_5H%bdGf$4d*#997u|ebH=KWRyLU;goAc#{WgUs(+s!$1LA!agX>q%Y z^X3Am**2hPnTvDhfYL)X+BoMn2#MwE z?^oD3?>1uMpi~vMpi~vMpi~vMpi~vMpi*qK~_OlK~_OlK~_OlK~_OlK~_ap zMOH;tMOH;tMOH;tMOH;tMOGjykQK-ZWCgMUS%IuTRv;^o)sWSY)sWSY)sWSY)sWSY z)sWSY)sfYa)sfYa)sfYa)sfYa)sfYaHIOxsHIOxsHIOxsHIOxsHIOxsHIX%uHIX%u zHIX%uHIX%uHIX&VL6{9cEfOsYUZ%zdrso}l@PDs{n6L03qF&VxV&0U!5b;~F5ZYrS4_HBp z4}?D74zXv(D<-h`yCewx!wEvaYJ|8ah#z9l+glLzOlSDO>OIduDE5dJ$S Lh(FKufQ1^}`{v5W?B%0g-P#1mQP_Lg*wBhkEW_x(-4MhCt*U8zA)BD-iL(g%J8|C`9~k9fY2f s1)(_&A@U~-A^h#~5IWNhB7P6*u4;aWxOo^vyfp#B|MVZiH=hKd0iw|&dH?_b literal 0 HcmV?d00001 diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n10000_go.sk new file mode 100644 index 0000000000000000000000000000000000000000..22ffba36ca07208b7b380666849dffe55db54a3e GIT binary patch literal 288 zcmX@Y#LJ+-z`!7&4kVyJ0mkS!?2xJg;rJ>;Xmt(z_d`5>r!3gsc|S<==yv9}J}>p!!}w`CL%GD3mV=rN2PwGf?v$LDfk@&HV_a-$K=yLDl_$@^?V# PIH)=YD6IyiS3~6ieFzdE literal 0 HcmV?d00001 diff --git a/serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_long_n100_go.sk new file mode 100644 index 0000000000000000000000000000000000000000..c273a9e2d86f36c592a9e8f5fdc100e5c9a8871c GIT binary patch literal 288 zcmXxWF$;oV7=_Wdh^WB^hp_ESaM4L{5NaqIqNRu?!6oRY_QPs8Ubvj+JI^9T#OMEM vX^I$5qh9dBCEr|e%`;op%(&r`58jyb$evej*|6Y_B@e9lV#hlZ_nh$uC24k)f~I zBkm1Fb6-AQ>7PNI|E>K7a3g^-1*xfz%kzPdFhRf24u^EJ#CV zQqPPv`B~{~I9Oi}x`@L*me0*!Cif1V2M^2TqkpBRx;K-)g6hT1FH9G~qF5{;hm+)k z=@R(6xuxjRxP{*qx4UC1T~;o=I}fT4!`pHd!|GTAYvKrf z|HeA}y4c)vcd#d3uA%w@^@{d3=7-3CicR^=66RuIeQC@ME6_IYI{r4Rn?dC!_Dc5=gq4jS4-al`9j!N{(HIp^lA0Q=Cn}%(VRhY zgYk~s5cOgFUHswn2poz19J| zpIrY;dKS*clJ4jtw?uy%oG14UMq(7!ky}77#6`Fmm*7%dhRbmUhPh`I-NpO5$B*W( z#ku@^?msHOQN5e`CVDe&!BOsisQ)&G^M~OM`JK26H`=#{4wc_W@5h4X9-!ZL*GN3f zkI_HG+!%heTr7PYr|=izN&YE3jbE921nbDpch@Ogg%|W)#QN@u)PIG)P=2!d_w+is z>vBQ-U+JOpztcDI7T(4?nAE(xbTjwgrz7Zx^nUvu(T{Pjy>awYY^eV^-9qjyemws# ztmU2}_5>w*Q`jkgPk&N=GE9#9^rfWx(uee==BL55m=6E4Cj-6FJyrF;FmF1YRW2K5 z$13J@)jvpI6@72ZwWY`4T+FAQ9}D2SSP%7yDsoTh)Tc|YJJ^-BqRa0^k#Rh<*&oN=53&l z$>-y5;y>d@;8uP`xovcay7K}h{IXO1sQhkv5AMZ%xF0i^cL~SnKcs#*A!pA~ItGtn zEEYBANBRVw#8Wufy#C(X8U9@Q9DN?|y6Xac5qryDrmx^Ecl=EMg4b}VdTVo!(^2N- z!|&XAOW$q0gDLgjrTdvvPG4(W%ilpil7Eb|`Ehhk`@8d>^Iu?bemwoe9VP8a6!hlr zeh_ZsC!v$#S^i~A!B2^A;Y#-;l}p1Upt*d-K!5^t*IHEQFu9zX)9vADBN|e+_$s)vxIrh^6>(a$WU*NH?TIcz^^R#A%`CeF!KSO^FdZ4==@N4tyU|p<-^|6!v$@Dkke}s*( z32v8bN>A6{oNlLH&K+U=Li|?rpZeO+ZE=9U_H+lV#DA=>GrtR7F~1v~pKk2E_27qN z0dqEEZ~jGnef9O>_r-p=NM8g!0K4dm#zFj8bLZetez52Ij2@07aEU$trPt`sZT`>n zSoLxEIgZB(I1xANJ7xbQ{$yNZ@00{zZW=uuXW&eng|l%Ep430t+^_lb@PGVBItu4w zTK6ub3)1uSE#aS_2YNrt`73ZG-gM7udJRV7D)WcQt>0T>Suj5ZB7TH1`PqD8^uAxmbF?dHyT~PVi6SEcLJP zkp463XR)LEyQ|017t|B+pSrKT`W5x7IMJTk>eu+!@doCVOOC6sH{OyfLD#nb5B^=e zhd1O7y05f+4R>7UkLEwo7l+&V&*)|HFX)#Tk0Zf0@op=fnKiM!l}* zXlG6#^}_hATv7U&Tygpz*wLI4bV)3QrSYHm9+trnEQ{r^JT|wtl=obb--KU@j#iJv zP=2`F7|(MI8}Vz%)x_QUi(wspYI;1@=QqI4<~5=N-pe2MHR0dT-;{2Kzv;gv-;y7O zpWrrgThqDa+tTfDnt7Y->&Ra({{z1Zzbkgb?ieG#)824?Pwa&~?E94dQoaveOZ}0( zqcB1}i9O%Sox?%ugK-FUF#ijEz0CVq-w3&p7$!f8<gC9*g<)kCh+KpMVqbbM-Ij zNw|jJ-JGlLo2vc`9fj%T_t0TDTRt5fA~!c-pZW9XV9ygthpNw~E2|&o$Iz{?HZGO_ zKyEp`0$1WH9Hg&``=a^X=ymjZ+<*i06_DG+U*~zZ(D&tz^5d}?zl}S0$R~V}J6nA> ze-Cbzk1(g2{r{3bAa@WCVPg3s^illIykm4MZZ)^HxrgZ!@_G2D<=3m9q0izuY_IPp z`T~x3*Co1}xoi1X`91l;{37OESHFQ#{NLy@?z%}YqtBajhu@T*iTC*T@c}->Kk*Uf d)mO~^IQ~<7hR^W@KGGLYFLGai@9TiT{{fOBAm9K1 literal 0 HcmV?d00001 diff --git a/serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk b/serialization_test_data/go_generated_files/varopt_sketch_string_exact_go.sk new file mode 100644 index 0000000000000000000000000000000000000000..2da7e4ed24dcb8dd19a57c0596d540b71f14a439 GIT binary patch literal 2916 zcmXw&drXye7{`xyH*Xo|ez=^&9YHwn<@a9ncz6IYguqK>l1(Iz0V!E&Wjb<9m@Lyw zT{9y^XIkPP-iE1ZZMtkZz^R#tL!FsAb>6TPaO<4sd3yKm`R%~_`Mkg1^L*bE14mgb zgDsZdEOh<9;xG6DXk7uXMXo=V@b5*mTB`Z)Avjdi)mqb4&i^li528=KAAM>LKUWGz zZf#$kd1Mwp|0?W@ni#%cVQvPr?2cU1nV$yLconm!2u9ry8~>bM$iAu26XT_526OoT=L{NSCT6M0_~AlJPk$GyYk_nHuTFF5pOg~9vO;f$ka zNNtYJ`_*9Wbt8Ohyl(+dzFpJXK1J~URdC^C)m{aY3=aTjq1g zkcADF`Fs)_+|>Togw+zCQ-Xvl9N`@M5K_hUSqNqwjk_GfTkGN8Jlt!{F92KOf&=Cm&C+&@2^`;^Y_ z9mo9|2bDKcPL@7PR6i0eNKj*Wxd~(^(Jw@lb~*hefGS%B<_0>T%0~P z?~_qU-2X(#-&uaNbW0-7ArZRU*KInaB=Y_e;Y#Dc4JB;}JeLF*S^V+dEAJ%md=lV< ze`NSh;5o&^oYC+8Gh$yn&nq4_Che`xE{*58#Y3St^L*3bc%GjRHe`9!XZw9TM<3Yt zuWQH%`FNf_s2l4H-#(tJ7s?yj=QXr@xzAqsOD$`uT;=6Cdm*nmZSdw?FVEY{=kW5} z-8H0ryFBA53!(L>!{C;isf9yK-8t8@D~=t z`G|pv1I?Ef%Q2jjXc*UZbo8xXqd70p@G#}YlIpq9oSSGU_&#{4H8pl$6N_5H%bdGf$4d*#997u|ebH=KWRyLU;goAc#{WgUs(+s!$1LA!agX>q%Y z^X3Am**2hPnTvDhfYL)X+BoMn2#MwE z?^oD3?>1uMpi~vMpi~vMpi~vMpi~vMpi*qK~_OlK~_OlK~_OlK~_OlK~_ap zMOH;tMOH;tMOH;tMOH;tMOGjykQK-ZWCgMUS%IuTRv;^o)sWSY)sWSY)sWSY)sWSY z)sWSY)sfYa)sfYa)sfYa)sfYa)sfYaHIOxsHIOxsHIOxsHIOxsHIOxsHIX%uHIX%u zHIX%uHIX%uHIX&VL6{9cEfOss6y0Tk%X8xJsBd;wuTF=PG};8Hh%>%XYyZ& zJJPu!_B9-XsAnwW1FKI6fT&BB0CO2!!yxu3F+Mm}ug!@K_ OJ01l>=qp!P!Quc5A|hr0 literal 0 HcmV?d00001 diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n100000_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..a7a0b7926c26713fabb206aaa32a5f995aa154be GIT binary patch literal 288 zcmX@Y#LJ+-z`(GejS)yffC7w>^}}I;EQHg18$vVbLFm(SA@tR45PIJP2<<2cp;I10 zXy?}udZ90bPX7m?%a|ed?c;&aCc+SUPYi@EFoV!4Q2AL*5Pen35OMiA5dMQH5PIQV s2rU%}k>AV$;oob5@ON#3@VDwf=rU)BcUB?O_%I3aX^6ohV;h0xca`edN)w}YAo0IKj70{{R3 literal 0 HcmV?d00001 diff --git a/serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk b/serialization_test_data/java_generated_files/varopt_sketch_long_n1000_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..460dd4979e1f584dcffb99f09ee7caf2a6de474a GIT binary patch literal 288 zcmX@Y#LJ+-z`*c=8Aw2Z0*v9%?~us|;cSJ<&t!t|qo8W)pnMmo{BtOu1xnjO)t!gR zOF+%Df{HJOs=E!P&q2inpnMl7e;!ny6O=ZD%3p)Zn?S{PLhZQE$E%T=jX_$b1xEfI0MM!k^{$>?>++i9CyYM$1p)3iB)V zt-{r~2G`;`T#p;DvHgcJ4{lN4isRk4o$tV%7>%vmx0`>)_ws!>-M*B#77wZ)!cgCN zgdfFx`j7JycoMI8ca85kqd%T~XY`%L{qih$L4MSpCFaZHH+WUwH9Vky2#&y;>bLNA z?3{XA`|ip=G3^}9^Lm--Uwd(HEB?xyQV-}3kpN(h7P4?&HeeFA`|El_Y`4o9x{n_MieP;oAL9D7?h!@5p7;b+tUL1$T z&e{8`zLM%=-BFsi(N~s7tC!~$Fvy-je#G1X?*+-LU{(AQTbQrTYv4+GExykCS?w)~ z_0;QQ1MFhHA+KycGakkl`g-AD_ct@QOizf)-V)2`AE-VETd5D`H{Ej%hsZ;*ul#!+hT)jp zzTrHV-(`(_D~?i+z>Dr1!;9)`ZN3}-ffsSd1pT$-k$j1LP4(^MQ`D#8G~BM<8E498 z;cT3Pb8(yf^LZ35z#w;**Y}nEzo;+9CAbuq;c{GoFYH^zTgCpqyb2%Uo%Qk!xDjjU zJLIk{@~xOmo(^}&cVaZ|!rizBTX_Gj`Tg<(xKzH*_bkK1>bKRKx$~I(TYiF{#6{*$ z@zc0R{~3N3zp(#2zkuQTFY(KG1$Uae#xwZN>wK2^oBS5$Hg|`Q(RY{M!~3|&-amN> z^A+Te<+XhG5c?9SKUY7>OJaO{ss0LIV>9(J-g_qxh#T|o|2Q~GUpyWkBi%7lJ(2ta zOpIgPos?JMo6N5=-y*ipJ*oAl!G!w0)|XEHTz_}<`aDWsIe7wkBCM?*M?I^3*)T*s z2d}5E27V&H#tYl~sk{l#$MfSF{RMd;ze6Ej&AUZ-Q7neV@pH_pzXUIdL*=FUEBBP; zKk)MW69343f%3HS$~*{j+rQHNU&^atb*zDN^_8@*w!98rm)GMR?GMF4>aE?^NMB=Y zg8S5`d+$ec59DuobNwx_C6+MPnzzBW*bduc2kdCBuKk_m!Po^es?YOX-Q_*7C-%a- z?(V~<^RC|OFaNK60MBIqWPKs>@A*6ZrSuI^55=KaQay}^<1ieK0!yokNgzCT($mwFPOQaz04jitVP7^trR zPpR*m@BGtyNqAEAp62?f7dQVoeu2mIB{o+|Uf*1Qd0BZmoG7orD`Fs4!fy5l@hTW8 z-)YZ!?@i-1^wq?1>J81+k=Mo6zCW412J+GNHRO%3F*d>M=KioJj(3`=mzB?uw~)8Q zR@fTb;9UJ#?Qbs+aaV15CwXTKj&<|%u$#O)_Q0Na%bmS>Z*zTlKkSbi-7$dIG(U)k z;9&d~zr#xUL-`Tklg!>Qc{mQk;rQBpE6t6R?=>I6N8=bAi{r4E{;KxPTH+xpd zlY3_sUyUR5ujT7-oA);G2Ie>M%{a~c*4R1Rjyv##zG%MCzP8@kBj1bra6e`=-$?&K zc^2P!m>jbLZ64@Q3O}+;vGk%={HzS3OXF0lcn$ z0~eTkrhZ#~2dBvI^85O~a?b<#Uim}*2p{7>d!F)Vn9H6%>M88`H|AG=rT;a)!M7Np N|NZ|X@c!=x{vT$4Cl1(Iz0V!E&Wjb<9m@Lyw zT{9y^XIkPP-iE1ZZMtkZz^R#tL!FsAb>6TPaO<4sd3yKm`R%~_`Mkg1^L*bE14mgb zgDsZdEOh<9;xG6DXk7uXMXo=V@b5*mTB`Z)Avjdi)mqb4&i^li528=KAAM>LKUWGz zZf#$kd1Mwp|0?W@ni#%cVQvPr?2cU1nV$yLconm!2u9ry8~>bM$iAu26XT_526OoT=L{NSCT6M0_~AlJPk$GyYk_nHuTFF5pOg~9vO;f$ka zNNtYJ`_*9Wbt8Ohyl(+dzFpJXK1J~URdC^C)m{aY3=aTjq1g zkcADF`Fs)_+|>Togw+zCQ-Xvl9N`@M5K_hUSqNqwjk_GfTkGN8Jlt!{F92KOf&=Cm&C+&@2^`;^Y_ z9mo9|2bDKcPL@7PR6i0eNKj*Wxd~(^(Jw@lb~*hefGS%B<_0>T%0~P z?~_qU-2X(#-&uaNbW0-7ArZRU*KInaB=Y_e;Y#Dc4JB;}JeLF*S^V+dEAJ%md=lV< ze`NSh;5o&^oYC+8Gh$yn&nq4_Che`xE{*58#Y3St^L*3bc%GjRHe`9!XZw9TM<3Yt zuWQH%`FNf_s2l4H-#(tJ7s?yj=QXr@xzAqsOD$`uT;=6Cdm*nmZSdw?FVEY{=kW5} z-8H0ryFBA53!(L>!{C;isf9yK-8t8@D~=t z`G|pv1I?Ef%Q2jjXc*UZbo8xXqd70p@G#}YlIpq9oSSGU_&#{4H8pl$6N_5H%bdGf$4d*#997u|ebH=KWRyLU;goAc#{WgUs(+s!$1LA!agX>q%Y z^X3Am**2hPnTvDhfYL)X+BoMn2#MwE z?^oD3?>1uMpi~vMpi~vMpi~vMpi~vMpi*qK~_OlK~_OlK~_OlK~_OlK~_ap zMOH;tMOH;tMOH;tMOH;tMOGjykQK-ZWCgMUS%IuTRv;^o)sWSY)sWSY)sWSY)sWSY z)sWSY)sfYa)sfYa)sfYa)sfYa)sfYaHIOxsHIOxsHIOxsHIOxsHIOxsHIX%uHIX%u zHIX%uHIX%uHIX&VL6{9cEfOsiDVgv|q5P)(HGx0Vw0NDyqK8%KNCpZX#0VCaL38;M#`vn{ZivIWi z|9^>TS945C~XX-HK4Q>l-7aLAM6>xz!u84hte>6 r{GfbTn84IILd89yv<;MYfzm2aS`A9W?01Foq3&z&h4Q_jGy?+wZJ;O0 literal 0 HcmV?d00001 From 064bb6297878cdee114bc804b603a12b791defa1 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 9 May 2026 23:13:37 +0900 Subject: [PATCH 2/3] test : add serialize_size_bytes cases --- sampling/varopt_items_sketch.go | 7 ++- sampling/varopt_items_sketch_test.go | 76 ++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/sampling/varopt_items_sketch.go b/sampling/varopt_items_sketch.go index a55db1a..aa93a7d 100644 --- a/sampling/varopt_items_sketch.go +++ b/sampling/varopt_items_sketch.go @@ -782,7 +782,7 @@ func (s *VarOptItemsSketch[T]) EstimateSubsetSum(predicate func(T) bool) (Sample } // SerializedSizeBytes computes size needed to serialize the current state of the sketch. -func (s *VarOptItemsSketch[T]) SerializedSizeBytes(serde ItemsSerDe[T]) int { +func (s *VarOptItemsSketch[T]) SerializedSizeBytes(serde common.ItemSketchSerde[T]) int { if s.IsEmpty() { return preambleIntsEmpty << 3 } @@ -801,6 +801,9 @@ func (s *VarOptItemsSketch[T]) SerializedSizeBytes(serde ItemsSerDe[T]) int { } } - numBytes += serde.SizeOfItem() * (s.h + s.r) + for sample := range s.All() { + numBytes += serde.SizeOf(sample.Item) + } + return numBytes } diff --git a/sampling/varopt_items_sketch_test.go b/sampling/varopt_items_sketch_test.go index f0a24a9..160d7c2 100644 --- a/sampling/varopt_items_sketch_test.go +++ b/sampling/varopt_items_sketch_test.go @@ -18,9 +18,11 @@ package sampling import ( + "bytes" "math" "testing" + "github.com/apache/datasketches-go/common" "github.com/stretchr/testify/assert" ) @@ -124,6 +126,80 @@ func TestVarOptItemsSketch_Reset(t *testing.T) { }) } +func TestVarOptItemsSketch_SerializedSizeBytes(t *testing.T) { + t.Run("empty sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsEmpty<<3), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) + + t.Run("warmup sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + for i := int64(1); i <= 5; i++ { + err = sketch.Update(i, float64(i)) + assert.NoError(t, err) + } + assert.Equal(t, 0, sketch.R()) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsWarmup<<3)+sketch.H()*8+sketch.H()*8, sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) + + t.Run("warmup string sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[string](10) + assert.NoError(t, err) + for i, item := range []string{"a", "bc", "def"} { + err = sketch.Update(item, float64(i+1)) + assert.NoError(t, err) + } + assert.Equal(t, 0, sketch.R()) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchStringSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + expectedSize := int(preambleLongsWarmup << 3) + expectedSize += sketch.H() * 8 + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("a") + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("bc") + expectedSize += common.ItemSketchStringSerDe{}.SizeOf("def") + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchStringSerDe{})) + assert.Equal(t, expectedSize, sketch.SerializedSizeBytes(common.ItemSketchStringSerDe{})) + }) + + t.Run("full sketch", func(t *testing.T) { + sketch, err := NewVarOptItemsSketch[int64](10) + assert.NoError(t, err) + for i := int64(1); i <= 20; i++ { + err = sketch.Update(i, 1.0) + assert.NoError(t, err) + } + assert.Greater(t, sketch.R(), 0) + + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder(&buf, common.ItemSketchLongSerDe{}) + err = encoder.Encode(sketch) + assert.NoError(t, err) + + assert.Equal(t, buf.Len(), sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + assert.Equal(t, int(preambleLongsFull<<3)+sketch.H()*8+(sketch.H()+sketch.R())*8, sketch.SerializedSizeBytes(common.ItemSketchLongSerDe{})) + }) +} + func TestVarOptItemsSketch_All(t *testing.T) { t.Run("empty sketch", func(t *testing.T) { sketch, err := NewVarOptItemsSketch[int](10) From fdd21ce459f5bb37a9d4ad05c926f940daca3430 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 9 May 2026 23:38:36 +0900 Subject: [PATCH 3/3] test: more cover edge cases --- .../varopt_items_sketch_serialization_test.go | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/sampling/varopt_items_sketch_serialization_test.go b/sampling/varopt_items_sketch_serialization_test.go index fe7a514..5fb8d1e 100644 --- a/sampling/varopt_items_sketch_serialization_test.go +++ b/sampling/varopt_items_sketch_serialization_test.go @@ -265,6 +265,14 @@ func TestVarOptItemsSketchCppCompat(t *testing.T) { } func TestVarOptItemsSketchSerialization(t *testing.T) { + t.Run("nil sketch encode", func(t *testing.T) { + var buf bytes.Buffer + encoder := NewVarOptItemsSketchEncoder[int64](&buf, common.ItemSketchLongSerDe{}) + + err := encoder.Encode(nil) + require.ErrorContains(t, err, "cannot encode nil VarOptItemsSketch") + }) + t.Run("bad serialization version", func(t *testing.T) { sketch := createUnweightedVarOptItemsSketch(t, 16, 16) data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) @@ -331,6 +339,14 @@ func TestVarOptItemsSketchSerialization(t *testing.T) { _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) require.ErrorContains(t, err, "invalid state in warmup mode: expected r==0, got r=4294967168") }) + + t.Run("warmup preamble in full mode", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + data[0] = (data[0] & 0xc0) | preambleLongsWarmup + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid preamble longs: expected full because n>k, got 3") + }) }) t.Run("empty sketch", func(t *testing.T) { @@ -361,7 +377,41 @@ func TestVarOptItemsSketchSerialization(t *testing.T) { require.ErrorContains(t, err, "invalid preamble longs: expected warmup or full, got 1") }) - t.Run("corrupt serialized weight", func(t *testing.T) { + t.Run("invalid full mode H plus R count", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint32(data[20:], 0) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "invalid state in full mode: expected h+r==k") + }) + + t.Run("corrupt serialized R weight", func(t *testing.T) { + t.Run("zero", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(0)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + + t.Run("negative", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(-1.5)) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + + t.Run("nan", func(t *testing.T) { + data := encodeVarOptItemsSketch(t, createUnweightedVarOptItemsSketch(t, 32, 33), common.ItemSketchLongSerDe{}) + binary.LittleEndian.PutUint64(data[24:], math.Float64bits(math.NaN())) + + _, err := Decode[int64](data, common.ItemSketchLongSerDe{}) + require.ErrorContains(t, err, "data is corrupt in full mode: invalid R region weight") + }) + }) + + t.Run("corrupt serialized H weight", func(t *testing.T) { sketch := createUnweightedVarOptItemsSketch(t, 100, 20) data := encodeVarOptItemsSketch(t, sketch, common.ItemSketchLongSerDe{}) preambleBytes := int(data[0]&0x3f) << 3