Skip to content

Commit 572c4e5

Browse files
authored
Merge pull request #138 from proost/doc-utf8-compatibility
doc: utf8 compatibility
2 parents a39564d + 5df4bac commit 572c4e5

12 files changed

Lines changed: 177 additions & 14 deletions

common/item_sketch_string.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ func (f ItemSketchStringSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems
6767
return offset - offsetBytes, nil
6868
}
6969

70+
// SerializeOneToSlice writes the item to a byte slice.
71+
//
72+
// If the sketch contains string values and the caller cares about
73+
// cross-language compatibility, it is the caller's responsibility to ensure
74+
// that the input string is encoded as valid UTF-8.
7075
func (f ItemSketchStringSerDe) SerializeOneToSlice(item string) []byte {
7176
if len(item) == 0 {
7277
return []byte{}
@@ -78,6 +83,11 @@ func (f ItemSketchStringSerDe) SerializeOneToSlice(item string) []byte {
7883
return bytesOut
7984
}
8085

86+
// SerializeManyToSlice writes items to a byte slice.
87+
//
88+
// If the sketch contains string values and the caller cares about
89+
// cross-language compatibility, it is the caller's responsibility to ensure
90+
// that those strings are encoded as valid UTF-8.
8191
func (f ItemSketchStringSerDe) SerializeManyToSlice(items []string) []byte {
8292
if len(items) == 0 {
8393
return []byte{}
@@ -98,6 +108,11 @@ func (f ItemSketchStringSerDe) SerializeManyToSlice(items []string) []byte {
98108
return bytesOut
99109
}
100110

111+
// DeserializeManyFromSlice reconstructs bytes from its serialized form.
112+
//
113+
// If the sketch contains string values and the caller cares about
114+
// cross-language compatibility, it is the caller's responsibility to ensure
115+
// that the serialized string data is encoded as valid UTF-8.
101116
func (f ItemSketchStringSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]string, error) {
102117
if numItems <= 0 {
103118
return []string{}, nil

frequencies/items_sketch.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ func NewFrequencyItemsSketchWithMaxMapSize[C comparable](maxMapSize int, hasher
105105
// sketch and must be a power of 2. The maximum capacity of this internal hash map is
106106
// 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are a
107107
// function of maxMapSize.
108+
//
109+
// If the sketch contains string values and the caller cares about
110+
// cross-language compatibility, it is the caller's responsibility to ensure
111+
// that the serialized string data is encoded as valid UTF-8.
108112
func NewFrequencyItemsSketchFromSlice[C comparable](slc []byte, hasher common.ItemSketchHasher[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) {
109113
if serde == nil {
110114
return nil, errors.New("no SerDe provided")
@@ -327,6 +331,10 @@ func (i *ItemsSketch[C]) IsEmpty() bool {
327331
// Update this sketch with an item and a frequency count of one.
328332
//
329333
// item for which the frequency should be increased.
334+
//
335+
// If the sketch contains string values and the caller cares about
336+
// cross-language compatibility, it is the caller's responsibility to ensure
337+
// that the input string is encoded as valid UTF-8.
330338
func (i *ItemsSketch[C]) Update(item C) error {
331339
return i.UpdateMany(item, 1)
332340
}
@@ -337,6 +345,10 @@ func (i *ItemsSketch[C]) Update(item C) error {
337345
// and is only used by the sketch to determine uniqueness.
338346
// count the amount by which the frequency of the item should be increased.
339347
// A count of zero is a no-op, and a negative count will throw an exception.
348+
//
349+
// If the sketch contains string values and the caller cares about
350+
// cross-language compatibility, it is the caller's responsibility to ensure
351+
// that input strings are encoded as valid UTF-8.
340352
func (i *ItemsSketch[C]) UpdateMany(item C, count int64) error {
341353
if internal.IsNil(item) || count == 0 {
342354
return nil
@@ -374,6 +386,10 @@ func (i *ItemsSketch[C]) UpdateMany(item C, count int64) error {
374386
//
375387
// return a sketch whose estimates are within the guarantees of the largest error tolerance
376388
// of the two merged sketches.
389+
//
390+
// If the sketch contains string values and the caller cares about
391+
// cross-language compatibility, it is the caller's responsibility to ensure
392+
// that string values in both sketches are encoded as valid UTF-8.
377393
func (i *ItemsSketch[C]) Merge(other *ItemsSketch[C]) (*ItemsSketch[C], error) {
378394
if other == nil || other.IsEmpty() {
379395
return i, nil
@@ -412,7 +428,11 @@ func (i *ItemsSketch[C]) ToString() (string, error) {
412428
return sb.String(), nil
413429
}
414430

415-
// ToSlice returns a slice representation of this sketch
431+
// ToSlice returns a slice representation of this sketch.
432+
//
433+
// If the sketch contains string values and the caller cares about
434+
// cross-language compatibility, it is the caller's responsibility to ensure
435+
// that the serialized string data is encoded as valid UTF-8.
416436
func (i *ItemsSketch[C]) ToSlice() ([]byte, error) {
417437
if i.hashMap.serde == nil {
418438
return nil, errors.New("no SerDe provided")

kll/items_sketch.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ func NewKllItemsSketchWithDefault[C comparable](compareFn common.CompareFn[C], s
108108
}
109109

110110
// NewKllItemsSketchFromSlice create a new ItemsSketch from the given byte slice (serialized sketch).
111+
//
112+
// If the sketch contains string values and the caller cares about
113+
// cross-language compatibility, it is the caller's responsibility to ensure
114+
// that the serialized string data is encoded as valid UTF-8.
111115
func NewKllItemsSketchFromSlice[C comparable](sl []byte, compareFn common.CompareFn[C], serde common.ItemSketchSerde[C]) (*ItemsSketch[C], error) {
112116
if serde == nil {
113117
return nil, fmt.Errorf("no SerDe provided")
@@ -447,12 +451,20 @@ func (s *ItemsSketch[C]) GetSortedView() (*ItemsSketchSortedView[C], error) {
447451
}
448452

449453
// Update this sketch with the given item.
454+
//
455+
// If the sketch contains string values and the caller cares about
456+
// cross-language compatibility, it is the caller's responsibility to ensure
457+
// that the input string is encoded as valid UTF-8.
450458
func (s *ItemsSketch[C]) Update(item C) {
451459
s.updateItem(item, s.compareFn)
452460
s.sortedView = nil
453461
}
454462

455463
// Merge the given sketch into this sketch.
464+
//
465+
// If the sketch contains string values and the caller cares about
466+
// cross-language compatibility, it is the caller's responsibility to ensure
467+
// that string values in both sketches are encoded as valid UTF-8.
456468
func (s *ItemsSketch[C]) Merge(other *ItemsSketch[C]) {
457469
if other.IsEmpty() {
458470
return
@@ -476,6 +488,10 @@ func (s *ItemsSketch[C]) Reset() {
476488
}
477489

478490
// ToSlice returns the serialized byte array of this sketch.
491+
//
492+
// If the sketch contains string values and the caller cares about
493+
// cross-language compatibility, it is the caller's responsibility to ensure
494+
// that the serialized string data is encoded as valid UTF-8.
479495
func (s *ItemsSketch[C]) ToSlice() ([]byte, error) {
480496
if s.serde == nil {
481497
return nil, fmt.Errorf("no SerDe provided")

sampling/reservoir_items_sketch.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ func NewReservoirItemsSketch[T any](
9595
}
9696

9797
// Update adds an item to the sketch using reservoir sampling algorithm.
98+
//
99+
// If the sketch contains string values and the caller cares about
100+
// cross-language compatibility, it is the caller's responsibility to ensure
101+
// that the input string is encoded as valid UTF-8.
98102
func (s *ReservoirItemsSketch[T]) Update(item T) {
99103
if s.n < int64(s.k) {
100104
// Initial phase: store all items until reservoir is full
@@ -304,6 +308,10 @@ func resizeFactorFromHeaderByte(b byte) (ResizeFactor, error) {
304308
}
305309

306310
// ToSlice serializes the sketch to a byte slice.
311+
//
312+
// If the sketch contains string values and the caller cares about
313+
// cross-language compatibility, it is the caller's responsibility to ensure
314+
// that the serialized string data is encoded as valid UTF-8.
307315
func (s *ReservoirItemsSketch[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) {
308316
rfBits, err := resizeFactorBitsFor(s.rf)
309317
if err != nil {
@@ -365,6 +373,10 @@ func (s *ReservoirItemsSketch[T]) String() string {
365373
}
366374

367375
// NewReservoirItemsSketchFromSlice deserializes a sketch from a byte slice.
376+
//
377+
// If the sketch contains string values and the caller cares about
378+
// cross-language compatibility, it is the caller's responsibility to ensure
379+
// that the serialized string data is encoded as valid UTF-8.
368380
func NewReservoirItemsSketchFromSlice[T any](data []byte, serde ItemsSerDe[T]) (*ReservoirItemsSketch[T], error) {
369381
if len(data) < 8 {
370382
return nil, errors.New("data too short")

sampling/reservoir_items_union.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ func NewReservoirItemsUnion[T any](maxK int) (*ReservoirItemsUnion[T], error) {
5353
}
5454

5555
// Update adds a single item to the union.
56+
//
57+
// If the sketch contains string values and the caller cares about
58+
// cross-language compatibility, it is the caller's responsibility to ensure
59+
// that the input string is encoded as valid UTF-8.
5660
func (u *ReservoirItemsUnion[T]) Update(item T) {
5761
if u.gadget == nil {
5862
u.gadget, _ = NewReservoirItemsSketch[T](u.maxK)
@@ -62,6 +66,10 @@ func (u *ReservoirItemsUnion[T]) Update(item T) {
6266

6367
// UpdateSketch merges another sketch into the union.
6468
// This implements Java's update(ReservoirItemsSketch) with twoWayMergeInternal logic.
69+
//
70+
// If the sketch contains string values and the caller cares about
71+
// cross-language compatibility, it is the caller's responsibility to ensure
72+
// that string values in both sketches are encoded as valid UTF-8.
6573
func (u *ReservoirItemsUnion[T]) UpdateSketch(sketch *ReservoirItemsSketch[T]) error {
6674
if sketch == nil || sketch.IsEmpty() {
6775
return nil
@@ -91,6 +99,10 @@ func (u *ReservoirItemsUnion[T]) UpdateSketch(sketch *ReservoirItemsSketch[T]) e
9199

92100
// UpdateFromRaw creates a sketch from raw components and merges it.
93101
// Useful in distributed environments. Items slice is used directly, not copied.
102+
//
103+
// If the sketch contains string values and the caller cares about
104+
// cross-language compatibility, it is the caller's responsibility to ensure
105+
// that input strings are encoded as valid UTF-8.
94106
func (u *ReservoirItemsUnion[T]) UpdateFromRaw(n int64, k int, items []T) error {
95107
if len(items) == 0 {
96108
return nil
@@ -234,6 +246,10 @@ const (
234246
)
235247

236248
// ToSlice serializes the union to a byte slice.
249+
//
250+
// If the sketch contains string values and the caller cares about
251+
// cross-language compatibility, it is the caller's responsibility to ensure
252+
// that the serialized string data is encoded as valid UTF-8.
237253
func (u *ReservoirItemsUnion[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) {
238254
empty := u.gadget == nil || u.gadget.IsEmpty()
239255

@@ -267,6 +283,10 @@ func (u *ReservoirItemsUnion[T]) ToSlice(serde ItemsSerDe[T]) ([]byte, error) {
267283
}
268284

269285
// NewReservoirItemsUnionFromSlice deserializes a union from a byte slice.
286+
//
287+
// If the sketch contains string values and the caller cares about
288+
// cross-language compatibility, it is the caller's responsibility to ensure
289+
// that the serialized string data is encoded as valid UTF-8.
270290
func NewReservoirItemsUnionFromSlice[T any](data []byte, serde ItemsSerDe[T]) (*ReservoirItemsUnion[T], error) {
271291
if len(data) < 8 {
272292
return nil, errors.New("data too short")

sampling/varopt_items_sketch.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@ func (s *VarOptItemsSketch[T]) peekMin() (float64, error) {
203203

204204
// Update adds an item with the given weight to the sketch.
205205
// Weight must be positive and finite.
206+
//
207+
// If the sketch contains string values and the caller cares about
208+
// cross-language compatibility, it is the caller's responsibility to ensure
209+
// that the input string is encoded as valid UTF-8.
206210
func (s *VarOptItemsSketch[T]) Update(item T, weight float64) error {
207211
return s.update(item, weight, false)
208212
}

tuple/arrayofstrings_sketch.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ func (s *ArrayOfStringsSummary) Update(values []string) {
7878
// ArrayOfStringsSummaryWriter writes an ArrayOfStringsSummary to the provided io.Writer in binary format.
7979
// It validates the length of the string slice and computes total bytes for serialization.
8080
// Returns an error if the input exceeds the maximum allowed slice length or if any write operation fails.
81+
//
82+
// If the caller cares about cross-language compatibility,
83+
// it is the caller's responsibility to ensure that those strings are encoded as valid UTF-8.
8184
func ArrayOfStringsSummaryWriter(w io.Writer, summary *ArrayOfStringsSummary) error {
8285
return writeStrings(w, summary.values)
8386
}
@@ -121,6 +124,9 @@ func computeStringsTotalBytes(values []string) uint32 {
121124
// ArrayOfStringsSummaryReader reads an ArrayOfStringsSummary from the provided io.Reader in binary format.
122125
// It validates the length of the string slice and reads the total bytes for deserialization.
123126
// Returns an error if the input exceeds the maximum allowed slice length or if any read operation fails.
127+
//
128+
// If the caller cares about cross-language compatibility,
129+
// it is the caller's responsibility to ensure that the serialized string data is encoded as valid UTF-8.
124130
func ArrayOfStringsSummaryReader(r io.Reader) (*ArrayOfStringsSummary, error) {
125131
values, err := readStrings(r)
126132
if err != nil {
@@ -201,13 +207,19 @@ func ArrayOfStringsInlineSummaryUpdateFunc(s ArrayOfStringsInlineSummary, values
201207
// ArrayOfStringsInlineSummaryWriter writes an ArrayOfStringsInlineSummary to the provided io.Writer in binary format.
202208
// It validates the length of the string slice and computes total bytes for serialization.
203209
// Returns an error if the input exceeds the maximum allowed slice length or if any write operation fails.
210+
//
211+
// If the caller cares about cross-language compatibility,
212+
// it is the caller's responsibility to ensure that those strings are encoded as valid UTF-8.
204213
func ArrayOfStringsInlineSummaryWriter(w io.Writer, summary ArrayOfStringsInlineSummary) error {
205214
return writeStrings(w, summary.values)
206215
}
207216

208217
// ArrayOfStringsInlineSummaryReader reads an ArrayOfStringsInlineSummary from the provided io.Reader in binary format.
209218
// It validates the length of the string slice and reads the total bytes for deserialization.
210219
// Returns an error if the input exceeds the maximum allowed slice length or if any read operation fails.
220+
//
221+
// If the caller cares about cross-language compatibility,
222+
// it is the caller's responsibility to ensure that the serialized string data is encoded as valid UTF-8.
211223
func ArrayOfStringsInlineSummaryReader(r io.Reader) (ArrayOfStringsInlineSummary, error) {
212224
values, err := readStrings(r)
213225
if err != nil {

tuple/decoder.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ import (
2929

3030
// SummaryReader reads and returns a summary from the reader.
3131
// Implementations should read the format written by a corresponding SummaryWriter.
32+
//
33+
// If the summary contains string values and the caller cares about
34+
// cross-language compatibility, it is the caller's responsibility to ensure
35+
// that the serialized string data is encoded as valid UTF-8.
3236
type SummaryReader[S Summary] func(r io.Reader) (S, error)
3337

3438
// Decoder decodes a compact sketch from the given reader.

tuple/encoder.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ import (
2626

2727
// SummaryWriter writes a summary to the writer.
2828
// Implementations should write the summary in a format that can be read by a corresponding SummaryReader.
29+
//
30+
// If the summary contains string values and the caller cares about
31+
// cross-language compatibility, it is the caller's responsibility to ensure
32+
// that those strings are encoded as valid UTF-8.
2933
type SummaryWriter[S Summary] func(w io.Writer, s S) error
3034

3135
// Encoder encodes a compact tuple sketch to bytes.

tuple/intersection.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ func NewIntersectionWithSummaryMergeFunc[S Summary](
103103
}
104104

105105
// Update updates the intersection with a given sketch.
106+
//
107+
// If the summary contains string values and the caller cares about
108+
// cross-language compatibility, it is the caller's responsibility to ensure
109+
// that both sketches use string values encoded as valid UTF-8.
106110
func (i *Intersection[S]) Update(sketch Sketch[S]) error {
107111
if i.hashtable.isEmpty {
108112
return nil

0 commit comments

Comments
 (0)