Skip to content

Commit d79fbb8

Browse files
authored
Merge pull request apache#146 from proost/feat-quantile-sketch-sorted-view
feat: quantile sketch sorted view
2 parents a4c3a72 + 318f877 commit d79fbb8

12 files changed

Lines changed: 1449 additions & 46 deletions
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package quantiles
19+
20+
import (
21+
"errors"
22+
23+
"github.com/apache/datasketches-go/internal"
24+
quantilesutils "github.com/apache/datasketches-go/internal/quantiles"
25+
)
26+
27+
var ErrEmpty = errors.New("operation is undefined for an empty sketch")
28+
29+
// Number is a type constraint that permits numeric types used by quantile sketches.
30+
type Number interface {
31+
float32 | float64 | int64
32+
}
33+
34+
// The NumericSortedView provides a sorted view of the data retained by a numeric quantiles-type sketch that
35+
// would be cumbersome to get any other way.
36+
// One could use the sketch's iterator to iterate over the contents of the sketch,
37+
// but the result would not be sorted.
38+
//
39+
// The data from a NumericSortedView is an unbiased random sample of the input stream that can be used for other kinds of
40+
// analysis not directly provided by the sketch.
41+
type NumericSortedView[T Number] struct {
42+
quantiles []T
43+
cumWeights []int64
44+
n int64
45+
}
46+
47+
// NewNumericSortedView constructs a new NumericSortedView.
48+
func NewNumericSortedView[T Number](
49+
quantiles []T, cumWeights []int64, n int64, maxItem, minItem T,
50+
) *NumericSortedView[T] {
51+
res := quantilesutils.IncludeMinMax(quantiles, cumWeights, maxItem, minItem)
52+
return &NumericSortedView[T]{
53+
quantiles: res.Quantiles,
54+
cumWeights: res.CumWeights,
55+
n: n,
56+
}
57+
}
58+
59+
// CumulativeWeights returns a copy of the cumulative weights slice.
60+
// Also known as the natural ranks, which are the Natural Numbers on the interval [1, N].
61+
func (s *NumericSortedView[T]) CumulativeWeights() []int64 {
62+
copied := make([]int64, len(s.cumWeights))
63+
copy(copied, s.cumWeights)
64+
return copied
65+
}
66+
67+
// N returns the total number of items presented to the sourcing sketch.
68+
func (s *NumericSortedView[T]) N() int64 {
69+
return s.n
70+
}
71+
72+
// NumRetained returns the number of quantiles retained by this sorted view.
73+
// This may be slightly different from the function with the same name when called from the originating sketch.
74+
func (s *NumericSortedView[T]) NumRetained() int {
75+
return len(s.quantiles)
76+
}
77+
78+
// IsEmpty returns true if the view is empty.
79+
func (s *NumericSortedView[T]) IsEmpty() bool {
80+
return s.n == 0
81+
}
82+
83+
// Iterator creates and returns a new iterator.
84+
func (s *NumericSortedView[T]) Iterator() *NumericSortedViewIterator[T] {
85+
return NewNumericSortedViewIterator[T](s.quantiles, s.cumWeights)
86+
}
87+
88+
// Rank returns the normalized rank corresponding to the given a quantile.
89+
// If the sketch is empty, it returns an error.
90+
func (s *NumericSortedView[T]) Rank(quantile T, isInclusive bool) (float64, error) {
91+
if s.IsEmpty() {
92+
return 0, ErrEmpty
93+
}
94+
95+
crit := internal.InequalityLT
96+
if isInclusive {
97+
crit = internal.InequalityLE
98+
}
99+
100+
index, err := internal.FindWithInequality[T](s.quantiles, 0, len(s.quantiles)-1, quantile, crit, func(t1 T, t2 T) bool {
101+
return t1 < t2
102+
})
103+
if err != nil {
104+
return 0, err
105+
}
106+
if index == -1 {
107+
return 0, nil //LT case: quantile <= minQuantile; LE case: quantile < minQuantile
108+
}
109+
return float64(s.cumWeights[index]) / float64(s.n), nil
110+
}
111+
112+
// Quantiles return a copy of the quantiles slice.
113+
func (s *NumericSortedView[T]) Quantiles() []T {
114+
copied := make([]T, len(s.quantiles))
115+
copy(copied, s.quantiles)
116+
return copied
117+
}
118+
119+
// Quantile returns the approximate quantile of the given normalized rank.
120+
// If inclusive, the given rank includes all quantiles less than or equal to
121+
// the quantile directly corresponding to the given rank.
122+
// If not, the given rank includes all quantiles less than
123+
// the quantile directly corresponding to the given rank.
124+
// If the sketch is empty, it returns an error.
125+
func (s *NumericSortedView[T]) Quantile(rank float64, isInclusive bool) (T, error) {
126+
if s.IsEmpty() {
127+
return 0, ErrEmpty
128+
}
129+
130+
if err := quantilesutils.ValidateNormalizedRankBounds(rank); err != nil {
131+
return 0, err
132+
}
133+
134+
naturalRank := quantilesutils.ComputeNaturalRank(rank, uint64(s.n), isInclusive)
135+
crit := internal.InequalityGT
136+
if isInclusive {
137+
crit = internal.InequalityGE
138+
}
139+
length := len(s.cumWeights)
140+
index, err := internal.FindWithInequality(
141+
s.cumWeights, 0, length-1, naturalRank, crit, func(a, b int64) bool { return a < b },
142+
)
143+
if err != nil {
144+
return 0, err
145+
}
146+
if index == -1 {
147+
return s.quantiles[length-1], nil
148+
}
149+
return s.quantiles[index], nil
150+
}
151+
152+
// MinItem returns the minimum item in the sketch.
153+
// If the sketch is empty, it returns an error.
154+
// This may be distinct from the smallest item retained by the sketch algorithm.
155+
func (s *NumericSortedView[T]) MinItem() (T, error) {
156+
if s.IsEmpty() {
157+
return 0, ErrEmpty
158+
}
159+
return s.quantiles[0], nil
160+
}
161+
162+
// MaxItem returns the maximum item in the view.
163+
// If the sketch is empty, it returns an error.
164+
// This may be distinct from the largest item retained by the sketch algorithm.
165+
func (s *NumericSortedView[T]) MaxItem() (T, error) {
166+
if s.IsEmpty() {
167+
return 0, ErrEmpty
168+
}
169+
return s.quantiles[len(s.quantiles)-1], nil
170+
}
171+
172+
// CDF returns an approximation of the stream's cumulative distribution
173+
// function for the given split points.
174+
//
175+
// It returns a monotonically increasing slice of cumulative probabilities in
176+
// [0, 1]. The returned slice has len(splitPoints)+1 entries.
177+
//
178+
// The approximation has the probabilistic guarantee implied by
179+
// NormalizedRankError(false).
180+
//
181+
// splitPoints must be unique and strictly increasing. They divide the item
182+
// domain into len(splitPoints)+1 overlapping intervals. Each interval starts
183+
// below the lowest retained item, which corresponds to cumulative probability
184+
// 0, and ends at the cumulative probability of its split point.
185+
//
186+
// The final interval represents the rest of the distribution, so the last
187+
// returned value is always 1.
188+
//
189+
// If a split point is exactly equal to a retained item, isInclusive=true includes
190+
// that item's weight in the cumulative probability for that split point.
191+
//
192+
// Callers generally should not include the true minimum or maximum stream item
193+
// in splitPoints.
194+
func (s *NumericSortedView[T]) CDF(splitPoints []T, isInclusive bool) ([]float64, error) {
195+
if err := quantilesutils.ValidateSplitPoints(splitPoints); err != nil {
196+
return nil, err
197+
}
198+
199+
length := len(splitPoints) + 1
200+
buckets := make([]float64, length)
201+
for i := 0; i < length-1; i++ {
202+
rank, err := s.Rank(splitPoints[i], isInclusive)
203+
if err != nil {
204+
return nil, err
205+
}
206+
207+
buckets[i] = rank
208+
}
209+
buckets[length-1] = 1.0
210+
return buckets, nil
211+
}
212+
213+
// PMF returns an approximation of the stream's probability mass function
214+
// for the given split points.
215+
//
216+
// It returns len(splitPoints)+1 probability masses in [0, 1]. The returned
217+
// intervals are consecutive and non-overlapping, and their sum is always 1.
218+
//
219+
// The approximation has the probabilistic guarantee implied by
220+
// NormalizedRankError(true).
221+
//
222+
// splitPoints must be unique and strictly increasing. They divide the item
223+
// domain into len(splitPoints)+1 intervals. Each interior interval starts at
224+
// one split point and ends at the next. The first interval starts below the
225+
// lowest retained item and ends at the first split point. The last interval
226+
// starts at the last split point and extends past the largest retained item.
227+
//
228+
// If a split point is exactly equal to a retained item, the interval boundary
229+
// handling depends on searchCrit. With isInclusive=true, an interval includes an item
230+
// equal to its upper split point and excludes an item equal to its lower split
231+
// point. With isInclusive=false, an interval excludes an item equal to its upper split
232+
// point and includes an item equal to its lower split point.
233+
//
234+
// Callers generally should not include the true minimum or maximum stream item
235+
// in splitPoints.
236+
func (s *NumericSortedView[T]) PMF(splitPoints []T, isInclusive bool) ([]float64, error) {
237+
buckets, err := s.CDF(splitPoints, isInclusive)
238+
if err != nil {
239+
return nil, err
240+
}
241+
242+
for i := len(buckets) - 1; i > 0; i-- {
243+
buckets[i] -= buckets[i-1]
244+
}
245+
return buckets, nil
246+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package quantiles
19+
20+
import "errors"
21+
22+
var ErrIndexOutOfValidRange = errors.New("index out of range")
23+
24+
// NumericSortedViewIterator is an iterator over sorted views of numeric quantile sketches of type.
25+
type NumericSortedViewIterator[T Number] struct {
26+
cumWeights []int64
27+
quantiles []T
28+
totalN int64
29+
index int
30+
}
31+
32+
// NewNumericSortedViewIterator constructs a new NumericSortedViewIterator.
33+
// The quantiles slice must be ordered and have the same length as cumWeights.
34+
// The cumWeights slice must be ordered, start with the value one, and the last
35+
// value must be equal to N, the total number of items updated to the sketch.
36+
func NewNumericSortedViewIterator[T Number](quantiles []T, cumWeights []int64) *NumericSortedViewIterator[T] {
37+
var totalN int64
38+
if len(cumWeights) > 0 {
39+
totalN = cumWeights[len(cumWeights)-1]
40+
}
41+
42+
return &NumericSortedViewIterator[T]{
43+
cumWeights: cumWeights,
44+
totalN: totalN,
45+
index: -1,
46+
quantiles: quantiles,
47+
}
48+
}
49+
50+
func (it *NumericSortedViewIterator[T]) validateIndex() error {
51+
if it.index < 0 || it.index >= len(it.cumWeights) {
52+
return ErrIndexOutOfValidRange
53+
}
54+
return nil
55+
}
56+
57+
// NaturalRank returns the natural rank at the current index.
58+
// This is equivalent to NaturalRankWithCriterion(Inclusive).
59+
// NOTE: Call Next() before calling this method.
60+
func (it *NumericSortedViewIterator[T]) NaturalRank() (int64, error) {
61+
if err := it.validateIndex(); err != nil {
62+
return 0, err
63+
}
64+
return it.cumWeights[it.index], nil
65+
}
66+
67+
// NaturalRankWithCriterion returns the natural rank at the current index (or previous index)
68+
// based on the chosen search criterion. The natural rank is a number in the range [1, N],
69+
// where N (N()) is the total number of items fed to the sketch.
70+
//
71+
// If inclusive, includes the weight of the item at the current index.
72+
// Otherwise, returns the natural rank of the previous index.
73+
// NOTE: Call Next() before calling this method.
74+
func (it *NumericSortedViewIterator[T]) NaturalRankWithCriterion(isInclusive bool) (int64, error) {
75+
if err := it.validateIndex(); err != nil {
76+
return 0, err
77+
}
78+
if isInclusive {
79+
return it.cumWeights[it.index], nil
80+
}
81+
if it.index == 0 {
82+
return 0, nil
83+
}
84+
return it.cumWeights[it.index-1], nil
85+
}
86+
87+
// N returns the total count of all items presented to the sketch.
88+
func (it *NumericSortedViewIterator[T]) N() int64 {
89+
return it.totalN
90+
}
91+
92+
// NormalizedRank returns the normalized rank at the current index.
93+
// This is equivalent to NormalizedRankWithCriterion(true).
94+
// NOTE: Call Next() before calling this method.
95+
func (it *NumericSortedViewIterator[T]) NormalizedRank() (float64, error) {
96+
nr, err := it.NaturalRank()
97+
if err != nil {
98+
return 0, err
99+
}
100+
return float64(nr) / float64(it.totalN), nil
101+
}
102+
103+
// NormalizedRankWithCriterion returns the normalized rank at the current index (or previous
104+
// index) based on the chosen search criterion. Normalized rank = natural rank / N (N())
105+
// and is a fraction in the range (0, 1.0].
106+
// NOTE: Call Next() before calling this method.
107+
func (it *NumericSortedViewIterator[T]) NormalizedRankWithCriterion(isInclusive bool) (float64, error) {
108+
nr, err := it.NaturalRankWithCriterion(isInclusive)
109+
if err != nil {
110+
return 0, err
111+
}
112+
return float64(nr) / float64(it.totalN), nil
113+
}
114+
115+
// Weight returns the weight contribution of the item at the current index.
116+
// NOTE: Call Next() before calling this method.
117+
func (it *NumericSortedViewIterator[T]) Weight() (int64, error) {
118+
if err := it.validateIndex(); err != nil {
119+
return 0, err
120+
}
121+
if it.index == 0 {
122+
return it.cumWeights[0], nil
123+
}
124+
return it.cumWeights[it.index] - it.cumWeights[it.index-1], nil
125+
}
126+
127+
// Next advances the index and checks if it is valid.
128+
// The state of the iterator is undefined before the first call of this method.
129+
func (it *NumericSortedViewIterator[T]) Next() bool {
130+
it.index++
131+
return it.index < len(it.cumWeights)
132+
}
133+
134+
// Quantile returns the quantile at the current index.
135+
// NOTE: Call Next() before calling this method.
136+
func (it *NumericSortedViewIterator[T]) Quantile() (T, error) {
137+
if err := it.validateIndex(); err != nil {
138+
var zero T
139+
return zero, err
140+
}
141+
return it.quantiles[it.index], nil
142+
}

0 commit comments

Comments
 (0)