|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package quantiles |
| 19 | + |
| 20 | +import ( |
| 21 | + "errors" |
| 22 | + |
| 23 | + "github.com/apache/datasketches-go/internal" |
| 24 | + quantilesutils "github.com/apache/datasketches-go/internal/quantiles" |
| 25 | +) |
| 26 | + |
| 27 | +var ErrEmpty = errors.New("operation is undefined for an empty sketch") |
| 28 | + |
| 29 | +// Number is a type constraint that permits numeric types used by quantile sketches. |
| 30 | +type Number interface { |
| 31 | + float32 | float64 | int64 |
| 32 | +} |
| 33 | + |
| 34 | +// The NumericSortedView provides a sorted view of the data retained by a numeric quantiles-type sketch that |
| 35 | +// would be cumbersome to get any other way. |
| 36 | +// One could use the sketch's iterator to iterate over the contents of the sketch, |
| 37 | +// but the result would not be sorted. |
| 38 | +// |
| 39 | +// The data from a NumericSortedView is an unbiased random sample of the input stream that can be used for other kinds of |
| 40 | +// analysis not directly provided by the sketch. |
| 41 | +type NumericSortedView[T Number] struct { |
| 42 | + quantiles []T |
| 43 | + cumWeights []int64 |
| 44 | + n int64 |
| 45 | +} |
| 46 | + |
| 47 | +// NewNumericSortedView constructs a new NumericSortedView. |
| 48 | +func NewNumericSortedView[T Number]( |
| 49 | + quantiles []T, cumWeights []int64, n int64, maxItem, minItem T, |
| 50 | +) *NumericSortedView[T] { |
| 51 | + res := quantilesutils.IncludeMinMax(quantiles, cumWeights, maxItem, minItem) |
| 52 | + return &NumericSortedView[T]{ |
| 53 | + quantiles: res.Quantiles, |
| 54 | + cumWeights: res.CumWeights, |
| 55 | + n: n, |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +// CumulativeWeights returns a copy of the cumulative weights slice. |
| 60 | +// Also known as the natural ranks, which are the Natural Numbers on the interval [1, N]. |
| 61 | +func (s *NumericSortedView[T]) CumulativeWeights() []int64 { |
| 62 | + copied := make([]int64, len(s.cumWeights)) |
| 63 | + copy(copied, s.cumWeights) |
| 64 | + return copied |
| 65 | +} |
| 66 | + |
| 67 | +// N returns the total number of items presented to the sourcing sketch. |
| 68 | +func (s *NumericSortedView[T]) N() int64 { |
| 69 | + return s.n |
| 70 | +} |
| 71 | + |
| 72 | +// NumRetained returns the number of quantiles retained by this sorted view. |
| 73 | +// This may be slightly different from the function with the same name when called from the originating sketch. |
| 74 | +func (s *NumericSortedView[T]) NumRetained() int { |
| 75 | + return len(s.quantiles) |
| 76 | +} |
| 77 | + |
| 78 | +// IsEmpty returns true if the view is empty. |
| 79 | +func (s *NumericSortedView[T]) IsEmpty() bool { |
| 80 | + return s.n == 0 |
| 81 | +} |
| 82 | + |
| 83 | +// Iterator creates and returns a new iterator. |
| 84 | +func (s *NumericSortedView[T]) Iterator() *NumericSortedViewIterator[T] { |
| 85 | + return NewNumericSortedViewIterator[T](s.quantiles, s.cumWeights) |
| 86 | +} |
| 87 | + |
| 88 | +// Rank returns the normalized rank corresponding to the given a quantile. |
| 89 | +// If the sketch is empty, it returns an error. |
| 90 | +func (s *NumericSortedView[T]) Rank(quantile T, isInclusive bool) (float64, error) { |
| 91 | + if s.IsEmpty() { |
| 92 | + return 0, ErrEmpty |
| 93 | + } |
| 94 | + |
| 95 | + crit := internal.InequalityLT |
| 96 | + if isInclusive { |
| 97 | + crit = internal.InequalityLE |
| 98 | + } |
| 99 | + |
| 100 | + index, err := internal.FindWithInequality[T](s.quantiles, 0, len(s.quantiles)-1, quantile, crit, func(t1 T, t2 T) bool { |
| 101 | + return t1 < t2 |
| 102 | + }) |
| 103 | + if err != nil { |
| 104 | + return 0, err |
| 105 | + } |
| 106 | + if index == -1 { |
| 107 | + return 0, nil //LT case: quantile <= minQuantile; LE case: quantile < minQuantile |
| 108 | + } |
| 109 | + return float64(s.cumWeights[index]) / float64(s.n), nil |
| 110 | +} |
| 111 | + |
| 112 | +// Quantiles return a copy of the quantiles slice. |
| 113 | +func (s *NumericSortedView[T]) Quantiles() []T { |
| 114 | + copied := make([]T, len(s.quantiles)) |
| 115 | + copy(copied, s.quantiles) |
| 116 | + return copied |
| 117 | +} |
| 118 | + |
| 119 | +// Quantile returns the approximate quantile of the given normalized rank. |
| 120 | +// If inclusive, the given rank includes all quantiles less than or equal to |
| 121 | +// the quantile directly corresponding to the given rank. |
| 122 | +// If not, the given rank includes all quantiles less than |
| 123 | +// the quantile directly corresponding to the given rank. |
| 124 | +// If the sketch is empty, it returns an error. |
| 125 | +func (s *NumericSortedView[T]) Quantile(rank float64, isInclusive bool) (T, error) { |
| 126 | + if s.IsEmpty() { |
| 127 | + return 0, ErrEmpty |
| 128 | + } |
| 129 | + |
| 130 | + if err := quantilesutils.ValidateNormalizedRankBounds(rank); err != nil { |
| 131 | + return 0, err |
| 132 | + } |
| 133 | + |
| 134 | + naturalRank := quantilesutils.ComputeNaturalRank(rank, uint64(s.n), isInclusive) |
| 135 | + crit := internal.InequalityGT |
| 136 | + if isInclusive { |
| 137 | + crit = internal.InequalityGE |
| 138 | + } |
| 139 | + length := len(s.cumWeights) |
| 140 | + index, err := internal.FindWithInequality( |
| 141 | + s.cumWeights, 0, length-1, naturalRank, crit, func(a, b int64) bool { return a < b }, |
| 142 | + ) |
| 143 | + if err != nil { |
| 144 | + return 0, err |
| 145 | + } |
| 146 | + if index == -1 { |
| 147 | + return s.quantiles[length-1], nil |
| 148 | + } |
| 149 | + return s.quantiles[index], nil |
| 150 | +} |
| 151 | + |
| 152 | +// MinItem returns the minimum item in the sketch. |
| 153 | +// If the sketch is empty, it returns an error. |
| 154 | +// This may be distinct from the smallest item retained by the sketch algorithm. |
| 155 | +func (s *NumericSortedView[T]) MinItem() (T, error) { |
| 156 | + if s.IsEmpty() { |
| 157 | + return 0, ErrEmpty |
| 158 | + } |
| 159 | + return s.quantiles[0], nil |
| 160 | +} |
| 161 | + |
| 162 | +// MaxItem returns the maximum item in the view. |
| 163 | +// If the sketch is empty, it returns an error. |
| 164 | +// This may be distinct from the largest item retained by the sketch algorithm. |
| 165 | +func (s *NumericSortedView[T]) MaxItem() (T, error) { |
| 166 | + if s.IsEmpty() { |
| 167 | + return 0, ErrEmpty |
| 168 | + } |
| 169 | + return s.quantiles[len(s.quantiles)-1], nil |
| 170 | +} |
| 171 | + |
| 172 | +// CDF returns an approximation of the stream's cumulative distribution |
| 173 | +// function for the given split points. |
| 174 | +// |
| 175 | +// It returns a monotonically increasing slice of cumulative probabilities in |
| 176 | +// [0, 1]. The returned slice has len(splitPoints)+1 entries. |
| 177 | +// |
| 178 | +// The approximation has the probabilistic guarantee implied by |
| 179 | +// NormalizedRankError(false). |
| 180 | +// |
| 181 | +// splitPoints must be unique and strictly increasing. They divide the item |
| 182 | +// domain into len(splitPoints)+1 overlapping intervals. Each interval starts |
| 183 | +// below the lowest retained item, which corresponds to cumulative probability |
| 184 | +// 0, and ends at the cumulative probability of its split point. |
| 185 | +// |
| 186 | +// The final interval represents the rest of the distribution, so the last |
| 187 | +// returned value is always 1. |
| 188 | +// |
| 189 | +// If a split point is exactly equal to a retained item, isInclusive=true includes |
| 190 | +// that item's weight in the cumulative probability for that split point. |
| 191 | +// |
| 192 | +// Callers generally should not include the true minimum or maximum stream item |
| 193 | +// in splitPoints. |
| 194 | +func (s *NumericSortedView[T]) CDF(splitPoints []T, isInclusive bool) ([]float64, error) { |
| 195 | + if err := quantilesutils.ValidateSplitPoints(splitPoints); err != nil { |
| 196 | + return nil, err |
| 197 | + } |
| 198 | + |
| 199 | + length := len(splitPoints) + 1 |
| 200 | + buckets := make([]float64, length) |
| 201 | + for i := 0; i < length-1; i++ { |
| 202 | + rank, err := s.Rank(splitPoints[i], isInclusive) |
| 203 | + if err != nil { |
| 204 | + return nil, err |
| 205 | + } |
| 206 | + |
| 207 | + buckets[i] = rank |
| 208 | + } |
| 209 | + buckets[length-1] = 1.0 |
| 210 | + return buckets, nil |
| 211 | +} |
| 212 | + |
| 213 | +// PMF returns an approximation of the stream's probability mass function |
| 214 | +// for the given split points. |
| 215 | +// |
| 216 | +// It returns len(splitPoints)+1 probability masses in [0, 1]. The returned |
| 217 | +// intervals are consecutive and non-overlapping, and their sum is always 1. |
| 218 | +// |
| 219 | +// The approximation has the probabilistic guarantee implied by |
| 220 | +// NormalizedRankError(true). |
| 221 | +// |
| 222 | +// splitPoints must be unique and strictly increasing. They divide the item |
| 223 | +// domain into len(splitPoints)+1 intervals. Each interior interval starts at |
| 224 | +// one split point and ends at the next. The first interval starts below the |
| 225 | +// lowest retained item and ends at the first split point. The last interval |
| 226 | +// starts at the last split point and extends past the largest retained item. |
| 227 | +// |
| 228 | +// If a split point is exactly equal to a retained item, the interval boundary |
| 229 | +// handling depends on searchCrit. With isInclusive=true, an interval includes an item |
| 230 | +// equal to its upper split point and excludes an item equal to its lower split |
| 231 | +// point. With isInclusive=false, an interval excludes an item equal to its upper split |
| 232 | +// point and includes an item equal to its lower split point. |
| 233 | +// |
| 234 | +// Callers generally should not include the true minimum or maximum stream item |
| 235 | +// in splitPoints. |
| 236 | +func (s *NumericSortedView[T]) PMF(splitPoints []T, isInclusive bool) ([]float64, error) { |
| 237 | + buckets, err := s.CDF(splitPoints, isInclusive) |
| 238 | + if err != nil { |
| 239 | + return nil, err |
| 240 | + } |
| 241 | + |
| 242 | + for i := len(buckets) - 1; i > 0; i-- { |
| 243 | + buckets[i] -= buckets[i-1] |
| 244 | + } |
| 245 | + return buckets, nil |
| 246 | +} |
0 commit comments