Skip to content

Commit 7786747

Browse files
authored
Fix quantile calculation on small data points (#36)
1 parent 75a731a commit 7786747

File tree

6 files changed

+92
-5
lines changed

6 files changed

+92
-5
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(EXTENSION_SOURCES
2525
src/filesystem_ref_registry.cpp
2626
src/histogram.cpp
2727
src/metrics_collector.cpp
28+
src/numeric_utils.cpp
2829
src/observefs_extension.cpp
2930
src/observability_filesystem.cpp
3031
src/operation_latency_collector.cpp

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile
99

1010
format-all: format
1111
find unit/ -iname *.hpp -o -iname *.cpp | xargs clang-format --sort-includes=0 -style=file -i
12-
find benchmark/ -iname *.hpp -o -iname *.cpp | xargs clang-format --sort-includes=0 -style=file -i
1312
cmake-format -i CMakeLists.txt
1413

1514
test_unit: all

src/include/numeric_utils.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#pragma once
2+
3+
namespace duckdb {
4+
5+
// Return whether the given number A is equal to number B, in a tolerable error range.
6+
bool IsDoubleEqual(double actual, double expected, bool print_if_unequal = false);
7+
8+
} // namespace duckdb

src/numeric_utils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#include "numeric_utils.hpp"
2+
3+
#include <cmath>
4+
#include <iostream>
5+
6+
namespace duckdb {
7+
8+
namespace {
9+
constexpr double MAX_ERROR_RANGE = 0.01;
10+
} // namespace
11+
12+
bool IsDoubleEqual(double actual, double expected, bool print_if_unequal) {
13+
const bool is_equal = std::fabs(actual - expected) <= MAX_ERROR_RANGE;
14+
if (!is_equal && print_if_unequal) {
15+
std::cerr << "Actual value = " << actual << ", expected value = " << expected << std::endl;
16+
}
17+
return is_equal;
18+
}
19+
20+
} // namespace duckdb

src/quantilelite.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,30 @@
11
#include "quantilelite.hpp"
22

33
#include <algorithm>
4+
#include <cmath>
45

56
namespace duckdb {
67

78
float QuantileLite::Quantile(float q) const {
89
if (samples.empty()) {
910
return 0.0;
1011
}
11-
const size_t k = static_cast<size_t>(q * (samples.size() - 1));
12-
std::nth_element(samples.begin(), samples.begin() + k, samples.end());
13-
return samples[k];
12+
std::sort(samples.begin(), samples.end());
13+
14+
const size_t n = samples.size();
15+
if (n == 1) {
16+
return samples[0];
17+
}
18+
19+
const float pos = q * (n - 1);
20+
const size_t lower = static_cast<size_t>(std::floor(pos));
21+
const size_t upper = static_cast<size_t>(std::ceil(pos));
22+
const float frac = pos - lower;
23+
24+
if (upper == lower) {
25+
return samples[lower];
26+
}
27+
return samples[lower] + frac * (samples[upper] - samples[lower]);
1428
}
1529

1630
} // namespace duckdb

unit/test_quantile_estimator.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
#include <algorithm>
55
#include <random>
66

7-
#include "quantile_estimator.hpp"
87
#include "duckdb/common/vector.hpp"
8+
#include "numeric_utils.hpp"
9+
#include "quantile_estimator.hpp"
910

1011
using namespace duckdb; // NOLINT
1112

@@ -27,6 +28,50 @@ vector<int> GetRandomNumbers(int max_val) {
2728
}
2829
} // namespace
2930

31+
TEST_CASE("Insufficient number of data points", "[quantile test]") {
32+
QuantileEstimator qe {METRICS_NAME, METRICS_UNIT};
33+
34+
// --- One data point ---
35+
qe.Add(1);
36+
REQUIRE(IsDoubleEqual(qe.p50(), 1.0, /*print_if_unequal=*/true));
37+
REQUIRE(IsDoubleEqual(qe.p75(), 1.0, /*print_if_unequal=*/true));
38+
REQUIRE(IsDoubleEqual(qe.p90(), 1.0, /*print_if_unequal=*/true));
39+
REQUIRE(IsDoubleEqual(qe.p95(), 1.0, /*print_if_unequal=*/true));
40+
REQUIRE(IsDoubleEqual(qe.p99(), 1.0, /*print_if_unequal=*/true));
41+
42+
// --- Two data points ---
43+
qe.Add(2);
44+
REQUIRE(IsDoubleEqual(qe.p50(), 1.5, /*print_if_unequal=*/true));
45+
REQUIRE(IsDoubleEqual(qe.p75(), 1.75, /*print_if_unequal=*/true));
46+
REQUIRE(IsDoubleEqual(qe.p90(), 1.9, /*print_if_unequal=*/true));
47+
REQUIRE(IsDoubleEqual(qe.p95(), 1.95, /*print_if_unequal=*/true));
48+
REQUIRE(IsDoubleEqual(qe.p99(), 1.99, /*print_if_unequal=*/true));
49+
50+
// --- Three data points ---
51+
qe.Add(3);
52+
REQUIRE(IsDoubleEqual(qe.p50(), 2.0, /*print_if_unequal=*/true));
53+
REQUIRE(IsDoubleEqual(qe.p75(), 2.5, /*print_if_unequal=*/true));
54+
REQUIRE(IsDoubleEqual(qe.p90(), 2.8, /*print_if_unequal=*/true));
55+
REQUIRE(IsDoubleEqual(qe.p95(), 2.9, /*print_if_unequal=*/true));
56+
REQUIRE(IsDoubleEqual(qe.p99(), 2.98, /*print_if_unequal=*/true));
57+
58+
// --- Four data points ---
59+
qe.Add(4);
60+
REQUIRE(IsDoubleEqual(qe.p50(), 2.5, /*print_if_unequal=*/true));
61+
REQUIRE(IsDoubleEqual(qe.p75(), 3.25, /*print_if_unequal=*/true));
62+
REQUIRE(IsDoubleEqual(qe.p90(), 3.7, /*print_if_unequal=*/true));
63+
REQUIRE(IsDoubleEqual(qe.p95(), 3.85, /*print_if_unequal=*/true));
64+
REQUIRE(IsDoubleEqual(qe.p99(), 3.97, /*print_if_unequal=*/true));
65+
66+
// --- Five data points ---
67+
qe.Add(5);
68+
REQUIRE(IsDoubleEqual(qe.p50(), 3.0, /*print_if_unequal=*/true));
69+
REQUIRE(IsDoubleEqual(qe.p75(), 4.0, /*print_if_unequal=*/true));
70+
REQUIRE(IsDoubleEqual(qe.p90(), 4.6, /*print_if_unequal=*/true));
71+
REQUIRE(IsDoubleEqual(qe.p95(), 4.8, /*print_if_unequal=*/true));
72+
REQUIRE(IsDoubleEqual(qe.p99(), 4.96, /*print_if_unequal=*/true));
73+
}
74+
3075
TEST_CASE("Small scale quantile test", "[quantile test]") {
3176
constexpr size_t NUM_VALUE = 50;
3277
constexpr double MAX_TOLERABLE_DIFF = 1.0;

0 commit comments

Comments
 (0)