Skip to content

Commit 4b87a2d

Browse files
author
Mahesh Pai
committed
Bugfix: tdigest const_iterator returns dangling reference causing incorrect values
1 parent 5b1e968 commit 4b87a2d

File tree

3 files changed

+276
-1
lines changed

3 files changed

+276
-1
lines changed

tdigest/include/tdigest.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ template<typename T, typename A>
316316
class tdigest<T, A>::const_iterator {
317317
public:
318318
using iterator_category = std::input_iterator_tag;
319-
using value_type = std::pair<const T&, const W>;
319+
using value_type = std::pair<T, W>;
320320
using difference_type = void;
321321
using pointer = const return_value_holder<value_type>;
322322
using reference = const value_type;

tdigest/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ target_sources(tdigest_test
3939
PRIVATE
4040
tdigest_test.cpp
4141
tdigest_custom_allocator_test.cpp
42+
tdigest_iterator_test.cpp
4243
)
4344

4445
if (SERDE_COMPAT)
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <catch2/catch.hpp>
21+
#include <memory>
22+
#include <map>
23+
#include <vector>
24+
#include <set>
25+
26+
#include "tdigest.hpp"
27+
28+
namespace datasketches {
29+
30+
TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") {
31+
tdigest_double td(100);
32+
33+
// Insert 10 distinct values
34+
for (int i = 0; i < 10; i++) {
35+
td.update(static_cast<double>(i));
36+
}
37+
38+
// Collect all centroids via iteration
39+
std::map<double, uint64_t> centroids;
40+
for (const auto&& centroid : td) {
41+
centroids[centroid.first] = centroid.second;
42+
}
43+
44+
// Should have collected all 10 distinct values
45+
REQUIRE(centroids.size() == 10);
46+
47+
// Verify each value was captured correctly
48+
for (int i = 0; i < 10; i++) {
49+
REQUIRE(centroids.count(static_cast<double>(i)) == 1);
50+
REQUIRE(centroids[static_cast<double>(i)] == 1);
51+
}
52+
}
53+
54+
TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") {
55+
// This test reproduces the bug scenario found in ClickHouse
56+
std::unique_ptr<tdigest_double> td(new tdigest_double(100));
57+
58+
// Insert distinct values
59+
for (int i = 0; i < 10; i++) {
60+
td->update(static_cast<double>(i));
61+
}
62+
63+
// Use explicit begin/end iterators
64+
auto it = td->begin();
65+
auto end_it = td->end();
66+
67+
std::vector<double> means;
68+
std::vector<uint64_t> weights;
69+
70+
while (it != end_it) {
71+
// Before the fix, accessing it->first would return garbage or same value repeatedly
72+
double mean = it->first;
73+
uint64_t weight = it->second;
74+
means.push_back(mean);
75+
weights.push_back(weight);
76+
++it;
77+
}
78+
79+
// Should have collected 10 centroids
80+
REQUIRE(means.size() == 10);
81+
REQUIRE(weights.size() == 10);
82+
83+
// All means should be distinct (not all zeros or garbage)
84+
std::set<double> unique_means(means.begin(), means.end());
85+
REQUIRE(unique_means.size() == 10);
86+
87+
// Verify all expected values are present
88+
for (int i = 0; i < 10; i++) {
89+
REQUIRE(unique_means.count(static_cast<double>(i)) == 1);
90+
}
91+
}
92+
93+
TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") {
94+
tdigest_double td(100);
95+
96+
for (int i = 0; i < 5; i++) {
97+
td.update(static_cast<double>(i * 10));
98+
}
99+
100+
std::vector<std::pair<double, uint64_t>> collected;
101+
102+
// Test structured bindings
103+
for (auto it = td.begin(); it != td.end(); ++it) {
104+
const auto& centroid = *it;
105+
collected.emplace_back(centroid.first, centroid.second);
106+
}
107+
108+
REQUIRE(collected.size() == 5);
109+
110+
// Verify distinct values were collected
111+
std::set<double> means;
112+
for (const auto& pair : collected) {
113+
means.insert(pair.first);
114+
REQUIRE(pair.second == 1); // Each value inserted once
115+
}
116+
117+
REQUIRE(means.size() == 5);
118+
for (int i = 0; i < 5; i++) {
119+
REQUIRE(means.count(static_cast<double>(i * 10)) == 1);
120+
}
121+
}
122+
123+
TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") {
124+
tdigest_double td(100);
125+
126+
// Insert values
127+
for (int i = 1; i <= 10; i++) {
128+
td.update(static_cast<double>(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100
129+
}
130+
131+
// Access via operator->
132+
std::map<double, uint64_t> centroids;
133+
auto end_it = td.end();
134+
for (auto it = td.begin(); it != end_it; ++it) {
135+
// operator-> should return valid values
136+
centroids[it->first] = it->second;
137+
}
138+
139+
REQUIRE(centroids.size() == 10);
140+
141+
// Verify the squared values
142+
for (int i = 1; i <= 10; i++) {
143+
double expected = static_cast<double>(i * i);
144+
REQUIRE(centroids.count(expected) == 1);
145+
}
146+
}
147+
148+
TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") {
149+
tdigest_double td(100);
150+
151+
// Insert values
152+
for (double d = 0.0; d < 10.0; d += 1.0) {
153+
td.update(d);
154+
}
155+
156+
size_t count = 0;
157+
std::set<double> seen_means;
158+
159+
// This pattern was working in simple tests but failing in optimized builds
160+
for (const auto&& centroid : td) {
161+
seen_means.insert(centroid.first);
162+
count++;
163+
}
164+
165+
REQUIRE(count == 10);
166+
REQUIRE(seen_means.size() == 10);
167+
168+
// Verify all values from 0 to 9 are present
169+
for (int i = 0; i < 10; i++) {
170+
REQUIRE(seen_means.count(static_cast<double>(i)) == 1);
171+
}
172+
}
173+
174+
TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") {
175+
tdigest_double td(100);
176+
177+
td.update(1.0);
178+
td.update(2.0);
179+
td.update(3.0);
180+
181+
auto it = td.begin();
182+
183+
// Store the pair
184+
auto pair1 = *it;
185+
double mean1 = pair1.first;
186+
187+
++it;
188+
189+
// Store another pair
190+
auto pair2 = *it;
191+
double mean2 = pair2.first;
192+
193+
++it;
194+
195+
auto pair3 = *it;
196+
double mean3 = pair3.first;
197+
198+
// All three means should be distinct
199+
REQUIRE(mean1 != mean2);
200+
REQUIRE(mean2 != mean3);
201+
REQUIRE(mean1 != mean3);
202+
203+
// And they should match our input values
204+
std::set<double> means = {mean1, mean2, mean3};
205+
REQUIRE(means.count(1.0) == 1);
206+
REQUIRE(means.count(2.0) == 1);
207+
REQUIRE(means.count(3.0) == 1);
208+
}
209+
210+
TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") {
211+
tdigest_double td(100);
212+
213+
// Empty sketch should have begin() == end()
214+
REQUIRE(td.begin() == td.end());
215+
216+
// Range-based for should not execute
217+
size_t count = 0;
218+
for (const auto&& centroid : td) {
219+
(void)centroid; // Silence unused warning
220+
count++;
221+
}
222+
REQUIRE(count == 0);
223+
}
224+
225+
TEST_CASE("tdigest iterator: single value", "[tdigest]") {
226+
tdigest_double td(100);
227+
td.update(42.0);
228+
229+
size_t count = 0;
230+
double captured_mean = 0.0;
231+
uint64_t captured_weight = 0;
232+
233+
for (const auto&& centroid : td) {
234+
captured_mean = centroid.first;
235+
captured_weight = centroid.second;
236+
count++;
237+
}
238+
239+
REQUIRE(count == 1);
240+
REQUIRE(captured_mean == 42.0);
241+
REQUIRE(captured_weight == 1);
242+
}
243+
244+
TEST_CASE("tdigest iterator: large dataset", "[tdigest]") {
245+
tdigest_double td(100);
246+
247+
// Insert 1000 distinct values
248+
for (int i = 0; i < 1000; i++) {
249+
td.update(static_cast<double>(i));
250+
}
251+
252+
// Iterator should provide compressed centroids (not all 1000)
253+
size_t centroid_count = 0;
254+
std::set<double> unique_means;
255+
uint64_t total_weight = 0;
256+
257+
for (const auto&& centroid : td) {
258+
unique_means.insert(centroid.first);
259+
total_weight += centroid.second;
260+
centroid_count++;
261+
}
262+
263+
// Should have fewer centroids than input values due to compression
264+
REQUIRE(centroid_count < 1000);
265+
REQUIRE(centroid_count > 0);
266+
267+
// Total weight should equal number of input values
268+
REQUIRE(total_weight == 1000);
269+
270+
// All means should be unique (no duplicates)
271+
REQUIRE(unique_means.size() == centroid_count);
272+
}
273+
274+
} // namespace datasketches

0 commit comments

Comments
 (0)