Skip to content

Commit c6e3945

Browse files
mcrumillerdongchao-1
authored andcommitted
fix: Ensure floating-point accuracy in hist (pola-rs#22245)
1 parent 40936a5 commit c6e3945

File tree

2 files changed

+26
-26
lines changed

2 files changed

+26
-26
lines changed

crates/polars-ops/src/chunked_array/hist.rs

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,30 +26,17 @@ where
2626
// User-supplied bins. Note these are actually bin edges. Check for monotonicity.
2727
// If we only have one edge, we have no bins.
2828
let bin_len = bins.len();
29-
// We also check for uniformity of bins. We declare uniformity if the difference
30-
// between the largest and smallest bin is < 0.00001 the average bin size.
3129
if bin_len > 1 {
32-
let mut smallest = bins[1] - bins[0];
33-
let mut largest = smallest;
34-
let mut avg_bin_size = smallest;
35-
for i in 1..bins.len() {
36-
let d = bins[i] - bins[i - 1];
37-
if d <= 0.0 {
30+
for i in 1..bin_len {
31+
if (bins[i] - bins[i - 1]) <= 0.0 {
3832
return Err(PolarsError::ComputeError(
3933
"bins must increase monotonically".into(),
4034
));
4135
}
42-
if d > largest {
43-
largest = d;
44-
} else if d < smallest {
45-
smallest = d;
46-
}
47-
avg_bin_size += d;
4836
}
49-
let uniform = (largest - smallest) / (avg_bin_size / bin_len as f64) < 0.00001;
50-
(bins.to_vec(), uniform)
37+
(bins.to_vec(), false)
5138
} else {
52-
(Vec::<f64>::new(), false) // uniformity doesn't matter here
39+
(Vec::<f64>::new(), false)
5340
}
5441
},
5542
(bin_count, None) => {
@@ -105,20 +92,22 @@ where
10592
let min_break: f64 = breaks[0];
10693
let max_break: f64 = breaks[num_bins];
10794
let scale = num_bins as f64 / (max_break - min_break);
95+
let max_idx = num_bins - 1;
10896

10997
for chunk in ca.downcast_iter() {
11098
for item in chunk.non_null_values_iter() {
11199
let item = item.to_f64().unwrap();
112100
if item > min_break && item <= max_break {
113-
let idx = scale * (item - min_break);
114-
let idx_floor = idx.floor();
115-
let idx = if idx == idx_floor {
116-
idx - 1.0
117-
} else {
118-
idx_floor
119-
};
120-
/* idx > (num_bins - 1) may happen due to floating point representation imprecision */
121-
let idx = cmp::min(idx as usize, num_bins - 1);
101+
// idx > (num_bins - 1) may happen due to floating point representation imprecision
102+
let mut idx = cmp::min((scale * (item - min_break)) as usize, max_idx);
103+
104+
// Adjust for float imprecision providing idx > 1 ULP of the breaks
105+
if item <= breaks[idx] {
106+
idx -= 1;
107+
} else if item > breaks[idx + 1] {
108+
idx += 1;
109+
}
110+
122111
count[idx] += 1;
123112
} else if item == min_break {
124113
count[0] += 1;

py-polars/tests/unit/operations/test_hist.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,3 +536,14 @@ def test_hist_include_lower_22056() -> None:
536536
}
537537
)
538538
assert_frame_equal(result, expected)
539+
540+
541+
def test_hist_ulp_edge_22234() -> None:
542+
# Uniform path
543+
s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])
544+
result = s.hist(bin_count=2)
545+
assert result["count"].to_list() == [1, 3]
546+
547+
# Manual path
548+
result = s.hist(bins=[-1, 0, 1])
549+
assert result["count"].to_list() == [1, 3]

0 commit comments

Comments
 (0)