@@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False):
5151 x_new = bin_edges [idx ]
5252 return x_new
5353
54+ def _get_tolerance (v ):
55+ """Determine numerical tolerance due to limited precision of floating-point values.
56+
57+ ... to account for roundoff error.
58+
59+ In other words, returns a maximum possible difference that can be considered negligible.
60+ Only relevant for floating-point values.
61+ """
62+ if issubclass (v .dtype .type , numpy .floating ):
63+ return numpy .abs (v ) * numpy .finfo (v .dtype ).eps
64+ return 0 # assuming it's an int
65+
5466def bin1d_vec (p , bins , tol = None , right_continuous = False ):
5567 """Efficient implementation of binning routine on 1D Cartesian Grid.
5668
57- Returns the indices of the points into bins. Bins are inclusive on the lower bound
69+ Bins are inclusive on the lower bound
5870 and exclusive on the upper bound. In the case where a point does not fall within the bins a -1
5971 will be returned. The last bin extends to infinity when right_continuous is set as true.
6072
73+ To correctly bin points that are practically on a bin edge, this function accounts for the
74+ limited precision of floating-point numbers (the roundoff error) with a numerical tolerance.
75+ If the provided points were subject to some floating-point operations after loading or
76+ generating them, the roundoff error increases (which is not accounted for) and requires
77+ overwriting the `tol` argument.
78+
6179 Args:
62- p (array-like): Point(s) to be placed into b
63- bins (array-like): bins to considering for binning, must be monotonically increasing
64- right_continuous (bool): if true, consider last bin extending to infinity
80+ p (array-like): Point(s) to be placed into bins.
81+ bins (array-like): bin edges; must be sorted (monotonically increasing)
82+ tol (float): overwrite numerical tolerance, by default determined automatically from the
83+ points' dtype to account for the roundoff error.
84+ right_continuous (bool): if true, consider last bin extending to infinity.
6585
6686 Returns:
67- idx (array-like): indexes hashed into grid
87+ numpy.ndarray: indices for the points corresponding to the bins.
6888
6989 Raises:
7090 ValueError:
7191 """
72- bins = numpy .array (bins )
73- p = numpy .array (p )
74- a0 = numpy .min (bins )
75- # if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary
92+ bins = numpy .asarray (bins )
93+ p = numpy .asarray (p )
94+
95+ # if not np.all(bins[:-1] <= bins[1:]): # check for sorted bins, which is a requirement
96+ # raise ValueError("Bin edges are not sorted.") # (pyCSEP always passes sorted bins)
97+ a0 = bins [0 ]
7698 if bins .size == 1 :
99+ # for a single bin, set `right_continuous` to true; h is now arbitrary
77100 right_continuous = True
78- h = 1
101+ h = 1.
79102 else :
80103 h = bins [1 ] - bins [0 ]
81104
82- a0_tol = numpy .abs (a0 ) * numpy .finfo (numpy .float64 ).eps
83- h_tol = numpy .abs (h ) * numpy .finfo (numpy .float64 ).eps
84- p_tol = numpy .abs (p ) * numpy .finfo (numpy .float64 ).eps
85-
86- # absolute tolerance
87- if tol is None :
88- idx = numpy .floor ((p + (p_tol + a0_tol ) - a0 ) / (h - h_tol ))
89- else :
90- idx = numpy .floor ((p + (tol + a0_tol ) - a0 ) / (h - h_tol ))
91105 if h < 0 :
92106 raise ValueError ("grid spacing must be positive and monotonically increasing." )
93- # account for floating point uncertainties by considering extreme case
107+
108+ # account for floating point precision
109+ a0_tol = _get_tolerance (a0 )
110+ h_tol = a0_tol # must be based on *involved* numbers
111+ p_tol = tol or _get_tolerance (p )
112+
113+ idx = numpy .floor ((p - a0 + p_tol + a0_tol ) / (h - h_tol ))
114+ idx = numpy .asarray (idx ) # assure idx is an array
94115
95116 if right_continuous :
96117 # set upper bin index to last
97- try :
98- idx [(idx < 0 )] = - 1
99- idx [(idx >= len (bins ) - 1 )] = len (bins ) - 1
100- except TypeError :
101- if idx >= len (bins ) - 1 :
102- idx = len (bins ) - 1
103- if idx < 0 :
104- idx = - 1
118+ idx [idx < 0 ] = - 1
119+ idx [idx >= len (bins ) - 1 ] = len (bins ) - 1
105120 else :
106- try :
107- idx [((idx < 0 ) | (idx >= len (bins )))] = - 1
108- except TypeError :
109- if idx < 0 or idx >= len (bins ):
110- idx = - 1
111- try :
112- idx = idx .astype (numpy .int64 )
113- except AttributeError :
114- idx = int (idx )
121+ idx [(idx < 0 ) | (idx >= len (bins ))] = - 1
122+ idx = idx .astype (numpy .int64 )
115123 return idx
116124
117125def _compute_likelihood (gridded_data , apprx_rate_density , expected_cond_count , n_obs ):
@@ -215,12 +223,13 @@ def cleaner_range(start, end, h):
215223 Returns:
216224 bin_edges (numpy.ndarray)
217225 """
218- # convert to integers to prevent accumulating floating point errors
219- const = 100000
220- start = numpy .floor (const * start )
221- end = numpy .floor (const * end )
222- d = const * h
223- return numpy .arange (start , end + d / 2 , d ) / const
226+ # determine required scaling to account for decimal places of bin edges and stepping
227+ num_decimals_bins = len (str (float (start )).split ('.' )[1 ])
228+ scale = max (10 ** num_decimals_bins , 1 / h )
229+ start = numpy .round (scale * start )
230+ end = numpy .round (scale * end )
231+ d = scale * h
232+ return numpy .arange (start , end + d / 2 , d ) / scale
224233
225234def first_nonnan (arr , axis = 0 , invalid_val = - 1 ):
226235 mask = arr == arr
0 commit comments