1+ from typing import List
2+ import numpy as np
3+
4+
5+ def merge_small_bins (bin_count_observed : List [float ], bin_count_fitted_data : List [float ]):
6+ """Merge small bins for goodness-of-fit tests (e.g., chi-square).
7+
8+ This utility merges adjacent "small" bins (those whose expected count is < 5)
9+ starting from the right-most bin and moving left, accumulating small bins
10+ until their combined expected count is >= 5. If a large (>= 5) bin is
11+ encountered while there is an accumulation, that accumulation is merged into
12+ that bin. If the left edge is reached with a remaining accumulation that was
13+ never merged into a large bin, the accumulation is appended as its own bin.
14+
15+ After merging, the expected counts are rescaled so that their sum equals the
16+ total observed count (required by Pearson's chi-square test), preserving the
17+ expected proportions within the merged structure.
18+
19+ Args:
20+ bin_count_observed (List[float]):
21+ Observed counts per original bin. Must be the same length as
22+ ``bin_count_fitted_data``. Values should be non-negative.
23+ bin_count_fitted_data (List[float]):
24+ Expected (model-fitted) counts per original bin. Must be the same
25+ length as ``bin_count_observed``. Values should be non-negative.
26+
27+ Returns:
28+ Tuple[np.ndarray, np.ndarray]:
29+ Two 1D numpy arrays ``(merged_observed, merged_expected)`` in
30+ low-to-high bin order after merging and rescaling. The two arrays
31+ are the same length, and ``merged_expected.sum() ==
32+ merged_observed.sum()``.
33+
34+ Raises:
35+ ZeroDivisionError: If the total expected count across merged bins is 0,
36+ rescaling cannot be performed (division by zero). This can happen if
37+ all expected counts are zero.
38+ ValueError: If the input sequences have different lengths.
39+
40+ Notes:
41+ - The function assumes a one-to-one correspondence of observed and
42+ expected bins. If lengths differ, only a partial zip would occur; to
43+ avoid silent truncation a ``ValueError`` is raised.
44+ - Merging proceeds from right to left and the result is then reversed
45+ back to low-to-high order.
46+ - The "< 5" rule is a common heuristic for chi-square tests to ensure
47+ adequate expected counts per bin.
48+
49+ Examples:
50+ - Merge tail small bins with the nearest large bin on the left
51+
52+ ```python
53+ >>> from statista.utils import merge_small_bins
54+ >>> merge_small_bins([10, 3, 2], [10, 3, 2])
55+ (array([15]), array([15.]))
56+
57+ ```
58+
59+ - No merging when all expected counts are >= 5
60+
61+ ```python
62+ >>> merge_small_bins([10, 20, 30], [12, 18, 30])
63+ (array([10, 20, 30]), array([12., 18., 30.]))
64+
65+ ```
66+
67+ - Accumulated leftmost small bins remain as their own bin if no large bin is found to the left
68+
69+ ```python
70+ >>> merge_small_bins([10, 10], [4, 6])
71+ (array([10, 10]), array([ 8., 12.]))
72+
73+ ```
74+
75+ - Expected counts are rescaled to match the observed total while preserving proportions
76+
77+ ```python
78+ >>> merge_small_bins([5, 5, 10], [2, 3, 5])
79+ (array([10, 10]), array([10., 10.]))
80+
81+ ```
82+ """
83+ if len (bin_count_observed ) != len (bin_count_fitted_data ):
84+ raise ValueError ("bin_count_observed and bin_count_fitted_data must have the same length." )
85+
86+ # Merge tail bins whose expected counts are < 5
87+ merged_obs = []
88+ merged_exp = []
89+ accum_obs = 0
90+ accum_exp = 0
91+
92+ # Work from the rightmost bin backwards, accumulating bins until the combined
93+ # expected count is ≥ 5
94+ for observed , expected in reversed (list (zip (bin_count_observed , bin_count_fitted_data ))):
95+ if expected < 5 :
96+ accum_obs += observed
97+ accum_exp += expected
98+ else :
99+ if accum_exp > 0 :
100+ # combine the accumulated small bins with this one
101+ accum_obs += observed
102+ accum_exp += expected
103+ merged_obs .append (accum_obs )
104+ merged_exp .append (accum_exp )
105+ accum_obs = accum_exp = 0
106+ else :
107+ # keep this bin separate
108+ merged_obs .append (observed )
109+ merged_exp .append (expected )
110+
111+ # Append any remaining accumulated bins
112+ if accum_exp > 0 :
113+ merged_obs .append (accum_obs )
114+ merged_exp .append (accum_exp )
115+
116+ # Reverse the order back to low→high
117+ merged_obs = np .array (merged_obs [::- 1 ])
118+ merged_exp = np .array (merged_exp [::- 1 ]).astype (float )
119+
120+ # Rescale expected counts so they sum to the total number of observations
121+ # This is required for Pearson’s χ² test
122+ merged_exp *= merged_obs .sum () / merged_exp .sum ()
123+ return merged_obs , merged_exp
0 commit comments