feat(stats): improve chisquare test with p-value and small bin merging (#119)

MAfarrag · web-flow · commit eb52d28e1d2e · 2025-08-08T22:34:02.000+02:00
feat(stats): improve chisquare test with p-value and small bin merging - Added `merge_small_bins` utility with tests - Updated `chisquare` method to calculate p-value and merge small bins for more reliable results - Refactored method signature and simplified error handling - Integrated `chisquare` call into all `fit_model` methods - Added documentation for `utils` module and updated changelog and PR template ref: #110
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -29,7 +29,7 @@ Check relevant points.
 # Checklist:
 
 - [ ] updated version number in pyproject.toml
-- [ ] added changes to History.rst
+- [ ] added changes to docs/change-log.md
 - [ ] updated the latest version in README file
 - [ ] I have added tests that prove my fix is effective or that my feature works
 - [ ] New and existing unit tests pass locally with my changes
diff --git a/docs/change-log.md b/docs/change-log.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 0.6.3 (2025-08-08)
+##### Distributions
+* fix the `chisquare` method to all distributions.
+
 ## 0.6.2 (2025-07-31)
 ##### Docs
 * add complete documentation for all modules.
diff --git a/docs/reference/utils-module.md b/docs/reference/utils-module.md
@@ -0,0 +1,8 @@
+##### utils module
+
+::: statista.utils
+    options:
+        show_root_heading: true
+        show_source: true
+        heading_level: 3
+        members_order: source
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -111,6 +111,7 @@ nav:
     - Plot: reference/plot-class.md
     - Sensitivity Analysis: reference/sensitivity-class.md
     - Tools: reference/tools-module.md
+    - utils: reference/utils-module.md
   - Examples:
     - sensitivity-analysis:
       - Sensitivity Analysis: notebook/sensitivity-analysis/scs-cn.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "statista"
-version = "0.6.2"
+version = "0.6.3"
 description = "statistics package"
 readme = "README.md"
 requires-python = ">=3.11,<4"
@@ -76,7 +76,7 @@ Repository = "https://github.com/Serapieum-of-alex/statista"
 
 
 [tool.pytest.ini_options]
-# Anything you would have put in addopts/pytest.ini
+
 addopts = [
     "--cov",
     "--cov-branch",
@@ -88,6 +88,9 @@ testpaths = ["tests"]
 markers = [
     "slow: mark test as slow.",
     "fast: mark test as fast.",
+    "e2e: marks tests as e2e (deselect with '-m \"not e2e\"')",
+    "unit: marks tests as unit (deselect with '-m \"not unit\"')",
+    "integration: marks tests as integration (deselect with '-m \"not integration\"')",
 ]
 
 [tool.flake8]
diff --git a/src/statista/distributions.py b/src/statista/distributions.py
@@ -15,7 +15,8 @@
 from statista.confidence_interval import ConfidenceInterval
 from statista.parameters import Lmoments
 from statista.plot import Plot
-from statista.tools import Tools as st
+from statista.utils import merge_small_bins
+
 
 ninf = 1e-5
 
@@ -538,17 +539,25 @@ def ks(self) -> tuple:
         return test.statistic, test.pvalue
 
     @abstractmethod
-    def chisquare(self) -> Union[tuple, None]:
+    def chisquare(self) -> Tuple[float, float]:
         """Perform the Chi-square test for goodness of fit.
 
-        This method tests whether the data follows the fitted distribution using
-        the Chi-square test. The test compares the observed frequencies with the
+        - `chisquare test` refers to Pearson’s chi square goodness of fit test. It is designed for
+        categorical/count data: you observe how many points fall into each bin and compare those counts with the
+        frequencies expected under some hypothesis
+
+        This method tests whether the data follows the fitted distribution using the Chi-square test.
+        The test compares the observed frequencies (number of values in each category/histogram bin) with the
         expected frequencies under the fitted distribution.
 
         Returns:
             Tuple containing:
             - Chi-square statistic: The test statistic measuring the difference between
               observed and expected frequencies.
+              The χ² statistic is simply a measure of how far your observed counts deviate from the counts you would
+              expect if the fitted distribution were correct. For each bin 𝑖 we compute the squared difference
+              between the observed count 𝑂𝑖 and the expected count 𝐸𝑖, scaled by 𝐸𝑖, and then sum over all bins:
+
             - p-value: The probability of observing a Chi-square statistic as extreme as the one calculated,
               assuming the null hypothesis is true (data follows the distribution).
               If p-value < significance level (typically 0.05), reject the null hypothesis.
@@ -562,15 +571,17 @@ def chisquare(self) -> Union[tuple, None]:
                 "The Value of parameters is unknown. Please use 'fit_model' to estimate the distribution parameters"
             )
 
-        qth = self.inverse_cdf(self.cdf_weibul, self.parameters)
-        try:
-            test = chisquare(st.standardize(qth), st.standardize(self.data))
-            print("-----chisquare Test-----")
-            print("Statistic = " + str(test.statistic))
-            print("P value = " + str(test.pvalue))
-            return test.statistic, test.pvalue
-        except Exception as e:
-            print(e)
+        bin_edges = np.histogram_bin_edges(self.data, bins="sturges")
+        obs_counts, _ = np.histogram(self.data, bins=bin_edges)
+
+        expected_prob = np.diff(self._cdf_eq(bin_edges, self.parameters))
+        expected_counts = expected_prob * len(self.data)
+
+        # Pearson’s χ² test assumes each expected count is sufficiently large (at least about 5); otherwise the asymptotic χ² approximation is unreliable
+        merged_obs, merged_exp = merge_small_bins(obs_counts, expected_counts)
+
+        test = chisquare(merged_obs, f_exp=merged_exp, ddof=len(self.parameters))
+        return test.statistic, test.pvalue
 
     def confidence_interval(
         self,
@@ -1345,8 +1356,6 @@ def fit_model(
                 Statistic = 0.019
                 Accept Hypothesis
                 P value = 0.9937026761524456
-
-
                 >>> print(parameters)
                 {'loc': np.float64(0.010101355750222706), 'scale': 1.0313042643102108}
 
@@ -1419,7 +1428,7 @@ def fit_model(
 
         if test:
             self.ks()
-            # self.chisquare()
+            self.chisquare()
 
         return param
 
@@ -1523,8 +1532,7 @@ def _inv_cdf(
         scale = parameters.get("scale")
         if scale <= 0:
             raise ValueError(SCALE_PARAMETER_ERROR)
-        # the main equation from scipy
-        # Qth = loc - scale * (np.log(-np.log(cdf)))
+
         qth = gumbel_r.ppf(cdf, loc=loc, scale=scale)
 
         return qth
@@ -2330,6 +2338,7 @@ def fit_model(
 
         if test:
             self.ks()
+            self.chisquare()
 
         return param
 
@@ -3049,6 +3058,7 @@ def fit_model(
 
         if test:
             self.ks()
+            self.chisquare()
 
         return param
 
@@ -3370,6 +3380,7 @@ def fit_model(
 
         if test:
             self.ks()
+            self.chisquare()
 
         return param
 
diff --git a/src/statista/utils.py b/src/statista/utils.py
@@ -0,0 +1,123 @@
+from typing import List
+import numpy as np
+
+
+def merge_small_bins(bin_count_observed: List[float], bin_count_fitted_data: List[float]):
+    """Merge small bins for goodness-of-fit tests (e.g., chi-square).
+
+    This utility merges adjacent "small" bins (those whose expected count is < 5)
+    starting from the right-most bin and moving left, accumulating small bins
+    until their combined expected count is >= 5. If a large (>= 5) bin is
+    encountered while there is an accumulation, that accumulation is merged into
+    that bin. If the left edge is reached with a remaining accumulation that was
+    never merged into a large bin, the accumulation is appended as its own bin.
+
+    After merging, the expected counts are rescaled so that their sum equals the
+    total observed count (required by Pearson's chi-square test), preserving the
+    expected proportions within the merged structure.
+
+    Args:
+        bin_count_observed (List[float]):
+            Observed counts per original bin. Must be the same length as
+            ``bin_count_fitted_data``. Values should be non-negative.
+        bin_count_fitted_data (List[float]):
+            Expected (model-fitted) counts per original bin. Must be the same
+            length as ``bin_count_observed``. Values should be non-negative.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]:
+            Two 1D numpy arrays ``(merged_observed, merged_expected)`` in
+            low-to-high bin order after merging and rescaling. The two arrays
+            are the same length, and ``merged_expected.sum() ==
+            merged_observed.sum()``.
+
+    Raises:
+        ZeroDivisionError: If the total expected count across merged bins is 0,
+            rescaling cannot be performed (division by zero). This can happen if
+            all expected counts are zero.
+        ValueError: If the input sequences have different lengths.
+
+    Notes:
+        - The function assumes a one-to-one correspondence of observed and
+          expected bins. If lengths differ, only a partial zip would occur; to
+          avoid silent truncation a ``ValueError`` is raised.
+        - Merging proceeds from right to left and the result is then reversed
+          back to low-to-high order.
+        - The "< 5" rule is a common heuristic for chi-square tests to ensure
+          adequate expected counts per bin.
+
+    Examples:
+        - Merge tail small bins with the nearest large bin on the left
+
+            ```python
+            >>> from statista.utils import merge_small_bins
+            >>> merge_small_bins([10, 3, 2], [10, 3, 2])
+            (array([15]), array([15.]))
+
+            ```
+
+        - No merging when all expected counts are >= 5
+
+            ```python
+            >>> merge_small_bins([10, 20, 30], [12, 18, 30])
+            (array([10, 20, 30]), array([12., 18., 30.]))
+
+            ```
+
+        - Accumulated leftmost small bins remain as their own bin if no large bin is found to the left
+
+            ```python
+            >>> merge_small_bins([10, 10], [4, 6])
+            (array([10, 10]), array([ 8., 12.]))
+
+            ```
+
+        - Expected counts are rescaled to match the observed total while preserving proportions
+
+            ```python
+            >>> merge_small_bins([5, 5, 10], [2, 3, 5])
+            (array([10, 10]), array([10., 10.]))
+
+            ```
+    """
+    if len(bin_count_observed) != len(bin_count_fitted_data):
+        raise ValueError("bin_count_observed and bin_count_fitted_data must have the same length.")
+
+    # Merge tail bins whose expected counts are < 5
+    merged_obs = []
+    merged_exp = []
+    accum_obs  = 0
+    accum_exp  = 0
+
+    # Work from the rightmost bin backwards, accumulating bins until the combined
+    # expected count is ≥ 5
+    for observed, expected in reversed(list(zip(bin_count_observed, bin_count_fitted_data))):
+        if expected < 5:
+            accum_obs += observed
+            accum_exp += expected
+        else:
+            if accum_exp > 0:
+                # combine the accumulated small bins with this one
+                accum_obs += observed
+                accum_exp += expected
+                merged_obs.append(accum_obs)
+                merged_exp.append(accum_exp)
+                accum_obs = accum_exp = 0
+            else:
+                # keep this bin separate
+                merged_obs.append(observed)
+                merged_exp.append(expected)
+
+    # Append any remaining accumulated bins
+    if accum_exp > 0:
+        merged_obs.append(accum_obs)
+        merged_exp.append(accum_exp)
+
+    # Reverse the order back to low→high
+    merged_obs = np.array(merged_obs[::-1])
+    merged_exp = np.array(merged_exp[::-1]).astype(float)
+
+    # Rescale expected counts so they sum to the total number of observations
+    # This is required for Pearson’s χ² test
+    merged_exp *= merged_obs.sum() / merged_exp.sum()
+    return merged_obs, merged_exp
diff --git a/tests/distribution/test_distributions.py b/tests/distribution/test_distributions.py
@@ -140,8 +140,9 @@ def test_chisquare(
     ):
         param = gum_dist_parameters[dist_estimation_parameters_ks]
         dist = Gumbel(time_series2, param)
-        dstatic, _ = dist.chisquare()
-        assert dstatic == pytest.approx(-0.2813945052127964)
+        dstatic, p_value = dist.chisquare()
+        assert dstatic == pytest.approx(0.5768408126308443)
+        assert p_value == pytest.approx(0.7494464539783021)
 
     def test_pdf(
         self,
@@ -288,8 +289,8 @@ def test_gev_chisquare(
     ):
         param = gev_dist_parameters[dist_estimation_parameters_ks]
         dist = GEV(time_series1, param)
-        dstatic, _ = dist.chisquare()
-        assert dstatic == pytest.approx(-22.906818156545253)
+        dstatic, p_value = dist.chisquare()
+        assert dstatic == pytest.approx(1.745019092902356)
 
     def test_gev_pdf(
         self,
diff --git a/tests/tests_utils.py b/tests/tests_utils.py