Merge pull request #32 from arviz-devs/ecdf_pit

aloctavodia · web-flow · commit 9f8b73a73cb8 · 2025-03-12T21:05:37.000+02:00
Remove plots, use arviz instead
diff --git a/docs/examples/gallery/sbc.md b/docs/examples/gallery/sbc.md
@@ -15,9 +15,10 @@ This example demonstrates how to use the `SBC` class for simulation-based calibr
 
 ```{jupyter-execute}
 
-import matplotlib.pyplot as plt
+from arviz_plots import plot_ecdf_pit, style
 import numpy as np
 import simuk
+style.use("arviz-variat")
 ```
 
 ::::::{tab-set}
@@ -42,24 +43,28 @@ with pm.Model() as centered_eight:
     y_obs = pm.Normal('y', mu=theta, sigma=sigma, observed=data)
 ```
 
-Pass the model to the SBC class, set the number of simulations to 100, and run the simulations. This process may take 
-some time since the model runs multiple times.
+Pass the model to the SBC class, set the number of simulations to 100, and run the simulations. This process may take
+some time since the model runs multiple times (100 in this example).
 
 ```{jupyter-execute}
 
 sbc = simuk.SBC(centered_eight,
     num_simulations=100,
     sample_kwargs={'draws': 25, 'tune': 50})
 
-sbc.run_simulations()
+sbc.run_simulations();
 ```
 
-To compare the prior and posterior distributions, we will plot the results. You can adjust the type of visualization 
-using the ``kind`` parameter. We use the empirical CDF to compare the differences between the prior and posterior.
+To compare the prior and posterior distributions, we will plot the results from the simulations,
+using the ArviZ function `plot_ecdf_pit`.
+We expect a uniform distribution, the gray envelope corresponds to the 94% credible interval.
 
 ```{jupyter-execute}
 
-sbc.plot_results(kind="ecdf")
+plot_ecdf_pit(sbc.simulations,
+              pc_kwargs={'col_wrap':4},
+              plot_kwargs={"xlabel":False},
+)
 ```
 
 :::::
@@ -80,7 +85,7 @@ df = pd.DataFrame({"x": x, "y": y})
 bmb_model = bmb.Model("y ~ x", df)
 ```
 
-Pass the model to the `SBC` class, set the number of simulations to 100, and run the simulations. 
+Pass the model to the `SBC` class, set the number of simulations to 100, and run the simulations.
 This process may take some time, as the model runs multiple times
 
 ```{jupyter-execute}
@@ -89,15 +94,14 @@ sbc = simuk.SBC(bmb_model,
     num_simulations=100,
     sample_kwargs={'draws': 25, 'tune': 50})
 
-sbc.run_simulations()
+sbc.run_simulations();
 ```
 
-To compare the prior and posterior distributions, we will plot the results. You can customize the visualization type 
-using the `kind` parameter. The example below displays a histogram.
+To compare the prior and posterior distributions, we will plot the results from the simulations.
+We expect a uniform distribution, the gray envelope corresponds to the 94% credible interval.
 
 ```{jupyter-execute}
-
-sbc.plot_results(kind="hist")
+plot_ecdf_pit(sbc.simulations)
 ```
 
 :::::
diff --git a/docs/examples/img/sbc.png b/docs/examples/img/sbc.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dynamic = ["version"]
 description = "Simulation based calibration and generation of synthetic data."
 dependencies = [
   "pymc>=5.20",
-  "arviz>=0.20.0",
+  "arviz_base>=0.4.0",
   "tqdm"
 ]
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,5 +6,5 @@ pre-commit>=2.19
 ipytest==0.13.0
 pymc>=5.20.1
 bambi>=0.13.0
-arviz>=0.20.0
+arviz_base>=0.4.0
 ruff==0.9.1
diff --git a/requirements-docs.txt b/requirements-docs.txt
@@ -1,7 +1,8 @@
 pydata-sphinx-theme>=0.6.3
 myst-nb
-pymc @ git+https://github.com/pymc-devs/pymc@main
+pymc>=5.20.1
 bambi>=0.15.0
+arviz_plots @ git+https://github.com/arviz-devs/arviz-plots@main
 sphinx>=4
 sphinx-copybutton
 sphinx_tabs
diff --git a/simuk/plots.py b/simuk/plots.py
diff --git a/simuk/sbc.py b/simuk/sbc.py
@@ -2,14 +2,13 @@
 
 import logging
 from copy import copy
+from importlib.metadata import version
 
-import arviz as az
 import numpy as np
 import pymc as pm
+from arviz_base import extract, from_dict
 from tqdm import tqdm
 
-from simuk.plots import plot_results
-
 
 class quiet_logging:
     """Turn off logging for PyMC, Bambi and PyTensor."""
@@ -93,31 +92,48 @@ def __init__(
 
         self.simulations = {name: [] for name in self.var_names}
         self._simulations_complete = 0
-        self._seed = seed
+        self.seed = seed
+        self._seeds = self._get_seeds()
 
     def _get_seeds(self):
         """Set the random seed, and generate seeds for all the simulations."""
-        if self._seed is not None:
-            np.random.seed(self._seed)
-        return np.random.randint(2**30, size=self.num_simulations)
+        rng = np.random.default_rng(self.seed)
+        return rng.integers(0, 2**30, size=self.num_simulations)
 
     def _get_prior_predictive_samples(self):
         """Generate samples to use for the simulations."""
         with self.model:
-            idata = pm.sample_prior_predictive(samples=self.num_simulations)
-            prior_pred = az.extract(idata, group="prior_predictive")
-            prior = az.extract(idata, group="prior")
+            idata = pm.sample_prior_predictive(
+                samples=self.num_simulations, random_seed=self._seeds[0]
+            )
+            prior_pred = extract(idata, group="prior_predictive", keep_dataset=True)
+            prior = extract(idata, group="prior", keep_dataset=True)
         return prior, prior_pred
 
     def _get_posterior_samples(self, prior_predictive_draw):
         """Generate posterior samples conditioned to a prior predictive sample."""
         new_model = pm.observe(self.model, prior_predictive_draw)
         with new_model:
-            check = pm.sample(**self.sample_kwargs)
+            check = pm.sample(
+                **self.sample_kwargs, random_seed=self._seeds[self._simulations_complete]
+            )
 
-        posterior = az.extract(check, group="posterior")
+        posterior = extract(check, group="posterior", keep_dataset=True)
         return posterior
 
+    def _convert_to_datatree(self):
+        self.simulations = from_dict(
+            {"prior_sbc": self.simulations},
+            attrs={
+                "/": {
+                    "inferece_library": self.engine,
+                    "inferece_library_version": version(self.engine),
+                    "modeling_interface": "simuk",
+                    "modeling_interface_version": version("simuk"),
+                }
+            },
+        )
+
     @quiet_logging("pymc", "pytensor.gof.compilelock", "bambi")
     def run_simulations(self):
         """Run all the simulations.
@@ -127,7 +143,6 @@ def run_simulations(self):
         seed was passed initially, it will still be respected (that is, the resulting
         simulations will be identical to running without pausing in the middle).
         """
-        seeds = self._get_seeds()
         prior, prior_pred = self._get_prior_predictive_samples()
 
         progress = tqdm(
@@ -142,8 +157,6 @@ def run_simulations(self):
                     for var_name in self.observed_vars
                 }
 
-                np.random.seed(seeds[idx])
-
                 posterior = self._get_posterior_samples(prior_predictive_draw)
                 for name in self.var_names:
                     self.simulations[name].append(
@@ -153,34 +166,8 @@ def run_simulations(self):
                 progress.update()
         finally:
             self.simulations = {
-                k: v[: self._simulations_complete] for k, v in self.simulations.items()
+                k: np.stack(v[: self._simulations_complete])[None, :]
+                for k, v in self.simulations.items()
             }
+            self._convert_to_datatree()
             progress.close()
-
-    def plot_results(self, kind="ecdf", var_names=None, color="C0"):
-        """Visual diagnostic for SBC.
-
-        Currently it support two options: `ecdf` for the empirical CDF plots
-        of the difference between prior and posterior. `hist` for the rank
-        histogram.
-
-
-        Parameters
-        ----------
-        simulations : dict[str] -> listlike
-            The SBC.simulations dictionary.
-        kind : str
-            What kind of plot to make. Supported values are 'ecdf' (default) and 'hist'
-        var_names : list[str]
-            Variables to plot (defaults to all)
-        figsize : tuple
-            Figure size for the plot. If None, it will be defined automatically.
-        color : str
-            Color to use for the eCDF or histogram
-
-        Returns
-        -------
-        fig, axes
-            matplotlib figure and axes
-        """
-        return plot_results(self.simulations, kind=kind, var_names=var_names, color=color)
diff --git a/simuk/tests/test_sbc.py b/simuk/tests/test_sbc.py
@@ -17,18 +17,18 @@
     theta = pm.Normal("theta", mu=mu, sigma=tau, shape=8)
     y_obs = pm.Normal("y", mu=theta, sigma=sigma, observed=data)
 
-x = np.random.normal(0, 1, 200)
+x = np.random.normal(0, 1, 20)
 y = 2 + np.random.normal(x, 1)
 df = pd.DataFrame({"x": x, "y": y})
 bmb_model = bmb.Model("y ~ x", df)
 
 
-@pytest.mark.parametrize("model, kind", [(centered_eight, "ecdf"), (bmb_model, "hist")])
-def test_sbc(model, kind):
+@pytest.mark.parametrize("model", [centered_eight, bmb_model])
+def test_sbc(model):
     sbc = simuk.SBC(
         model,
         num_simulations=10,
-        sample_kwargs={"draws": 25, "tune": 50},
+        sample_kwargs={"draws": 5, "tune": 5},
     )
     sbc.run_simulations()
-    sbc.plot_results(kind=kind)
+    assert "prior_sbc" in sbc.simulations

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ dynamic = ["version"]`
`25`	`25`	`description = "Simulation based calibration and generation of synthetic data."`
`26`	`26`	`dependencies = [`
`27`	`27`	`"pymc>=5.20",`
`28`		`- "arviz>=0.20.0",`
	`28`	`+ "arviz_base>=0.4.0",`
`29`	`29`	`"tqdm"`
`30`	`30`	`]`
`31`	`31`