Merge pull request #897 from alan-turing-institute/calibration_plot

radka-j · web-flow · commit edeea166c33c · 2025-10-17T15:05:46.000+01:00
Add calibration plot
diff --git a/autoemulate/core/compare.py b/autoemulate/core/compare.py
@@ -17,6 +17,7 @@
     calculate_subplot_layout,
     create_and_plot_slice,
     display_figure,
+    plot_calibration_from_distributions,
     plot_xy,
 )
 from autoemulate.core.reinitialize import fit_from_reinitialized
@@ -27,6 +28,7 @@
     DeviceLike,
     InputLike,
     ModelParams,
+    TensorLike,
     TransformedEmulatorParams,
 )
 from autoemulate.data.utils import ConversionMixin, set_random_seed
@@ -36,7 +38,7 @@
     PYTORCH_EMULATORS,
     get_emulator_class,
 )
-from autoemulate.emulators.base import Emulator
+from autoemulate.emulators.base import Emulator, ProbabilisticEmulator
 from autoemulate.emulators.transformed.base import TransformedEmulator
 from autoemulate.transforms.base import AutoEmulateTransform
 from autoemulate.transforms.standardize import StandardizeTransform
@@ -839,6 +841,78 @@ def plot_surface(
         fig.savefig(fname, bbox_inches="tight")
         return None
 
+    def plot_calibration(
+        self,
+        emulator: ProbabilisticEmulator,
+        x_test: TensorLike | None = None,
+        y_test: TensorLike | None = None,
+        levels: np.ndarray | None = None,
+        n_samples: int = 2000,
+        joint: bool = False,
+        title: str | None = None,
+        legend: bool = True,
+        fname: str | None = None,
+        figsize: tuple[int, int] | None = None,
+        **kwargs,
+    ):
+        """Plot calibration curve(s) for a given emulator.
+
+        This draws empirical coverage (y-axis) against nominal coverage (x-axis).
+
+        Parameters
+        ----------
+        emulator: ProbabilisticEmulator
+            Emulator that outputs a predictive distribution.
+        x_test: Tensorlike | None
+            Optional test inputs. If None, the held out test data is used.
+            Defaults to None.
+        y_test: Tensorlike | None
+            Optional true test outputs. If None, the held out test data is used.
+            Defaults to None.
+        levels: array-like, optional
+            Nominal coverage levels (between 0 and 1). If None, a default grid is
+            used.
+        n_samples: int
+            Number of Monte-Carlo samples to draw from the predictive
+            distribution to compute empirical intervals if analytical quantiles
+            are not available.
+        joint: bool
+            If True and the predictive outputs are multivariate, compute joint
+            coverage (i.e., the true vector must lie inside the interval for all
+            dimensions). If False (default), compute marginal coverage per output
+            dimension and return the mean across data points.
+        title: str | None
+            An optional title for the plot. Defaults to None (no title).
+        legend: bool
+            Whether to display a legend. Defaults to True.
+        fname: str | None
+            If provided, the figure will be saved to this file path. If None, the figure
+            will be displayed. Defaults to None.
+        figsize: tuple[int, int] | None
+            The size of the figure to create. If None, a default size is used.
+            Defaults to None.
+        """
+        if x_test is None or y_test is None:
+            if not (x_test is None and y_test is None):
+                msg = (
+                    "Both x_test and y_test must be provided, or neither to use held "
+                    "out test data."
+                )
+                raise ValueError(msg)
+            self.logger.info(
+                "Using held out test data for calibration plot. "
+                "To use different data, provide x_test and y_test."
+            )
+        x_test, y_test = self._convert_to_tensors(self.test)
+        y_pred = emulator.predict(x_test)
+        fig, _ = plot_calibration_from_distributions(
+            y_pred, y_test, levels, n_samples, joint, title, legend, figsize
+        )
+        if fname is None:
+            return display_figure(fig)
+        fig.savefig(fname, bbox_inches="tight")
+        return None
+
     def save(
         self,
         model_obj: int | Emulator | Result,
diff --git a/autoemulate/core/plotting.py b/autoemulate/core/plotting.py
@@ -5,7 +5,7 @@
 from matplotlib.axes import Axes
 from matplotlib.figure import Figure
 
-from autoemulate.core.types import NumpyLike, TensorLike
+from autoemulate.core.types import DistributionLike, GaussianLike, NumpyLike, TensorLike
 from autoemulate.emulators.base import Emulator
 
 
@@ -236,7 +236,6 @@ def mean_and_var_surface(
         The predicted variance on the grid.
     grid: list[TensorLike]
         The grid of parameter values used for predictions.
-
     """
     # Determine which parameters to vary and which to fix
     grid_params = {}
@@ -412,8 +411,162 @@ def create_and_plot_slice(
         param_pair_names,
         vmin,
         vmax,
-        fixed_params_info=f"{', '.join(fixed_params)} at {quantile:.1f} quantile"
-        if len(fixed_params) > 0
-        else "None",
+        fixed_params_info=(
+            f"{', '.join(fixed_params)} at {quantile:.1f} quantile"
+            if len(fixed_params) > 0
+            else "None"
+        ),
     )
     return fig, ax
+
+
+def coverage_from_distributions(
+    y_pred: DistributionLike,
+    y_true: TensorLike,
+    levels: list[float] | NumpyLike | TensorLike | None = None,
+    n_samples: int = 2000,
+    joint: bool = False,
+) -> tuple[NumpyLike, NumpyLike]:
+    """Compute empirical coverage for a set of nominal confidence levels.
+
+    Parameters
+    ----------
+    y_pred: DistributionLike
+        The emulator predicted distribution.
+    y_true: TensorLike
+        The true values.
+    levels: array-like, optional
+        Nominal coverage levels (between 0 and 1). If None, a default grid is
+        used. Defaults to None.
+    n_samples: int
+        Number of Monte-Carlo samples to draw from the predictive
+        distribution to compute empirical intervals if analytical quantiles
+        are not available.
+    joint: bool
+        If True and the predictive outputs are multivariate, compute joint
+        coverage (i.e., the true vector must lie inside the interval for all
+        dimensions). If False (default), compute marginal coverage per output
+        dimension and return the mean across data points.
+
+    Returns
+    -------
+    levels: np.ndarray
+        Nominal coverage levels.
+    empirical: np.ndarray
+        Empirical coverages. Shape is (len(levels), output_dim) when
+        `joint=False` and output_dim>1, or (len(levels),) when joint=True or
+        output_dim==1.
+    """
+    if levels is None:
+        levels = np.linspace(0.0, 1.0, 51)
+    levels = np.asarray(levels)
+
+    # if dist.icdf not available, compute empirical intervals using sample quantiles
+    samples = None
+    y_dist = None
+    if isinstance(y_pred, GaussianLike):
+        y_dist = y_pred
+    elif isinstance(y_pred, torch.distributions.Independent) and isinstance(
+        y_pred.base_dist, GaussianLike
+    ):
+        y_dist = y_pred.base_dist
+    else:
+        samples = y_pred.sample((n_samples,))
+
+    empirical_list = []
+    for p in levels:
+        lower_q = (1.0 - p) / 2.0
+        upper_q = 1.0 - lower_q
+
+        if y_dist is not None:
+            lower = y_dist.icdf(lower_q)
+            upper = y_dist.icdf(upper_q)
+        else:
+            assert samples is not None
+            lower = torch.quantile(samples, float(lower_q), dim=0)
+            upper = torch.quantile(samples, float(upper_q), dim=0)
+
+        inside = (y_true >= lower) & (y_true <= upper)
+        if joint:
+            inside_all = inside.all(dim=-1)
+            empirical = inside_all.float().mean().item()
+        else:
+            # marginal per-dim coverage
+            empirical = inside.float().mean(dim=0).cpu().numpy()
+        empirical_list.append(empirical)
+
+    empirical_arr = np.asarray(empirical_list)
+
+    return levels, empirical_arr
+
+
+def plot_calibration_from_distributions(
+    y_pred: DistributionLike,
+    y_true: TensorLike,
+    levels: np.ndarray | None = None,
+    n_samples: int = 2000,
+    joint: bool = False,
+    title: str | None = None,
+    legend: bool = True,
+    figsize: tuple[int, int] | None = None,
+):
+    """Plot calibration curve(s) given predictive distributions and true values.
+
+    This draws empirical coverage (y-axis) against nominal coverage (x-axis).
+
+    When points lie above or below the diagonal, this indicates that uncertainty
+    is respectively being  overestimated or underestimated.
+
+    Parameters
+    ----------
+    y_pred: DistributionLike
+        The emulator predicted distribution.
+    y_true: TensorLike
+        The true values.
+    levels: array-like, optional
+        Nominal coverage levels (between 0 and 1). If None, a default grid is
+        used.
+    n_samples: int
+        Number of Monte-Carlo samples to draw from the predictive
+        distribution to compute empirical intervals.
+    joint: bool
+        If True and the predictive outputs are multivariate, compute joint
+        coverage (i.e., the true vector must lie inside the interval for all
+        dimensions). If False (default), compute marginal coverage per output
+        dimension and return the mean across data points.
+    title: str | None
+        An optional title for the plot. Defaults to None (no title).
+    legend: bool
+        Whether to display a legend. Defaults to True.
+    figsize: tuple[int, int] | None
+        The size of the figure to create. If None, a default size is used.
+    """
+    levels, empirical = coverage_from_distributions(
+        y_pred, y_true, levels=levels, n_samples=n_samples, joint=joint
+    )
+
+    if figsize is None:
+        figsize = (6, 6)
+    fig, ax = plt.subplots(figsize=figsize)
+
+    if len(empirical.shape) == 1 or empirical.shape[1] == 1:
+        ax.plot(levels, empirical, marker="o", label="empirical")
+    else:
+        # multiple outputs: plot each dimension
+        for i in range(empirical.shape[1]):
+            ax.plot(levels, empirical[:, i], marker="o", label=f"$y_{i}$")
+
+    # diagonal reference
+    ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="ideal")
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.set_xlabel("Expected coverage")
+    ax.set_ylabel("Observed coverage")
+
+    if title:
+        ax.set_title(title)
+    ax.grid(alpha=0.3)
+    if legend:
+        ax.legend()
+
+    return fig, ax
diff --git a/docs/tutorials/emulation/01_quickstart.ipynb b/docs/tutorials/emulation/01_quickstart.ipynb
@@ -258,7 +258,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As well as plotting the data, we can directly plot the predicted mean and variance of the emulator for a pair of variables while holding the other variables constant at a given quantile. API to support plotting for a subset of the parameter and output range is also supported."
+    "As well as plotting the data, we can directly plot the predicted mean and variance of the emulator for a pair of variables while holding the other variables constant at a given quantile. API to support plotting for a subset of the parameter and output range is also supported.\n",
+    "\n",
+    "The emulator predicted mean captures the simulated data plotted at the top of the tutorial well. The predicted variance is low where we have data, and increases away from the data. "
    ]
   },
   {
@@ -270,6 +272,22 @@
     "ae.plot_surface(best.model, projectile.parameters_range, quantile=0.5)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also visualise the calibration of the emulator's predicted uncertainty on the held out test data. The closer the line is to the diagonal, the better calibrated the uncertainty is. Line above the diagonal overestimates the uncertainty while line below the diagonal underestimates it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ae.plot_calibration(best.model)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -358,7 +376,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "autoemulate",
    "language": "python",
    "name": "python3"
   },
@@ -372,7 +390,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,