Update Metric and evaluate() API to support OutputLike

sgreenbury · sgreenbury · commit d644cfa629ae · 2025-10-30T12:04:10.000Z
diff --git a/autoemulate/core/compare.py b/autoemulate/core/compare.py
@@ -12,11 +12,7 @@
 
 from autoemulate.core.device import TorchDeviceMixin
 from autoemulate.core.logging_config import get_configured_logger
-from autoemulate.core.metrics import (
-    TorchMetrics,
-    get_metric_config,
-    get_metric_configs,
-)
+from autoemulate.core.metrics import TorchMetrics, get_metric_config, get_metric_configs
 from autoemulate.core.model_selection import bootstrap, evaluate
 from autoemulate.core.plotting import (
     calculate_subplot_layout,
diff --git a/autoemulate/core/metrics.py b/autoemulate/core/metrics.py
@@ -37,7 +37,9 @@ def __repr__(self) -> str:
         return f"Metric(name={self.name}, maximize={self.maximize})"
 
     @abstractmethod
-    def __call__(self, y_pred: OutputLike, y_true: TensorLike) -> TensorLike:
+    def __call__(
+        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+    ) -> TensorLike:
         """Calculate metric."""
 
 
@@ -64,13 +66,21 @@ def __init__(
         self.name = name
         self.maximize = maximize
 
-    def __call__(self, y_pred: OutputLike, y_true: TensorLike) -> TensorLike:
+    def __call__(
+        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+    ) -> TensorLike:
         """Calculate metric."""
-        if not isinstance(y_pred, TensorLike):
+        if not isinstance(y_pred, OutputLike):
             raise ValueError(f"Metric not implemented for y_pred ({type(y_pred)})")
         if not isinstance(y_true, TensorLike):
             raise ValueError(f"Metric not implemented for y_true ({type(y_true)})")
 
+        # Handle probabilistic predictions
+        if isinstance(y_pred, DistributionLike):
+            try:
+                y_pred = y_pred.mean
+            except Exception:
+                y_pred = y_pred.rsample((n_samples,)).mean(dim=0)
         metric = self.metric()
         metric.to(y_pred.device)
         # Assume first dim is a batch dim, flatten others for metric calculation
@@ -82,7 +92,9 @@ class ProbabilisticMetric(Metric):
     """Base class for probabilistic metrics."""
 
     @abstractmethod
-    def __call__(self, y_pred: OutputLike, y_true: TensorLike) -> TensorLike:
+    def __call__(
+        self, y_pred: OutputLike, y_true: TensorLike, n_samples: int = 1000
+    ) -> TensorLike:
         """Calculate metric."""
 
 
diff --git a/autoemulate/core/model_selection.py b/autoemulate/core/model_selection.py
@@ -14,6 +14,7 @@
 from autoemulate.core.types import (
     DeviceLike,
     ModelParams,
+    OutputLike,
     TensorLike,
     TransformedEmulatorParams,
 )
@@ -25,27 +26,31 @@
 
 
 def evaluate(
-    y_pred: TensorLike,
+    y_pred: OutputLike,
     y_true: TensorLike,
     metric: Metric = R2,
+    n_samples: int = 1000,
 ) -> float:
     """
     Evaluate Emulator prediction performance using a `torchmetrics.Metric`.
 
     Parameters
     ----------
+    y_pred: OutputLike
+        Predicted target values, as returned by an Emulator.
     y_true: TensorLike
         Ground truth target values.
-    y_pred: TensorLike
-        Predicted target values, as returned by an Emulator.
     metric: Metric
         Metric to use for evaluation. Defaults to R2.
+    n_samples: int
+        Number of samples to generate to predict mean when y_pred does not have a mean
+        directly available. Defaults to 1000.
 
     Returns
     -------
     float
     """
-    return metric(y_pred, y_true).item()
+    return metric(y_pred, y_true, n_samples=n_samples).item()
 
 
 def cross_validate(
@@ -139,7 +144,7 @@ def cross_validate(
         transformed_emulator.fit(x, y)
 
         # compute and save results
-        y_pred = transformed_emulator.predict_mean(x_val)
+        y_pred = transformed_emulator.predict(x_val)
         for metric in metrics:
             score = evaluate(y_pred, y_val, metric)
             cv_results[metric.name].append(score)
@@ -192,7 +197,7 @@ def bootstrap(
 
     # If no bootstraps are specified, fall back to a single evaluation on given data
     if n_bootstraps is None:
-        y_pred = model.predict_mean(x, n_samples=n_samples)
+        y_pred = model.predict(x)
         results = {}
         for metric in metrics:
             score = evaluate(y_pred, y, metric)
diff --git a/tests/core/test_metrics.py b/tests/core/test_metrics.py
@@ -528,3 +528,69 @@ def test_crps_with_1d_targets():
     assert result.ndim == 0, "Result should be a scalar tensor"
     assert isinstance(result, torch.Tensor)
     assert result >= 0, "CRPS should be non-negative"
+
+
+# Tests for OutputLike support in TorchMetrics
+
+
+def test_torchmetrics_with_distribution_vs_mean():
+    """Test TorchMetrics with distribution gives same result as using mean."""
+    batch_size, n_targets = 10, 3
+    y_true = torch.randn(batch_size, n_targets)
+
+    # Create a Normal distribution
+    mean = torch.randn(batch_size, n_targets)
+    std = torch.ones(batch_size, n_targets) * 0.5
+    y_pred_dist = Normal(mean, std)
+
+    # Get result with distribution
+    result_dist = MSE(y_pred_dist, y_true)
+
+    # Get result with mean tensor
+    result_mean = MSE(mean, y_true)
+
+    assert torch.isclose(result_dist, result_mean, rtol=1e-4), "Should be close"
+
+
+@pytest.mark.parametrize(
+    "metric_instance",
+    [
+        metric
+        for metric in AVAILABLE_METRICS.values()
+        if isinstance(metric, TorchMetrics)
+    ],
+)
+def test_all_torchmetrics_support_distributions(metric_instance):
+    """Test all TorchMetrics instances support distribution inputs."""
+    batch_size = 20
+    y_true = torch.randn(batch_size, 2)
+
+    # Create a distribution
+    mean = torch.randn(batch_size, 2)
+    std = torch.ones(batch_size, 2) * 0.3
+    y_pred_dist = Normal(mean, std)
+
+    # Should work without error
+    result = metric_instance(y_pred_dist, y_true)
+
+    assert isinstance(result, torch.Tensor)
+    assert result.ndim == 0
+    assert torch.isfinite(result), "Result should be finite"
+
+
+def test_torchmetrics_distribution_multioutput():
+    """Test TorchMetrics with distribution for multioutput case."""
+    batch_size, n_outputs = 50, 5
+    y_true = torch.randn(batch_size, n_outputs)
+
+    # Create distribution with different means for different outputs
+    mean = torch.randn(batch_size, n_outputs)
+    std = torch.rand(batch_size, n_outputs) * 0.5 + 0.1  # Avoid zero std
+    y_pred_dist = Normal(mean, std)
+
+    # Test with MAE
+    result = MAE(y_pred_dist, y_true)
+
+    assert isinstance(result, torch.Tensor)
+    assert result.ndim == 0
+    assert result >= 0, "MAE should be non-negative"