alan-turing-institute
diff --git a/‎icenet_mp/callbacks/metric_summary_callback.py‎
Lines changed: 30 additions & 47 deletions b/‎icenet_mp/callbacks/metric_summary_callback.py‎
Lines changed: 30 additions & 47 deletions
diff --git a/‎icenet_mp/callbacks/plotting_callback.py‎
Lines changed: 5 additions & 5 deletions b/‎icenet_mp/callbacks/plotting_callback.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎icenet_mp/config/evaluate/callbacks/metric_summary.yaml‎
Lines changed: 0 additions & 1 deletion b/‎icenet_mp/config/evaluate/callbacks/metric_summary.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎icenet_mp/models/losses/__init__.py‎ ‎icenet_mp/losses/__init__.py‎icenet_mp/models/losses/__init__.py renamed to icenet_mp/losses/__init__.py b/‎icenet_mp/models/losses/__init__.py‎ ‎icenet_mp/losses/__init__.py‎icenet_mp/models/losses/__init__.py renamed to icenet_mp/losses/__init__.py
diff --git a/‎…et_mp/models/losses/weighted_bce_loss.py‎ ‎icenet_mp/losses/weighted_bce_loss.py‎icenet_mp/models/losses/weighted_bce_loss.py renamed to icenet_mp/losses/weighted_bce_loss.py b/‎…et_mp/models/losses/weighted_bce_loss.py‎ ‎icenet_mp/losses/weighted_bce_loss.py‎icenet_mp/models/losses/weighted_bce_loss.py renamed to icenet_mp/losses/weighted_bce_loss.py
diff --git a/‎…net_mp/models/losses/weighted_l1_loss.py‎ ‎icenet_mp/losses/weighted_l1_loss.py‎icenet_mp/models/losses/weighted_l1_loss.py renamed to icenet_mp/losses/weighted_l1_loss.py b/‎…net_mp/models/losses/weighted_l1_loss.py‎ ‎icenet_mp/losses/weighted_l1_loss.py‎icenet_mp/models/losses/weighted_l1_loss.py renamed to icenet_mp/losses/weighted_l1_loss.py
diff --git a/‎…et_mp/models/losses/weighted_mse_loss.py‎ ‎icenet_mp/losses/weighted_mse_loss.py‎icenet_mp/models/losses/weighted_mse_loss.py renamed to icenet_mp/losses/weighted_mse_loss.py b/‎…et_mp/models/losses/weighted_mse_loss.py‎ ‎icenet_mp/losses/weighted_mse_loss.py‎icenet_mp/models/losses/weighted_mse_loss.py renamed to icenet_mp/losses/weighted_mse_loss.py
diff --git a/‎icenet_mp/models/metrics/__init__.py‎ ‎icenet_mp/metrics/__init__.py‎icenet_mp/models/metrics/__init__.py renamed to icenet_mp/metrics/__init__.py b/‎icenet_mp/models/metrics/__init__.py‎ ‎icenet_mp/metrics/__init__.py‎icenet_mp/models/metrics/__init__.py renamed to icenet_mp/metrics/__init__.py
diff --git a/‎icenet_mp/metrics/base_metrics.py‎
Lines changed: 106 additions & 0 deletions b/‎icenet_mp/metrics/base_metrics.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎…net_mp/models/metrics/icenet_accuracy.py‎ ‎icenet_mp/metrics/icenet_accuracy.py‎icenet_mp/models/metrics/icenet_accuracy.py renamed to icenet_mp/metrics/icenet_accuracy.py b/‎…net_mp/models/metrics/icenet_accuracy.py‎ ‎icenet_mp/metrics/icenet_accuracy.py‎icenet_mp/models/metrics/icenet_accuracy.py renamed to icenet_mp/metrics/icenet_accuracy.py
@@ -1,61 +1,44 @@
 import logging
-import statistics
-from collections.abc import Mapping
-from typing import Any
 
+import wandb
 from lightning import LightningModule, Trainer
 from lightning.pytorch import Callback
-from torch import Tensor
+from torchmetrics import MetricCollection
 
-from icenet_mp.types import ModelTestOutput
+from icenet_mp.utils import get_wandb_run
 
 logger = logging.getLogger(__name__)
 
 
 class MetricSummaryCallback(Callback):
     """A callback to summarise metrics during evaluation."""
 
-    def __init__(self, *, average_loss: bool = True) -> None:
-        """Summarise metrics during evaluation.
-
-        Args:
-            average_loss: Whether to log average loss
-
-        """
-        self.metrics: dict[str, list[float]] = {}
-        if average_loss:
-            self.metrics["average_loss"] = []
-
-    def on_test_batch_end(
-        self,
-        _trainer: Trainer,
-        _module: LightningModule,
-        outputs: Tensor | Mapping[str, Any] | None,
-        _batch: Any,  # noqa: ANN401
-        _batch_idx: int,
-        _dataloader_idx: int = 0,
-    ) -> None:
-        """Called when the test batch ends."""
-        if not isinstance(outputs, ModelTestOutput):
-            msg = f"Output is of type {type(outputs)}, skipping metric accumulation."
-            logger.warning(msg)
+    def on_test_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
+        """Called at the end of testing."""
+        test_metrics: MetricCollection = pl_module.test_metrics  # type: ignore[assignment]
+        if not isinstance(pl_module.test_metrics, MetricCollection):
+            logger.warning("Could not load test metrics!")
             return
 
-        if "average_loss" in self.metrics:
-            self.metrics["average_loss"].append(outputs.loss.item())
-
-    def on_test_epoch_end(
-        self,
-        trainer: Trainer,
-        _module: LightningModule,
-    ) -> None:
-        """Called at the end of the test epoch."""
-        # Post-process accumulated metrics into a single value
-        metrics_: dict[str, float] = {}
-        for name, values in self.metrics.items():
-            if name.startswith("average_"):
-                metrics_[name] = statistics.mean(values)
-
-        # Log metrics to each logger
-        for logger in trainer.loggers:
-            logger.log_metrics(metrics_)
+        for name, metric in test_metrics.items():
+            # Compute the metric value (e.g., SIEError) across all batches and log it
+            values = metric.compute()
+
+            for logger_ in trainer.loggers:
+                # Log the mean value of the metric across all days
+                logger_.log_metrics({f"{name} (mean)": values.mean().item()})
+
+            # check if WandB is being used as a logger and metrics are calculated for multiple days
+            # if so, log the metric values as a table and plot
+            if (
+                isinstance(run := get_wandb_run(trainer), wandb.Run)
+                and values.numel() > 1
+            ):
+                table = wandb.Table(
+                    data=list(enumerate(values.tolist(), start=1)),
+                    columns=["day", name],
+                )
+                plot_name = name + " per day"
+                run.log(
+                    {plot_name: wandb.plot.line(table, "day", name, title=plot_name)}
+                )
@@ -86,12 +86,12 @@ def on_test_batch_end(
 
         # Get sequence dates for static and video plots
         batch_size = int(outputs.target.shape[0])
-        n_timesteps = int(outputs.target.shape[1])
-        dates = [
-            datetime_from_npdatetime(dataset.dates[batch_size * batch_idx + tt])
-            for tt in range(n_timesteps)
-        ]
 
+        start_date = dataset.dates[batch_size * batch_idx]
+
+        dates = list(
+            map(datetime_from_npdatetime, dataset.get_forecast_steps(start_date))
+        )
         # Set hemisphere for plotting based on dataset
         self.plotter.set_hemisphere(dataset.hemisphere)
 
 
@@ -1,3 +1,2 @@
 metric_summary:
   _target_: icenet_mp.callbacks.MetricSummaryCallback
-  average_loss: true
@@ -0,0 +1,106 @@
+"""Calculating RMSE, MAE by forecast step."""
+
+import torch
+from torchmetrics import Metric
+
+
+class BaseErrorMetricDaily(Metric):
+    """Base class for per-timestep error metrics using sufficient statistics."""
+
+    def __init__(self) -> None:
+        """Initialize the metric."""
+        super().__init__()
+        self.sum_errors: torch.Tensor
+        self.count: torch.Tensor
+        self.add_state(
+            "sum_errors",
+            default=torch.tensor([], dtype=torch.float32),
+            dist_reduce_fx="sum",
+        )
+        self.add_state(
+            "count",
+            default=torch.tensor([], dtype=torch.long),
+            dist_reduce_fx="sum",
+        )
+
+    def _compute_errors(
+        self, preds: torch.Tensor, targets: torch.Tensor
+    ) -> torch.Tensor:
+        """Compute element-wise errors. Override in subclasses."""
+        raise NotImplementedError
+
+    def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
+        """Update metrics with a batch of predictions and targets.
+
+        Args:
+            preds: Tensor of shape (batch, time, channels, height, width)
+            targets: Tensor of shape (batch, time, channels, height, width)
+
+        """
+        # Compute errors: (batch, time, channels, height, width)
+        errors = self._compute_errors(preds, targets)
+
+        batch_size = errors.shape[0]
+        num_spatial = errors.shape[2] * errors.shape[3] * errors.shape[4]
+
+        # Reshape to (batch, time, -1) then sum over batch and spatial dims
+        errors_reshaped = errors.view(batch_size, -1, num_spatial)
+        batch_sum_errors = errors_reshaped.sum(dim=(0, 2))
+
+        # Count samples per time step
+        batch_count = torch.full(
+            (errors.shape[1],),
+            batch_size * num_spatial,
+            dtype=torch.long,
+            device=errors.device,
+        )
+
+        # Initialize buffers on first update
+        if self.sum_errors.numel() == 0:
+            self.sum_errors = batch_sum_errors
+            self.count = batch_count
+        else:
+            if self.sum_errors.shape[0] != batch_sum_errors.shape[0]:
+                msg = f"Time dimension mismatch: expected {self.sum_errors.shape[0]}, got {batch_sum_errors.shape[0]}"
+                raise ValueError(msg)
+            self.sum_errors += batch_sum_errors
+            self.count += batch_count
+
+    def _finalize(self, mean_errors: torch.Tensor) -> torch.Tensor:
+        """Apply final transformation to mean errors. Override in subclasses."""
+        return mean_errors
+
+    def compute(self) -> torch.Tensor:
+        """Compute metric per lead time from accumulated sufficient statistics.
+
+        Returns:
+            Tensor of shape (T,) with metric value for each time step
+
+        """
+        if self.count.numel() == 0:
+            return torch.tensor([], dtype=torch.float32, device=self.sum_errors.device)
+
+        count = torch.clamp(self.count, min=1)
+        mean_errors = self.sum_errors / count.float()
+        return self._finalize(mean_errors)
+
+
+class RMSEPerForecastDay(BaseErrorMetricDaily):
+    """Root Mean Squared Error per forecast lead time."""
+
+    def _compute_errors(
+        self, preds: torch.Tensor, targets: torch.Tensor
+    ) -> torch.Tensor:
+        return (preds - targets) ** 2
+
+    def _finalize(self, mean_errors: torch.Tensor) -> torch.Tensor:
+        return torch.sqrt(mean_errors)
+
+
+class MAEPerForecastDay(BaseErrorMetricDaily):
+    """Mean Absolute Error per forecast lead time."""
+
+    def _compute_errors(
+        self, preds: torch.Tensor, targets: torch.Tensor
+    ) -> torch.Tensor:
+        return torch.abs(preds - targets)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`metric_summary:`
`2`	`2`	`_target_: icenet_mp.callbacks.MetricSummaryCallback`
`3`		`- average_loss: true`