Merge pull request #397 from alan-turing-institute/371-compare

radka-j · web-flow · commit 47be238d74b8 · 2025-04-25T16:58:50.000+01:00
Add compare for experimental
diff --git a/autoemulate/experimental/compare.py b/autoemulate/experimental/compare.py
@@ -0,0 +1,95 @@
+import logging
+import warnings
+from typing import Any
+
+import numpy as np
+from sklearn.model_selection import BaseCrossValidator, KFold
+
+from autoemulate.experimental.data.utils import InputTypeMixin
+from autoemulate.experimental.emulators import ALL_EMULATORS
+from autoemulate.experimental.emulators.base import Emulator
+from autoemulate.experimental.model_selection import cross_validate
+from autoemulate.experimental.tuner import Tuner
+from autoemulate.experimental.types import InputLike
+
+
+class AutoEmulate(InputTypeMixin):
+    def __init__(
+        self,
+        x: InputLike,
+        y: InputLike,
+        models: list[type[Emulator]] | None = None,
+    ):
+        # TODO: refactor in https://github.com/alan-turing-institute/autoemulate/issues/400
+        x, y = self._convert_to_tensors(x, y)
+
+        # Set default models if None
+        updated_models = self.get_models(models)
+
+        # Filter models to only be those that can handle multioutput data
+        if y.shape[1] > 1:
+            updated_models = self.filter_models_if_multioutput(
+                updated_models, models is not None
+            )
+
+        self.models = updated_models
+        self.train_val, self.test = self._random_split(self._convert_to_dataset(x, y))
+
+    @staticmethod
+    def all_emulators() -> list[type[Emulator]]:
+        return ALL_EMULATORS
+
+    def get_models(self, models: list[type[Emulator]] | None) -> list[type[Emulator]]:
+        if models is None:
+            return self.all_emulators()
+        return models
+
+    def filter_models_if_multioutput(
+        self, models: list[type[Emulator]], warn: bool
+    ) -> list[type[Emulator]]:
+        updated_models = []
+        for model in models:
+            if not model.is_multioutput():
+                if warn:
+                    msg = (
+                        f"Model ({model}) is not multioutput but the data is "
+                        f"multioutput. Skipping model ({model})..."
+                    )
+                    warnings.warn(msg, stacklevel=2)
+            else:
+                updated_models.append(model)
+        return updated_models
+
+    def log_compare(self, model_cls, best_model_config, r2_score, rmse_score):
+        logger = logging.getLogger(__name__)
+        msg = (
+            f"Model: {model_cls.__name__}, "
+            f"Best params: {best_model_config}, "
+            f"R2 score: {r2_score:.3f}, "
+            f"RMSE score: {rmse_score:.3f}"
+        )
+        logger.info(msg)
+
+    def compare(
+        self, n_iter: int = 10, cv: type[BaseCrossValidator] = KFold
+    ) -> dict[str, dict[str, Any]]:
+        tuner = Tuner(self.train_val, y=None, n_iter=n_iter)
+        models_evaluated = {}
+        for model_cls in self.models:
+            scores, configs = tuner.run(model_cls)
+            best_score_idx = scores.index(max(scores))
+            best_model_config = configs[best_score_idx]
+            cv_results = cross_validate(
+                cv(), self.train_val.dataset, model_cls, **best_model_config
+            )
+            r2_score, rmse_score = (
+                np.mean(cv_results["r2"]),
+                np.mean(cv_results["rmse"]),
+            )
+            models_evaluated[model_cls.__name__] = {
+                "config": best_model_config,
+                "r2_score": r2_score,
+                "rmse_score": rmse_score,
+            }
+            self.log_compare(model_cls, best_model_config, r2_score, rmse_score)
+        return models_evaluated
diff --git a/autoemulate/experimental/data/utils.py b/autoemulate/experimental/data/utils.py
@@ -1,7 +1,9 @@
 import numpy as np
 import torch
+import torch.utils
+import torch.utils.data
 from autoemulate.experimental.types import InputLike
-from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
+from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset, random_split
 
 
 class InputTypeMixin:
@@ -31,6 +33,8 @@ def _convert_to_dataset(
             dataset = TensorDataset(x)
         elif isinstance(x, Dataset) and y is None:
             dataset = x
+        elif isinstance(x, DataLoader) and y is None:
+            dataset = x.dataset
         else:
             raise ValueError(
                 f"Unsupported type for x ({type(x)}). Must be numpy array or PyTorch "
@@ -69,6 +73,21 @@ def _convert_to_tensors(
         Convert InputLike x, y to Tensor or tuple of Tensors.
         """
         dataset = self._convert_to_dataset(x, y)
+
+        # Handle Subset of TensorDataset
+        if isinstance(dataset, Subset):
+            if isinstance(dataset.dataset, TensorDataset):
+                tensors = dataset.dataset.tensors
+                indices = dataset.indices
+
+                # Use indexing to get subset tensors
+                subset_tensors = tuple(tensor[indices] for tensor in tensors)
+                dataset = TensorDataset(*subset_tensors)
+            else:
+                raise ValueError(
+                    f"Subset must wrap a TensorDataset. Found {type(dataset.dataset)}."
+                )
+
         if isinstance(dataset, TensorDataset):
             if len(dataset.tensors) > 2:
                 raise ValueError(
diff --git a/autoemulate/experimental/emulators/__init__.py b/autoemulate/experimental/emulators/__init__.py
@@ -0,0 +1,5 @@
+from .gaussian_process.exact import GaussianProcessExact
+from .lightgbm import LightGBM
+from .neural_processes.conditional_neural_process import CNPModule
+
+ALL_EMULATORS = [GaussianProcessExact, LightGBM, CNPModule]
diff --git a/autoemulate/experimental/emulators/base.py b/autoemulate/experimental/emulators/base.py
@@ -5,11 +5,7 @@
 
 from autoemulate.experimental.data.preprocessors import Preprocessor
 from autoemulate.experimental.data.utils import InputTypeMixin
-from autoemulate.experimental.types import (
-    InputLike,
-    OutputLike,
-    TuneConfig,
-)
+from autoemulate.experimental.types import InputLike, OutputLike, TuneConfig
 
 
 class Emulator(ABC):
@@ -35,6 +31,11 @@ def fit(self, x: InputLike, y: InputLike | None): ...
     def predict(self, x: InputLike) -> OutputLike:
         pass
 
+    @staticmethod
+    @abstractmethod
+    def is_multioutput() -> bool:
+        """Flag to indicate if the model is multioutput or not."""
+
     @staticmethod
     @abstractmethod
     def get_tune_config() -> TuneConfig:
diff --git a/autoemulate/experimental/emulators/gaussian_process/exact.py b/autoemulate/experimental/emulators/gaussian_process/exact.py
@@ -123,6 +123,10 @@ def __init__(  # noqa: PLR0913 allow too many arguments since all currently requ
         self.batch_size = batch_size
         self.activation = activation
 
+    @staticmethod
+    def is_multioutput():
+        return True
+
     def preprocess(self, x: InputLike) -> InputLike:
         """Preprocess the input data using the preprocessor."""
         if self.preprocessor is not None:
diff --git a/autoemulate/experimental/emulators/lightgbm.py b/autoemulate/experimental/emulators/lightgbm.py
@@ -64,6 +64,10 @@ def __init__(  # noqa: PLR0913 allow too many arguments since all currently requ
         self.importance_type = importance_type
         self.verbose = verbose
 
+    @staticmethod
+    def is_multioutput() -> bool:
+        return False
+
     def fit(self, x: InputLike, y: InputLike | None):
         """
         Fits the emulator to the data.
diff --git a/autoemulate/experimental/emulators/neural_processes/conditional_neural_process.py b/autoemulate/experimental/emulators/neural_processes/conditional_neural_process.py
@@ -478,6 +478,7 @@ def get_tune_config():
             "hidden_layers_dec": [1, 2, 4],
             "activation": [nn.ReLU],
             "min_context_points": [4, 5, 6],
-            "offset_context_points": [4, 6],
+            "offset_context_points": [4, 5],
+            # max_context_points must be less than n_episodes
             "n_episodes": [12, 13, 14],
         }
diff --git a/autoemulate/experimental/model_selection.py b/autoemulate/experimental/model_selection.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import numpy as np
 import torchmetrics
 from sklearn.model_selection import BaseCrossValidator
@@ -7,6 +9,7 @@
 from autoemulate.experimental.types import (
     DistributionLike,
     InputLike,
+    ModelConfig,
     OutputLike,
     TensorLike,
 )
@@ -60,8 +63,8 @@ def evaluate(
 def cross_validate(
     cv: BaseCrossValidator,
     dataset: Dataset,
-    model: Emulator,
-    batch_size: int = 16,
+    model: type[Emulator],
+    **kwargs: Any,
 ):
     """
     Cross validate model performance using the given `cv` strategy.
@@ -81,7 +84,9 @@ def cross_validate(
     dict[str, list[float]]
        Contains r2 and rmse scores computed for each cross validation fold.
     """
+    best_model_config: ModelConfig = kwargs
     cv_results = {"r2": [], "rmse": []}
+    batch_size = best_model_config.get("batch_size", 16)
     for train_idx, val_idx in cv.split(dataset):  # type: ignore TODO: identify type handling here
         # create train/val data subsets
         # convert idx to list to satisfy type checker
@@ -91,13 +96,15 @@ def cross_validate(
         val_loader = DataLoader(val_subset, batch_size=batch_size)
 
         # fit model
-        model.fit(train_loader, y=None)
+        x, y = next(iter(train_loader))
+        m = model(x, y, **best_model_config)
+        m.fit(x, y)
 
         # evaluate on batches
         r2_metric = torchmetrics.R2Score()
         mse_metric = torchmetrics.MeanSquaredError()
         for x_batch, y_batch in val_loader:
-            y_batch_pred = model.predict(x_batch)
+            y_batch_pred = m.predict(x_batch)
             _update(y_batch, y_batch_pred, r2_metric)
             _update(y_batch, y_batch_pred, mse_metric)
 
diff --git a/tests/experimental/test_experimental_base.py b/tests/experimental/test_experimental_base.py
@@ -119,6 +119,10 @@ def get_tune_config():
                 "batch_size": [16],
             }
 
+        @staticmethod
+        def is_multioutput():
+            return False
+
     def setup_method(self):
         """
         Define the PyTorchBackend instance.
diff --git a/tests/experimental/test_experimental_compare.py b/tests/experimental/test_experimental_compare.py
@@ -0,0 +1,31 @@
+from autoemulate.experimental.compare import AutoEmulate
+from autoemulate.experimental.emulators import ALL_EMULATORS
+
+
+def test_compare(sample_data_y2d):
+    x, y = sample_data_y2d
+    ae = AutoEmulate(x, y)
+    results = ae.compare(10)
+    print(results)
+
+
+def test_compare_user_models(sample_data_y2d, recwarn):
+    x, y = sample_data_y2d
+    ae = AutoEmulate(x, y, models=ALL_EMULATORS)
+    results = ae.compare(1)
+    print(results)
+    assert len(recwarn) == 1
+    assert str(recwarn.pop().message) == (
+        "Model (<class 'autoemulate.experimental.emulators.lightgbm.Li"
+        "ghtGBM'>) is not multioutput but the data is multioutput. Skipping model "
+        "(<class 'autoemulate.experimental.emulators.lightgbm.LightGBM'>)..."
+    )
+
+
+def test_compare_y1d(sample_data_y1d):
+    x, y = sample_data_y1d
+    # TODO: add handling when 1D
+    y = y.reshape(-1, 1)
+    ae = AutoEmulate(x, y)
+    results = ae.compare(10)
+    print(results)
diff --git a/tests/experimental/test_experimental_conditional_neural_process.py b/tests/experimental/test_experimental_conditional_neural_process.py
@@ -66,7 +66,7 @@ def test_cnp_module_predict_fails_with_calling_fit_first(sample_data_y1d):
 
 def test_tune_gp(sample_data_y1d):
     x, y = sample_data_y1d
-    tuner = Tuner(x, y, n_iter=5)
+    tuner = Tuner(x, y, n_iter=20)
     scores, configs = tuner.run(CNPModule)
-    assert len(scores) == 5
-    assert len(configs) == 5
+    assert len(scores) == 20
+    assert len(configs) == 20
diff --git a/tests/experimental/test_experimental_model_selection.py b/tests/experimental/test_experimental_model_selection.py
@@ -11,7 +11,7 @@ def test_cross_validate():
     Test cross_validate can be called with any sklearn.model_selection class.
     """
 
-    class DummyEmulator(Emulator):
+    class DummyEmulator(Emulator, torch.nn.Module):
         def __init__(self, x=None, y=None, **kwargs):
             pass
 
@@ -25,22 +25,26 @@ def predict(self, x):
         def get_tune_config():
             return {}
 
+        @staticmethod
+        def is_multioutput():
+            return False
+
     x = torch.tensor(np.arange(32)).float()
     y = 2 * x
     dataset = TensorDataset(x, y)
 
-    emulator = DummyEmulator()
+    emulator_cls = DummyEmulator
 
     # KFold
-    results = cross_validate(KFold(n_splits=2), dataset, emulator)
+    results = cross_validate(KFold(n_splits=2), dataset, emulator_cls)
     assert "r2" in results
     assert "rmse" in results
     assert len(results["r2"]) == 2
     assert len(results["rmse"]) == 2
 
     # LeavePOut: LOO raised an error with torchmetrics R2Score since it requires at
     # least 2 samples
-    results = cross_validate(LeavePOut(p=2), dataset, emulator)
+    results = cross_validate(LeavePOut(p=2), dataset, emulator_cls)
     expected_n = (x.shape[0] * (x.shape[0] - 1)) / 2
     assert len(results["r2"]) == expected_n
     assert len(results["rmse"]) == expected_n

Original file line number	Diff line number	Diff line change
`@@ -478,6 +478,7 @@ def get_tune_config():`
`478`	`478`	`"hidden_layers_dec": [1, 2, 4],`
`479`	`479`	`"activation": [nn.ReLU],`
`480`	`480`	`"min_context_points": [4, 5, 6],`
`481`		`- "offset_context_points": [4, 6],`
	`481`	`+ "offset_context_points": [4, 5],`
	`482`	`+ # max_context_points must be less than n_episodes`
`482`	`483`	`"n_episodes": [12, 13, 14],`
`483`	`484`	`}`