|
| 1 | +import logging |
| 2 | +import warnings |
| 3 | +from typing import Any |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +from sklearn.model_selection import BaseCrossValidator, KFold |
| 7 | + |
| 8 | +from autoemulate.experimental.data.utils import InputTypeMixin |
| 9 | +from autoemulate.experimental.emulators import ALL_EMULATORS |
| 10 | +from autoemulate.experimental.emulators.base import Emulator |
| 11 | +from autoemulate.experimental.model_selection import cross_validate |
| 12 | +from autoemulate.experimental.tuner import Tuner |
| 13 | +from autoemulate.experimental.types import InputLike |
| 14 | + |
| 15 | + |
| 16 | +class AutoEmulate(InputTypeMixin): |
| 17 | + def __init__( |
| 18 | + self, |
| 19 | + x: InputLike, |
| 20 | + y: InputLike, |
| 21 | + models: list[type[Emulator]] | None = None, |
| 22 | + ): |
| 23 | + # TODO: refactor in https://github.com/alan-turing-institute/autoemulate/issues/400 |
| 24 | + x, y = self._convert_to_tensors(x, y) |
| 25 | + |
| 26 | + # Set default models if None |
| 27 | + updated_models = self.get_models(models) |
| 28 | + |
| 29 | + # Filter models to only be those that can handle multioutput data |
| 30 | + if y.shape[1] > 1: |
| 31 | + updated_models = self.filter_models_if_multioutput( |
| 32 | + updated_models, models is not None |
| 33 | + ) |
| 34 | + |
| 35 | + self.models = updated_models |
| 36 | + self.train_val, self.test = self._random_split(self._convert_to_dataset(x, y)) |
| 37 | + |
| 38 | + @staticmethod |
| 39 | + def all_emulators() -> list[type[Emulator]]: |
| 40 | + return ALL_EMULATORS |
| 41 | + |
| 42 | + def get_models(self, models: list[type[Emulator]] | None) -> list[type[Emulator]]: |
| 43 | + if models is None: |
| 44 | + return self.all_emulators() |
| 45 | + return models |
| 46 | + |
| 47 | + def filter_models_if_multioutput( |
| 48 | + self, models: list[type[Emulator]], warn: bool |
| 49 | + ) -> list[type[Emulator]]: |
| 50 | + updated_models = [] |
| 51 | + for model in models: |
| 52 | + if not model.is_multioutput(): |
| 53 | + if warn: |
| 54 | + msg = ( |
| 55 | + f"Model ({model}) is not multioutput but the data is " |
| 56 | + f"multioutput. Skipping model ({model})..." |
| 57 | + ) |
| 58 | + warnings.warn(msg, stacklevel=2) |
| 59 | + else: |
| 60 | + updated_models.append(model) |
| 61 | + return updated_models |
| 62 | + |
| 63 | + def log_compare(self, model_cls, best_model_config, r2_score, rmse_score): |
| 64 | + logger = logging.getLogger(__name__) |
| 65 | + msg = ( |
| 66 | + f"Model: {model_cls.__name__}, " |
| 67 | + f"Best params: {best_model_config}, " |
| 68 | + f"R2 score: {r2_score:.3f}, " |
| 69 | + f"RMSE score: {rmse_score:.3f}" |
| 70 | + ) |
| 71 | + logger.info(msg) |
| 72 | + |
| 73 | + def compare( |
| 74 | + self, n_iter: int = 10, cv: type[BaseCrossValidator] = KFold |
| 75 | + ) -> dict[str, dict[str, Any]]: |
| 76 | + tuner = Tuner(self.train_val, y=None, n_iter=n_iter) |
| 77 | + models_evaluated = {} |
| 78 | + for model_cls in self.models: |
| 79 | + scores, configs = tuner.run(model_cls) |
| 80 | + best_score_idx = scores.index(max(scores)) |
| 81 | + best_model_config = configs[best_score_idx] |
| 82 | + cv_results = cross_validate( |
| 83 | + cv(), self.train_val.dataset, model_cls, **best_model_config |
| 84 | + ) |
| 85 | + r2_score, rmse_score = ( |
| 86 | + np.mean(cv_results["r2"]), |
| 87 | + np.mean(cv_results["rmse"]), |
| 88 | + ) |
| 89 | + models_evaluated[model_cls.__name__] = { |
| 90 | + "config": best_model_config, |
| 91 | + "r2_score": r2_score, |
| 92 | + "rmse_score": rmse_score, |
| 93 | + } |
| 94 | + self.log_compare(model_cls, best_model_config, r2_score, rmse_score) |
| 95 | + return models_evaluated |
0 commit comments