alan-turing-institute · ContiPaolo · Apr 23, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ docs/_build/
 docs/generated/
 .sphinx-build-environment
 
+
 # Auto-generated documentation
 _autosummary/
 _autodoc/
@@ -32,6 +33,8 @@ Thumbs.db
 # Quarto
 README.html
 README_files/
+requirements.txt
+
 
 # Ignore pyrightconfig.json to enable custom venv to be set
 pyrightconfig.json
diff --git a/autoemulate/compare.py b/autoemulate/compare.py
diff --git a/autoemulate/cross_validate.py b/autoemulate/cross_validate.py
@@ -6,6 +6,7 @@
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
 
+from autoemulate.utils import _ensure_1d_if_column_vec
 from autoemulate.utils import get_model_name
 from autoemulate.utils import get_model_params
 
@@ -46,6 +47,8 @@ def _run_cv(X, y, cv, model, metrics, n_jobs=None, logger=None):
     logger.info(f"Cross-validating {get_model_name(model)}...")
     logger.info(f"Parameters: {get_model_params(model)}")
 
+    y = _ensure_1d_if_column_vec(y)
+
     cv_results = None
     try:
         cv_results = cross_validate(

diff --git a/autoemulate/hyperparam_searching.py b/autoemulate/hyperparam_searching.py
@@ -4,6 +4,8 @@
 from sklearn.model_selection import RandomizedSearchCV
 
 from autoemulate.utils import _adjust_param_space
+from autoemulate.utils import _ensure_1d_if_column_vec
+from autoemulate.utils import _ensure_2d
 from autoemulate.utils import get_model_name
 from autoemulate.utils import get_model_param_space
 from autoemulate.utils import get_model_params
@@ -57,14 +59,28 @@ def _optimize_params(
     -------
     Refitted estimator on the whole dataset with best parameters.
     """
-    logger.info(f"Performing grid search for {get_model_name(model)}...")
-    param_space = _process_param_space(model, search_type, param_space)
+
+    if hasattr(model, "transformer"):
+        # If 'model' is a Pipeline with transformer, we need to fit the transformer, before performing the search.
+
+        # Note that if the pipeline has transformer 'model' is a InputOutputPipeline
+        # where 'regressor' is the Input Pipeline (containing the model) and 'transformer' is the Output Pipeline
+
+        # Fit the transformer to the output data and transform the output data
+        y = _ensure_2d(y)  # data expected to be 2D for transformer
+        y = model.transformer.fit_transform(y)
+        regressor = model.regressor
+    else:
+        regressor = model
+
+    logger.info(f"Performing grid search for {get_model_name(regressor)}...")
+    param_space = _process_param_space(regressor, search_type, param_space)
     search_type = search_type.lower()
 
     # random search
     if search_type == "random":
         searcher = RandomizedSearchCV(
-            model,
+            regressor,
             param_space,
             n_iter=niter,
             cv=cv,
@@ -78,7 +94,8 @@ def _optimize_params(
 
     # run hyperparameter search
     try:
-        searcher.fit(X, y)
+        searcher.fit(X, _ensure_1d_if_column_vec(y))
+
     except Exception:
         logger.exception(
             f"Failed to perform hyperparameter search on {get_model_name(model)}"

diff --git a/autoemulate/model_processing.py b/autoemulate/model_processing.py
@@ -1,96 +1,101 @@
 """Functions for getting and processing models."""
+from sklearn.decomposition import PCA
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.pipeline import Pipeline
 
-
-def _turn_models_into_multioutput(models, y):
-    """Turn single output models into multioutput models if y is 2D.
-
-    Parameters
-    ----------
-    models : dict
-        Dict of model instances.
-    y : array-like, shape (n_samples, n_outputs)
-        Simulation output.
-
-    Returns
-    -------
-    models_multi : dict
-        Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
-    """
-
-    models_multi = [
-        MultiOutputRegressor(model)
-        if not model._more_tags()["multioutput"] and (y.ndim > 1 and y.shape[1] > 1)
-        else model
-        for model in models
-    ]
-    return models_multi
-
-
-def _wrap_models_in_pipeline(models, scale, scaler, reduce_dim, dim_reducer):
-    """Wrap models in a pipeline if scale is True.
-
-    Parameters
-    ----------
-    models : dict
-        dict of model instances.
-    scale : bool
-        Whether to scale the data.
-    scaler : sklearn.preprocessing object
-        Scaler to use.
-    reduce_dim : bool
-        Whether to reduce the dimensionality of the data.
-    dim_reducer : sklearn.decomposition object
-        Dimensionality reduction method to use.
-
-    Returns
-    -------
-    models_scaled : dict
-        dict of model_names: model instances, with scaled models wrapped in a pipeline.
-    """
-
-    models_piped = []
-
-    for model in models:
-        steps = []
-        if scale:
-            steps.append(("scaler", scaler))
-        if reduce_dim:
-            steps.append(("dim_reducer", dim_reducer))
-        steps.append(("model", model))
-        # without scaling or dim reduction, the model is the only step
-        models_piped.append(Pipeline(steps))
-
-    return models_piped
-
-
-def _process_models(
-    model_registry, model_names, y, scale, scaler, reduce_dim, dim_reducer
-):
-    """Get and process models.
-
-    Parameters
-    ----------
-    model_registry : ModelRegistry
-        An instance of the ModelRegistry class.
-    model_names : list
-        List of model names.
-    y : array-like, shape (n_samples, n_outputs)
-        Simulation output.
-    scale : bool
-        Whether to scale the data.
-    scaler : sklearn.preprocessing object
-        Scaler to use.
-
-    Returns
-    -------
-    models : list
-        List of model instances.
-    """
-    models = model_registry.get_models(model_names)
-    models_multi = _turn_models_into_multioutput(models, y)
-    models_scaled = _wrap_models_in_pipeline(
-        models_multi, scale, scaler, reduce_dim, dim_reducer
-    )
-    return models_scaled
+from autoemulate.preprocess_target import get_dim_reducer
+from autoemulate.preprocess_target import InputOutputPipeline
+from autoemulate.preprocess_target import NoChangeTransformer
+from autoemulate.preprocess_target import TargetPCA
+from autoemulate.preprocess_target import TargetVAE
+
+
+class AutoEmulatePipeline:
+    def __init__(
+        self,
+        model_registry,
+        model_names,
+        y,
+        prep_config,
+        scale_input=False,
+        scaler_input=None,
+        reduce_dim_input=False,
+        dim_reducer_input=None,
+        scale_output=False,
+        scaler_output=None,
+        reduce_dim_output=False,
+    ):
+        self.model_piped = None
+        prep_name = prep_config["name"]
+        prep_params = prep_config.get("params", {})
+        self.dim_reducer_output = get_dim_reducer(prep_name, **prep_params)
+
+        self.models = model_registry.get_models(model_names)
+
+        self._turn_models_into_multioutput(y)
+
+        # Store pipeline settings as instance attributes
+        self.scale_input = scale_input
+        self.scaler_input = scaler_input
+        self.reduce_dim_input = reduce_dim_input
+        self.dim_reducer_input = dim_reducer_input
+        self.scale_output = scale_output
+        self.scaler_output = scaler_output
+        self.reduce_dim_output = reduce_dim_output
+
+        # Wrap the model and reducer into a pipeline
+        self._wrap_model_reducer_in_pipeline()
+
+    def _wrap_model_reducer_in_pipeline(self):
+        """Wrap reducer in a pipeline if reduce_dim_output is True."""
+        self.models_piped = []
+
+        for model in self.models_multi:
+            input_steps = []
+            if self.scale_input:
+                input_steps.append(("scaler", self.scaler_input))
+            if self.reduce_dim_input:
+                input_steps.append(("dim_reducer", self.dim_reducer_input))
+            input_steps.append(("model", model))
+            input_pipeline = Pipeline(input_steps)
+
+            # Create output transformation pipeline
+            output_steps = []
+            if self.scale_output:
+                output_steps.append(("scaler_output", self.scaler_output))
+            if self.reduce_dim_output:
+                output_steps.append(("dim_reducer_output", self.dim_reducer_output))
+
+            if output_steps:
+                output_pipeline = Pipeline(output_steps)
+                final_model = InputOutputPipeline(
+                    regressor=input_pipeline, transformer=output_pipeline
+                )
+                self.models_piped.append(final_model)
+            else:
+                self.models_piped.append(input_pipeline)
+        return self.models_piped
+
+    def _turn_models_into_multioutput(self, y):
+        """Turn single output models into multioutput models if y is 2D.
+
+        Parameters
+        ----------
+        models : dict
+            Dict of model instances.
+        y : array-like, shape (n_samples, n_outputs)
+            Simulation output.
+
+        Returns
+        -------
+        models_multi : dict
+            Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
+        """
+        self.models_multi = [
+            MultiOutputRegressor(model)
+            if not model._more_tags().get("multioutput", False)
+            and (y.ndim > 1 and y.shape[1] > 1)
+            else model
+            for model in self.models
+        ]
+        return self.models_multi
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
@@ -6,6 +6,7 @@
 from sklearn.metrics import r2_score
 from sklearn.pipeline import Pipeline
 
+from autoemulate.preprocess_target import InputOutputPipeline
 from autoemulate.utils import _ensure_2d
 
 
@@ -44,21 +45,19 @@ def _check_multioutput(y, output_index):
 
 def _predict_with_optional_std(model, X_test):
     """Predicts the output of the model with or without uncertainty."""
-    # see whether the model is a pipeline or not
-    if isinstance(model, Pipeline):
-        predict_params = inspect.signature(
-            model.named_steps["model"].predict
-        ).parameters
+    # Get the base model's predict signature
+    if isinstance(model, InputOutputPipeline):
+        base_model = model.regressor_.named_steps["model"]
+    elif isinstance(model, Pipeline):
+        base_model = model.named_steps["model"]
     else:
-        predict_params = inspect.signature(model.predict).parameters
-    # see whether the model has return_std in its predict parameters
-    if "return_std" in predict_params:
-        y_test_pred, y_test_std = model.predict(X_test, return_std=True)
-    else:
-        y_test_pred = model.predict(X_test)
-        y_test_std = None
+        base_model = model
 
-    return y_test_pred, y_test_std
+    predict_params = inspect.signature(base_model.predict).parameters
+    # Only pass return_std if explicitly supported
+    if "return_std" in predict_params:
+        return model.predict(X_test, return_std=True)
+    return model.predict(X_test), None
 
 
 def _calculate_subplot_layout(n_plots, n_cols=3):
@@ -412,6 +411,7 @@ def _plot_model(
     output_index : int or list of int, optional
         The index(es) of the output variable(s) to plot. If None, all outputs are used.
     """
+
     # Get predictions, with uncertainty if available
     y_pred, y_std = _predict_with_optional_std(model, X)