Skip to content

Commit 66e1a88

Browse files
authored
Merge pull request #390 from alan-turing-institute/Preprocessing
Preprocessing
2 parents 46ca7a6 + 9d014cd commit 66e1a88

29 files changed

+5791
-336
lines changed

.gitignore

+8-1
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ notebooks/
22
__pycache__/
33
.pytest_cache/
44
dist/
5+
.venv/
56

67
# Ignore Sphinx build artifacts
78
docs/build/
89
docs/_build/
910
docs/generated/
1011
.sphinx-build-environment
1112

13+
1214
# Auto-generated documentation
1315
_autosummary/
1416
_autodoc/
@@ -35,6 +37,11 @@ Thumbs.db
3537
# Quarto
3638
README.html
3739
README_files/
40+
requirements.txt
41+
3842

3943
# Ignore pyrightconfig.json to enable custom venv to be set
40-
pyrightconfig.json
44+
pyrightconfig.json
45+
docs/data/shallow_water/X.npy
46+
docs/data/shallow_water/Y.npy
47+
docs/tutorials/05_reaction_diffusion copy.ipynb

autoemulate/compare.py

+571-124
Large diffs are not rendered by default.

autoemulate/cross_validate.py

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sklearn.metrics import make_scorer
77
from sklearn.model_selection import cross_validate
88

9+
from autoemulate.utils import _ensure_1d_if_column_vec
910
from autoemulate.utils import get_model_name
1011
from autoemulate.utils import get_model_params
1112

@@ -46,6 +47,8 @@ def _run_cv(X, y, cv, model, metrics, n_jobs=None, logger=None):
4647
logger.info(f"Cross-validating {get_model_name(model)}...")
4748
logger.info(f"Parameters: {get_model_params(model)}")
4849

50+
y = _ensure_1d_if_column_vec(y)
51+
4952
cv_results = None
5053
try:
5154
cv_results = cross_validate(

autoemulate/hyperparam_searching.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from sklearn.model_selection import RandomizedSearchCV
55

66
from autoemulate.utils import _adjust_param_space
7+
from autoemulate.utils import _ensure_1d_if_column_vec
8+
from autoemulate.utils import _ensure_2d
79
from autoemulate.utils import get_model_name
810
from autoemulate.utils import get_model_param_space
911
from autoemulate.utils import get_model_params
@@ -57,14 +59,28 @@ def _optimize_params(
5759
-------
5860
Refitted estimator on the whole dataset with best parameters.
5961
"""
60-
logger.info(f"Performing grid search for {get_model_name(model)}...")
61-
param_space = _process_param_space(model, search_type, param_space)
62+
63+
if hasattr(model, "transformer"):
64+
# If 'model' is a Pipeline with transformer, we need to fit the transformer, before performing the search.
65+
66+
# Note that if the pipeline has transformer 'model' is a InputOutputPipeline
67+
# where 'regressor' is the Input Pipeline (containing the model) and 'transformer' is the Output Pipeline
68+
69+
# Fit the transformer to the output data and transform the output data
70+
y = _ensure_2d(y) # data expected to be 2D for transformer
71+
y = model.transformer.fit_transform(y)
72+
regressor = model.regressor
73+
else:
74+
regressor = model
75+
76+
logger.info(f"Performing grid search for {get_model_name(regressor)}...")
77+
param_space = _process_param_space(regressor, search_type, param_space)
6278
search_type = search_type.lower()
6379

6480
# random search
6581
if search_type == "random":
6682
searcher = RandomizedSearchCV(
67-
model,
83+
regressor,
6884
param_space,
6985
n_iter=niter,
7086
cv=cv,
@@ -78,7 +94,8 @@ def _optimize_params(
7894

7995
# run hyperparameter search
8096
try:
81-
searcher.fit(X, y)
97+
searcher.fit(X, _ensure_1d_if_column_vec(y))
98+
8299
except Exception:
83100
logger.exception(
84101
f"Failed to perform hyperparameter search on {get_model_name(model)}"

autoemulate/model_processing.py

+97-92
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,101 @@
11
"""Functions for getting and processing models."""
2+
from sklearn.decomposition import PCA
23
from sklearn.multioutput import MultiOutputRegressor
34
from sklearn.pipeline import Pipeline
45

5-
6-
def _turn_models_into_multioutput(models, y):
7-
"""Turn single output models into multioutput models if y is 2D.
8-
9-
Parameters
10-
----------
11-
models : dict
12-
Dict of model instances.
13-
y : array-like, shape (n_samples, n_outputs)
14-
Simulation output.
15-
16-
Returns
17-
-------
18-
models_multi : dict
19-
Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
20-
"""
21-
22-
models_multi = [
23-
MultiOutputRegressor(model)
24-
if not model._more_tags()["multioutput"] and (y.ndim > 1 and y.shape[1] > 1)
25-
else model
26-
for model in models
27-
]
28-
return models_multi
29-
30-
31-
def _wrap_models_in_pipeline(models, scale, scaler, reduce_dim, dim_reducer):
32-
"""Wrap models in a pipeline if scale is True.
33-
34-
Parameters
35-
----------
36-
models : dict
37-
dict of model instances.
38-
scale : bool
39-
Whether to scale the data.
40-
scaler : sklearn.preprocessing object
41-
Scaler to use.
42-
reduce_dim : bool
43-
Whether to reduce the dimensionality of the data.
44-
dim_reducer : sklearn.decomposition object
45-
Dimensionality reduction method to use.
46-
47-
Returns
48-
-------
49-
models_scaled : dict
50-
dict of model_names: model instances, with scaled models wrapped in a pipeline.
51-
"""
52-
53-
models_piped = []
54-
55-
for model in models:
56-
steps = []
57-
if scale:
58-
steps.append(("scaler", scaler))
59-
if reduce_dim:
60-
steps.append(("dim_reducer", dim_reducer))
61-
steps.append(("model", model))
62-
# without scaling or dim reduction, the model is the only step
63-
models_piped.append(Pipeline(steps))
64-
65-
return models_piped
66-
67-
68-
def _process_models(
69-
model_registry, model_names, y, scale, scaler, reduce_dim, dim_reducer
70-
):
71-
"""Get and process models.
72-
73-
Parameters
74-
----------
75-
model_registry : ModelRegistry
76-
An instance of the ModelRegistry class.
77-
model_names : list
78-
List of model names.
79-
y : array-like, shape (n_samples, n_outputs)
80-
Simulation output.
81-
scale : bool
82-
Whether to scale the data.
83-
scaler : sklearn.preprocessing object
84-
Scaler to use.
85-
86-
Returns
87-
-------
88-
models : list
89-
List of model instances.
90-
"""
91-
models = model_registry.get_models(model_names)
92-
models_multi = _turn_models_into_multioutput(models, y)
93-
models_scaled = _wrap_models_in_pipeline(
94-
models_multi, scale, scaler, reduce_dim, dim_reducer
95-
)
96-
return models_scaled
6+
from autoemulate.preprocess_target import get_dim_reducer
7+
from autoemulate.preprocess_target import InputOutputPipeline
8+
from autoemulate.preprocess_target import NoChangeTransformer
9+
from autoemulate.preprocess_target import TargetPCA
10+
from autoemulate.preprocess_target import TargetVAE
11+
12+
13+
class AutoEmulatePipeline:
14+
def __init__(
15+
self,
16+
model_registry,
17+
model_names,
18+
y,
19+
prep_config,
20+
scale_input=False,
21+
scaler_input=None,
22+
reduce_dim_input=False,
23+
dim_reducer_input=None,
24+
scale_output=False,
25+
scaler_output=None,
26+
reduce_dim_output=False,
27+
):
28+
self.model_piped = None
29+
prep_name = prep_config["name"]
30+
prep_params = prep_config.get("params", {})
31+
self.dim_reducer_output = get_dim_reducer(prep_name, **prep_params)
32+
33+
self.models = model_registry.get_models(model_names)
34+
35+
self._turn_models_into_multioutput(y)
36+
37+
# Store pipeline settings as instance attributes
38+
self.scale_input = scale_input
39+
self.scaler_input = scaler_input
40+
self.reduce_dim_input = reduce_dim_input
41+
self.dim_reducer_input = dim_reducer_input
42+
self.scale_output = scale_output
43+
self.scaler_output = scaler_output
44+
self.reduce_dim_output = reduce_dim_output
45+
46+
# Wrap the model and reducer into a pipeline
47+
self._wrap_model_reducer_in_pipeline()
48+
49+
def _wrap_model_reducer_in_pipeline(self):
50+
"""Wrap reducer in a pipeline if reduce_dim_output is True."""
51+
self.models_piped = []
52+
53+
for model in self.models_multi:
54+
input_steps = []
55+
if self.scale_input:
56+
input_steps.append(("scaler", self.scaler_input))
57+
if self.reduce_dim_input:
58+
input_steps.append(("dim_reducer", self.dim_reducer_input))
59+
input_steps.append(("model", model))
60+
input_pipeline = Pipeline(input_steps)
61+
62+
# Create output transformation pipeline
63+
output_steps = []
64+
if self.scale_output:
65+
output_steps.append(("scaler_output", self.scaler_output))
66+
if self.reduce_dim_output:
67+
output_steps.append(("dim_reducer_output", self.dim_reducer_output))
68+
69+
if output_steps:
70+
output_pipeline = Pipeline(output_steps)
71+
final_model = InputOutputPipeline(
72+
regressor=input_pipeline, transformer=output_pipeline
73+
)
74+
self.models_piped.append(final_model)
75+
else:
76+
self.models_piped.append(input_pipeline)
77+
return self.models_piped
78+
79+
def _turn_models_into_multioutput(self, y):
80+
"""Turn single output models into multioutput models if y is 2D.
81+
82+
Parameters
83+
----------
84+
models : dict
85+
Dict of model instances.
86+
y : array-like, shape (n_samples, n_outputs)
87+
Simulation output.
88+
89+
Returns
90+
-------
91+
models_multi : dict
92+
Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
93+
"""
94+
self.models_multi = [
95+
MultiOutputRegressor(model)
96+
if not model._more_tags().get("multioutput", False)
97+
and (y.ndim > 1 and y.shape[1] > 1)
98+
else model
99+
for model in self.models
100+
]
101+
return self.models_multi

autoemulate/plotting.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sklearn.metrics import r2_score
77
from sklearn.pipeline import Pipeline
88

9+
from autoemulate.preprocess_target import InputOutputPipeline
910
from autoemulate.utils import _ensure_2d
1011

1112

@@ -44,21 +45,19 @@ def _check_multioutput(y, output_index):
4445

4546
def _predict_with_optional_std(model, X_test):
4647
"""Predicts the output of the model with or without uncertainty."""
47-
# see whether the model is a pipeline or not
48-
if isinstance(model, Pipeline):
49-
predict_params = inspect.signature(
50-
model.named_steps["model"].predict
51-
).parameters
48+
# Get the base model's predict signature
49+
if isinstance(model, InputOutputPipeline):
50+
base_model = model.regressor_.named_steps["model"]
51+
elif isinstance(model, Pipeline):
52+
base_model = model.named_steps["model"]
5253
else:
53-
predict_params = inspect.signature(model.predict).parameters
54-
# see whether the model has return_std in its predict parameters
55-
if "return_std" in predict_params:
56-
y_test_pred, y_test_std = model.predict(X_test, return_std=True)
57-
else:
58-
y_test_pred = model.predict(X_test)
59-
y_test_std = None
54+
base_model = model
6055

61-
return y_test_pred, y_test_std
56+
predict_params = inspect.signature(base_model.predict).parameters
57+
# Only pass return_std if explicitly supported
58+
if "return_std" in predict_params:
59+
return model.predict(X_test, return_std=True)
60+
return model.predict(X_test), None
6261

6362

6463
def _calculate_subplot_layout(n_plots, n_cols=3):
@@ -412,6 +411,7 @@ def _plot_model(
412411
output_index : int or list of int, optional
413412
The index(es) of the output variable(s) to plot. If None, all outputs are used.
414413
"""
414+
415415
# Get predictions, with uncertainty if available
416416
y_pred, y_std = _predict_with_optional_std(model, X)
417417

0 commit comments

Comments
 (0)