Skip to content
Merged
Show file tree
Hide file tree
Changes from 68 commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
a52f9f2
pre-process target
marjanfamili Mar 21, 2025
d391596
Added get_model function
ContiPaolo Mar 21, 2025
df31adc
Modified pipeline and add output dimensionality reducer
ContiPaolo Mar 21, 2025
1a4dbe1
Fixed issues compare.py
ContiPaolo Mar 21, 2025
fedfbbf
Fix issues in utils.py
ContiPaolo Mar 21, 2025
5f2c9e8
added test_notebook
marjanfamili Mar 21, 2025
21142b8
Add reaction-diffusion tutorial
ContiPaolo Mar 21, 2025
4c4265e
changing the pipeline to have UQ
marjanfamili Mar 21, 2025
f1152da
debugging and merge
marjanfamili Mar 21, 2025
8856054
pre-commit
marjanfamili Mar 21, 2025
239030d
Added reaction-diffusion simulator
ContiPaolo Mar 24, 2025
55c41c4
Added VAE
ContiPaolo Mar 24, 2025
9d5e349
Add reactiondiffusion data
ContiPaolo Mar 24, 2025
6d5dd3b
This is a large commit - I have implemented an outer loop around the …
marjanfamili Mar 24, 2025
919b639
merged from the origin
marjanfamili Mar 24, 2025
0cf2600
Implemented reconstruction of std
ContiPaolo Mar 25, 2025
4371c2d
improved reconstruction of std for PCA
ContiPaolo Mar 25, 2025
7342943
fixed VAE issues
ContiPaolo Mar 26, 2025
c9404f7
Add a non-trainable Reducer wrapper
ContiPaolo Mar 26, 2025
13d499f
Inserted pretrained reducers into the Pipeline
ContiPaolo Mar 27, 2025
9855b64
Fixed reconstruction of UQ
ContiPaolo Mar 27, 2025
cd16eff
added predict_with_std
ContiPaolo Mar 28, 2025
3e76721
comparison with multiple reducers
ContiPaolo Mar 28, 2025
dbad474
update notebook
marjanfamili Mar 29, 2025
1edb24c
changed the name of hidden_dims to hidden_layers, debugged plot_cv to…
marjanfamili Mar 31, 2025
008766d
Removed the unnecessary tqdm itterations, much faster now
marjanfamili Mar 31, 2025
adf50dc
Merge pull request #363 from alan-turing-institute/Preprocessing-Pipe…
marjanfamili Mar 31, 2025
a49504b
fixed variable names
ContiPaolo Mar 31, 2025
6dd391c
added ModelPrepPipeline to wrap all pipeline creation in one
marjanfamili Mar 31, 2025
096de43
fixed printing and added spatio-temporal tutorial
ContiPaolo Mar 31, 2025
a84ee77
some debugging for _plot_cv, misspelled function name and pre-commit…
marjanfamili Mar 31, 2025
c753c02
fixing optimiser
marjanfamili Apr 1, 2025
3173a36
fixing optimiser
marjanfamili Apr 1, 2025
13ac454
fixing tests
marjanfamili Apr 1, 2025
1fb2c8b
fixing tests
marjanfamili Apr 2, 2025
cc4b69b
Merge branch 'Preprocessing' of github.com:alan-turing-institute/auto…
marjanfamili Apr 2, 2025
ddb5c35
fixed output_pipeline and hyper_param search
ContiPaolo Apr 2, 2025
7325fa0
Merge branch 'Preprocessing' of github.com:alan-turing-institute/auto…
marjanfamili Apr 2, 2025
0ff7e7e
Merge branch 'Preprocessing' of github.com:alan-turing-institute/auto…
marjanfamili Apr 2, 2025
7e6d1cc
put scaler_output outside the loop
ContiPaolo Apr 2, 2025
905edf1
Merge remote-tracking branch 'origin/Preprocessing' into Preprocessing
ContiPaolo Apr 2, 2025
2cfd253
fixed the tests in test_compare and also access to the transformer
marjanfamili Apr 2, 2025
4dbc77d
fixing conflicts
marjanfamili Apr 2, 2025
9d76dec
minor changes
marjanfamili Apr 2, 2025
4b49167
use _predict_with_optional_std in history matching
marjanfamili Apr 2, 2025
fb69457
fixing tests in test_model_processing and added a new test
marjanfamili Apr 2, 2025
5111db1
added test for preprocess_target
marjanfamili Apr 2, 2025
5d8380c
end_to_end test passing
marjanfamili Apr 2, 2025
06d6460
Currently only 6 tests failing
marjanfamili Apr 3, 2025
08d1c3c
CV_results pass with no fail
marjanfamili Apr 3, 2025
fa1c1d6
test_ui fixed
marjanfamili Apr 3, 2025
1478794
pre-commit
marjanfamili Apr 3, 2025
e5b5418
fixed predict and predict_with_optional_std. all tests passed
ContiPaolo Apr 4, 2025
3568d7b
put "scaler_output" in the outer loop (with reducer) to have all trai…
ContiPaolo Apr 4, 2025
e49ca20
debugging plot_cv
marjanfamili Apr 4, 2025
5a258e1
fix pipeline bug
marjanfamili Apr 4, 2025
e18ceed
deleted unnecessary file
marjanfamili Apr 4, 2025
09cd26a
Fix warning for 2D column vec
sgreenbury Apr 15, 2025
a273831
Add _ensure_2d call to enable 1D or 2D data to be passed
sgreenbury Apr 15, 2025
78ce9cd
Apply suggested changes from PR review
ContiPaolo Apr 15, 2025
a154279
Merge remote-tracking branch 'origin/Preprocessing' into Preprocessing
ContiPaolo Apr 15, 2025
e8389b6
Remove _ensure_2d
sgreenbury Apr 15, 2025
09ddf6d
Merge remote-tracking branch 'origin/main' into Preprocessing
sgreenbury Apr 15, 2025
7c5d146
Fix pre-commit
sgreenbury Apr 15, 2025
fc9d7fd
Remove obsolete comments
sgreenbury Apr 15, 2025
8ce0464
Updated reaction_diffusion names
ContiPaolo Apr 15, 2025
5286b89
Update imports
sgreenbury Apr 15, 2025
4a5e6b2
Rename notebooks
sgreenbury Apr 15, 2025
e3a5fe0
Remove obsolete comments and fixed return object of compare
ContiPaolo Apr 16, 2025
168d9c1
added tutorial 5 to the documentation
ContiPaolo Apr 16, 2025
26961fd
Add more comments to the tutorial 5
ContiPaolo Apr 22, 2025
1d02f6f
fixed minor issues in testing preprocessing
ContiPaolo Apr 23, 2025
2af9f3e
fixed issues with loading in tests
ContiPaolo Apr 23, 2025
85cd6df
Merge branch 'main' into Preprocessing
ContiPaolo Apr 23, 2025
4bad82b
fixed issues with loading in tests
ContiPaolo Apr 23, 2025
9d014cd
Merge remote-tracking branch 'origin/Preprocessing' into Preprocessing
ContiPaolo Apr 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ docs/_build/
docs/generated/
.sphinx-build-environment


# Auto-generated documentation
_autosummary/
_autodoc/
Expand All @@ -32,6 +33,8 @@ Thumbs.db
# Quarto
README.html
README_files/
requirements.txt


# Ignore pyrightconfig.json to enable custom venv to be set
pyrightconfig.json
697 changes: 572 additions & 125 deletions autoemulate/compare.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions autoemulate/cross_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

from autoemulate.utils import _ensure_1d_if_column_vec
from autoemulate.utils import get_model_name
from autoemulate.utils import get_model_params

Expand Down Expand Up @@ -46,6 +47,8 @@ def _run_cv(X, y, cv, model, metrics, n_jobs=None, logger=None):
logger.info(f"Cross-validating {get_model_name(model)}...")
logger.info(f"Parameters: {get_model_params(model)}")

y = _ensure_1d_if_column_vec(y)

cv_results = None
try:
cv_results = cross_validate(
Expand Down
25 changes: 21 additions & 4 deletions autoemulate/hyperparam_searching.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from sklearn.model_selection import RandomizedSearchCV

from autoemulate.utils import _adjust_param_space
from autoemulate.utils import _ensure_1d_if_column_vec
from autoemulate.utils import _ensure_2d
from autoemulate.utils import get_model_name
from autoemulate.utils import get_model_param_space
from autoemulate.utils import get_model_params
Expand Down Expand Up @@ -57,14 +59,28 @@ def _optimize_params(
-------
Refitted estimator on the whole dataset with best parameters.
"""
logger.info(f"Performing grid search for {get_model_name(model)}...")
param_space = _process_param_space(model, search_type, param_space)

if hasattr(model, "transformer"):
# If 'model' is a Pipeline with transformer, we need to fit the transformer, before performing the search.

# Note that if the pipeline has transformer 'model' is a InputOutputPipeline
# where 'regressor' is the Input Pipeline (containing the model) and 'transformer' is the Output Pipeline

# Fit the transformer to the output data and transform the output data
y = _ensure_2d(y) # data expected to be 2D for transformer
y = model.transformer.fit_transform(y)
regressor = model.regressor
else:
regressor = model

logger.info(f"Performing grid search for {get_model_name(regressor)}...")
param_space = _process_param_space(regressor, search_type, param_space)
search_type = search_type.lower()

# random search
if search_type == "random":
searcher = RandomizedSearchCV(
model,
regressor,
param_space,
n_iter=niter,
cv=cv,
Expand All @@ -78,7 +94,8 @@ def _optimize_params(

# run hyperparameter search
try:
searcher.fit(X, y)
searcher.fit(X, _ensure_1d_if_column_vec(y))

except Exception:
logger.exception(
f"Failed to perform hyperparameter search on {get_model_name(model)}"
Expand Down
189 changes: 97 additions & 92 deletions autoemulate/model_processing.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,101 @@
"""Functions for getting and processing models."""
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline


def _turn_models_into_multioutput(models, y):
"""Turn single output models into multioutput models if y is 2D.

Parameters
----------
models : dict
Dict of model instances.
y : array-like, shape (n_samples, n_outputs)
Simulation output.

Returns
-------
models_multi : dict
Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
"""

models_multi = [
MultiOutputRegressor(model)
if not model._more_tags()["multioutput"] and (y.ndim > 1 and y.shape[1] > 1)
else model
for model in models
]
return models_multi


def _wrap_models_in_pipeline(models, scale, scaler, reduce_dim, dim_reducer):
"""Wrap models in a pipeline if scale is True.

Parameters
----------
models : dict
dict of model instances.
scale : bool
Whether to scale the data.
scaler : sklearn.preprocessing object
Scaler to use.
reduce_dim : bool
Whether to reduce the dimensionality of the data.
dim_reducer : sklearn.decomposition object
Dimensionality reduction method to use.

Returns
-------
models_scaled : dict
dict of model_names: model instances, with scaled models wrapped in a pipeline.
"""

models_piped = []

for model in models:
steps = []
if scale:
steps.append(("scaler", scaler))
if reduce_dim:
steps.append(("dim_reducer", dim_reducer))
steps.append(("model", model))
# without scaling or dim reduction, the model is the only step
models_piped.append(Pipeline(steps))

return models_piped


def _process_models(
model_registry, model_names, y, scale, scaler, reduce_dim, dim_reducer
):
"""Get and process models.

Parameters
----------
model_registry : ModelRegistry
An instance of the ModelRegistry class.
model_names : list
List of model names.
y : array-like, shape (n_samples, n_outputs)
Simulation output.
scale : bool
Whether to scale the data.
scaler : sklearn.preprocessing object
Scaler to use.

Returns
-------
models : list
List of model instances.
"""
models = model_registry.get_models(model_names)
models_multi = _turn_models_into_multioutput(models, y)
models_scaled = _wrap_models_in_pipeline(
models_multi, scale, scaler, reduce_dim, dim_reducer
)
return models_scaled
from autoemulate.preprocess_target import get_dim_reducer
from autoemulate.preprocess_target import InputOutputPipeline
from autoemulate.preprocess_target import NoChangeTransformer
from autoemulate.preprocess_target import TargetPCA
from autoemulate.preprocess_target import TargetVAE


class AutoEmulatePipeline:
def __init__(
self,
model_registry,
model_names,
y,
prep_config,
scale_input=False,
scaler_input=None,
reduce_dim_input=False,
dim_reducer_input=None,
scale_output=False,
scaler_output=None,
reduce_dim_output=False,
):
self.model_piped = None
prep_name = prep_config["name"]
prep_params = prep_config.get("params", {})
self.dim_reducer_output = get_dim_reducer(prep_name, **prep_params)

self.models = model_registry.get_models(model_names)

self._turn_models_into_multioutput(y)

# Store pipeline settings as instance attributes
self.scale_input = scale_input
self.scaler_input = scaler_input
self.reduce_dim_input = reduce_dim_input
self.dim_reducer_input = dim_reducer_input
self.scale_output = scale_output
self.scaler_output = scaler_output
self.reduce_dim_output = reduce_dim_output

# Wrap the model and reducer into a pipeline
self._wrap_model_reducer_in_pipeline()

def _wrap_model_reducer_in_pipeline(self):
"""Wrap reducer in a pipeline if reduce_dim_output is True."""
self.models_piped = []

for model in self.models_multi:
input_steps = []
if self.scale_input:
input_steps.append(("scaler", self.scaler_input))
if self.reduce_dim_input:
input_steps.append(("dim_reducer", self.dim_reducer_input))
input_steps.append(("model", model))
input_pipeline = Pipeline(input_steps)

# Create output transformation pipeline
output_steps = []
if self.scale_output:
output_steps.append(("scaler_output", self.scaler_output))
if self.reduce_dim_output:
output_steps.append(("dim_reducer_output", self.dim_reducer_output))

if output_steps:
output_pipeline = Pipeline(output_steps)
final_model = InputOutputPipeline(
regressor=input_pipeline, transformer=output_pipeline
)
self.models_piped.append(final_model)
else:
self.models_piped.append(input_pipeline)
return self.models_piped

def _turn_models_into_multioutput(self, y):
"""Turn single output models into multioutput models if y is 2D.

Parameters
----------
models : dict
Dict of model instances.
y : array-like, shape (n_samples, n_outputs)
Simulation output.

Returns
-------
models_multi : dict
Dict with model instances, where single output models are now wrapped in MultiOutputRegressor.
"""
self.models_multi = [
MultiOutputRegressor(model)
if not model._more_tags().get("multioutput", False)
and (y.ndim > 1 and y.shape[1] > 1)
else model
for model in self.models
]
return self.models_multi
26 changes: 13 additions & 13 deletions autoemulate/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

from autoemulate.preprocess_target import InputOutputPipeline
from autoemulate.utils import _ensure_2d


Expand Down Expand Up @@ -44,21 +45,19 @@ def _check_multioutput(y, output_index):

def _predict_with_optional_std(model, X_test):
"""Predicts the output of the model with or without uncertainty."""
# see whether the model is a pipeline or not
if isinstance(model, Pipeline):
predict_params = inspect.signature(
model.named_steps["model"].predict
).parameters
# Get the base model's predict signature
if isinstance(model, InputOutputPipeline):
base_model = model.regressor_.named_steps["model"]
elif isinstance(model, Pipeline):
base_model = model.named_steps["model"]
else:
predict_params = inspect.signature(model.predict).parameters
# see whether the model has return_std in its predict parameters
if "return_std" in predict_params:
y_test_pred, y_test_std = model.predict(X_test, return_std=True)
else:
y_test_pred = model.predict(X_test)
y_test_std = None
base_model = model

return y_test_pred, y_test_std
predict_params = inspect.signature(base_model.predict).parameters
# Only pass return_std if explicitly supported
if "return_std" in predict_params:
return model.predict(X_test, return_std=True)
return model.predict(X_test), None


def _calculate_subplot_layout(n_plots, n_cols=3):
Expand Down Expand Up @@ -412,6 +411,7 @@ def _plot_model(
output_index : int or list of int, optional
The index(es) of the output variable(s) to plot. If None, all outputs are used.
"""

# Get predictions, with uncertainty if available
y_pred, y_std = _predict_with_optional_std(model, X)

Expand Down
Loading
Loading