|
| 1 | +""" |
| 2 | +==================================================== |
| 3 | +Hyperparameter Optimization Benchmark with OpenML |
| 4 | +==================================================== |
| 5 | +
|
| 6 | +In this tutorial, we walk through how to conduct hyperparameter optimization experiments using |
| 7 | +OpenML and OptunaHub. |
| 8 | +""" |
| 9 | + |
| 10 | +############################################################################ |
| 11 | +# Please make sure to install the dependencies with: |
| 12 | +# ``pip install -r requirements.txt`` |
| 13 | +# Then we import all the necessary modules. |
| 14 | + |
| 15 | +# License: MIT License |
| 16 | +import logging |
| 17 | + |
| 18 | +import optuna |
| 19 | + |
| 20 | +import openml |
| 21 | +from openml.extensions.sklearn import cat |
| 22 | +from openml.extensions.sklearn import cont |
| 23 | +from sklearn.compose import ColumnTransformer |
| 24 | +from sklearn.ensemble import RandomForestClassifier |
| 25 | +from sklearn.impute import SimpleImputer |
| 26 | +from sklearn.pipeline import Pipeline |
| 27 | +from sklearn.preprocessing import OneHotEncoder |
| 28 | + |
| 29 | + |
| 30 | +logger = logging.Logger(name="Experiment Logger", level=1) |
| 31 | + |
| 32 | +# Set your openml api key if you want to upload your results to OpenML (eg: |
| 33 | +# https://openml.org/search?type=run&sort=date) . To get one, simply make an |
| 34 | +# account (you don't need one for anything else, just to upload your results), |
| 35 | +# go to your profile and select the API-KEY. |
| 36 | +# Or log in, and navigate to https://www.openml.org/auth/api-key |
| 37 | +openml.config.apikey = "" |
| 38 | + |
| 39 | +############################################################################ |
| 40 | +# Prepare for preprocessors and an OpenML task |
| 41 | +# ============================================ |
| 42 | + |
| 43 | +# OpenML contains several key concepts which it needs to make machine learning research shareable. |
| 44 | +# A machine learning experiment consists of one or several runs, which describe the performance of |
| 45 | +# an algorithm (called a flow in OpenML), its hyperparameter settings (called a setup) on a task. |
| 46 | +# A Task is the combination of a dataset, a split and an evaluation metric We choose a dataset from |
| 47 | +# OpenML, (https://www.openml.org/d/1464) and a subsequent task (https://www.openml.org/t/10101) To |
| 48 | +# make your own dataset and task, please refer to |
| 49 | +# https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html |
| 50 | + |
| 51 | +# https://www.openml.org/search?type=study&study_type=task&id=218 |
| 52 | +task_id = 10101 |
| 53 | +seed = 42 |
| 54 | +categorical_preproc = ( |
| 55 | + "categorical", |
| 56 | + OneHotEncoder(sparse_output=False, handle_unknown="ignore"), |
| 57 | + cat, |
| 58 | +) |
| 59 | +numerical_preproc = ("numerical", SimpleImputer(strategy="median"), cont) |
| 60 | +preproc = ColumnTransformer([categorical_preproc, numerical_preproc]) |
| 61 | + |
| 62 | +############################################################################ |
| 63 | +# Define a pipeline for the hyperparameter optimization (this is standark for Optuna) |
| 64 | +# ===================================================== |
| 65 | + |
| 66 | +# Optuna explanation |
| 67 | +# we follow the `Optuna <https://github.com/optuna/optuna/>`__ search space design. |
| 68 | + |
| 69 | +# OpenML runs |
| 70 | +# We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance |
| 71 | +# of the pipeline |
| 72 | +# on the specified OpenML task. |
| 73 | +# Do you want to share your results along with an easily reproducible pipeline, you can set an API |
| 74 | +# key and just upload your results. |
| 75 | +# You can find more examples on https://www.openml.org/ |
| 76 | + |
| 77 | + |
| 78 | +def objective(trial: optuna.Trial) -> Pipeline: |
| 79 | + clf = RandomForestClassifier( |
| 80 | + max_depth=trial.suggest_int("max_depth", 2, 32, log=True), |
| 81 | + min_samples_leaf=trial.suggest_float("min_samples_leaf", 0.0, 1.0), |
| 82 | + random_state=seed, |
| 83 | + ) |
| 84 | + pipe = Pipeline(steps=[("preproc", preproc), ("model", clf)]) |
| 85 | + logger.log(1, f"Running pipeline - {pipe}") |
| 86 | + run = openml.runs.run_model_on_task(pipe, task=task_id, avoid_duplicate_runs=False) |
| 87 | + |
| 88 | + logger.log(1, f"Model has been trained - {run}") |
| 89 | + if openml.config.apikey != "": |
| 90 | + try: |
| 91 | + run.publish() |
| 92 | + |
| 93 | + logger.log(1, f"Run was uploaded to - {run.openml_url}") |
| 94 | + except Exception as e: |
| 95 | + logger.log(1, f"Could not publish run - {e}") |
| 96 | + else: |
| 97 | + logger.log( |
| 98 | + 0, |
| 99 | + "If you want to publish your results to OpenML, please set an apikey", |
| 100 | + ) |
| 101 | + accuracy = max(run.fold_evaluations["predictive_accuracy"][0].values()) |
| 102 | + logger.log(0, f"Accuracy {accuracy}") |
| 103 | + |
| 104 | + return accuracy |
| 105 | + |
| 106 | + |
| 107 | +############################################################################ |
| 108 | +# Optimize the pipeline |
| 109 | +# ===================== |
| 110 | +study = optuna.create_study(direction="maximize") |
| 111 | +logger.log(0, f"Study {study}") |
| 112 | +study.optimize(objective, n_trials=15) |
| 113 | + |
| 114 | +############################################################################ |
| 115 | +# Visualize the optimization history |
| 116 | +# ================================== |
| 117 | +fig = optuna.visualization.plot_optimization_history(study) |
| 118 | +fig.show() |
0 commit comments