|
1 | 1 | """Training generative on the dataset.""" |
2 | 2 |
|
3 | 3 | import logging |
| 4 | +from functools import partial |
4 | 5 | from pathlib import Path |
5 | 6 |
|
6 | 7 | import joblib |
| 8 | +import numpy as np |
7 | 9 | import pandas as pd |
| 10 | +import scipy.optimize as opt |
| 11 | +from scipy.special import expit as sigmoid |
| 12 | +from scipy.special import logit as inverse_sigmoid |
8 | 13 | from sklearn.model_selection import GridSearchCV |
9 | 14 | from sklearn.neighbors import KernelDensity |
10 | | -from sklearn.pipeline import Pipeline |
| 15 | +from sklearn.pipeline import Pipeline, check_is_fitted |
11 | 16 | from sklearn.preprocessing import MinMaxScaler |
12 | 17 |
|
13 | 18 | logger = logging.getLogger(__name__) |
@@ -37,70 +42,171 @@ def train_generative_model( |
37 | 42 | # Split data by country |
38 | 43 | logger.info("Splitting data into train/test sets...") |
39 | 44 | train_dfs: list[pd.DataFrame] = [] |
| 45 | + val_dfs: list[pd.DataFrame] = [] |
40 | 46 | test_dfs: list[pd.DataFrame] = [] |
41 | 47 | for country in eu_df["country_code"].unique(): |
42 | 48 | country_data = eu_df.query("country_code == @country").sample( |
43 | 49 | frac=1, random_state=seed |
44 | 50 | ) |
45 | 51 | n_test = min(test_samples_per_country, len(country_data) // 5) |
46 | 52 | test_dfs.append(country_data.iloc[:n_test]) |
47 | | - train_dfs.append(country_data.iloc[n_test:]) |
| 53 | + val_dfs.append(country_data.iloc[n_test : 2 * n_test]) |
| 54 | + train_dfs.append(country_data.iloc[2 * n_test :]) |
48 | 55 |
|
49 | 56 | # Set up the data as NumPy arrays |
50 | 57 | train_matrix = scaler.transform(pd.concat(train_dfs)[question_columns].values) |
| 58 | + val_matrix = scaler.transform(pd.concat(val_dfs)[question_columns].values) |
51 | 59 | test_matrix = scaler.transform(pd.concat(test_dfs)[question_columns].values) |
52 | 60 | logger.info( |
53 | | - f"There are {len(train_matrix):,} training samples and {len(test_matrix):,} " |
54 | | - "test samples." |
| 61 | + f"There are {len(train_matrix):,} training samples, " |
| 62 | + f"{len(val_matrix):,} validation samples, " |
| 63 | + f"and {len(test_matrix):,} test samples." |
55 | 64 | ) |
56 | 65 |
|
57 | 66 | # Initialise the model |
58 | 67 | grid = GridSearchCV( |
59 | 68 | estimator=KernelDensity(), |
60 | 69 | param_grid=dict( |
61 | | - bandwidth=[0.1, 0.2, 0.3, 0.4, 0.5, 1.0, "scott", "silverman"], |
62 | | - leaf_size=[10, 20, 30, 40, 50], |
| 70 | + bandwidth=[0.1, 0.2, 0.3, 0.4, 0.5], leaf_size=[10, 20, 30, 40, 50] |
63 | 71 | ), |
64 | 72 | n_jobs=-1, |
65 | 73 | ) |
66 | 74 |
|
67 | 75 | # Fit the model |
68 | 76 | logger.info("Training the model on the training data...") |
69 | 77 | grid.fit(train_matrix) |
70 | | - model = grid.best_estimator_ |
| 78 | + model: KernelDensity = grid.best_estimator_ |
71 | 79 | logger.info(f"Best model found with the parameters {grid.best_params_}.") |
72 | 80 |
|
| 81 | + # Set the `transform` method of the model to the score_samples method, as this will |
| 82 | + # allow us to use the scaler, model and scorer in the same pipeline |
| 83 | + model.transform = model.score_samples.__get__(model) |
| 84 | + |
| 85 | + # logger.info("Computing the log-likelihoods for the training data...") |
| 86 | + train_log_likelihoods = model.transform(train_matrix) |
| 87 | + |
| 88 | + logger.info("Computing the log-likelihoods for the validation data...") |
| 89 | + val_log_likelihoods = model.transform(val_matrix) |
| 90 | + |
| 91 | + logger.info("Computing the log-likelihoods for the test data...") |
| 92 | + test_log_likelihoods = model.transform(test_matrix) |
| 93 | + |
| 94 | + # Fit the log-likelihood transform |
| 95 | + logger.info("Fitting the sigmoid transform on the validation data...") |
| 96 | + scorer = SigmoidTransformer().fit(val_log_likelihoods) |
| 97 | + |
73 | 98 | # Evaluate the model |
74 | | - logger.info("Evaluating the model on the training and test data...") |
75 | | - train_log_likelihoods = model.score_samples(train_matrix) |
| 99 | + logger.info("Evaluating the model on the training, validation and test data...") |
76 | 100 | logger.info( |
77 | 101 | f"Log-likelihoods for train:\n" |
78 | 102 | f"\t- Mean: {train_log_likelihoods.mean():.4f}\n" |
79 | 103 | f"\t- Std: {train_log_likelihoods.std():.4f}\n" |
80 | 104 | f"\t- Min: {train_log_likelihoods.min():.4f}\n" |
81 | 105 | f"\t- 10% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.1):.4f}\n" |
82 | 106 | f"\t- 90% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.9):.4f}\n" |
83 | | - f"\t- Max: {train_log_likelihoods.max():.4f}" |
| 107 | + f"\t- Max: {train_log_likelihoods.max():.4f}\n" |
| 108 | + f"Mean score for train: {scorer.transform(train_log_likelihoods).mean():.0%}" |
| 109 | + ) |
| 110 | + logger.info( |
| 111 | + f"Log-likelihoods for validation:\n" |
| 112 | + f"\t- Mean: {val_log_likelihoods.mean():.4f}\n" |
| 113 | + f"\t- Std: {val_log_likelihoods.std():.4f}\n" |
| 114 | + f"\t- Min: {val_log_likelihoods.min():.4f}\n" |
| 115 | + f"\t- 10% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.1):.4f}\n" |
| 116 | + f"\t- 90% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.9):.4f}\n" |
| 117 | + f"\t- Max: {val_log_likelihoods.max():.4f}\n" |
| 118 | + f"Mean score for validation: {scorer.transform(val_log_likelihoods).mean():.0%}" |
84 | 119 | ) |
85 | | - test_log_likelihoods = model.score_samples(test_matrix) |
86 | 120 | logger.info( |
87 | 121 | f"Log-likelihoods for test:\n" |
88 | 122 | f"\t- Mean: {test_log_likelihoods.mean():.4f}\n" |
89 | 123 | f"\t- Std: {test_log_likelihoods.std():.4f}\n" |
90 | 124 | f"\t- Min: {test_log_likelihoods.min():.4f}\n" |
91 | 125 | f"\t- 10% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.1):.4f}\n" |
92 | 126 | f"\t- 90% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.9):.4f}\n" |
93 | | - f"\t- Max: {test_log_likelihoods.max():.4f}" |
| 127 | + f"\t- Max: {test_log_likelihoods.max():.4f}\n" |
| 128 | + f"Mean score for test: {scorer.transform(test_log_likelihoods).mean():.0%}" |
94 | 129 | ) |
95 | 130 |
|
96 | 131 | # Train final model on all data |
97 | 132 | logger.info("Training final model on entire EU dataset...") |
98 | 133 | full_matrix = scaler.transform(eu_df[question_columns].values) |
99 | 134 | model.fit(full_matrix) |
100 | | - pipeline = Pipeline([("scaler", scaler), ("model", model)]) |
| 135 | + pipeline = Pipeline([("scaler", scaler), ("model", model), ("scorer", scorer)]) |
101 | 136 |
|
102 | 137 | # Save the complete pipeline |
103 | 138 | model_path = Path("models", "model.pkl") |
104 | 139 | model_path.parent.mkdir(exist_ok=True) |
105 | 140 | joblib.dump(pipeline, model_path) |
106 | 141 | logger.info(f"Pipeline saved to {model_path.resolve()}") |
| 142 | + |
| 143 | + |
| 144 | +class SigmoidTransformer: |
| 145 | + """Transformer to apply a sigmoid function to log-likelihoods.""" |
| 146 | + |
| 147 | + def fit(self, X: np.ndarray) -> "SigmoidTransformer": |
| 148 | + """Fit the transformer to the data. |
| 149 | +
|
| 150 | + Args: |
| 151 | + X: |
| 152 | + The input array of log-likelihoods. |
| 153 | +
|
| 154 | + Returns: |
| 155 | + The fitted transformer. |
| 156 | + """ |
| 157 | + # We choose the alpha parameter such that the range is shrunk down to a length |
| 158 | + # of 10, as that gives a smooth sigmoid curve that is not too flat |
| 159 | + lower, upper = np.quantile(X, q=[0.05, 0.95]) |
| 160 | + self.alpha_ = 10 / (upper.item() - lower.item()) |
| 161 | + |
| 162 | + # Optimise the center of the sigmoid function to fit the target value |
| 163 | + result: opt.OptimizeResult = opt.minimize( |
| 164 | + fun=partial(self._loss, array=X, target=0.99, alpha=self.alpha_), |
| 165 | + x0=np.array([0.0]), |
| 166 | + ) |
| 167 | + self.center_ = result.x[0] |
| 168 | + logger.info( |
| 169 | + f"Fitted sigmoid transformer with alpha={self.alpha_:.2f} and " |
| 170 | + f"center={self.center_:.2f}." |
| 171 | + ) |
| 172 | + return self |
| 173 | + |
| 174 | + def transform(self, X: np.ndarray) -> np.ndarray: |
| 175 | + """Transform the input data using the fitted sigmoid function. |
| 176 | +
|
| 177 | + Args: |
| 178 | + X: |
| 179 | + The input array of log-likelihoods. |
| 180 | +
|
| 181 | + Returns: |
| 182 | + The transformed values between 0 and 1. |
| 183 | + """ |
| 184 | + check_is_fitted(estimator=self, attributes=["alpha_", "center_"]) |
| 185 | + return sigmoid(self.alpha_ * (X - self.center_)) |
| 186 | + |
| 187 | + @staticmethod |
| 188 | + def _loss( |
| 189 | + center: np.ndarray, array: np.ndarray, target: float, alpha: float |
| 190 | + ) -> float: |
| 191 | + """Calculate the loss for the sigmoid transformation. |
| 192 | +
|
| 193 | + The loss aims to get the sigmoid values of the array as close to a given target |
| 194 | + value as possible. |
| 195 | +
|
| 196 | + Args: |
| 197 | + center: |
| 198 | + The center of the sigmoid curve. |
| 199 | + array: |
| 200 | + The input array of log-likelihoods. |
| 201 | + target: |
| 202 | + The target value for the sigmoid transformation. |
| 203 | + alpha: |
| 204 | + The steepness of the sigmoid curve. |
| 205 | +
|
| 206 | + Returns: |
| 207 | + The l2 loss between the transformed values and the target sigmoid values. |
| 208 | + """ |
| 209 | + target = inverse_sigmoid(target) |
| 210 | + errors = (alpha * (array - center) - target) ** 2 |
| 211 | + l2_loss = np.mean(errors).item() |
| 212 | + return l2_loss |
0 commit comments