Skip to content

Commit baa666d

Browse files
committed
feat: Optimise sigmoid log-likelihood transformation from data
1 parent 8a379aa commit baa666d

4 files changed

Lines changed: 136 additions & 49 deletions

File tree

config/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ focus: null
1414
seed: 4242
1515

1616
# Using a subset of the questions
17-
subset_csv: null
17+
subset_csv: data/processed/optimisation-davies-bouldin-penalty10/davies-bouldin-penalty10-eufocus-1000it.csv
1818
top_num_questions_in_subset: null
1919

2020
plotting:

src/european_values/generative_training.py

Lines changed: 119 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
"""Training generative on the dataset."""
22

33
import logging
4+
from functools import partial
45
from pathlib import Path
56

67
import joblib
8+
import numpy as np
79
import pandas as pd
10+
import scipy.optimize as opt
11+
from scipy.special import expit as sigmoid
12+
from scipy.special import logit as inverse_sigmoid
813
from sklearn.model_selection import GridSearchCV
914
from sklearn.neighbors import KernelDensity
10-
from sklearn.pipeline import Pipeline
15+
from sklearn.pipeline import Pipeline, check_is_fitted
1116
from sklearn.preprocessing import MinMaxScaler
1217

1318
logger = logging.getLogger(__name__)
@@ -37,70 +42,171 @@ def train_generative_model(
3742
# Split data by country
3843
logger.info("Splitting data into train/test sets...")
3944
train_dfs: list[pd.DataFrame] = []
45+
val_dfs: list[pd.DataFrame] = []
4046
test_dfs: list[pd.DataFrame] = []
4147
for country in eu_df["country_code"].unique():
4248
country_data = eu_df.query("country_code == @country").sample(
4349
frac=1, random_state=seed
4450
)
4551
n_test = min(test_samples_per_country, len(country_data) // 5)
4652
test_dfs.append(country_data.iloc[:n_test])
47-
train_dfs.append(country_data.iloc[n_test:])
53+
val_dfs.append(country_data.iloc[n_test : 2 * n_test])
54+
train_dfs.append(country_data.iloc[2 * n_test :])
4855

4956
# Set up the data as NumPy arrays
5057
train_matrix = scaler.transform(pd.concat(train_dfs)[question_columns].values)
58+
val_matrix = scaler.transform(pd.concat(val_dfs)[question_columns].values)
5159
test_matrix = scaler.transform(pd.concat(test_dfs)[question_columns].values)
5260
logger.info(
53-
f"There are {len(train_matrix):,} training samples and {len(test_matrix):,} "
54-
"test samples."
61+
f"There are {len(train_matrix):,} training samples, "
62+
f"{len(val_matrix):,} validation samples, "
63+
f"and {len(test_matrix):,} test samples."
5564
)
5665

5766
# Initialise the model
5867
grid = GridSearchCV(
5968
estimator=KernelDensity(),
6069
param_grid=dict(
61-
bandwidth=[0.1, 0.2, 0.3, 0.4, 0.5, 1.0, "scott", "silverman"],
62-
leaf_size=[10, 20, 30, 40, 50],
70+
bandwidth=[0.1, 0.2, 0.3, 0.4, 0.5], leaf_size=[10, 20, 30, 40, 50]
6371
),
6472
n_jobs=-1,
6573
)
6674

6775
# Fit the model
6876
logger.info("Training the model on the training data...")
6977
grid.fit(train_matrix)
70-
model = grid.best_estimator_
78+
model: KernelDensity = grid.best_estimator_
7179
logger.info(f"Best model found with the parameters {grid.best_params_}.")
7280

81+
# Set the `transform` method of the model to the score_samples method, as this will
82+
# allow us to use the scaler, model and scorer in the same pipeline
83+
model.transform = model.score_samples.__get__(model)
84+
85+
# logger.info("Computing the log-likelihoods for the training data...")
86+
train_log_likelihoods = model.transform(train_matrix)
87+
88+
logger.info("Computing the log-likelihoods for the validation data...")
89+
val_log_likelihoods = model.transform(val_matrix)
90+
91+
logger.info("Computing the log-likelihoods for the test data...")
92+
test_log_likelihoods = model.transform(test_matrix)
93+
94+
# Fit the log-likelihood transform
95+
logger.info("Fitting the sigmoid transform on the validation data...")
96+
scorer = SigmoidTransformer().fit(val_log_likelihoods)
97+
7398
# Evaluate the model
74-
logger.info("Evaluating the model on the training and test data...")
75-
train_log_likelihoods = model.score_samples(train_matrix)
99+
logger.info("Evaluating the model on the training, validation and test data...")
76100
logger.info(
77101
f"Log-likelihoods for train:\n"
78102
f"\t- Mean: {train_log_likelihoods.mean():.4f}\n"
79103
f"\t- Std: {train_log_likelihoods.std():.4f}\n"
80104
f"\t- Min: {train_log_likelihoods.min():.4f}\n"
81105
f"\t- 10% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.1):.4f}\n"
82106
f"\t- 90% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.9):.4f}\n"
83-
f"\t- Max: {train_log_likelihoods.max():.4f}"
107+
f"\t- Max: {train_log_likelihoods.max():.4f}\n"
108+
f"Mean score for train: {scorer.transform(train_log_likelihoods).mean():.0%}"
109+
)
110+
logger.info(
111+
f"Log-likelihoods for validation:\n"
112+
f"\t- Mean: {val_log_likelihoods.mean():.4f}\n"
113+
f"\t- Std: {val_log_likelihoods.std():.4f}\n"
114+
f"\t- Min: {val_log_likelihoods.min():.4f}\n"
115+
f"\t- 10% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.1):.4f}\n"
116+
f"\t- 90% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.9):.4f}\n"
117+
f"\t- Max: {val_log_likelihoods.max():.4f}\n"
118+
f"Mean score for validation: {scorer.transform(val_log_likelihoods).mean():.0%}"
84119
)
85-
test_log_likelihoods = model.score_samples(test_matrix)
86120
logger.info(
87121
f"Log-likelihoods for test:\n"
88122
f"\t- Mean: {test_log_likelihoods.mean():.4f}\n"
89123
f"\t- Std: {test_log_likelihoods.std():.4f}\n"
90124
f"\t- Min: {test_log_likelihoods.min():.4f}\n"
91125
f"\t- 10% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.1):.4f}\n"
92126
f"\t- 90% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.9):.4f}\n"
93-
f"\t- Max: {test_log_likelihoods.max():.4f}"
127+
f"\t- Max: {test_log_likelihoods.max():.4f}\n"
128+
f"Mean score for test: {scorer.transform(test_log_likelihoods).mean():.0%}"
94129
)
95130

96131
# Train final model on all data
97132
logger.info("Training final model on entire EU dataset...")
98133
full_matrix = scaler.transform(eu_df[question_columns].values)
99134
model.fit(full_matrix)
100-
pipeline = Pipeline([("scaler", scaler), ("model", model)])
135+
pipeline = Pipeline([("scaler", scaler), ("model", model), ("scorer", scorer)])
101136

102137
# Save the complete pipeline
103138
model_path = Path("models", "model.pkl")
104139
model_path.parent.mkdir(exist_ok=True)
105140
joblib.dump(pipeline, model_path)
106141
logger.info(f"Pipeline saved to {model_path.resolve()}")
142+
143+
144+
class SigmoidTransformer:
145+
"""Transformer to apply a sigmoid function to log-likelihoods."""
146+
147+
def fit(self, X: np.ndarray) -> "SigmoidTransformer":
148+
"""Fit the transformer to the data.
149+
150+
Args:
151+
X:
152+
The input array of log-likelihoods.
153+
154+
Returns:
155+
The fitted transformer.
156+
"""
157+
# We choose the alpha parameter such that the range is shrunk down to a length
158+
# of 10, as that gives a smooth sigmoid curve that is not too flat
159+
lower, upper = np.quantile(X, q=[0.05, 0.95])
160+
self.alpha_ = 10 / (upper.item() - lower.item())
161+
162+
# Optimise the center of the sigmoid function to fit the target value
163+
result: opt.OptimizeResult = opt.minimize(
164+
fun=partial(self._loss, array=X, target=0.99, alpha=self.alpha_),
165+
x0=np.array([0.0]),
166+
)
167+
self.center_ = result.x[0]
168+
logger.info(
169+
f"Fitted sigmoid transformer with alpha={self.alpha_:.2f} and "
170+
f"center={self.center_:.2f}."
171+
)
172+
return self
173+
174+
def transform(self, X: np.ndarray) -> np.ndarray:
175+
"""Transform the input data using the fitted sigmoid function.
176+
177+
Args:
178+
X:
179+
The input array of log-likelihoods.
180+
181+
Returns:
182+
The transformed values between 0 and 1.
183+
"""
184+
check_is_fitted(estimator=self, attributes=["alpha_", "center_"])
185+
return sigmoid(self.alpha_ * (X - self.center_))
186+
187+
@staticmethod
188+
def _loss(
189+
center: np.ndarray, array: np.ndarray, target: float, alpha: float
190+
) -> float:
191+
"""Calculate the loss for the sigmoid transformation.
192+
193+
The loss aims to get the sigmoid values of the array as close to a given target
194+
value as possible.
195+
196+
Args:
197+
center:
198+
The center of the sigmoid curve.
199+
array:
200+
The input array of log-likelihoods.
201+
target:
202+
The target value for the sigmoid transformation.
203+
alpha:
204+
The steepness of the sigmoid curve.
205+
206+
Returns:
207+
The l2 loss between the transformed values and the target sigmoid values.
208+
"""
209+
target = inverse_sigmoid(target)
210+
errors = (alpha * (array - center) - target) ** 2
211+
l2_loss = np.mean(errors).item()
212+
return l2_loss

src/scripts/evaluate_llm_benchmark.py

Lines changed: 12 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,13 @@
77
import numpy as np
88
import pandas as pd
99
from omegaconf import DictConfig
10-
from sklearn.preprocessing import FunctionTransformer
1110

1211
from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
1312
from european_values.data_processing import process_data
1413

1514
logger = logging.getLogger("evaluate_llm")
1615

1716

18-
def sigmoid_transform(log_likelihoods, alpha=0.05, center=-50.0):
19-
"""Apply sigmoid transformation to log-likelihood values.
20-
21-
Args:
22-
log_likelihoods: Array of log-likelihood values
23-
alpha: Scaling parameter for sigmoid steepness (default 0.05)
24-
center: Center point of the sigmoid (default -50.0)
25-
26-
Returns:
27-
Transformed values between 0 and 1
28-
"""
29-
return 1 / (1 + np.exp(-alpha * (log_likelihoods - center)))
30-
31-
3217
@hydra.main(config_path="../../config", config_name="config", version_base=None)
3318
def main(config: DictConfig) -> None:
3419
"""Main evaluation function."""
@@ -63,7 +48,10 @@ def main(config: DictConfig) -> None:
6348
if col.startswith("question_") and col not in question_subset
6449
]
6550
df.drop(columns=question_cols_to_remove, inplace=True)
66-
logger.info(f"Using {len(question_subset)} questions from subset")
51+
logger.info(
52+
f"Using {len(question_subset)} questions from the subset "
53+
f"{config.subset_csv!r}."
54+
)
6755

6856
# Process data without normalization (let pipeline handle it)
6957
logger.info("Processing the data WITHOUT normalization...")
@@ -76,25 +64,15 @@ def main(config: DictConfig) -> None:
7664
for country_group in df.country_group.unique():
7765
group_df = df.query("country_group == @country_group")
7866
responses = group_df[question_cols].values
79-
log_likelihoods = pipeline.score_samples(responses)
80-
81-
# Apply sigmoid transformation using FunctionTransformer
82-
# Ensures EU countries (around -31 mean) stay above 99%
83-
sigmoid_transformer = FunctionTransformer(
84-
func=sigmoid_transform,
85-
validate=False
86-
)
87-
normalised_scores = sigmoid_transformer.transform(log_likelihoods.reshape(-1, 1)).flatten()
88-
67+
scores = pipeline.transform(responses)
8968
logger.info(
90-
f"Log-likelihoods for {country_group}:\n"
91-
f"\t- Mean: {log_likelihoods.mean():.2f}\n"
92-
f"\t- Std: {log_likelihoods.std():.2f}\n"
93-
f"\t- Min: {log_likelihoods.min():.2f}\n"
94-
f"\t- 10% quantile: {np.quantile(log_likelihoods, q=0.1):.2f}\n"
95-
f"\t- 90% quantile: {np.quantile(log_likelihoods, q=0.9):.2f}\n"
96-
f"\t- Max: {log_likelihoods.max():.2f}\n"
97-
f"\t- Mean normalised score: {normalised_scores.mean():.2%} "
69+
f"Scores for {country_group}:\n"
70+
f"\t- Mean: {scores.mean():.0%}\n"
71+
f"\t- Std: {scores.std():.0%}\n"
72+
f"\t- Min: {scores.min():.0%}\n"
73+
f"\t- 10% quantile: {np.quantile(scores, q=0.1):.0%}\n"
74+
f"\t- 90% quantile: {np.quantile(scores, q=0.9):.0%}\n"
75+
f"\t- Max: {scores.max():.0%}\n"
9876
)
9977

10078

src/scripts/train_generative_model.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ def main(config: DictConfig) -> None:
4747
if col.startswith("question_") and col not in question_subset
4848
]
4949
df.drop(columns=question_cols_to_remove, inplace=True)
50-
logger.info(f"Using {len(question_subset)} questions from subset")
50+
logger.info(
51+
f"Using {len(question_subset)} questions from the subset "
52+
f"{config.subset_csv!r}."
53+
)
5154

5255
# Process data but SKIP normalization (let pipeline handle it)
5356
logger.info("Processing the data WITHOUT normalization...")

0 commit comments

Comments
 (0)