Skip to content
Merged
5 changes: 2 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ focus: null
seed: 4242

# Using a subset of the questions
subset_csv: null
top_num_questions_in_subset: null
subset_csv: data/processed/optimisation-davies-bouldin-penalty10/davies-bouldin-penalty10-eufocus-1000it.csv

plotting:
umap_neighbours: 100
Expand Down Expand Up @@ -48,4 +47,4 @@ generative_training:
test_samples_per_country: 100

evaluation:
model_path: models/model.pkl
model_path: models/pipeline.pkl
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ maintainers = [
requires-python = ">=3.11,<4.0"
dependencies = [
"beautifulsoup4>=4.13.4",
"cloudpickle>=3.1.1",
"datasets>=3.6.0",
"feature-engine>=1.8.3",
"hydra-core>=1.3.2",
Expand Down
2 changes: 1 addition & 1 deletion src/european_values/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def process_data(

# Always fit the scaler (so we can save it), but only apply if requested
logger.info("Fitting scaler...")
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = MinMaxScaler(feature_range=(0, 1), clip=True)
scaler.fit(embedding_matrix)

if normalize:
Expand Down
81 changes: 52 additions & 29 deletions src/european_values/generative_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import logging
from pathlib import Path

import joblib
import cloudpickle
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from . import sigmoid_transformer

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -37,70 +38,92 @@ def train_generative_model(
# Split data by country
logger.info("Splitting data into train/test sets...")
train_dfs: list[pd.DataFrame] = []
val_dfs: list[pd.DataFrame] = []
test_dfs: list[pd.DataFrame] = []
for country in eu_df["country_code"].unique():
country_data = eu_df.query("country_code == @country").sample(
frac=1, random_state=seed
)
n_test = min(test_samples_per_country, len(country_data) // 5)
test_dfs.append(country_data.iloc[:n_test])
train_dfs.append(country_data.iloc[n_test:])
val_dfs.append(country_data.iloc[n_test : 2 * n_test])
train_dfs.append(country_data.iloc[2 * n_test :])

# Set up the data as NumPy arrays
train_matrix = scaler.transform(pd.concat(train_dfs)[question_columns].values)
val_matrix = scaler.transform(pd.concat(val_dfs)[question_columns].values)
test_matrix = scaler.transform(pd.concat(test_dfs)[question_columns].values)
logger.info(
f"There are {len(train_matrix):,} training samples and {len(test_matrix):,} "
"test samples."
)

# Initialise the model
grid = GridSearchCV(
estimator=KernelDensity(),
param_grid=dict(
bandwidth=[0.1, 0.2, 0.3, 0.4, 0.5, 1.0, "scott", "silverman"],
leaf_size=[10, 20, 30, 40, 50],
),
n_jobs=-1,
f"There are {len(train_matrix):,} training samples, "
f"{len(val_matrix):,} validation samples, "
f"and {len(test_matrix):,} test samples."
)

# Fit the model
# Fit the model. We select a small bandwidth to ensure that the model fits the data
# well (lower bandwidth means more sensitivity to the data, i.e., higher variance)
logger.info("Training the model on the training data...")
grid.fit(train_matrix)
model = grid.best_estimator_
logger.info(f"Best model found with the parameters {grid.best_params_}.")
model = KernelDensity(bandwidth=0.1).fit(train_matrix)

# Set the `transform` method of the model to the score_samples method, as this will
# allow us to use the scaler, model and scorer in the same pipeline
model.transform = model.score_samples.__get__(model)

# logger.info("Computing the log-likelihoods for the training data...")
logger.info("Computing the log-likelihoods for the training data...")
train_log_likelihoods = model.transform(train_matrix)

# Evaluate the model
logger.info("Evaluating the model on the training and test data...")
train_log_likelihoods = model.score_samples(train_matrix)
logger.info("Computing the log-likelihoods for the validation data...")
val_log_likelihoods = model.transform(val_matrix)

logger.info("Computing the log-likelihoods for the test data...")
test_log_likelihoods = model.transform(test_matrix)

# Fit the log-likelihood transform
logger.info("Fitting the sigmoid transform on the validation data...")
scorer = sigmoid_transformer.SigmoidTransformer().fit(val_log_likelihoods)

logger.info("Evaluating the model on the training, validation and test data...")
logger.info(
f"Log-likelihoods for train:\n"
f"\t- Mean: {train_log_likelihoods.mean():.4f}\n"
f"\t- Std: {train_log_likelihoods.std():.4f}\n"
f"\t- Min: {train_log_likelihoods.min():.4f}\n"
f"\t- 10% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.1):.4f}\n"
f"\t- 90% quantile: {pd.Series(train_log_likelihoods).quantile(q=0.9):.4f}\n"
f"\t- Max: {train_log_likelihoods.max():.4f}"
f"\t- Max: {train_log_likelihoods.max():.4f}\n"
f"Mean score for train: {scorer.transform(train_log_likelihoods).mean():.0%}"
)
logger.info(
f"Log-likelihoods for validation:\n"
f"\t- Mean: {val_log_likelihoods.mean():.4f}\n"
f"\t- Std: {val_log_likelihoods.std():.4f}\n"
f"\t- Min: {val_log_likelihoods.min():.4f}\n"
f"\t- 10% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.1):.4f}\n"
f"\t- 90% quantile: {pd.Series(val_log_likelihoods).quantile(q=0.9):.4f}\n"
f"\t- Max: {val_log_likelihoods.max():.4f}\n"
f"Mean score for validation: {scorer.transform(val_log_likelihoods).mean():.0%}"
)
test_log_likelihoods = model.score_samples(test_matrix)
logger.info(
f"Log-likelihoods for test:\n"
f"\t- Mean: {test_log_likelihoods.mean():.4f}\n"
f"\t- Std: {test_log_likelihoods.std():.4f}\n"
f"\t- Min: {test_log_likelihoods.min():.4f}\n"
f"\t- 10% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.1):.4f}\n"
f"\t- 90% quantile: {pd.Series(test_log_likelihoods).quantile(q=0.9):.4f}\n"
f"\t- Max: {test_log_likelihoods.max():.4f}"
f"\t- Max: {test_log_likelihoods.max():.4f}\n"
f"Mean score for test: {scorer.transform(test_log_likelihoods).mean():.0%}"
)

# Train final model on all data
logger.info("Training final model on entire EU dataset...")
full_matrix = scaler.transform(eu_df[question_columns].values)
model.fit(full_matrix)
pipeline = Pipeline([("scaler", scaler), ("model", model)])
pipeline = Pipeline([("scaler", scaler), ("model", model), ("scorer", scorer)])

# Save the complete pipeline
model_path = Path("models", "model.pkl")
model_path = Path("models", "pipeline.pkl")
model_path.parent.mkdir(exist_ok=True)
joblib.dump(pipeline, model_path)
logger.info(f"Pipeline saved to {model_path.resolve()}")
cloudpickle.register_pickle_by_value(module=sigmoid_transformer)
with model_path.open("wb") as f:
cloudpickle.dump(obj=pipeline, file=f)
logger.info(f"Pipeline saved to {model_path.as_posix()}")
89 changes: 89 additions & 0 deletions src/european_values/sigmoid_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Custom transformer used to transform log-likelihoods into scores."""

import logging
import typing as t
from functools import partial

import scipy.optimize as opt
from numpy import mean, quantile, zeros
from scipy.special import expit as sigmoid
from scipy.special import logit as inverse_sigmoid
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted

if t.TYPE_CHECKING:
from numpy import ndarray

logger = logging.getLogger(__name__)


class SigmoidTransformer(TransformerMixin):
"""Transformer to apply a sigmoid function to log-likelihoods."""

def fit(self, X: "ndarray") -> "SigmoidTransformer":
"""Fit the transformer to the data.

Args:
X:
The input array of log-likelihoods.

Returns:
The fitted transformer.
"""
# We choose the alpha parameter to fit the range of the log-likelihoods. An
# alpha of 0.1 has an effective range of 100, and scales inversely with the
# range of the data: with alpha being 0.05 we get an effective range of 200,
lower, upper = quantile(X, q=[0.01, 0.99])
self.alpha_ = 0.1 / ((upper - lower) / 100)

# Optimise the center of the sigmoid function to fit the target value
result: opt.OptimizeResult = opt.minimize(
fun=partial(self._loss, array=X, target=0.9, alpha=self.alpha_),
x0=zeros(shape=(1,)),
)
self.center_ = result.x[0]
logger.info(
f"Fitted sigmoid transformer with alpha={self.alpha_:.2f} and "
f"center={self.center_:.2f}."
)
return self

def transform(self, X: "ndarray") -> "ndarray":
"""Transform the input data using the fitted sigmoid function.

Args:
X:
The input array of log-likelihoods.

Returns:
The transformed values between 0 and 1.
"""
check_is_fitted(estimator=self, attributes=["alpha_", "center_"])
return sigmoid(self.alpha_ * (X - self.center_))

@staticmethod
def _loss(
center: "ndarray", array: "ndarray", target: float, alpha: float
) -> float:
"""Calculate the loss for the sigmoid transformation.

The loss aims to get the sigmoid values of the array as close to a given target
value as possible.

Args:
center:
The center of the sigmoid curve.
array:
The input array of log-likelihoods.
target:
The target value for the sigmoid transformation.
alpha:
The steepness of the sigmoid curve.

Returns:
The l2 loss between the transformed values and the target sigmoid values.
"""
target = inverse_sigmoid(target)
errors = (alpha * (array - center) - target) ** 2
l2_loss = mean(errors).item()
return l2_loss
33 changes: 33 additions & 0 deletions src/european_values/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,36 @@ def df_has_column_with_only_nans(df: pd.DataFrame) -> bool:
True if there is at least one column with only NaN values, False otherwise.
"""
return any(df[col].isna().all() for col in df.columns)


def apply_subset_filtering(
df: pd.DataFrame, subset_csv_path: str | None
) -> pd.DataFrame:
"""Apply subset filtering to a DataFrame based on a CSV file.

Args:
df:
The DataFrame to filter.
subset_csv_path:
Path to the CSV file containing the subset of questions. Can be None, in
which case no filtering is applied.

Returns:
The filtered DataFrame.
"""
if subset_csv_path is None:
return df

subset_df = pd.read_csv(subset_csv_path)
question_subset = (
subset_df.question.unique().tolist()
if "question" in subset_df.columns
else list({line.split(":")[0] for line in subset_df.index.tolist()})
)
df = df[
[col for col in df.columns if not col.startswith("question_")] + question_subset
]
logger.info(
f"Using {len(question_subset)} questions from the subset {subset_csv_path!r}."
)
return df
24 changes: 2 additions & 22 deletions src/scripts/create_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
from european_values.data_processing import process_data
from european_values.plotting import create_scatter
from european_values.utils import apply_subset_filtering

logger = logging.getLogger("create_plot")

Expand Down Expand Up @@ -42,28 +43,7 @@ def main(config: DictConfig) -> None:
"At least one of `include_evs_trend` or `include_evs_wvs` must be True."
)

# Only use a subset of questions if specified
if config.subset_csv is not None:
subset_df = pd.read_csv(config.subset_csv)
if "question" in subset_df.columns:
question_subset = subset_df.question.unique().tolist()
if config.top_num_questions_in_subset is not None:
question_subset = question_subset[: config.top_num_questions_in_subset]
else:
question_subset = list(
{line.split(":")[0] for line in subset_df.index.tolist()}
)
question_columns_to_remove = [
col
for col in df.columns
if col.startswith("question_") and col not in question_subset
]
df.drop(columns=question_columns_to_remove, inplace=True)
logger.info(
f"Removed {len(question_columns_to_remove):,} questions not in the "
f"specified subset CSV file {config.subset_csv}."
)
logger.info(f"Shape of the data after filtering: {df.shape}")
df = apply_subset_filtering(df=df, subset_csv_path=config.subset_csv)

logger.info("Processing the data...")
df, _ = process_data(df, config)
Expand Down
40 changes: 10 additions & 30 deletions src/scripts/evaluate_llm_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
from european_values.data_processing import process_data
from european_values.utils import apply_subset_filtering

logger = logging.getLogger("evaluate_llm")

Expand All @@ -34,21 +35,7 @@ def main(config: DictConfig) -> None:
"At least one of `include_evs_trend` or `include_evs_wvs` must be True."
)

# Apply subset filtering
if config.subset_csv is not None:
subset_df = pd.read_csv(config.subset_csv)
question_subset = (
subset_df.question.unique().tolist()
if "question" in subset_df.columns
else list({line.split(":")[0] for line in subset_df.index.tolist()})
)
question_cols_to_remove = [
col
for col in df.columns
if col.startswith("question_") and col not in question_subset
]
df.drop(columns=question_cols_to_remove, inplace=True)
logger.info(f"Using {len(question_subset)} questions from subset")
df = apply_subset_filtering(df=df, subset_csv_path=config.subset_csv)

# Process data without normalization (let pipeline handle it)
logger.info("Processing the data WITHOUT normalization...")
Expand All @@ -61,22 +48,15 @@ def main(config: DictConfig) -> None:
for country_group in df.country_group.unique():
group_df = df.query("country_group == @country_group")
responses = group_df[question_cols].values
log_likelihoods = pipeline.score_samples(responses)

# We normalise so that anything below -100 is 0% and anything above 0 is 100%,
# with a linear scale in between.
normalised_scores = (log_likelihoods + 100) / 100
normalised_scores = np.clip(normalised_scores, 0, 1)

scores = pipeline.transform(responses)
logger.info(
f"Log-likelihoods for {country_group}:\n"
f"\t- Mean: {log_likelihoods.mean():.2f}\n"
f"\t- Std: {log_likelihoods.std():.2f}\n"
f"\t- Min: {log_likelihoods.min():.2f}\n"
f"\t- 10% quantile: {np.quantile(log_likelihoods, q=0.1):.2f}\n"
f"\t- 90% quantile: {np.quantile(log_likelihoods, q=0.9):.2f}\n"
f"\t- Max: {log_likelihoods.max():.2f}\n"
f"\t- Mean normalised score: {normalised_scores.mean():.2%} "
f"Scores for {country_group}:\n"
f"\t- Mean: {scores.mean():.0%}\n"
f"\t- Std: {scores.std():.0%}\n"
f"\t- Min: {scores.min():.0%}\n"
f"\t- 10% quantile: {np.quantile(scores, q=0.1):.0%}\n"
f"\t- 90% quantile: {np.quantile(scores, q=0.9):.0%}\n"
f"\t- Max: {scores.max():.0%}\n"
)


Expand Down
Loading
Loading