Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b156496
Fix remaining line length issues in print statements
AnnikaSimonsen Aug 4, 2025
33700f1
Added print statements for probabilities without .max(axis=1), made v…
AnnikaSimonsen Aug 5, 2025
5261120
Update src/european_values/data_processing.py
AnnikaSimonsen Aug 6, 2025
437ef04
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 6, 2025
a666c21
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 6, 2025
58f0c6f
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 6, 2025
205725b
Update src/scripts/evaluate_llm_benchmark.py
AnnikaSimonsen Aug 6, 2025
2def0f6
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 6, 2025
96021dd
Update src/scripts/evaluate_llm_benchmark.py
AnnikaSimonsen Aug 6, 2025
3136d90
Update src/scripts/train_generative_model.py
AnnikaSimonsen Aug 6, 2025
803b806
Update src/scripts/evaluate_llm_benchmark.py
AnnikaSimonsen Aug 6, 2025
a9aba95
Address review feedback: flexible data loading, evaluation fixes, and…
AnnikaSimonsen Aug 6, 2025
0bab4dc
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 7, 2025
6aae6a8
Update src/european_values/generative_training.py
AnnikaSimonsen Aug 7, 2025
429d343
Fix process_data tuple unpacking in classifier, survey, and plot scripts
AnnikaSimonsen Aug 7, 2025
ae00d74
Apply ruff formatting
AnnikaSimonsen Aug 7, 2025
243177f
Clean up llm_evaluation.py and add flexible data loading to evaluate_…
AnnikaSimonsen Aug 7, 2025
012ded9
Update data loading patterns and fix tuple unpacking
AnnikaSimonsen Aug 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ discriminative_training:
generative_training:
max_components: 30
samples_per_country_val_test: 100

evaluation:
region: EU
gmm_model_path: data/processed/gmm_model/gmm_n4_seed4242.pkl
Binary file modified data/processed/gmm_model/gmm_n4_seed4242.pkl
Binary file not shown.
26 changes: 18 additions & 8 deletions src/european_values/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,22 @@
logger = logging.getLogger(__name__)


def process_data(df: pd.DataFrame, config: DictConfig) -> pd.DataFrame:
def process_data(
df: pd.DataFrame, config: DictConfig, normalize: bool = True
) -> tuple[pd.DataFrame, MinMaxScaler]:
"""Process the survey data.

Args:
df:
The survey data.
config:
The Hydra config.
normalize (optional):
Whether to apply normalization. If False, scaler is fitted but not applied.
Defaults to True.

Returns:
The processed DataFrame.
The processed DataFrame and the fitted scaler.

Raises:
ValueError:
Expand Down Expand Up @@ -211,13 +216,18 @@ def process_data(df: pd.DataFrame, config: DictConfig) -> pd.DataFrame:
f"{np.isnan(embedding_matrix).sum():,} missing values in the embedding matrix."
)

# Normalise the data
logger.info("Normalising the data...")
embedding_matrix = MinMaxScaler(feature_range=(0, 1)).fit_transform(
X=embedding_matrix
)
# Always fit the scaler (so we can save it), but only apply if requested
logger.info("Fitting scaler...")
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(embedding_matrix) # Always fit

if normalize:
logger.info("Applying normalization...")
embedding_matrix = scaler.transform(embedding_matrix)
else:
logger.info("Skipping normalization (but scaler is fitted and available)...")

# Update the survey DataFrame with the processed values
df[question_columns] = embedding_matrix

return df
return df, scaler
88 changes: 67 additions & 21 deletions src/european_values/generative_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

logger = logging.getLogger(__name__)

Expand All @@ -19,7 +22,7 @@ def train_generative_model(
"""Train a Gaussian Mixture Model on EU survey data.

Args:
survey_df: Survey data DataFrame
survey_df: A non-normalised dataframe containing the survey responses.
max_components: Maximum number of components to try in model selection
samples_per_country_val_test: Number of samples per country for
validation/test sets
Expand Down Expand Up @@ -54,6 +57,7 @@ def train_generative_model(
val_dfs.append(country_data.iloc[n_test : n_test + n_val])
train_dfs.append(country_data.iloc[n_test + n_val :])

# Set up the data as NumPy arrays
train_matrix = pd.concat(train_dfs)[question_columns].values
val_matrix = pd.concat(val_dfs)[question_columns].values
test_matrix = pd.concat(test_dfs)[question_columns].values
Expand All @@ -68,39 +72,81 @@ def train_generative_model(
n_components_range = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30]
n_components_range = [n for n in n_components_range if n <= max_components]

# Create GMM instance once
gmm = GaussianMixture(random_state=seed)

bic_scores = {}
for n_comp in n_components_range:
gmm.n_components = n_comp
gmm.fit(train_matrix)
bic_scores[n_comp] = gmm.bic(val_matrix)
# Create pipeline with scaler + GMM
pipeline = make_pipeline(
MinMaxScaler(feature_range=(0, 1)),
GaussianMixture(n_components=n_comp, random_state=seed),
)

# Fit the entire pipeline on training data
pipeline.fit(train_matrix)
bic_scores[n_comp] = pipeline.named_steps["gaussianmixture"].bic(
pipeline.named_steps["minmaxscaler"].transform(val_matrix)
)
logger.info(f"n_components={n_comp}: BIC={bic_scores[n_comp]:.2f}")

best_n = min(bic_scores.keys(), key=lambda x: bic_scores[x])
logger.info(f"Best n_components: {best_n}")

# Evaluate on test set - reuse the same GMM instance
logger.info("Evaluating on test set...")
gmm.n_components = best_n
gmm.fit(train_matrix)
test_bic = gmm.bic(test_matrix)
logger.info(f"Test BIC: {test_bic:.2f}")
# Create final pipeline with best number of components
final_pipeline = make_pipeline(
MinMaxScaler(feature_range=(0, 1)),
GaussianMixture(n_components=best_n, random_state=seed),
)

# Train final model on all data - reuse the same GMM instance
# Train final model on all data
logger.info("Training final model on entire EU dataset...")
full_matrix = eu_df[question_columns].values
gmm.fit(full_matrix)
final_pipeline.fit(full_matrix)

# Evaluate on test set
logger.info("Evaluating on test set...")
test_bic = final_pipeline.named_steps["gaussianmixture"].bic(
final_pipeline.named_steps["minmaxscaler"].transform(test_matrix)
)
logger.info(f"Test BIC: {test_bic:.2f}")

gmm = final_pipeline.named_steps["gaussianmixture"]
scaler = final_pipeline.named_steps["minmaxscaler"]
scaled_full_matrix = scaler.transform(full_matrix)

logger.info(
f"Final model - BIC: {gmm.bic(full_matrix):.2f}, "
f"Final model - BIC: {gmm.bic(scaled_full_matrix):.2f}, "
f"Converged: {gmm.converged_}, Iterations: {gmm.n_iter_}"
)

# Save model
model_dir = Path("models")
model_dir.mkdir(exist_ok=True)
logger.info("Testing probabilities on training data...")
train_probabilities = gmm.predict_proba(scaled_full_matrix)
print(f"Training data - Component weights: {gmm.weights_}")
print(f"Training data - First 5 full probabilities:\n{train_probabilities[:5]}")
Comment thread
AnnikaSimonsen marked this conversation as resolved.
Comment thread
AnnikaSimonsen marked this conversation as resolved.

# Try weighted approach
weighted_probs = np.sum(
train_probabilities * np.expand_dims(gmm.weights_, axis=0), axis=1
)
max_probs = train_probabilities.max(axis=1)

logger.info(f"Training data max average probability: {max_probs.mean():.4f}")
logger.info(
f"Training data max probability range: [{max_probs.min():.4f}, "
f"{max_probs.max():.4f}]"
)
logger.info(f"Training data max probability std: {max_probs.std():.4f}")

logger.info(
f"Training data weighted average probability: {weighted_probs.mean():.4f}"
)
logger.info(
f"Training data weighted probability range: [{weighted_probs.min():.4f}, "
f"{weighted_probs.max():.4f}]"
)
logger.info(f"Training data weighted probability std: {weighted_probs.std():.4f}")

# Save the complete pipeline (as requested)
model_dir = Path("data/processed/gmm_model")
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / f"gmm_n{best_n}_seed{seed}.pkl"
joblib.dump(gmm, model_path)
logger.info(f"Model saved to {model_path.resolve()!r}")
joblib.dump(final_pipeline, model_path)
logger.info(f"Pipeline saved to {model_path.resolve()!r}")
73 changes: 73 additions & 0 deletions src/european_values/llm_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""LLM evaluation using trained GMM."""

from typing import Any, Dict, Tuple

import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline


def evaluate_with_gmm(
responses: np.ndarray, gmm_pipeline: Pipeline
) -> Tuple[float, np.ndarray]:
"""Evaluate responses using GMM pipeline."""
# Try using the pipeline directly first
try:
full_probabilities = gmm_pipeline.predict_proba(responses)
except AttributeError:
# Fallback: Pipeline doesn't have predict_proba, access GMM component
gmm_model = gmm_pipeline.named_steps["gaussianmixture"]
scaled_responses = gmm_pipeline.named_steps["minmaxscaler"].transform(responses)
full_probabilities = gmm_model.predict_proba(scaled_responses)

print("First 5 rows of full probabilities:")
print(full_probabilities[:5])
Comment thread
AnnikaSimonsen marked this conversation as resolved.
Comment thread
AnnikaSimonsen marked this conversation as resolved.

probabilities = full_probabilities.max(axis=1)
avg_probability = np.mean(probabilities)
return avg_probability, probabilities


def evaluate_survey_data(
survey_df: pd.DataFrame, gmm_model_path: str, region: str = "EU"
) -> Dict[str, Any]:
"""Evaluate survey data with GMM."""
# Load pipeline directly (no need for wrapper function)
gmm_pipeline = joblib.load(gmm_model_path)

# Try filtering by country_group first, then by country_code
data = survey_df[survey_df["country_group"] == region]
if len(data) == 0:
data = survey_df[survey_df["country_code"] == region]
if len(data) == 0:
available_groups = survey_df["country_group"].unique()
available_countries = survey_df["country_code"].unique()
raise ValueError(
f"No data found for region '{region}'. "
f"Available groups: {list(available_groups)}, "
f"Available countries: {list(available_countries)}"
)

question_cols = [col for col in data.columns if col.startswith("question_")]
responses = data[question_cols].values

# Since data should already be processed and imputed, we shouldn't need
# additional NaN handling. But keep minimal safety check:
if np.isnan(responses).any():
nan_count = np.isnan(responses).sum()
print(f"Warning: Found {nan_count} NaN values in processed data")
# Simple fallback: replace NaN with column means
col_means = np.nanmean(responses, axis=0)
nan_mask = np.isnan(responses)
responses = np.where(nan_mask, col_means, responses)

# Evaluate
avg_probability, sample_probabilities = evaluate_with_gmm(responses, gmm_pipeline)

return {
"avg_probability": avg_probability,
"sample_probabilities": sample_probabilities,
"n_samples": len(data),
"n_questions": len(question_cols),
}
2 changes: 1 addition & 1 deletion src/scripts/create_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def main(config: DictConfig) -> None:
)

logger.info("Processing the data...")
df = process_data(df=df, config=config)
df, scaler = process_data(df, config)
logger.info(f"Shape of the data after processing: {df.shape}")

# Only use a subset of questions if specified
Expand Down
75 changes: 75 additions & 0 deletions src/scripts/evaluate_llm_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Script to evaluate LLM benchmark."""

import logging

import hydra
import pandas as pd
from omegaconf import DictConfig

from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
from european_values.data_processing import process_data
from european_values.llm_evaluation import evaluate_survey_data

logger = logging.getLogger("evaluate_llm")


@hydra.main(config_path="../../config", config_name="config", version_base=None)
def main(config: DictConfig) -> None:
"""Main evaluation function."""
# Load data - now supports both datasets like other scripts
match (config.include_evs_trend, config.include_evs_wvs):
case (True, True):
logger.info("Loading EVS trend and EVS/WVS data...")
evs_trend_df = load_evs_trend_data()
evs_wvs_df = load_evs_wvs_data()
df = pd.concat([evs_trend_df, evs_wvs_df], ignore_index=True)
case (True, False):
logger.info("Loading only EVS trend data...")
df = load_evs_trend_data()
case (False, True):
logger.info("Loading only EVS/WVS data...")
df = load_evs_wvs_data()
case _:
raise ValueError(
"At least one of `include_evs_trend` or `include_evs_wvs` must be True."
)
# Process data but SKIP normalization (let pipeline handle it)
df, _ = process_data(df=df, config=config, normalize=False)
# Apply subset filtering
if config.subset_csv is not None:
subset_df = pd.read_csv(config.subset_csv)
question_subset = (
subset_df.question.unique().tolist()
if "question" in subset_df.columns
else list({line.split(":")[0] for line in subset_df.index.tolist()})
)
question_cols_to_remove = [
col
for col in df.columns
if col.startswith("question_") and col not in question_subset
]
df.drop(columns=question_cols_to_remove, inplace=True)
logger.info(f"Using {len(question_subset)} questions from subset")
# Set evaluation parameters
region = config.evaluation.region
model_path = config.evaluation.gmm_model_path
# Run evaluation
logger.info(f"Evaluating {region} data...")
results = evaluate_survey_data(df, model_path, region)
# Print results
print(f"\n{'=' * 50}")
print(f"EVALUATION RESULTS FOR {region}")
print(f"{'=' * 50}")
print(f"Samples: {results['n_samples']:,}")
print(f"Questions: {results['n_questions']}")
print(f"Average probability: {results['avg_probability']:.4f}")
print(
f"Probability range: [{results['sample_probabilities'].min():.4f}, "
f"{results['sample_probabilities'].max():.4f}]"
)
print(f"Probability mean: {results['sample_probabilities'].mean():.4f}")
print(f"Probability std: {results['sample_probabilities'].std():.4f}")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/scripts/optimise_survey.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main(config: DictConfig) -> None:
)

logger.info("Processing the data...")
df = process_data(df=df, config=config)
df, scaler = process_data(df, config)
logger.info(f"Shape of the data after processing: {df.shape}")

df = optimise_survey(survey_df=df, config=config)
Expand Down
2 changes: 1 addition & 1 deletion src/scripts/train_discriminative_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def main(config: DictConfig) -> None:
)

logger.info("Processing the data...")
df = process_data(df=df, config=config)
df, scaler = process_data(df, config)
logger.info(f"Shape of the data after processing: {df.shape}")

# Only use a subset of questions if specified
Expand Down
Loading
Loading