-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluate_llm_benchmark.py
More file actions
75 lines (67 loc) · 2.88 KB
/
Copy pathevaluate_llm_benchmark.py
File metadata and controls
75 lines (67 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Script to evaluate LLM benchmark."""
import logging
import hydra
import pandas as pd
from omegaconf import DictConfig
from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
from european_values.data_processing import process_data
from european_values.llm_evaluation import evaluate_survey_data
logger = logging.getLogger("evaluate_llm")
@hydra.main(config_path="../../config", config_name="config", version_base=None)
def main(config: DictConfig) -> None:
"""Main evaluation function."""
# Load data - now supports both datasets like other scripts
match (config.include_evs_trend, config.include_evs_wvs):
case (True, True):
logger.info("Loading EVS trend and EVS/WVS data...")
evs_trend_df = load_evs_trend_data()
evs_wvs_df = load_evs_wvs_data()
df = pd.concat([evs_trend_df, evs_wvs_df], ignore_index=True)
case (True, False):
logger.info("Loading only EVS trend data...")
df = load_evs_trend_data()
case (False, True):
logger.info("Loading only EVS/WVS data...")
df = load_evs_wvs_data()
case _:
raise ValueError(
"At least one of `include_evs_trend` or `include_evs_wvs` must be True."
)
# Process data but SKIP normalization (let pipeline handle it)
df, _ = process_data(df=df, config=config, normalize=False)
# Apply subset filtering
if config.subset_csv is not None:
subset_df = pd.read_csv(config.subset_csv)
question_subset = (
subset_df.question.unique().tolist()
if "question" in subset_df.columns
else list({line.split(":")[0] for line in subset_df.index.tolist()})
)
question_cols_to_remove = [
col
for col in df.columns
if col.startswith("question_") and col not in question_subset
]
df.drop(columns=question_cols_to_remove, inplace=True)
logger.info(f"Using {len(question_subset)} questions from subset")
# Set evaluation parameters
region = config.evaluation.region
model_path = config.evaluation.gmm_model_path
# Run evaluation
logger.info(f"Evaluating {region} data...")
results = evaluate_survey_data(df, model_path, region)
# Print results
print(f"\n{'=' * 50}")
print(f"EVALUATION RESULTS FOR {region}")
print(f"{'=' * 50}")
print(f"Samples: {results['n_samples']:,}")
print(f"Questions: {results['n_questions']}")
print(f"Average probability: {results['avg_probability']:.4f}")
print(
f"Probability range: [{results['sample_probabilities'].min():.4f}, "
f"{results['sample_probabilities'].max():.4f}]"
)
print(f"Probability mean: {results['sample_probabilities'].mean():.4f}")
print(f"Probability std: {results['sample_probabilities'].std():.4f}")
if __name__ == "__main__":
main()