-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluate_llm_benchmark.py
More file actions
64 lines (53 loc) · 2.33 KB
/
Copy pathevaluate_llm_benchmark.py
File metadata and controls
64 lines (53 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Script to evaluate LLM benchmark."""
import logging
import hydra
import joblib
import numpy as np
import pandas as pd
from omegaconf import DictConfig
from european_values.data_loading import load_evs_trend_data, load_evs_wvs_data
from european_values.data_processing import process_data
from european_values.utils import apply_subset_filtering
logger = logging.getLogger("evaluate_llm")
@hydra.main(config_path="../../config", config_name="config", version_base=None)
def main(config: DictConfig) -> None:
"""Main evaluation function."""
match (config.include_evs_trend, config.include_evs_wvs):
case (True, True):
logger.info("Loading EVS trend and EVS/WVS data...")
evs_trend_df = load_evs_trend_data()
evs_wvs_df = load_evs_wvs_data()
df = pd.concat([evs_trend_df, evs_wvs_df], ignore_index=True)
case (True, False):
logger.info("Loading only EVS trend data...")
df = load_evs_trend_data()
case (False, True):
logger.info("Loading only EVS/WVS data...")
df = load_evs_wvs_data()
case _:
raise ValueError(
"At least one of `include_evs_trend` or `include_evs_wvs` must be True."
)
df = apply_subset_filtering(df=df, subset_csv_path=config.subset_csv)
# Process data without normalization (let pipeline handle it)
logger.info("Processing the data WITHOUT normalization...")
df, _ = process_data(df=df, config=config, normalize=False)
# Run evaluation
logger.info("Running evaluation...")
pipeline = joblib.load(config.evaluation.model_path)
question_cols = [col for col in df.columns if col.startswith("question_")]
for country_group in df.country_group.unique():
group_df = df.query("country_group == @country_group")
responses = group_df[question_cols].values
scores = pipeline.transform(responses)
logger.info(
f"Scores for {country_group}:\n"
f"\t- Mean: {scores.mean():.0%}\n"
f"\t- Std: {scores.std():.0%}\n"
f"\t- Min: {scores.min():.0%}\n"
f"\t- 10% quantile: {np.quantile(scores, q=0.1):.0%}\n"
f"\t- 90% quantile: {np.quantile(scores, q=0.9):.0%}\n"
f"\t- Max: {scores.max():.0%}\n"
)
if __name__ == "__main__":
main()