-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathOLDevaluate.py
More file actions
134 lines (111 loc) · 4.46 KB
/
OLDevaluate.py
File metadata and controls
134 lines (111 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import sys
sys.path.append("../lm-evaluation-harness")
from lm_eval import evaluator
from lm_eval.tasks import TaskManager
import logging
import json
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def evaluate_models(model_names, num_gpus=3, tasks=None):
"""
Evaluates multiple models on specified tasks using lm-eval harness
"""
if tasks is None:
tasks = ["mmlu", "hellaswag", "xwinograd", "winogrande",
"truthfulqa_mc1", "arc_challenge", "gsm8k"]
results_data = {}
for model_name in model_names:
logger.info(f"Evaluating {model_name}")
# Run evaluation using lm-eval's simple_evaluate
results = evaluator.simple_evaluate(
model="hf", # Use huggingface model type
model_args=f"pretrained={model_name}",
tasks=tasks,
batch_size="auto",
device="cuda",
num_fewshot=None,
limit=None,
bootstrap_iters=100000,
)
# Extract metrics from results
model_metrics = {}
for task, task_results in results['results'].items():
# Skip subtask results (those starting with a space and dash)
if task.startswith(' -'):
continue
for metric, value in task_results.items():
# Skip non-metric fields
if metric in ['samples', 'alias'] or metric.endswith('_stderr'):
continue
# Get mean and stderr
mean_value = value
stderr_value = task_results.get(f"{metric}_stderr", None)
if stderr_value == "N/A":
stderr_value = None
# Store values
model_metrics[f"{task}_{metric}_mean"] = mean_value
if stderr_value is not None:
model_metrics[f"{task}_{metric}_stderr"] = stderr_value
results_data[model_name] = model_metrics
# Create DataFrame
results_df = pd.DataFrame.from_dict(results_data, orient='index')
return results_df
#todo
#write code
#add models
# accelerate launch --multi_gpu \
# --num_processes=3 \
# --dynamo_backend inductor \
# -m lm_eval --model hf \
# --model_args pretrained=EleutherAI/pythia-2.8b,\
# --tasks mmlu,hellaswag,xwinograd,winogrande,truthfulqa_mc1,arc_challenge,gsm8k \
# --batch_size auto
def main():
# Models to evaluate
models = [
"EleutherAI/pythia-2.8b",
"EleutherAI/pythia-1.4b",
# Add more models here
]
tasks = ["mmlu", "hellaswag", "xwinograd", "winogrande",
"truthfulqa_mc1", "arc_challenge", "gsm8k"]
task_manager = TaskManager(args.verbosity, include_path=None)
for model in models:
results = simple_evaluate(model=hf,
model_args=f"pretrained={model}",
tasks=tasks,
batch_size="auto",
num_fewshot=None, # from default=None
max_batch_size=None, # from default=None
device=None, # from default=None
use_cache=None, # from default=None
limit=None, # from default=None
check_integrity=False, # since it's an action='store_true' with no default
write_out=False, # from default=False
log_samples=False, # from default=False
evaluation_tracker=evaluation_tracker, # unchanged as it's not in parser
system_instruction=None, # from default=None
apply_chat_template=False, # from default=False
fewshot_as_multiturn=False, # from default=False
gen_kwargs=None, # from default=None
task_manager=task_manager, # unchanged as it's not in parser
verbosity="INFO", # from default="INFO"
predict_only=False, # from default=False
random_seed=0, # from default_seed_string="0,1234,1234,1234"
numpy_random_seed=1234, # from default_seed_string
torch_random_seed=1234, # from default_seed_string
fewshot_random_seed=1234, # from default_seed_string
)
if results is not None:
with open(f"results/{model.split("/")[1]}.json", 'w') as f:
json.dump(data, f)
print(
f"{model} done"
)
# Save results
results_df.to_csv("model_evaluation_results.csv")
print("\nResults Summary:")
print(results_df)
if __name__ == "__main__":
main()