Skip to content

Commit f51f960

Browse files
author
María Grandury
committed
feat: enable evaluating from json
1 parent a22bd58 commit f51f960

File tree

6 files changed

+171
-1
lines changed

6 files changed

+171
-1
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
# Backend de "La Leaderboard"
2+
3+
- To evaluate the models in the requests dataset, run `python3 -m main_eval_queue.py`
4+
- To evaluate the combination of models and tasks in tasks_todo.json, run `python3 -m main_eval_json.py`
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"flax-community/gpt-2-spanish": "float32",
3+
"google/gemma-2-9b": "float32",
4+
"google/gemma-2-2b": "float32",
5+
"bertin-project/bertin-gpt-j-6B": "float32",
6+
"gplsi/Aitana-6.3B": "bfloat16",
7+
"projecte-aina/aguila-7b": "float16",
8+
"01-ai/Yi-1.5-9B": "bfloat16",
9+
"microsoft/phi-1_5": "float16",
10+
"occiglot/occiglot-7b-es-en": "float32",
11+
"tiiuae/falcon-7b": "bfloat16",
12+
"HiTZ/latxa-7b-v1.2": "bfloat16",
13+
"meta-llama/Meta-Llama-3.1-8B": "bfloat16",
14+
"mistralai/Mistral-7B-v0.3": "bfloat16",
15+
"projecte-aina/FLOR-6.3B": "float16",
16+
"proxectonos/Carballo-bloom-1.3B": "float16",
17+
"BSC-LT/salamandra-2b": "bfloat16",
18+
"BSC-LT/salamandra-7b": "bfloat16",
19+
"meta-llama/Llama-3.2-1B": "bfloat16",
20+
"meta-llama/Llama-3.2-3B": "bfloat16",
21+
"bertin-project/Gromenauer-7B": "float32"
22+
}

internal_queue/tasks_todo.json

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
{
2+
"google/gemma-2-2b": [
3+
"escola",
4+
"catcola",
5+
"parafraseja",
6+
"paws_ca",
7+
"xnli_ca"
8+
],
9+
"HiTZ/latxa-7b-v1.2": [],
10+
"proxectonos/Carballo-bloom-1.3B": [
11+
"mgsm_direct_es",
12+
"mgsm_direct_ca",
13+
"mgsm_direct_gl"
14+
],
15+
"projecte-aina/aguila-7b": [
16+
"escola",
17+
"mgsm_direct_gl"
18+
],
19+
"projecte-aina/FLOR-6.3B": [],
20+
"gplsi/Aitana-6.3B": [
21+
"escola",
22+
"catcola",
23+
"paws_ca",
24+
"xnli_ca"
25+
],
26+
"occiglot/occiglot-7b-es-en": [
27+
"eus_reading",
28+
"summarization_gl"
29+
],
30+
"bertin-project/bertin-gpt-j-6B": [
31+
"escola",
32+
"mgsm_direct_es",
33+
"catcola",
34+
"mgsm_direct_ca",
35+
"paws_ca",
36+
"xnli_ca",
37+
"mgsm_direct_gl"
38+
],
39+
"meta-llama/Meta-Llama-3.1-8B": [],
40+
"mistralai/Mistral-7B-v0.3": [],
41+
"01-ai/Yi-1.5-9B": [
42+
"escola",
43+
"mgsm_direct_es",
44+
"catcola",
45+
"paws_ca",
46+
"xnli_ca"
47+
],
48+
"microsoft/phi-1_5": [
49+
"escola",
50+
"catcola",
51+
"mgsm_direct_ca",
52+
"paws_ca",
53+
"xnli_ca",
54+
"mgsm_direct_gl"
55+
],
56+
"tiiuae/falcon-7b": [
57+
"escola",
58+
"catcola",
59+
"paws_ca",
60+
"xnli_ca"
61+
],
62+
"bertin-project/Gromenauer-7B": [
63+
"escola",
64+
"mgsm_direct_es",
65+
"catcola",
66+
"paws_ca",
67+
"xnli_ca"
68+
],
69+
"BSC-LT/salamandra-2b": [
70+
"escola",
71+
"mgsm_direct_es",
72+
"catcola",
73+
"paws_ca",
74+
"xnli_ca"
75+
],
76+
"BSC-LT/salamandra-7b": [
77+
"escola",
78+
"catcola",
79+
"paws_ca",
80+
"xnli_ca"
81+
],
82+
"meta-llama/Llama-3.2-1B": [
83+
"escola",
84+
"wnli_es",
85+
"catcola",
86+
"paws_ca",
87+
"wnli_ca",
88+
"xnli_ca"
89+
],
90+
"meta-llama/Llama-3.2-3B": [
91+
"escola",
92+
"wnli_es",
93+
"catcola",
94+
"paws_ca",
95+
"wnli_ca",
96+
"xnli_ca"
97+
]
98+
}

main_eval_json.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import json
2+
3+
from src.backend.manage_requests import EvalRequest
4+
from src.backend.run_eval_suite_harness import run_evaluation
5+
from src.envs import (
6+
BATCH_SIZE,
7+
DEVICE,
8+
EVAL_RESULTS_PATH_BACKEND,
9+
LEADERBOARD_GROUP,
10+
LIMIT,
11+
LOGS_REPO,
12+
NUM_FEWSHOT,
13+
RESULTS_REPO,
14+
)
15+
16+
if __name__ == "__main__":
17+
with open("internal_queue/tasks_todo.json", "r") as f:
18+
tasks_todo = json.load(f)
19+
with open("internal_queue/model_precision.json", "r") as f:
20+
model_precision = json.load(f)
21+
22+
for model in tasks_todo:
23+
MODEL = model
24+
TASKS_HARNESS = tasks_todo[model]
25+
PRECISION = model_precision[model]
26+
EVAL_REQUEST = EvalRequest(
27+
model=MODEL,
28+
precision=PRECISION,
29+
base_model="", # TODO: Review arg
30+
status="", # TODO: Review arg
31+
json_filepath="", # TODO: Review arg
32+
)
33+
34+
run_evaluation(
35+
eval_request=EVAL_REQUEST,
36+
task_names=TASKS_HARNESS,
37+
leaderboard_group=LEADERBOARD_GROUP,
38+
num_fewshot=NUM_FEWSHOT,
39+
batch_size=BATCH_SIZE,
40+
device=DEVICE,
41+
local_dir=EVAL_RESULTS_PATH_BACKEND,
42+
results_repo=RESULTS_REPO,
43+
logs_repo=LOGS_REPO,
44+
limit=LIMIT,
45+
)

main.py renamed to main_eval_queue.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from src.backend.sort_queue import sort_models_by_priority
99
from src.envs import (
1010
API,
11+
BATCH_SIZE,
1112
DEVICE,
1213
EVAL_REQUESTS_PATH_BACKEND,
1314
EVAL_RESULTS_PATH_BACKEND,
@@ -97,7 +98,7 @@ def run_auto_eval():
9798
local_dir=EVAL_RESULTS_PATH_BACKEND,
9899
results_repo=RESULTS_REPO,
99100
logs_repo=LOGS_REPO,
100-
batch_size=1,
101+
batch_size=BATCH_SIZE,
101102
device=DEVICE,
102103
leaderboard_group=LEADERBOARD_GROUP,
103104
limit=LIMIT,

src/envs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
NUM_FEWSHOT = 5 # TODO: Remove to use each task's default number of few-shots
1818
LEADERBOARD_GROUP = None # TODO: Update leaderboard group name
1919
PARALLELIZE = True
20+
BATCH_SIZE = 1
2021

2122
# Cache setup
2223
CACHE_PATH = os.getenv("HF_HOME", ".") # /data/.huggingface

0 commit comments

Comments
 (0)