feat: enable evaluating from json

María Grandury · María Grandury · commit f51f96029a82 · 2024-10-23T02:14:13.000+02:00
diff --git a/README.md b/README.md
@@ -1 +1,4 @@
 # Backend de "La Leaderboard"
+
+- To evaluate the models in the requests dataset, run `python3 -m main_eval_queue.py`
+- To evaluate the combination of models and tasks in tasks_todo.json, run `python3 -m main_eval_json.py`
diff --git a/internal_queue/model_precision.json b/internal_queue/model_precision.json
@@ -0,0 +1,22 @@
+{
+    "flax-community/gpt-2-spanish": "float32",
+    "google/gemma-2-9b": "float32",
+    "google/gemma-2-2b": "float32",
+    "bertin-project/bertin-gpt-j-6B": "float32",
+    "gplsi/Aitana-6.3B": "bfloat16",
+    "projecte-aina/aguila-7b": "float16",
+    "01-ai/Yi-1.5-9B": "bfloat16",
+    "microsoft/phi-1_5": "float16",
+    "occiglot/occiglot-7b-es-en": "float32",
+    "tiiuae/falcon-7b": "bfloat16",
+    "HiTZ/latxa-7b-v1.2": "bfloat16",
+    "meta-llama/Meta-Llama-3.1-8B": "bfloat16",
+    "mistralai/Mistral-7B-v0.3": "bfloat16",
+    "projecte-aina/FLOR-6.3B": "float16",
+    "proxectonos/Carballo-bloom-1.3B": "float16",
+    "BSC-LT/salamandra-2b": "bfloat16",
+    "BSC-LT/salamandra-7b": "bfloat16",
+    "meta-llama/Llama-3.2-1B": "bfloat16",
+    "meta-llama/Llama-3.2-3B": "bfloat16",
+    "bertin-project/Gromenauer-7B": "float32"
+}
diff --git a/internal_queue/tasks_todo.json b/internal_queue/tasks_todo.json
@@ -0,0 +1,98 @@
+{
+    "google/gemma-2-2b": [
+        "escola",
+        "catcola",
+        "parafraseja",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "HiTZ/latxa-7b-v1.2": [],
+    "proxectonos/Carballo-bloom-1.3B": [
+        "mgsm_direct_es",
+        "mgsm_direct_ca",
+        "mgsm_direct_gl"
+    ],
+    "projecte-aina/aguila-7b": [
+        "escola",
+        "mgsm_direct_gl"
+    ],
+    "projecte-aina/FLOR-6.3B": [],
+    "gplsi/Aitana-6.3B": [
+        "escola",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "occiglot/occiglot-7b-es-en": [
+        "eus_reading",
+        "summarization_gl"
+    ],
+    "bertin-project/bertin-gpt-j-6B": [
+        "escola",
+        "mgsm_direct_es",
+        "catcola",
+        "mgsm_direct_ca",
+        "paws_ca",
+        "xnli_ca",
+        "mgsm_direct_gl"
+    ],
+    "meta-llama/Meta-Llama-3.1-8B": [],
+    "mistralai/Mistral-7B-v0.3": [],
+    "01-ai/Yi-1.5-9B": [
+        "escola",
+        "mgsm_direct_es",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "microsoft/phi-1_5": [
+        "escola",
+        "catcola",
+        "mgsm_direct_ca",
+        "paws_ca",
+        "xnli_ca",
+        "mgsm_direct_gl"
+    ],
+    "tiiuae/falcon-7b": [
+        "escola",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "bertin-project/Gromenauer-7B": [
+        "escola",
+        "mgsm_direct_es",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "BSC-LT/salamandra-2b": [
+        "escola",
+        "mgsm_direct_es",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "BSC-LT/salamandra-7b": [
+        "escola",
+        "catcola",
+        "paws_ca",
+        "xnli_ca"
+    ],
+    "meta-llama/Llama-3.2-1B": [
+        "escola",
+        "wnli_es",
+        "catcola",
+        "paws_ca",
+        "wnli_ca",
+        "xnli_ca"
+    ],
+    "meta-llama/Llama-3.2-3B": [
+        "escola",
+        "wnli_es",
+        "catcola",
+        "paws_ca",
+        "wnli_ca",
+        "xnli_ca"
+    ]
+}
diff --git a/main_eval_json.py b/main_eval_json.py
@@ -0,0 +1,45 @@
+import json
+
+from src.backend.manage_requests import EvalRequest
+from src.backend.run_eval_suite_harness import run_evaluation
+from src.envs import (
+    BATCH_SIZE,
+    DEVICE,
+    EVAL_RESULTS_PATH_BACKEND,
+    LEADERBOARD_GROUP,
+    LIMIT,
+    LOGS_REPO,
+    NUM_FEWSHOT,
+    RESULTS_REPO,
+)
+
+if __name__ == "__main__":
+    with open("internal_queue/tasks_todo.json", "r") as f:
+        tasks_todo = json.load(f)
+    with open("internal_queue/model_precision.json", "r") as f:
+        model_precision = json.load(f)
+
+    for model in tasks_todo:
+        MODEL = model
+        TASKS_HARNESS = tasks_todo[model]
+        PRECISION = model_precision[model]
+        EVAL_REQUEST = EvalRequest(
+            model=MODEL,
+            precision=PRECISION,
+            base_model="",  # TODO: Review arg
+            status="",  # TODO: Review arg
+            json_filepath="",  # TODO: Review arg
+        )
+
+        run_evaluation(
+            eval_request=EVAL_REQUEST,
+            task_names=TASKS_HARNESS,
+            leaderboard_group=LEADERBOARD_GROUP,
+            num_fewshot=NUM_FEWSHOT,
+            batch_size=BATCH_SIZE,
+            device=DEVICE,
+            local_dir=EVAL_RESULTS_PATH_BACKEND,
+            results_repo=RESULTS_REPO,
+            logs_repo=LOGS_REPO,
+            limit=LIMIT,
+        )
diff --git a/main_eval_queue.py b/main_eval_queue.py
@@ -8,6 +8,7 @@
 from src.backend.sort_queue import sort_models_by_priority
 from src.envs import (
     API,
+    BATCH_SIZE,
     DEVICE,
     EVAL_REQUESTS_PATH_BACKEND,
     EVAL_RESULTS_PATH_BACKEND,
@@ -97,7 +98,7 @@ def run_auto_eval():
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
         logs_repo=LOGS_REPO,
-        batch_size=1,
+        batch_size=BATCH_SIZE,
         device=DEVICE,
         leaderboard_group=LEADERBOARD_GROUP,
         limit=LIMIT,
diff --git a/src/envs.py b/src/envs.py
@@ -17,6 +17,7 @@
 NUM_FEWSHOT = 5  # TODO: Remove to use each task's default number of few-shots
 LEADERBOARD_GROUP = None  # TODO: Update leaderboard group name
 PARALLELIZE = True
+BATCH_SIZE = 1
 
 # Cache setup
 CACHE_PATH = os.getenv("HF_HOME", ".")  # /data/.huggingface