feat: add new LMEval tests with chat template (#322)

adolfo-ab · web-flow · commit 5b3dd09906ce · 2025-06-10T18:54:20.000+02:00
* feat: improve lmeval tests

* fix: remove unused function
diff --git a/tests/model_explainability/lm_eval/conftest.py b/tests/model_explainability/lm_eval/conftest.py
@@ -25,42 +25,26 @@
 
 @pytest.fixture(scope="function")
 def lmevaljob_hf(
-    admin_client: DynamicClient, model_namespace: Namespace, patched_trustyai_operator_configmap_allow_online: ConfigMap
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    patched_trustyai_operator_configmap_allow_online: ConfigMap,
 ) -> Generator[LMEvalJob, None, None]:
     with LMEvalJob(
         client=admin_client,
-        name="test-job",
+        name=LMEVALJOB_NAME,
         namespace=model_namespace.name,
         model="hf",
-        model_args=[{"name": "pretrained", "value": "google/flan-t5-base"}],
-        task_list={
-            "custom": {
-                "systemPrompts": [
-                    {"name": "sp_0", "value": "Be concise. At every point give the shortest acceptable answer."}
-                ],
-                "templates": [
-                    {
-                        "name": "tp_0",
-                        "value": '{ "__type__": "input_output_template", '
-                        '"input_format": "{text_a_type}: {text_a}\\n'
-                        '{text_b_type}: {text_b}", '
-                        '"output_format": "{label}", '
-                        '"target_prefix": '
-                        '"The {type_of_relation} class is ", '
-                        '"instruction": "Given a {text_a_type} and {text_b_type} '
-                        'classify the {type_of_relation} of the {text_b_type} to one of {classes}.",'
-                        ' "postprocessors": [ "processors.take_first_non_empty_line",'
-                        ' "processors.lower_case_till_punc" ] }',
-                    }
-                ],
-            },
-            "taskRecipes": [
-                {"card": {"name": "cards.wnli"}, "systemPrompt": {"ref": "sp_0"}, "template": {"ref": "tp_0"}}
-            ],
-        },
+        model_args=[{"name": "pretrained", "value": "Qwen/Qwen2.5-0.5B"}],
+        task_list=request.param.get("task_list"),
         log_samples=True,
         allow_online=True,
         allow_code_execution=True,
+        system_instruction="Be concise. At every point give the shortest acceptable answer.",
+        chat_template={
+            "enabled": True,
+        },
+        limit="0.01",
     ) as job:
         yield job
 
@@ -80,6 +64,7 @@ def lmevaljob_local_offline(
         model="hf",
         model_args=[{"name": "pretrained", "value": "/opt/app-root/src/hf_home/flan"}],
         task_list=request.param.get("task_list"),
+        limit="0.01",
         log_samples=True,
         offline={"storage": {"pvcName": "lmeval-data"}},
         pod={
@@ -402,6 +387,13 @@ def lmevaljob_hf_pod(admin_client: DynamicClient, lmevaljob_hf: LMEvalJob) -> Ge
     yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_hf)
 
 
+@pytest.fixture(scope="function")
+def lmevaljob_local_offline_pod(
+    admin_client: DynamicClient, lmevaljob_local_offline: LMEvalJob
+) -> Generator[Pod, Any, Any]:
+    yield get_lmevaljob_pod(client=admin_client, lmevaljob=lmevaljob_local_offline)
+
+
 @pytest.fixture(scope="function")
 def lmevaljob_vllm_emulator_pod(
     admin_client: DynamicClient, lmevaljob_vllm_emulator: LMEvalJob
diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py
@@ -1,22 +1,65 @@
 import pytest
 
-from tests.model_explainability.lm_eval.utils import verify_lmevaljob_running
 from utilities.constants import Timeout
 
 LMEVALJOB_COMPLETE_STATE: str = "Complete"
 
 
 @pytest.mark.parametrize(
-    "model_namespace",
+    "model_namespace, lmevaljob_hf",
     [
         pytest.param(
-            {"name": "test-lmeval-huggingface"},
-        )
+            {"name": "test-lmeval-hf-arc"}, {"task_list": {"taskNames": ["arc_challenge"]}}, id="arc_challenge"
+        ),
+        pytest.param(
+            {"name": "test-lmeval-hf-mmlu"},
+            {"task_list": {"taskNames": ["mmlu_astronomy_generative"]}},
+            id="mmlu_astronomy_generative",
+        ),
+        pytest.param({"name": "test-lmeval-hf-hellaswag"}, {"task_list": {"taskNames": ["hellaswag"]}}, id="hellaswag"),
+        pytest.param(
+            {"name": "test-lmeval-hf-truthfulqa"}, {"task_list": {"taskNames": ["truthfulqa_gen"]}}, id="truthfulqa_gen"
+        ),
+        pytest.param(
+            {"name": "test-lmeval-hf-winogrande"}, {"task_list": {"taskNames": ["winogrande"]}}, id="winogrande"
+        ),
+        pytest.param(
+            {"name": "test-lmeval-hf-custom-task"},
+            {
+                "task_list": {
+                    "custom": {
+                        "systemPrompts": [
+                            {"name": "sp_0", "value": "Be concise. At every point give the shortest acceptable answer."}
+                        ],
+                        "templates": [
+                            {
+                                "name": "tp_0",
+                                "value": '{ "__type__": "input_output_template", '
+                                '"input_format": "{text_a_type}: {text_a}\\n'
+                                '{text_b_type}: {text_b}", '
+                                '"output_format": "{label}", '
+                                '"target_prefix": '
+                                '"The {type_of_relation} class is ", '
+                                '"instruction": "Given a {text_a_type} and {text_b_type} '
+                                'classify the {type_of_relation} of the {text_b_type} to one of {classes}.",'
+                                ' "postprocessors": [ "processors.take_first_non_empty_line",'
+                                ' "processors.lower_case_till_punc" ] }',
+                            }
+                        ],
+                    },
+                    "taskRecipes": [
+                        {"card": {"name": "cards.wnli"}, "systemPrompt": {"ref": "sp_0"}, "template": {"ref": "tp_0"}}
+                    ],
+                }
+            },
+            id="custom_task",
+        ),
     ],
     indirect=True,
 )
 def test_lmeval_huggingface_model(admin_client, model_namespace, lmevaljob_hf_pod):
-    """Basic test that verifies that LMEval can run successfully pulling a model from HuggingFace."""
+    """Tests that verify running common evaluations (and a custom one) on a model pulled directly from HuggingFace.
+    On each test we run a different evaluation task, limiting it to 1% of the questions on each eval."""
     lmevaljob_hf_pod.wait_for_status(status=lmevaljob_hf_pod.Status.SUCCEEDED, timeout=Timeout.TIMEOUT_20MIN)
 
 
@@ -39,10 +82,12 @@ def test_lmeval_local_offline_builtin_tasks_flan_arceasy(
     admin_client,
     model_namespace,
     lmeval_data_downloader_pod,
-    lmevaljob_local_offline,
+    lmevaljob_local_offline_pod,
 ):
     """Test that verifies that LMEval can run successfully in local, offline mode using builtin tasks"""
-    verify_lmevaljob_running(client=admin_client, lmevaljob=lmevaljob_local_offline)
+    lmevaljob_local_offline_pod.wait_for_status(
+        status=lmevaljob_local_offline_pod.Status.SUCCEEDED, timeout=Timeout.TIMEOUT_20MIN
+    )
 
 
 @pytest.mark.parametrize(
@@ -72,10 +117,12 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups(
     admin_client,
     model_namespace,
     lmeval_data_downloader_pod,
-    lmevaljob_local_offline,
+    lmevaljob_local_offline_pod,
 ):
     """Test that verifies that LMEval can run successfully in local, offline mode using unitxt"""
-    verify_lmevaljob_running(client=admin_client, lmevaljob=lmevaljob_local_offline)
+    lmevaljob_local_offline_pod.wait_for_status(
+        status=lmevaljob_local_offline_pod.Status.SUCCEEDED, timeout=Timeout.TIMEOUT_20MIN
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
@@ -3,33 +3,12 @@
 from ocp_resources.pod import Pod
 
 from utilities.constants import Timeout
-from utilities.infra import check_pod_status_in_time
 from simple_logger.logger import get_logger
 
 
 LOGGER = get_logger(name=__name__)
 
 
-def verify_lmevaljob_running(client: DynamicClient, lmevaljob: LMEvalJob) -> None:
-    """
-    Verifies that an LMEvalJob Pod reaches Running state and maintains Running/Succeeded state.
-    Waits for Pod to enter Running state, then checks it stays Running or Succeeded for 2 minutes.
-
-    Args:
-        client: DynamicClient instance for interacting with Kubernetes
-        lmevaljob: LMEvalJob object representing the job to verify
-
-    Raises:
-        TimeoutError: If Pod doesn't reach Running state within 10 minutes
-        AssertionError: If Pod doesn't stay in one of the desired states for 2 minutes
-    """
-
-    lmevaljob_pod = Pod(client=client, name=lmevaljob.name, namespace=lmevaljob.namespace, wait_for_resource=True)
-    lmevaljob_pod.wait_for_status(status=lmevaljob_pod.Status.RUNNING, timeout=Timeout.TIMEOUT_20MIN)
-
-    check_pod_status_in_time(pod=lmevaljob_pod, status={lmevaljob_pod.Status.RUNNING, lmevaljob_pod.Status.SUCCEEDED})
-
-
 def get_lmevaljob_pod(client: DynamicClient, lmevaljob: LMEvalJob, timeout: int = Timeout.TIMEOUT_2MIN) -> Pod:
     """
     Gets the pod corresponding to a given LMEvalJob and waits for it to be ready.