11import pytest
22
3- from tests .model_explainability .lm_eval .utils import verify_lmevaljob_running
43from utilities .constants import Timeout
54
65LMEVALJOB_COMPLETE_STATE : str = "Complete"
76
87
98@pytest .mark .parametrize (
10- "model_namespace" ,
9+ "model_namespace, lmevaljob_hf " ,
1110 [
1211 pytest .param (
13- {"name" : "test-lmeval-huggingface" },
14- )
12+ {"name" : "test-lmeval-hf-arc" }, {"task_list" : {"taskNames" : ["arc_challenge" ]}}, id = "arc_challenge"
13+ ),
14+ pytest .param (
15+ {"name" : "test-lmeval-hf-mmlu" },
16+ {"task_list" : {"taskNames" : ["mmlu_astronomy_generative" ]}},
17+ id = "mmlu_astronomy_generative" ,
18+ ),
19+ pytest .param ({"name" : "test-lmeval-hf-hellaswag" }, {"task_list" : {"taskNames" : ["hellaswag" ]}}, id = "hellaswag" ),
20+ pytest .param (
21+ {"name" : "test-lmeval-hf-truthfulqa" }, {"task_list" : {"taskNames" : ["truthfulqa_gen" ]}}, id = "truthfulqa_gen"
22+ ),
23+ pytest .param (
24+ {"name" : "test-lmeval-hf-winogrande" }, {"task_list" : {"taskNames" : ["winogrande" ]}}, id = "winogrande"
25+ ),
26+ pytest .param (
27+ {"name" : "test-lmeval-hf-custom-task" },
28+ {
29+ "task_list" : {
30+ "custom" : {
31+ "systemPrompts" : [
32+ {"name" : "sp_0" , "value" : "Be concise. At every point give the shortest acceptable answer." }
33+ ],
34+ "templates" : [
35+ {
36+ "name" : "tp_0" ,
37+ "value" : '{ "__type__": "input_output_template", '
38+ '"input_format": "{text_a_type}: {text_a}\\ n'
39+ '{text_b_type}: {text_b}", '
40+ '"output_format": "{label}", '
41+ '"target_prefix": '
42+ '"The {type_of_relation} class is ", '
43+ '"instruction": "Given a {text_a_type} and {text_b_type} '
44+ 'classify the {type_of_relation} of the {text_b_type} to one of {classes}.",'
45+ ' "postprocessors": [ "processors.take_first_non_empty_line",'
46+ ' "processors.lower_case_till_punc" ] }' ,
47+ }
48+ ],
49+ },
50+ "taskRecipes" : [
51+ {"card" : {"name" : "cards.wnli" }, "systemPrompt" : {"ref" : "sp_0" }, "template" : {"ref" : "tp_0" }}
52+ ],
53+ }
54+ },
55+ id = "custom_task" ,
56+ ),
1557 ],
1658 indirect = True ,
1759)
1860def test_lmeval_huggingface_model (admin_client , model_namespace , lmevaljob_hf_pod ):
19- """Basic test that verifies that LMEval can run successfully pulling a model from HuggingFace."""
61+ """Tests that verify running common evaluations (and a custom one) on a model pulled directly from HuggingFace.
62+ On each test we run a different evaluation task, limiting it to 1% of the questions on each eval."""
2063 lmevaljob_hf_pod .wait_for_status (status = lmevaljob_hf_pod .Status .SUCCEEDED , timeout = Timeout .TIMEOUT_20MIN )
2164
2265
@@ -39,10 +82,12 @@ def test_lmeval_local_offline_builtin_tasks_flan_arceasy(
3982 admin_client ,
4083 model_namespace ,
4184 lmeval_data_downloader_pod ,
42- lmevaljob_local_offline ,
85+ lmevaljob_local_offline_pod ,
4386):
4487 """Test that verifies that LMEval can run successfully in local, offline mode using builtin tasks"""
45- verify_lmevaljob_running (client = admin_client , lmevaljob = lmevaljob_local_offline )
88+ lmevaljob_local_offline_pod .wait_for_status (
89+ status = lmevaljob_local_offline_pod .Status .SUCCEEDED , timeout = Timeout .TIMEOUT_20MIN
90+ )
4691
4792
4893@pytest .mark .parametrize (
@@ -72,10 +117,12 @@ def test_lmeval_local_offline_unitxt_tasks_flan_20newsgroups(
72117 admin_client ,
73118 model_namespace ,
74119 lmeval_data_downloader_pod ,
75- lmevaljob_local_offline ,
120+ lmevaljob_local_offline_pod ,
76121):
77122 """Test that verifies that LMEval can run successfully in local, offline mode using unitxt"""
78- verify_lmevaljob_running (client = admin_client , lmevaljob = lmevaljob_local_offline )
123+ lmevaljob_local_offline_pod .wait_for_status (
124+ status = lmevaljob_local_offline_pod .Status .SUCCEEDED , timeout = Timeout .TIMEOUT_20MIN
125+ )
79126
80127
81128@pytest .mark .parametrize (
0 commit comments