Skip to content

Commit d660729

Browse files
feat: Add systematic testing of Tier 2 LMEval tasks
1 parent 2a29882 commit d660729

2 files changed

Lines changed: 30 additions & 9 deletions

File tree

tests/model_explainability/lm_eval/test_lm_eval.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,20 @@
99

1010
LMEVALJOB_COMPLETE_STATE: str = "Complete"
1111

12-
LMEVAL_TASKS: List[str] = get_lmeval_tasks(min_downloads=10000)
12+
TIER1_LMEVAL_TASKS: List[str] = get_lmeval_tasks(min_downloads=10000)
1313

14+
TIER2_LMEVAL_TASKS: List[str] = set(get_lmeval_tasks(min_downloads=.70, max_downloads=10000)) - set(TIER1_LMEVAL_TASKS)
1415

1516
@pytest.mark.parametrize(
1617
"model_namespace, lmevaljob_hf",
1718
[
1819
pytest.param(
19-
{"name": "test-lmeval-hf"},
20-
{"task_list": {"taskNames": LMEVAL_TASKS}},
20+
{"name": "test-lmeval-hf-tier1"},
21+
{"task_list": {"taskNames": TIER1_LMEVAL_TASKS}},
22+
),
23+
pytest.param(
24+
{"name": "test-lmeval-hf-tier2"},
25+
{"task_list": {"taskNames": TIER2_LMEVAL_TASKS}},
2126
),
2227
pytest.param(
2328
{"name": "test-lmeval-hf-custom-task"},

tests/model_explainability/lm_eval/utils.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List
1+
from typing import List,Union
22

33
from kubernetes.dynamic import DynamicClient
44
from ocp_resources.lm_eval_job import LMEvalJob
@@ -36,30 +36,46 @@ def get_lmevaljob_pod(client: DynamicClient, lmevaljob: LMEvalJob, timeout: int
3636
return lmeval_pod
3737

3838

39-
def get_lmeval_tasks(min_downloads: int = 10000) -> List[str]:
39+
def get_lmeval_tasks(min_downloads: Union[int, float], max_downloads: Union[int, float, None] = None) -> List[str]:
4040
"""
4141
Gets the list of supported LM-Eval tasks that have above a certain number of minimum downloads on HuggingFace.
4242
4343
Args:
44-
min_downloads: The minimum number of downloads
44+
min_downloads: The minimum number of downloads or the percentile of downloads to use as a minimum
45+
max_downloads: The maximum number of downloads or the percentile of downloads to use as a maximum
4546
4647
Returns:
4748
List of LM-Eval task names
4849
"""
49-
if min_downloads < 1:
50+
if min_downloads <= 0:
5051
raise ValueError("Minimum downloads must be greater than 0")
5152

5253
lmeval_tasks = pd.read_csv(filepath_or_buffer="tests/model_explainability/lm_eval/data/new_task_list.csv")
5354

54-
# filter for tasks that either exceed (min_downloads OR exist on the OpenLLM leaderboard)
55-
# AND exist on LMEval AND do not include image data
55+
if isinstance(min_downloads, float):
56+
if not 0 <= min_downloads <= 1:
57+
raise ValueError("Minimum downloads as a percentile must be between 0 and 1")
58+
min_downloads = lmeval_tasks["HF dataset downloads"].quantile(q=min_downloads)
5659

60+
# filter for tasks that either exceed min_downloads OR exist on the OpenLLM leaderboard
61+
# AND exist on LMEval AND do not include image data
5762
filtered_df = lmeval_tasks[
5863
lmeval_tasks["Exists"]
5964
& (lmeval_tasks["Dataset"] != "MMMU/MMMU")
6065
& ((lmeval_tasks["HF dataset downloads"] >= min_downloads) | (lmeval_tasks["OpenLLM leaderboard"]))
6166
]
6267

68+
# if max_downloads is provided, filter for tasks that have less than
69+
# or equal to the maximum number of downloads
70+
if max_downloads is not None:
71+
if (max_downloads <= 0 | max_downloads > max(lmeval_tasks["HF dataset downloads"])):
72+
raise ValueError("Maximum downloads must be greater than 0 and less than the maximum number of downloads")
73+
if isinstance(max_downloads, float):
74+
if not 0 <= min_downloads <= 1:
75+
raise ValueError("Maximum downloads as a percentile must be between 0 and 1")
76+
max_downloads = lmeval_tasks["HF dataset downloads"].quantile(q=max_downloads)
77+
filtered_df = filtered_df[filtered_df["HF dataset downloads"] <= max_downloads]
78+
6379
# group tasks by dataset and extract the task with shortest name in the group
6480
unique_tasks = filtered_df.loc[filtered_df.groupby("Dataset")["Name"].apply(lambda x: x.str.len().idxmin())]
6581

0 commit comments

Comments
 (0)