feat: Add systematic testing of Tier 2 LMEval tasks

christinaexyou · christinaexyou · commit 0a5cc63eae5c · 2025-08-04T10:40:34.000-04:00
diff --git a/tests/model_explainability/lm_eval/test_lm_eval.py b/tests/model_explainability/lm_eval/test_lm_eval.py
@@ -9,15 +9,20 @@
 
 LMEVALJOB_COMPLETE_STATE: str = "Complete"
 
-LMEVAL_TASKS: List[str] = get_lmeval_tasks(min_downloads=10000)
+TIER1_LMEVAL_TASKS: List[str] = get_lmeval_tasks(min_downloads=10000)
 
+TIER2_LMEVAL_TASKS: List[str] = list(set(get_lmeval_tasks(min_downloads=.70, max_downloads=10000)) - set(TIER1_LMEVAL_TASKS))
 
 @pytest.mark.parametrize(
     "model_namespace, lmevaljob_hf",
     [
         pytest.param(
-            {"name": "test-lmeval-hf"},
-            {"task_list": {"taskNames": LMEVAL_TASKS}},
+            {"name": "test-lmeval-hf-tier1"},
+            {"task_list": {"taskNames": TIER1_LMEVAL_TASKS}},
+        ),
+        pytest.param(
+            {"name": "test-lmeval-hf-tier2"},
+            {"task_list": {"taskNames": TIER2_LMEVAL_TASKS}},
         ),
         pytest.param(
             {"name": "test-lmeval-hf-custom-task"},
diff --git a/tests/model_explainability/lm_eval/utils.py b/tests/model_explainability/lm_eval/utils.py
@@ -36,30 +36,46 @@ def get_lmevaljob_pod(client: DynamicClient, lmevaljob: LMEvalJob, timeout: int
     return lmeval_pod
 
 
-def get_lmeval_tasks(min_downloads: int = 10000) -> List[str]:
+def get_lmeval_tasks(min_downloads: int|float, max_downloads: int|float|None = None) -> List[str]:
     """
     Gets the list of supported LM-Eval tasks that have above a certain number of minimum downloads on HuggingFace.
 
     Args:
-        min_downloads: The minimum number of downloads
+        min_downloads: The minimum number of downloads or the percentile of downloads to use as a minimum
+        max_downloads: The maximum number of downloads or the percentile of downloads to use as a maximum
 
     Returns:
         List of LM-Eval task names
     """
-    if min_downloads < 1:
+    if min_downloads <= 0:
         raise ValueError("Minimum downloads must be greater than 0")
 
     lmeval_tasks = pd.read_csv(filepath_or_buffer="tests/model_explainability/lm_eval/data/new_task_list.csv")
 
-    # filter for tasks that either exceed (min_downloads OR exist on the OpenLLM leaderboard)
-    # AND exist on LMEval AND do not include image data
+    if isinstance(min_downloads, float):
+        if not 0 <= min_downloads <= 1:
+            raise ValueError("Minimum downloads as a percentile must be between 0 and 1")
+        min_downloads = lmeval_tasks["HF dataset downloads"].quantile(q=min_downloads)
 
+    # filter for tasks that either exceed min_downloads OR exist on the OpenLLM leaderboard
+    # AND exist on LMEval AND do not include image data
     filtered_df = lmeval_tasks[
         lmeval_tasks["Exists"]
         & (lmeval_tasks["Dataset"] != "MMMU/MMMU")
         & ((lmeval_tasks["HF dataset downloads"] >= min_downloads) | (lmeval_tasks["OpenLLM leaderboard"]))
     ]
 
+    # if max_downloads is provided, filter for tasks that have less than
+    # or equal to the maximum number of downloads
+    if max_downloads is not None:
+        if (max_downloads <= 0 or max_downloads > max(lmeval_tasks["HF dataset downloads"])):
+            raise ValueError("Maximum downloads must be greater than 0 and less than the maximum number of downloads")
+        if isinstance(max_downloads, float):
+            if not 0 <= min_downloads <= 1:
+                raise ValueError("Maximum downloads as a percentile must be between 0 and 1")
+            max_downloads = lmeval_tasks["HF dataset downloads"].quantile(q=max_downloads)
+        filtered_df = filtered_df[filtered_df["HF dataset downloads"] <= max_downloads]
+
     # group tasks by dataset and extract the task with shortest name in the group
     unique_tasks = filtered_df.loc[filtered_df.groupby("Dataset")["Name"].apply(lambda x: x.str.len().idxmin())]