use public prompt sets instead of prompt sets that require auth (#865)

rogthefrog · web-flow · commit 18e16c1aeb8c · 2025-03-11T18:25:38.000-04:00
* use public prompt sets instead of prompt sets that require auth

* disable a noisy test until we fix it properly

* find the public demo prompt set that corresponds to a non-public prompt set

* less janky still way to turn a token-protected prompt set file into a publicly-accessible one

* download public files instead of private or protected

* fixed test to reflect new function behavior

* appease mypy

* remove test removed elsewhere; remove unneeded tmp_path fixture

* remove unsupported model

* clear cache and lock file to try to prevent errors like '  Hash for pydantic-core (2.27.2) from archive pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl not found in known hashes (was: sha256:ed4964723e97cdf8c70abebd1495001f511491b8eeac817b033db1af28a86bb5)'

* refresh lock file with the right version of Poetry

* restore gemini 1.0 pro, which is really supposed to be supported.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -73,7 +73,10 @@ jobs:
       run: pipx install "poetry == 1.8.5"
 
     - name: Install dependencies
-      run: poetry lock && poetry install --no-interaction --with dev --extras all_plugins
+      run: |
+        poetry cache clear --all .
+        rm -f poetry.lock
+        poetry install --no-interaction --with dev --extras all_plugins
 
     - name: Lint formatting
       run: poetry run black --check .
diff --git a/plugins/google/modelgauge/suts/google_genai_client.py b/plugins/google/modelgauge/suts/google_genai_client.py
@@ -2,18 +2,23 @@
 from typing import Dict, List, Optional
 
 import google.generativeai as genai  # type: ignore
-from google.api_core.exceptions import InternalServerError, ResourceExhausted, RetryError, TooManyRequests
-from google.generativeai.types import HarmCategory, HarmBlockThreshold  # type: ignore
-from pydantic import BaseModel
+from google.api_core.exceptions import (
+    InternalServerError,
+    ResourceExhausted,
+    RetryError,
+    TooManyRequests,
+)
+from google.generativeai.types import HarmBlockThreshold, HarmCategory  # type: ignore
 
 from modelgauge.general import APIException
 from modelgauge.prompt import TextPrompt
 from modelgauge.retry_decorator import retry
 from modelgauge.secret_values import InjectSecret, RequiredSecret, SecretDescription
-from modelgauge.sut import REFUSAL_RESPONSE, PromptResponseSUT, SUTResponse
+from modelgauge.sut import REFUSAL_RESPONSE, PromptResponseSUT, SUTResponse  # usort: skip
 from modelgauge.sut_capabilities import AcceptsTextPrompt
 from modelgauge.sut_decorator import modelgauge_sut
 from modelgauge.sut_registry import SUTS
+from pydantic import BaseModel
 
 FinishReason = genai.protos.Candidate.FinishReason
 GEMINI_HARM_CATEGORIES = [
@@ -191,7 +196,7 @@ def safety_settings(self) -> Optional[Dict[HarmCategory, HarmBlockThreshold]]:
         return {harm: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE for harm in GEMINI_HARM_CATEGORIES}
 
 
-gemini_models = ["gemini-1.5-flash", "gemini-1.0-pro", "gemini-1.5-pro"]
+gemini_models = ["gemini-1.0-pro", "gemini-1.5-flash", "gemini-1.5-pro"]
 for model in gemini_models:
     SUTS.register(GoogleGenAiDefaultSUT, model, model, InjectSecret(GoogleAiApiKey))
     SUTS.register(
diff --git a/plugins/validation_tests/test_object_creation.py b/plugins/validation_tests/test_object_creation.py
@@ -1,14 +1,17 @@
 import os
+import re
 
 import pytest
 from flaky import flaky  # type: ignore
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.caching import SqlDictCache
 from modelgauge.config import load_secrets_from_config
 from modelgauge.dependency_helper import FromSourceDependencyHelper
+from modelgauge.external_data import WebData
 from modelgauge.load_plugins import load_plugins
 from modelgauge.locales import EN_US  # see "workaround" below
 from modelgauge.prompt import SUTOptions, TextPrompt
+from modelgauge.prompt_sets import demo_prompt_set_url
 from modelgauge.record_init import InitializationRecord
 from modelgauge.sut import PromptResponseSUT, SUTResponse
 from modelgauge.sut_capabilities import AcceptsTextPrompt
@@ -22,15 +25,18 @@
 
 # Ensure all the plugins are available during testing.
 load_plugins()
-# Some tests need to download a file from modellab, which requires a real auth token
+
 _FAKE_SECRETS = fake_all_secrets()
 
 
-@pytest.mark.parametrize("test_name", [key for key, _ in TESTS.items()])
-def test_all_tests_construct_and_record_init(test_name):
-    test = TESTS.make_instance(test_name, secrets=_FAKE_SECRETS)
-    assert hasattr(test, "initialization_record"), "Test is probably missing @modelgauge_test() decorator."
-    assert isinstance(test.initialization_record, InitializationRecord)
+def ensure_public_dependencies(dependencies):
+    """Some tests are defined with dependencies that require an auth token to download them.
+    In this test context, we substitute public files instead."""
+    for k, d in dependencies.items():
+        if isinstance(d, WebData):
+            new_dependency = WebData(source_url=demo_prompt_set_url(d.source_url), headers=None)
+            dependencies[k] = new_dependency
+    return dependencies
 
 
 @pytest.fixture(scope="session")
@@ -59,9 +65,10 @@ def test_all_tests_make_test_items(test_name, shared_run_dir):
 
     if isinstance(test, PromptResponseTest):
         test_data_path = os.path.join(shared_run_dir, test.__class__.__name__)
+        dependencies = ensure_public_dependencies(test.get_dependencies())
         dependency_helper = FromSourceDependencyHelper(
             test_data_path,
-            test.get_dependencies(),
+            dependencies,
             required_versions={},
         )
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/modelgauge/prompt_sets.py b/src/modelgauge/prompt_sets.py
@@ -1,4 +1,6 @@
+from pathlib import Path
 from typing import Any, Optional
+from urllib.parse import urlparse
 
 from modelgauge.locales import EN_US
 from modelgauge.secret_values import OptionalSecret, SecretDescription
@@ -73,3 +75,38 @@ def validate_token_requirement(prompt_set: str, token=None) -> bool:
     if token:
         return True
     raise ValueError(f"Prompt set {prompt_set} requires a token from MLCommons.")
+
+
+def demo_prompt_set_from_private_prompt_set(prompt_set: str) -> str:
+    """In a test environment, we replace the practice or official prompt sets
+    (which require auth) with matching demo prompt sets (which are public).
+    This function returns the demo counterpart to a given practice or official prompt set."""
+    found_locale = ""
+    for prompt_set_type, prompt_sets in PROMPT_SETS.items():
+        for locale, prompt_set_file_base_name in prompt_sets.items():
+            print(f"target {prompt_set} looking at {prompt_set_file_base_name}")
+            if prompt_set_file_base_name == prompt_set:
+                found_locale = locale
+                break
+
+    if found_locale:
+        return PROMPT_SETS["demo"].get(found_locale, "")
+    return prompt_set
+
+
+def prompt_set_from_url(source_url) -> str:
+    """Given the source_url from a WebData object, returns the bare prompt set name
+    without an extension or hostname"""
+    try:
+        chunks = urlparse(source_url)
+        filename = Path(chunks.path).stem
+        return filename
+    except Exception as exc:
+        return source_url
+
+
+def demo_prompt_set_url(url: str) -> str:
+    source_prompt_set = prompt_set_from_url(url)
+    target_prompt_set = demo_prompt_set_from_private_prompt_set(source_prompt_set)
+    target_url = url.replace(source_prompt_set, target_prompt_set)
+    return target_url
diff --git a/tests/modelgauge_tests/test_prompt_sets.py b/tests/modelgauge_tests/test_prompt_sets.py
@@ -1,7 +1,10 @@
 import pytest
 from modelgauge.prompt_sets import (
     PROMPT_SETS,
+    demo_prompt_set_from_private_prompt_set,
+    demo_prompt_set_url,
     prompt_set_file_base_name,
+    prompt_set_from_url,
     validate_prompt_set,
 )  # usort: skip
 
@@ -33,3 +36,29 @@ def test_validate_prompt_set():
         assert validate_prompt_set(s, "en_us", PROMPT_SETS)
     with pytest.raises(ValueError):
         validate_prompt_set("should raise")
+
+
+def test_demo_prompt_set_from_private_prompt_set():
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["practice"]["en_us"]) == PROMPT_SETS["demo"]["en_us"]
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["practice"]["fr_fr"]) == PROMPT_SETS["demo"]["fr_fr"]
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["official"]["en_us"]) == PROMPT_SETS["demo"]["en_us"]
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["official"]["fr_fr"]) == PROMPT_SETS["demo"]["fr_fr"]
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["demo"]["en_us"]) == PROMPT_SETS["demo"]["en_us"]
+    assert demo_prompt_set_from_private_prompt_set(PROMPT_SETS["demo"]["fr_fr"]) == PROMPT_SETS["demo"]["fr_fr"]
+    assert demo_prompt_set_from_private_prompt_set("bogus") == "bogus"
+
+
+def test_prompt_set_from_url():
+    assert prompt_set_from_url("https://www.example.com/path/to/file.csv") == "file"
+    assert prompt_set_from_url("https://www.example.com/thing.css") == "thing"
+    assert prompt_set_from_url("degenerate string") == "degenerate string"
+    assert prompt_set_from_url("https://www.example.com") == ""
+    assert prompt_set_from_url("https://www.example.com/") == ""
+
+
+def test_demo_prompt_set_url():
+    base = "https://www.example.com/path/to/"
+    for l in ("en_us", "fr_fr"):
+        for t in ("practice", "official"):
+            base_url = f"{base}{PROMPT_SETS[t][l]}.csv"
+            assert demo_prompt_set_url(base_url) == f"{base}{PROMPT_SETS["demo"][l]}.csv"