tests(maas-billing): add TinyLlama LLMD model (#861)

SB159 · web-flow · commit 589601af4212 · 2025-12-04T18:25:30.000Z
* Add TinyLlama LLMD

* test-maas-billing: TinyLlama s3 deployment

* tests-maas-billing - review comments implemented

* tests(maas-billing): added fixtures
diff --git a/tests/model_serving/model_server/maas_billing/conftest.py b/tests/model_serving/model_server/maas_billing/conftest.py
@@ -5,8 +5,16 @@
 import requests
 from simple_logger.logger import get_logger
 from utilities.plugins.constant import OpenAIEnpoints
+from ocp_resources.service_account import ServiceAccount
 
 from kubernetes.dynamic import DynamicClient
+from ocp_resources.namespace import Namespace
+from ocp_resources.llm_inference_service import LLMInferenceService
+
+from utilities.llmd_utils import create_llmisvc
+from utilities.llmd_constants import ModelStorage, ContainerImages
+from utilities.constants import Timeout
+
 from ocp_resources.infrastructure import Infrastructure
 from ocp_resources.oauth import OAuth
 from ocp_resources.resource import ResourceEditor
@@ -15,11 +23,14 @@
 from utilities.infra import login_with_user_password, get_openshift_token
 from utilities.general import wait_for_oauth_openshift_deployment
 from ocp_resources.secret import Secret
+
+
 from tests.model_serving.model_server.maas_billing.utils import (
     detect_scheme_via_llmisvc,
     host_from_ingress_domain,
     mint_token,
     llmis_name,
+    patch_llmisvc_with_maas_router,
     create_maas_group,
     build_maas_headers,
     get_maas_models_response,
@@ -60,22 +71,19 @@ def minted_token(request_session_http, base_url: str, current_client_token: str)
     return token
 
 
-@pytest.fixture(scope="module")
-def base_url(admin_client) -> str:
-    scheme = detect_scheme_via_llmisvc(client=admin_client)
-    host = host_from_ingress_domain(client=admin_client)
-    return f"{scheme}://{host}/maas-api"
+@pytest.fixture(scope="class")
+def base_url(maas_scheme: str, maas_host: str) -> str:
+    return f"{maas_scheme}://{maas_host}/maas-api"
 
 
-@pytest.fixture(scope="session")
-def model_url(admin_client) -> str:
-    """
-    MODEL_URL:http(s)://<host>/llm/<deployment>/v1/chat/completions
-    """
-    scheme = detect_scheme_via_llmisvc(client=admin_client)
-    host = host_from_ingress_domain(client=admin_client)
+@pytest.fixture(scope="class")
+def model_url(
+    maas_scheme: str,
+    maas_host: str,
+    admin_client: DynamicClient,
+) -> str:
     deployment = llmis_name(client=admin_client)
-    return f"{scheme}://{host}/llm/{deployment}{CHAT_COMPLETIONS}"
+    return f"{maas_scheme}://{maas_host}/llm/{deployment}{CHAT_COMPLETIONS}"
 
 
 @pytest.fixture
@@ -85,9 +93,10 @@ def maas_headers(minted_token: str) -> dict:
 
 @pytest.fixture
 def maas_models(
-    request_session_http,
-    base_url,
-    maas_headers,
+    request_session_http: requests.Session,
+    base_url: str,
+    maas_headers: dict,
+    maas_inference_service_tinyllama: LLMInferenceService,
 ):
     resp = get_maas_models_response(
         session=request_session_http,
@@ -458,3 +467,69 @@ def maas_models_response_for_actor(
         base_url=base_url,
         headers=maas_headers_for_actor,
     )
+
+
+@pytest.fixture(scope="class")
+def maas_inference_service_tinyllama(
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    model_service_account: ServiceAccount,
+) -> Generator[LLMInferenceService, None, None]:
+    """
+    TinyLlama S3-backed LLMInferenceService wired through MaaS for tests.
+    """
+    with (
+        create_llmisvc(
+            client=admin_client,
+            name="llm-s3-tinyllama",
+            namespace=unprivileged_model_namespace.name,
+            storage_uri=ModelStorage.TINYLLAMA_S3,
+            container_image=ContainerImages.VLLM_CPU,
+            container_resources={
+                "limits": {"cpu": "2", "memory": "12Gi"},
+                "requests": {"cpu": "1", "memory": "8Gi"},
+            },
+            service_account=model_service_account.name,
+            wait=True,
+            timeout=Timeout.TIMEOUT_15MIN,
+        ) as llm_service,
+        patch_llmisvc_with_maas_router(llm_service=llm_service),
+    ):
+        llmd_instance = llm_service.instance
+        model_spec = llmd_instance.spec.model
+
+        storage_uri = model_spec.uri
+        assert storage_uri == ModelStorage.TINYLLAMA_S3, (
+            f"Unexpected storage_uri on TinyLlama LLMInferenceService: {storage_uri}"
+        )
+
+        status = llmd_instance.status
+        conditions = {condition.type: condition.status for condition in status.conditions}
+        assert conditions.get("Ready") == "True", f"TinyLlama LLMInferenceService not Ready, conditions={conditions}"
+
+        LOGGER.info(
+            f"MaaS: TinyLlama S3 LLMInferenceService "
+            f"{llm_service.namespace}/{llm_service.name} "
+            f"is Ready with storage_uri={storage_uri}"
+        )
+
+        yield llm_service
+
+        LOGGER.info(
+            f"MaaS: TinyLlama S3 LLMInferenceService "
+            f"{llm_service.namespace}/{llm_service.name} "
+            f"will be deleted at teardown"
+        )
+
+
+@pytest.fixture(scope="class")
+def maas_scheme(admin_client: DynamicClient, unprivileged_model_namespace: Namespace) -> str:
+    return detect_scheme_via_llmisvc(
+        client=admin_client,
+        namespace=unprivileged_model_namespace.name,
+    )
+
+
+@pytest.fixture(scope="session")
+def maas_host(admin_client):
+    return host_from_ingress_domain(client=admin_client)
diff --git a/tests/model_serving/model_server/maas_billing/test_maas_endpoints.py b/tests/model_serving/model_server/maas_billing/test_maas_endpoints.py
@@ -1,12 +1,24 @@
-from utilities.plugins.constant import OpenAIEnpoints
 from simple_logger.logger import get_logger
 import requests
+import pytest
+from tests.model_serving.model_server.maas_billing.utils import verify_chat_completions
 
 LOGGER = get_logger(name=__name__)
-MODELS_INFO = OpenAIEnpoints.MODELS_INFO
-CHAT_COMPLETIONS = OpenAIEnpoints.CHAT_COMPLETIONS
 
 
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace",
+    [
+        pytest.param(
+            {
+                "name": "llm",
+                "modelmesh-enabled": False,
+            },
+            id="maas-billing-namespace",
+        ),
+    ],
+    indirect=True,
+)
 class TestMaasEndpoints:
     def test_model(
         self,
@@ -26,32 +38,11 @@ def test_chat_completions(
         maas_headers: dict,
         maas_models: list,
     ) -> None:
-        """
-        Verify /llm/<deployment>/v1/chat/completions responds to a simple prompt.
-        """
-        model_id = maas_models[0].get("id", "")
-        LOGGER.info("Using model_id=%s", model_id)
-        assert model_id, "first model from /v1/models has no 'id'"
-
-        payload = {"model": model_id, "prompt": "Hello", "max_tokens": 50}
-        LOGGER.info(f"POST {model_url} with keys={list(payload.keys())}")
-
-        resp = request_session_http.post(
-            url=model_url,
+        """Verify /llm/<deployment>/v1/chat/completions responds to a simple prompt."""
+        verify_chat_completions(
+            request_session_http=request_session_http,
+            model_url=model_url,
             headers=maas_headers,
-            json=payload,
-            timeout=60,
-        )
-        LOGGER.info(f"POST {model_url} -> {resp.status_code}")
-
-        assert resp.status_code == 200, (
-            f"/v1/chat/completions failed: {resp.status_code} {resp.text[:200]} (url={model_url})"
+            models_list=maas_models,
+            log_prefix="MaaS Endpoint Test",
         )
-
-        body = resp.json()
-        choices = body.get("choices", [])
-        assert isinstance(choices, list) and choices, "'choices' missing or empty"
-
-        msg = choices[0].get("message", {}) or {}
-        text = msg.get("content") or choices[0].get("text", "")
-        assert isinstance(text, str) and text.strip(), "first choice has no text content"
diff --git a/tests/model_serving/model_server/maas_billing/test_maas_rbac_e2e.py b/tests/model_serving/model_server/maas_billing/test_maas_rbac_e2e.py
@@ -1,6 +1,9 @@
 import pytest
 from simple_logger.logger import get_logger
 from utilities.plugins.constant import OpenAIEnpoints
+from tests.model_serving.model_server.maas_billing.utils import (
+    verify_chat_completions,
+)
 
 LOGGER = get_logger(name=__name__)
 
@@ -14,6 +17,19 @@
 ]
 
 
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace",
+    [
+        pytest.param(
+            {
+                "name": "llm",
+                "modelmesh-enabled": False,
+            },
+            id="maas-billing-namespace",
+        ),
+    ],
+    indirect=True,
+)
 @pytest.mark.usefixtures("maas_free_group", "maas_premium_group")
 @pytest.mark.parametrize(
     "ocp_token_for_actor",
@@ -37,6 +53,7 @@ def test_mint_token_for_actors(
 
     def test_models_visible_for_actors(
         self,
+        model_url: str,
         maas_models_response_for_actor,
     ) -> None:
         """Use fixture for /v1/models response."""
@@ -50,34 +67,24 @@ def test_chat_completions_for_actors(
         model_url: str,
         maas_headers_for_actor: dict,
         maas_models_response_for_actor,
+        ocp_token_for_actor,
     ) -> None:
         """
         Reuse the models fixture instead of duplicating the /v1/models logic,
-        then call /v1/chat/completions with the first model id.
+        then call /v1/chat/completions with the first model id using the
+        common verify_chat_completions helper.
         """
         models_response = maas_models_response_for_actor
-        models = models_response.json().get("data", [])
-        assert models, "no models returned from /v1/models"
-        model_id = models[0].get("id", "")
-        assert model_id, "first model from /v1/models has no 'id'"
+        models_list = models_response.json().get("data", [])
+        assert models_list, "no models returned from /v1/models"
 
-        payload = {"model": model_id, "prompt": "Hello", "max_tokens": 16}
-
-        LOGGER.info(f"MaaS RBAC: POST {model_url} with payload keys={list(payload.keys())}")
-
-        chat_response = request_session_http.post(
-            url=model_url,
+        verify_chat_completions(
+            request_session_http=request_session_http,
+            model_url=model_url,
             headers=maas_headers_for_actor,
-            json=payload,
-            timeout=60,
-        )
-
-        LOGGER.info(f"MaaS RBAC: POST {model_url} -> {chat_response.status_code}")
-
-        assert chat_response.status_code == 200, (
-            f"/v1/chat/completions failed: {chat_response.status_code} {chat_response.text[:200]} (url={model_url})"
+            models_list=models_list,
+            prompt_text="Hello",
+            max_tokens=16,
+            request_timeout_seconds=60,
+            log_prefix="MaaS RBAC",
         )
-
-        chat_body = chat_response.json()
-        choices = chat_body.get("choices", [])
-        assert isinstance(choices, list) and choices, "'choices' missing or empty"
diff --git a/tests/model_serving/model_server/maas_billing/utils.py b/tests/model_serving/model_server/maas_billing/utils.py