Add TinyLlama LLMD

SB159 · SB159 · commit 4f8c7cfd9932 · 2025-11-18T18:34:17.000-06:00
diff --git a/tests/model_serving/model_server/maas_billing/conftest.py b/tests/model_serving/model_server/maas_billing/conftest.py
@@ -4,13 +4,20 @@
 import requests
 from simple_logger.logger import get_logger
 from utilities.plugins.constant import RestHeader, OpenAIEnpoints
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.namespace import Namespace
+from ocp_resources.llm_inference_service import LLMInferenceService
 
+from utilities.llmd_utils import create_llmisvc
+from utilities.llmd_constants import ModelStorage as LLMDModelStorage, ContainerImages
+from utilities.constants import Timeout
 
 from tests.model_serving.model_server.maas_billing.utils import (
     detect_scheme_via_llmisvc,
     host_from_ingress_domain,
     mint_token,
     llmis_name,
+    patch_llmisvc_with_maas_router,
 )
 
 LOGGER = get_logger(name=__name__)
@@ -52,9 +59,13 @@ def base_url(admin_client) -> str:
 
 
 @pytest.fixture(scope="session")
-def model_url(admin_client) -> str:
+def model_url(
+    admin_client: DynamicClient,
+    llmd_inference_service_tinyllama: LLMInferenceService,
+) -> str:
     """
     MODEL_URL:http(s)://<host>/llm/<deployment>/v1/chat/completions
+
     """
     scheme = detect_scheme_via_llmisvc(client=admin_client)
     host = host_from_ingress_domain(client=admin_client)
@@ -73,6 +84,7 @@ def maas_models(
     request_session_http: requests.Session,
     base_url: str,
     maas_headers: dict,
+    llmd_inference_service_tinyllama: LLMInferenceService,
 ):
     """
     Call /v1/models once and return the list of models.
@@ -86,3 +98,54 @@ def maas_models(
     models = resp.json().get("data", [])
     assert models, "no models available"
     return models
+
+
+@pytest.fixture(scope="session")
+def llmd_inference_service_tinyllama(
+    admin_client: DynamicClient,
+) -> Generator[LLMInferenceService, None, None]:
+    """
+    Create a real LLMD model (TinyLlama chat HF) in the 'llm' namespace
+    for MaaS Billing tests, and delete it when the session ends.
+    """
+    namespace_name = "llm"
+
+    Namespace(
+        client=admin_client,
+        name=namespace_name,
+        ensure_exists=True,
+    )
+
+    container_resources = {
+        "limits": {"cpu": "2", "memory": "16Gi"},
+        "requests": {"cpu": "1", "memory": "12Gi"},
+    }
+
+    create_kwargs = {
+        "client": admin_client,
+        "name": "llm-hf-tinyllama",
+        "namespace": namespace_name,
+        "storage_uri": LLMDModelStorage.HF_TINYLLAMA,
+        "container_image": ContainerImages.VLLM_CPU,
+        "container_resources": container_resources,
+        "wait": True,
+        "timeout": Timeout.TIMEOUT_15MIN,
+    }
+
+    with create_llmisvc(**create_kwargs) as llm_service:
+        LOGGER.info(
+            f"MaaS LLMD: created LLMInferenceService {llm_service.namespace}/{llm_service.name} for TinyLlama HF"
+        )
+
+        patch_llmisvc_with_maas_router(
+            llm_service=llm_service,
+            client=admin_client,
+        )
+
+        yield llm_service
+
+        LOGGER.info(
+            f"MaaS LLMD: finished tests; LLMInferenceService "
+            f"{llm_service.namespace}/{llm_service.name} will be deleted "
+            "by context manager"
+        )
diff --git a/tests/model_serving/model_server/maas_billing/test_maas_endpoints.py b/tests/model_serving/model_server/maas_billing/test_maas_endpoints.py
@@ -33,7 +33,11 @@ def test_chat_completions(
         LOGGER.info("Using model_id=%s", model_id)
         assert model_id, "first model from /v1/models has no 'id'"
 
-        payload = {"model": model_id, "prompt": "Hello", "max_tokens": 50}
+        payload = {
+            "model": model_id,
+            "messages": [{"role": "user", "content": "Hello from MaaS billing test!"}],
+            "max_tokens": 50,
+        }
         LOGGER.info(f"POST {model_url} with keys={list(payload.keys())}")
 
         resp = request_session_http.post(
diff --git a/tests/model_serving/model_server/maas_billing/utils.py b/tests/model_serving/model_server/maas_billing/utils.py
@@ -8,6 +8,11 @@
 from urllib.parse import urlparse
 from ocp_resources.llm_inference_service import LLMInferenceService
 from utilities.llmd_utils import get_llm_inference_url
+from simple_logger.logger import get_logger
+from kubernetes.dynamic import DynamicClient
+from kubernetes.dynamic.exceptions import DynamicApiError
+
+LOGGER = get_logger(name=__name__)
 
 
 def host_from_ingress_domain(client) -> str:
@@ -104,3 +109,67 @@ def llmis_name(client, namespace: str = "llm", label_selector: str | None = None
         raise RuntimeError("No Ready LLMInferenceService found")
 
     return service.name
+
+
+def patch_llmisvc_with_maas_router(
+    llm_service: LLMInferenceService,
+    client: DynamicClient,
+) -> None:
+    """
+    Patch an existing LLMInferenceService with MaaS router wiring and annotations.
+
+    This is used for TinyLlama so that the model is reachable via the maas-default-gateway
+
+    and participates in MaaS flows.
+    """
+    router_spec = {
+        "gateway": {
+            "refs": [
+                {
+                    "name": "maas-default-gateway",
+                    "namespace": "openshift-ingress",
+                }
+            ]
+        },
+        "route": {},
+    }
+
+    LOGGER.info(
+        f"MaaS LLMD: patching LLMInferenceService "
+        f"{llm_service.namespace}/{llm_service.name} "
+        f"with MaaS router spec: {router_spec}"
+    )
+
+    patch_body = {
+        "metadata": {
+            "annotations": {
+                "alpha.maas.opendatahub.io/tiers": "[]",
+            }
+        },
+        "spec": {
+            "router": router_spec,
+        },
+    }
+
+    llmisvc_res = client.resources.get(
+        api_version="serving.kserve.io/v1alpha1",
+        kind="LLMInferenceService",
+    )
+
+    try:
+        llmisvc_res.patch(
+            name=llm_service.name,
+            namespace=llm_service.namespace,
+            body=patch_body,
+            content_type="application/merge-patch+json",
+        )
+    except DynamicApiError as exc:
+        LOGGER.error(
+            f"MaaS LLMD: failed to patch LLMInferenceService {llm_service.namespace}/{llm_service.name}: {exc}"
+        )
+        raise
+
+    LOGGER.info(
+        f"MaaS LLMD: successfully patched LLMInferenceService "
+        f"{llm_service.namespace}/{llm_service.name} for MaaS routing"
+    )