opendatahub-io
diff --git a/‎tests/model_serving/model_server/maas_billing/conftest.py‎
Lines changed: 82 additions & 9 deletions b/‎tests/model_serving/model_server/maas_billing/conftest.py‎
Lines changed: 82 additions & 9 deletions
diff --git a/‎tests/model_serving/model_server/maas_billing/test_maas_request_rate_limits.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/model_serving/model_server/maas_billing/test_maas_request_rate_limits.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/maas_billing/test_maas_token_rate_limits.py‎
Lines changed: 77 additions & 0 deletions b/‎tests/model_serving/model_server/maas_billing/test_maas_token_rate_limits.py‎
Lines changed: 77 additions & 0 deletions
@@ -1,5 +1,4 @@
-from typing import Generator
-
+from typing import Generator, Dict, List
 import base64
 import pytest
 import requests
@@ -23,8 +22,7 @@
 from utilities.infra import login_with_user_password, get_openshift_token
 from utilities.general import wait_for_oauth_openshift_deployment
 from ocp_resources.secret import Secret
-
-
+from tests.model_serving.model_server.maas_billing.utils import get_total_tokens
 from tests.model_serving.model_server.maas_billing.utils import (
     detect_scheme_via_llmisvc,
     host_from_ingress_domain,
@@ -34,6 +32,8 @@
     create_maas_group,
     build_maas_headers,
     get_maas_models_response,
+    verify_chat_completions,
+    maas_gateway_rate_limits_patched,
 )
 
 
@@ -81,8 +81,10 @@ def model_url(
     maas_scheme: str,
     maas_host: str,
     admin_client: DynamicClient,
+    maas_inference_service_tinyllama: LLMInferenceService,
 ) -> str:
     deployment = llmis_name(client=admin_client)
+    # deployment = maas_inference_service_tinyllama.name
     return f"{maas_scheme}://{maas_host}/llm/{deployment}{CHAT_COMPLETIONS}"
 
 
@@ -374,7 +376,7 @@ def maas_premium_group(
         yield group.name
 
 
-@pytest.fixture
+@pytest.fixture(scope="class")
 def ocp_token_for_actor(
     request,
     maas_api_server_url: str,
@@ -421,7 +423,7 @@ def ocp_token_for_actor(
             assert original_login_successful, f"Failed to log back in as original user '{original_user}'"
 
 
-@pytest.fixture
+@pytest.fixture(scope="class")
 def maas_token_for_actor(
     request_session_http: requests.Session,
     base_url: str,
@@ -449,13 +451,13 @@ def maas_token_for_actor(
     return token
 
 
-@pytest.fixture
+@pytest.fixture(scope="class")
 def maas_headers_for_actor(maas_token_for_actor: str) -> dict:
     """Headers for the current actor (admin/free/premium)."""
     return build_maas_headers(token=maas_token_for_actor)
 
 
-@pytest.fixture
+@pytest.fixture(scope="class")
 def maas_models_response_for_actor(
     request_session_http: requests.Session,
     base_url: str,
@@ -469,6 +471,60 @@ def maas_models_response_for_actor(
     )
 
 
+@pytest.fixture(scope="class")
+def maas_models_for_actor(
+    maas_models_response_for_actor: requests.Response,
+) -> List[Dict]:
+
+    models_list = maas_models_response_for_actor.json().get("data", [])
+    assert models_list, "no models returned from /v1/models"
+    return models_list
+
+
+@pytest.fixture(scope="class")
+def exercise_rate_limiter(
+    actor_label: str,
+    scenario: dict,
+    request_session_http: requests.Session,
+    model_url: str,
+    maas_headers_for_actor: Dict[str, str],
+    maas_models_for_actor: List[Dict],
+) -> List[int]:
+
+    models_list = maas_models_for_actor
+
+    max_requests = scenario["max_requests"]
+    max_tokens = scenario["max_tokens"]
+    log_prefix = scenario["log_prefix"]
+
+    status_codes_list: List[int] = []
+
+    for attempt_index in range(max_requests):
+        LOGGER.info(f"{log_prefix}[{actor_label}]: attempt {attempt_index + 1}/{max_requests}")
+
+        response = verify_chat_completions(
+            request_session_http=request_session_http,
+            model_url=model_url,
+            headers=maas_headers_for_actor,
+            models_list=models_list,
+            prompt_text="Repeat the word 'token' 60 times, separated by spaces. No extra text.",
+            max_tokens=max_tokens,
+            request_timeout_seconds=60,
+            log_prefix=f"{log_prefix}[{actor_label}]",
+            expected_status_codes=(200, 429),
+        )
+
+        status_codes_list.append(response.status_code)
+
+        total_tokens = get_total_tokens(resp=response)
+
+        if scenario["id"] == "token-rate" and response.status_code == 200:
+            total_tokens = get_total_tokens(resp=response, fail_if_missing=True)
+            LOGGER.info(f"{log_prefix}[{actor_label}]: total_tokens={total_tokens}")
+    LOGGER.info(f"{log_prefix}[{actor_label}]: status_codes={status_codes_list}")
+    return status_codes_list
+
+
 @pytest.fixture(scope="class")
 def maas_inference_service_tinyllama(
     admin_client: DynamicClient,
@@ -530,6 +586,23 @@ def maas_scheme(admin_client: DynamicClient, unprivileged_model_namespace: Names
     )
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="class")
 def maas_host(admin_client):
     return host_from_ingress_domain(client=admin_client)
+
+
+@pytest.fixture(scope="class")
+def maas_gateway_rate_limits(
+    admin_client: DynamicClient,
+) -> Generator[None, None, None]:
+    namespace = "openshift-ingress"
+    token_policy_name = "gateway-token-rate-limits"
+    request_policy_name = "gateway-rate-limits"
+
+    with maas_gateway_rate_limits_patched(
+        admin_client=admin_client,
+        namespace=namespace,
+        token_policy_name=token_policy_name,
+        request_policy_name=request_policy_name,
+    ):
+        yield
@@ -0,0 +1,74 @@
+from typing import List
+import pytest
+from simple_logger.logger import get_logger
+from tests.model_serving.model_server.maas_billing.utils import (
+    assert_mixed_200_and_429,
+)
+
+LOGGER = get_logger(name=__name__)
+
+REQUEST_RATE_MAX_REQUESTS = 10
+
+ACTORS = [
+    pytest.param({"type": "free"}, "free", id="free"),
+    pytest.param({"type": "premium"}, "premium", id="premium"),
+]
+
+SCENARIO_REQUEST_RATE = {
+    "id": "request-rate",
+    "max_requests": REQUEST_RATE_MAX_REQUESTS,
+    "max_tokens": 5,
+    "sleep_between_seconds": 0.1,
+    "log_prefix": "MaaS request-rate",
+    "context": "request-rate burst",
+}
+
+
+@pytest.mark.usefixtures(
+    "maas_inference_service_tinyllama",
+    "maas_free_group",
+    "maas_premium_group",
+    "maas_gateway_rate_limits",
+)
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace",
+    [
+        pytest.param(
+            {"name": "llm", "modelmesh-enabled": False},
+            id="maas-billing-namespace",
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize(
+    "ocp_token_for_actor, actor_label",
+    ACTORS,
+    indirect=["ocp_token_for_actor"],
+    scope="class",
+)
+class TestMaasRequestRateLimits:
+    """
+    MaaS Billing – request-rate limit tests against TinyLlama.
+    """
+
+    @pytest.fixture(scope="class")
+    def scenario(self):
+        return SCENARIO_REQUEST_RATE
+
+    def test_request_rate_limits(
+        self,
+        ocp_token_for_actor: str,
+        actor_label: str,
+        scenario: dict,
+        exercise_rate_limiter: List[int],
+    ) -> None:
+
+        _ = ocp_token_for_actor
+        status_codes_list = exercise_rate_limiter
+
+        assert_mixed_200_and_429(
+            actor_label=actor_label,
+            status_codes_list=status_codes_list,
+            context=scenario["context"],
+            require_429=True,
+        )
@@ -0,0 +1,77 @@
+from typing import List
+import pytest
+from simple_logger.logger import get_logger
+from tests.model_serving.model_server.maas_billing.utils import (
+    assert_mixed_200_and_429,
+)
+
+LOGGER = get_logger(name=__name__)
+
+TOKEN_RATE_MAX_REQUESTS = 8
+LARGE_MAX_TOKENS = 80
+
+ACTORS = [
+    pytest.param({"type": "free"}, "free", id="free"),
+    pytest.param({"type": "premium"}, "premium", id="premium"),
+]
+
+SCENARIO_TOKEN_RATE = {
+    "id": "token-rate",
+    "max_requests": TOKEN_RATE_MAX_REQUESTS,
+    "max_tokens": LARGE_MAX_TOKENS,
+    "sleep_between_seconds": 0.2,
+    "log_prefix": "MaaS token-rate",
+    "context": "token-rate tests",
+}
+
+
+@pytest.mark.usefixtures(
+    "maas_inference_service_tinyllama",
+    "maas_free_group",
+    "maas_premium_group",
+    "maas_gateway_rate_limits",
+)
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace",
+    [
+        pytest.param(
+            {"name": "llm", "modelmesh-enabled": False},
+            id="maas-billing-namespace",
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize(
+    "ocp_token_for_actor, actor_label",
+    ACTORS,
+    indirect=["ocp_token_for_actor"],
+    scope="class",
+)
+class TestMaasTokenRateLimits:
+    """
+    MaaS Billing – token-rate limit tests against TinyLlama.
+    """
+
+    @pytest.fixture(scope="class")
+    def scenario(self):
+        return SCENARIO_TOKEN_RATE
+
+    def test_token_rate_limits(
+        self,
+        ocp_token_for_actor: str,
+        actor_label: str,
+        scenario: dict,
+        exercise_rate_limiter: List[int],
+    ) -> None:
+
+        _ = ocp_token_for_actor
+        status_codes_list = exercise_rate_limiter
+
+        assert_mixed_200_and_429(
+            actor_label=actor_label,
+            status_codes_list=status_codes_list,
+            context=scenario["context"],
+            require_429=False,
+        )
+
+        LOGGER.info(f"MaaS token-rate[{actor_label}]: final status_codes={status_codes_list}")