[RHOAIENG-34594] Add test case for singlenode with estimated prefix cache (#907)

threcc · web-flow · commit 2472b3aaa1c5 · 2025-12-10T09:45:46.000Z
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
@@ -2,15 +2,24 @@
 from typing import Generator
 
 import pytest
+import yaml
 from _pytest.fixtures import FixtureRequest
 from kubernetes.dynamic import DynamicClient
+from ocp_resources.gateway import Gateway
 from ocp_resources.llm_inference_service import LLMInferenceService
 from ocp_resources.namespace import Namespace
 from ocp_resources.role import Role
 from ocp_resources.role_binding import RoleBinding
 from ocp_resources.secret import Secret
 from ocp_resources.service_account import ServiceAccount
 
+from tests.model_serving.model_server.llmd.constants import (
+    LLMD_LIVENESS_PROBE,
+    PREFIX_CACHE_BLOCK_SIZE,
+    PREFIX_CACHE_HASH_ALGO,
+    PREFIX_CACHE_HASH_SEED,
+    ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE,
+)
 from utilities.constants import Timeout, ResourceLimits
 from utilities.infra import s3_endpoint_secret, create_inference_token
 from utilities.logger import RedactedString
@@ -330,3 +339,132 @@ def _create_llmd_auth_service(
             return (llm_service, sa)
 
         yield _create_llmd_auth_service
+
+
+@pytest.fixture(scope="class")
+def singlenode_estimated_prefix_cache(
+    admin_client: DynamicClient,
+    unprivileged_model_namespace: Namespace,
+    llmd_s3_secret: Secret,
+    llmd_s3_service_account: ServiceAccount,
+    llmd_gateway: Gateway,
+) -> Generator[LLMInferenceService, None, None]:
+    """LLMInferenceService fixture for single-node estimated prefix cache test."""
+
+    with create_llmisvc(
+        client=admin_client,
+        name="singlenode-prefix-cache-test",
+        namespace=unprivileged_model_namespace.name,
+        storage_uri=ModelStorage.TINYLLAMA_S3,
+        model_name=ModelNames.TINYLLAMA,
+        replicas=2,
+        annotations={
+            "prometheus.io/port": "8000",
+            "prometheus.io/path": "/metrics",
+        },
+        container_resources={
+            "limits": {
+                "cpu": ResourceLimits.GPU.CPU_LIMIT,
+                "memory": ResourceLimits.GPU.MEMORY_LIMIT,
+                "nvidia.com/gpu": ResourceLimits.GPU.LIMIT,
+            },
+            "requests": {
+                "cpu": ResourceLimits.GPU.CPU_REQUEST,
+                "memory": ResourceLimits.GPU.MEMORY_REQUEST,
+                "nvidia.com/gpu": ResourceLimits.GPU.REQUEST,
+            },
+        },
+        container_env=[
+            {"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"},
+            {
+                "name": "VLLM_ADDITIONAL_ARGS",
+                "value": (
+                    f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} "
+                    '--kv_transfer_config \'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\' '
+                    '--kv-events-config \'{"enable_kv_cache_events":true,"publisher":"zmq",'
+                    '"endpoint":"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557",'
+                    '"topic":"kv@${POD_IP}@${MODEL_NAME}"}\''
+                ),
+            },
+            {
+                "name": "POD_IP",
+                "valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}},
+            },
+            {"name": "MODEL_NAME", "value": ModelNames.TINYLLAMA},
+            {"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED},
+        ],
+        liveness_probe=LLMD_LIVENESS_PROBE,
+        service_account=llmd_s3_service_account.name,
+        enable_auth=True,
+        router_config={
+            "scheduler": {
+                "template": {
+                    "volumes": [{"name": "tokenizers", "emptyDir": {}}],
+                    "containers": [
+                        {
+                            "name": "main",
+                            "volumeMounts": [
+                                {
+                                    "name": "tokenizers",
+                                    "mountPath": "/mnt/tokenizers",
+                                    "readOnly": False,
+                                }
+                            ],
+                            "args": [
+                                "--v=4",
+                                "--pool-name",
+                                "{{ ChildName .ObjectMeta.Name `-inference-pool` }}",
+                                "--pool-namespace",
+                                "{{ .ObjectMeta.Namespace }}",
+                                "--pool-group",
+                                "inference.networking.x-k8s.io",
+                                "--zap-encoder",
+                                "json",
+                                "--grpc-port",
+                                "9002",
+                                "--grpc-health-port",
+                                "9003",
+                                "--secure-serving",
+                                "--model-server-metrics-scheme",
+                                "https",
+                                "--cert-path",
+                                "/var/run/kserve/tls",
+                                "--config-text",
+                                yaml.dump(ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE),
+                            ],
+                        }
+                    ],
+                }
+            },
+            "route": {},
+            "gateway": {},
+        },
+        disable_scheduler=False,
+        enable_prefill_decode=False,
+        wait=True,
+        timeout=Timeout.TIMEOUT_15MIN,
+    ) as llm_service:
+        yield llm_service
+
+
+@pytest.fixture(scope="class")
+def authenticated_llmisvc_token(
+    request: FixtureRequest,
+    llmisvc_auth_token,
+    llmisvc_auth_view_role,
+    llmisvc_auth_role_binding,
+) -> str:
+    service_account_fixture_name = request.param["service_account_fixture"]
+    llmisvc_fixture_name = request.param["llmisvc_fixture"]
+
+    # Get fixtures dynamically
+    service_account = request.getfixturevalue(argname=service_account_fixture_name)
+    llmisvc = request.getfixturevalue(argname=llmisvc_fixture_name)
+
+    # Create and return token
+    return llmisvc_auth_token(
+        service_account=service_account,
+        llmisvc=llmisvc,
+        view_role_factory=llmisvc_auth_view_role,
+        role_binding_factory=llmisvc_auth_role_binding,
+    )
diff --git a/tests/model_serving/model_server/llmd/constants.py b/tests/model_serving/model_server/llmd/constants.py
@@ -0,0 +1,44 @@
+# Liveness probe for single-node configurations
+LLMD_LIVENESS_PROBE = {
+    "httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
+    "initialDelaySeconds": 120,
+    "periodSeconds": 30,
+    "timeoutSeconds": 30,
+    "failureThreshold": 5,
+}
+
+# Common parameters for vLLM and llm-d scheduler
+PREFIX_CACHE_BLOCK_SIZE = 64
+PREFIX_CACHE_HASH_ALGO = "sha256"
+PREFIX_CACHE_HASH_SEED = "42"
+
+# Scheduler configuration for single-node with estimated prefix cache
+ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE = {
+    "apiVersion": "inference.networking.x-k8s.io/v1alpha1",
+    "kind": "EndpointPickerConfig",
+    "plugins": [
+        {
+            "type": "prefix-cache-scorer",
+            "parameters": {
+                "indexerConfig": {
+                    "tokenProcessorConfig": {
+                        "blockSize": PREFIX_CACHE_BLOCK_SIZE,
+                        "hashAlgo": PREFIX_CACHE_HASH_ALGO,
+                        "hashSeed": PREFIX_CACHE_HASH_SEED,
+                    }
+                }
+            },
+        }
+    ],
+    "schedulingProfiles": [
+        {
+            "name": "default",
+            "plugins": [
+                {
+                    "pluginRef": "prefix-cache-scorer",
+                    "weight": 5.0,
+                }
+            ],
+        }
+    ],
+}
diff --git a/tests/model_serving/model_server/llmd/test_singlenode_estimated_prefix_cache.py b/tests/model_serving/model_server/llmd/test_singlenode_estimated_prefix_cache.py
@@ -0,0 +1,94 @@
+"""
+Test Single-Node Estimated Prefix Caching.
+
+This test verifies that the LLM-D router correctly routes inference requests
+based on cache state, maximizing prefix cache hits.
+
+Test configuration:
+- LLMInferenceService with 2 replicas and router enabled
+- Authentication enabled
+- Verify router pod and vLLM pods are running
+- Send multiple requests with shared prefixes and size greater than PREFIX_CACHE_BLOCK_SIZE
+"""
+
+import pytest
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.gateway import Gateway
+from ocp_resources.llm_inference_service import LLMInferenceService
+from ocp_resources.prometheus import Prometheus
+
+from tests.model_serving.model_server.llmd.utils import (
+    get_llmd_router_scheduler_pod,
+    get_llmd_workload_pods,
+    send_prefix_cache_test_requests,
+    verify_estimated_prefix_cache_metrics,
+    verify_gateway_status,
+    verify_llm_service_status,
+)
+from simple_logger.logger import get_logger
+
+LOGGER = get_logger(name=__name__)
+
+# Number of requests to send for prefix cache testing
+NUM_REQUESTS = 20
+
+pytestmark = [pytest.mark.llmd_gpu]
+
+
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace, authenticated_llmisvc_token",
+    [
+        pytest.param(
+            {"name": "llmd-singlenode-prefix-cache-test"},
+            {
+                "service_account_fixture": "llmd_s3_service_account",
+                "llmisvc_fixture": "singlenode_estimated_prefix_cache",
+            },
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.usefixtures("valid_aws_config", "user_workload_monitoring_config_map")
+class TestSingleNodeEstimatedPrefixCache:
+    """Test class for singlenode estimated prefix cache routing."""
+
+    def test_singlenode_estimated_prefix_cache(
+        self,
+        unprivileged_client: DynamicClient,
+        llmd_gateway: Gateway,
+        singlenode_estimated_prefix_cache: LLMInferenceService,
+        authenticated_llmisvc_token: str,
+        gpu_count_on_cluster: int,
+        prometheus: Prometheus,
+    ):
+        """Test single-node estimated prefix cache routing."""
+        if gpu_count_on_cluster < 2:
+            pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
+
+        # Verify infrastructure is ready before testing routing
+        assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
+        assert verify_llm_service_status(singlenode_estimated_prefix_cache), "LLMInferenceService should be ready"
+
+        router_scheduler_pod = get_llmd_router_scheduler_pod(
+            client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache
+        )
+        assert router_scheduler_pod is not None, "Router-scheduler pod should exist"
+        assert router_scheduler_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"
+
+        workload_pods = get_llmd_workload_pods(client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache)
+        assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
+
+        # Send N identical requests to test prefix cache
+        num_successful_requests = send_prefix_cache_test_requests(
+            llmisvc=singlenode_estimated_prefix_cache,
+            token=authenticated_llmisvc_token,
+            num_requests=NUM_REQUESTS,
+        )
+
+        # Verify estimated prefix cache routing using Prometheus metrics
+        verify_estimated_prefix_cache_metrics(
+            prometheus=prometheus,
+            llmisvc=singlenode_estimated_prefix_cache,
+            workload_pods=workload_pods,
+            expected_requests=num_successful_requests,
+        )
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py