Skip to content

Commit 2472b3a

Browse files
authored
[RHOAIENG-34594] Add test case for singlenode with estimated prefix cache (#907)
1 parent bb0b4d0 commit 2472b3a

File tree

4 files changed

+519
-1
lines changed

4 files changed

+519
-1
lines changed

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,24 @@
22
from typing import Generator
33

44
import pytest
5+
import yaml
56
from _pytest.fixtures import FixtureRequest
67
from kubernetes.dynamic import DynamicClient
8+
from ocp_resources.gateway import Gateway
79
from ocp_resources.llm_inference_service import LLMInferenceService
810
from ocp_resources.namespace import Namespace
911
from ocp_resources.role import Role
1012
from ocp_resources.role_binding import RoleBinding
1113
from ocp_resources.secret import Secret
1214
from ocp_resources.service_account import ServiceAccount
1315

16+
from tests.model_serving.model_server.llmd.constants import (
17+
LLMD_LIVENESS_PROBE,
18+
PREFIX_CACHE_BLOCK_SIZE,
19+
PREFIX_CACHE_HASH_ALGO,
20+
PREFIX_CACHE_HASH_SEED,
21+
ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE,
22+
)
1423
from utilities.constants import Timeout, ResourceLimits
1524
from utilities.infra import s3_endpoint_secret, create_inference_token
1625
from utilities.logger import RedactedString
@@ -330,3 +339,132 @@ def _create_llmd_auth_service(
330339
return (llm_service, sa)
331340

332341
yield _create_llmd_auth_service
342+
343+
344+
@pytest.fixture(scope="class")
345+
def singlenode_estimated_prefix_cache(
346+
admin_client: DynamicClient,
347+
unprivileged_model_namespace: Namespace,
348+
llmd_s3_secret: Secret,
349+
llmd_s3_service_account: ServiceAccount,
350+
llmd_gateway: Gateway,
351+
) -> Generator[LLMInferenceService, None, None]:
352+
"""LLMInferenceService fixture for single-node estimated prefix cache test."""
353+
354+
with create_llmisvc(
355+
client=admin_client,
356+
name="singlenode-prefix-cache-test",
357+
namespace=unprivileged_model_namespace.name,
358+
storage_uri=ModelStorage.TINYLLAMA_S3,
359+
model_name=ModelNames.TINYLLAMA,
360+
replicas=2,
361+
annotations={
362+
"prometheus.io/port": "8000",
363+
"prometheus.io/path": "/metrics",
364+
},
365+
container_resources={
366+
"limits": {
367+
"cpu": ResourceLimits.GPU.CPU_LIMIT,
368+
"memory": ResourceLimits.GPU.MEMORY_LIMIT,
369+
"nvidia.com/gpu": ResourceLimits.GPU.LIMIT,
370+
},
371+
"requests": {
372+
"cpu": ResourceLimits.GPU.CPU_REQUEST,
373+
"memory": ResourceLimits.GPU.MEMORY_REQUEST,
374+
"nvidia.com/gpu": ResourceLimits.GPU.REQUEST,
375+
},
376+
},
377+
container_env=[
378+
{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"},
379+
{
380+
"name": "VLLM_ADDITIONAL_ARGS",
381+
"value": (
382+
f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} "
383+
'--kv_transfer_config \'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\' '
384+
'--kv-events-config \'{"enable_kv_cache_events":true,"publisher":"zmq",'
385+
'"endpoint":"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557",'
386+
'"topic":"kv@${POD_IP}@${MODEL_NAME}"}\''
387+
),
388+
},
389+
{
390+
"name": "POD_IP",
391+
"valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}},
392+
},
393+
{"name": "MODEL_NAME", "value": ModelNames.TINYLLAMA},
394+
{"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED},
395+
],
396+
liveness_probe=LLMD_LIVENESS_PROBE,
397+
service_account=llmd_s3_service_account.name,
398+
enable_auth=True,
399+
router_config={
400+
"scheduler": {
401+
"template": {
402+
"volumes": [{"name": "tokenizers", "emptyDir": {}}],
403+
"containers": [
404+
{
405+
"name": "main",
406+
"volumeMounts": [
407+
{
408+
"name": "tokenizers",
409+
"mountPath": "/mnt/tokenizers",
410+
"readOnly": False,
411+
}
412+
],
413+
"args": [
414+
"--v=4",
415+
"--pool-name",
416+
"{{ ChildName .ObjectMeta.Name `-inference-pool` }}",
417+
"--pool-namespace",
418+
"{{ .ObjectMeta.Namespace }}",
419+
"--pool-group",
420+
"inference.networking.x-k8s.io",
421+
"--zap-encoder",
422+
"json",
423+
"--grpc-port",
424+
"9002",
425+
"--grpc-health-port",
426+
"9003",
427+
"--secure-serving",
428+
"--model-server-metrics-scheme",
429+
"https",
430+
"--cert-path",
431+
"/var/run/kserve/tls",
432+
"--config-text",
433+
yaml.dump(ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE),
434+
],
435+
}
436+
],
437+
}
438+
},
439+
"route": {},
440+
"gateway": {},
441+
},
442+
disable_scheduler=False,
443+
enable_prefill_decode=False,
444+
wait=True,
445+
timeout=Timeout.TIMEOUT_15MIN,
446+
) as llm_service:
447+
yield llm_service
448+
449+
450+
@pytest.fixture(scope="class")
451+
def authenticated_llmisvc_token(
452+
request: FixtureRequest,
453+
llmisvc_auth_token,
454+
llmisvc_auth_view_role,
455+
llmisvc_auth_role_binding,
456+
) -> str:
457+
service_account_fixture_name = request.param["service_account_fixture"]
458+
llmisvc_fixture_name = request.param["llmisvc_fixture"]
459+
460+
# Get fixtures dynamically
461+
service_account = request.getfixturevalue(argname=service_account_fixture_name)
462+
llmisvc = request.getfixturevalue(argname=llmisvc_fixture_name)
463+
464+
# Create and return token
465+
return llmisvc_auth_token(
466+
service_account=service_account,
467+
llmisvc=llmisvc,
468+
view_role_factory=llmisvc_auth_view_role,
469+
role_binding_factory=llmisvc_auth_role_binding,
470+
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Liveness probe for single-node configurations
2+
LLMD_LIVENESS_PROBE = {
3+
"httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
4+
"initialDelaySeconds": 120,
5+
"periodSeconds": 30,
6+
"timeoutSeconds": 30,
7+
"failureThreshold": 5,
8+
}
9+
10+
# Common parameters for vLLM and llm-d scheduler
11+
PREFIX_CACHE_BLOCK_SIZE = 64
12+
PREFIX_CACHE_HASH_ALGO = "sha256"
13+
PREFIX_CACHE_HASH_SEED = "42"
14+
15+
# Scheduler configuration for single-node with estimated prefix cache
16+
ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE = {
17+
"apiVersion": "inference.networking.x-k8s.io/v1alpha1",
18+
"kind": "EndpointPickerConfig",
19+
"plugins": [
20+
{
21+
"type": "prefix-cache-scorer",
22+
"parameters": {
23+
"indexerConfig": {
24+
"tokenProcessorConfig": {
25+
"blockSize": PREFIX_CACHE_BLOCK_SIZE,
26+
"hashAlgo": PREFIX_CACHE_HASH_ALGO,
27+
"hashSeed": PREFIX_CACHE_HASH_SEED,
28+
}
29+
}
30+
},
31+
}
32+
],
33+
"schedulingProfiles": [
34+
{
35+
"name": "default",
36+
"plugins": [
37+
{
38+
"pluginRef": "prefix-cache-scorer",
39+
"weight": 5.0,
40+
}
41+
],
42+
}
43+
],
44+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
Test Single-Node Estimated Prefix Caching.
3+
4+
This test verifies that the LLM-D router correctly routes inference requests
5+
based on cache state, maximizing prefix cache hits.
6+
7+
Test configuration:
8+
- LLMInferenceService with 2 replicas and router enabled
9+
- Authentication enabled
10+
- Verify router pod and vLLM pods are running
11+
- Send multiple requests with shared prefixes and size greater than PREFIX_CACHE_BLOCK_SIZE
12+
"""
13+
14+
import pytest
15+
from kubernetes.dynamic import DynamicClient
16+
from ocp_resources.gateway import Gateway
17+
from ocp_resources.llm_inference_service import LLMInferenceService
18+
from ocp_resources.prometheus import Prometheus
19+
20+
from tests.model_serving.model_server.llmd.utils import (
21+
get_llmd_router_scheduler_pod,
22+
get_llmd_workload_pods,
23+
send_prefix_cache_test_requests,
24+
verify_estimated_prefix_cache_metrics,
25+
verify_gateway_status,
26+
verify_llm_service_status,
27+
)
28+
from simple_logger.logger import get_logger
29+
30+
LOGGER = get_logger(name=__name__)
31+
32+
# Number of requests to send for prefix cache testing
33+
NUM_REQUESTS = 20
34+
35+
pytestmark = [pytest.mark.llmd_gpu]
36+
37+
38+
@pytest.mark.parametrize(
39+
"unprivileged_model_namespace, authenticated_llmisvc_token",
40+
[
41+
pytest.param(
42+
{"name": "llmd-singlenode-prefix-cache-test"},
43+
{
44+
"service_account_fixture": "llmd_s3_service_account",
45+
"llmisvc_fixture": "singlenode_estimated_prefix_cache",
46+
},
47+
)
48+
],
49+
indirect=True,
50+
)
51+
@pytest.mark.usefixtures("valid_aws_config", "user_workload_monitoring_config_map")
52+
class TestSingleNodeEstimatedPrefixCache:
53+
"""Test class for singlenode estimated prefix cache routing."""
54+
55+
def test_singlenode_estimated_prefix_cache(
56+
self,
57+
unprivileged_client: DynamicClient,
58+
llmd_gateway: Gateway,
59+
singlenode_estimated_prefix_cache: LLMInferenceService,
60+
authenticated_llmisvc_token: str,
61+
gpu_count_on_cluster: int,
62+
prometheus: Prometheus,
63+
):
64+
"""Test single-node estimated prefix cache routing."""
65+
if gpu_count_on_cluster < 2:
66+
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
67+
68+
# Verify infrastructure is ready before testing routing
69+
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
70+
assert verify_llm_service_status(singlenode_estimated_prefix_cache), "LLMInferenceService should be ready"
71+
72+
router_scheduler_pod = get_llmd_router_scheduler_pod(
73+
client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache
74+
)
75+
assert router_scheduler_pod is not None, "Router-scheduler pod should exist"
76+
assert router_scheduler_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"
77+
78+
workload_pods = get_llmd_workload_pods(client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache)
79+
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
80+
81+
# Send N identical requests to test prefix cache
82+
num_successful_requests = send_prefix_cache_test_requests(
83+
llmisvc=singlenode_estimated_prefix_cache,
84+
token=authenticated_llmisvc_token,
85+
num_requests=NUM_REQUESTS,
86+
)
87+
88+
# Verify estimated prefix cache routing using Prometheus metrics
89+
verify_estimated_prefix_cache_metrics(
90+
prometheus=prometheus,
91+
llmisvc=singlenode_estimated_prefix_cache,
92+
workload_pods=workload_pods,
93+
expected_requests=num_successful_requests,
94+
)

0 commit comments

Comments
 (0)