Skip to content

Commit 724334f

Browse files
committed
Add new test for llm-d
1 parent 23e2790 commit 724334f

File tree

5 files changed

+1395
-814
lines changed

5 files changed

+1395
-814
lines changed

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Generator
33

44
import pytest
5+
import yaml
56
from _pytest.fixtures import FixtureRequest
67
from kubernetes.dynamic import DynamicClient
78
from ocp_resources.llm_inference_service import LLMInferenceService
@@ -11,6 +12,13 @@
1112
from ocp_resources.secret import Secret
1213
from ocp_resources.service_account import ServiceAccount
1314

15+
from tests.model_serving.model_server.llmd.constants import (
16+
LLMD_LIVENESS_PROBE,
17+
PREFIX_CACHE_BLOCK_SIZE,
18+
PREFIX_CACHE_HASH_ALGO,
19+
PREFIX_CACHE_HASH_SEED,
20+
SINGLENODE_SCHEDULER_CONFIG_PRECISE_PREFIX_CACHE,
21+
)
1422
from utilities.constants import Timeout, ResourceLimits
1523
from utilities.infra import s3_endpoint_secret, create_inference_token
1624
from utilities.logger import RedactedString
@@ -330,3 +338,105 @@ def _create_llmd_auth_service(
330338
return (llm_service, sa)
331339

332340
yield _create_llmd_auth_service
341+
342+
343+
@pytest.fixture(scope="class")
344+
def singlenode_precise_prefix_cache(
345+
admin_client: DynamicClient,
346+
unprivileged_model_namespace: Namespace,
347+
llmd_s3_secret: Secret,
348+
llmd_s3_service_account: ServiceAccount,
349+
llmd_gateway,
350+
) -> Generator[LLMInferenceService, None, None]:
351+
"""LLMInferenceService fixture for single-node precise prefix cache test."""
352+
353+
with create_llmisvc(
354+
client=admin_client,
355+
name="singlenode-prefix-cache-test",
356+
namespace=unprivileged_model_namespace.name,
357+
storage_uri=ModelStorage.TINYLLAMA_S3,
358+
model_name=ModelNames.TINYLLAMA,
359+
replicas=2,
360+
container_resources={
361+
"limits": {
362+
"cpu": ResourceLimits.GPU.CPU_LIMIT,
363+
"memory": ResourceLimits.GPU.MEMORY_LIMIT,
364+
"nvidia.com/gpu": ResourceLimits.GPU.LIMIT,
365+
},
366+
"requests": {
367+
"cpu": ResourceLimits.GPU.CPU_REQUEST,
368+
"memory": ResourceLimits.GPU.MEMORY_REQUEST,
369+
"nvidia.com/gpu": ResourceLimits.GPU.REQUEST,
370+
},
371+
},
372+
container_env=[
373+
{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"},
374+
{
375+
"name": "VLLM_ADDITIONAL_ARGS",
376+
"value": (
377+
f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} "
378+
"--kv_transfer_config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' "
379+
"--kv-events-config '{\"enable_kv_cache_events\":true,\"publisher\":\"zmq\","
380+
"\"endpoint\":\"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557\","
381+
"\"topic\":\"kv@${POD_IP}@${MODEL_NAME}\"}'"
382+
),
383+
},
384+
{
385+
"name": "POD_IP",
386+
"valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}},
387+
},
388+
{"name": "MODEL_NAME", "value": "TinyLlama"},
389+
{"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED},
390+
],
391+
liveness_probe=LLMD_LIVENESS_PROBE,
392+
service_account=llmd_s3_service_account.name,
393+
enable_auth=True,
394+
router_config={
395+
"scheduler": {
396+
"template": {
397+
"volumes": [{"name": "tokenizers", "emptyDir": {}}],
398+
"containers": [
399+
{
400+
"name": "main",
401+
"volumeMounts": [
402+
{
403+
"name": "tokenizers",
404+
"mountPath": "/mnt/tokenizers",
405+
"readOnly": False,
406+
}
407+
],
408+
"args": [
409+
"--v=4",
410+
"--pool-name",
411+
"{{ ChildName .ObjectMeta.Name `-inference-pool` }}",
412+
"--pool-namespace",
413+
"{{ .ObjectMeta.Namespace }}",
414+
"--pool-group",
415+
"inference.networking.x-k8s.io",
416+
"--zap-encoder",
417+
"json",
418+
"--grpc-port",
419+
"9002",
420+
"--grpc-health-port",
421+
"9003",
422+
"--secure-serving",
423+
"--model-server-metrics-scheme",
424+
"https",
425+
"--cert-path",
426+
"/var/run/kserve/tls",
427+
"--config-text",
428+
yaml.dump(SINGLENODE_SCHEDULER_CONFIG_PRECISE_PREFIX_CACHE),
429+
],
430+
}
431+
],
432+
}
433+
},
434+
"route": {},
435+
"gateway": {},
436+
},
437+
disable_scheduler=False,
438+
enable_prefill_decode=False,
439+
wait=True,
440+
timeout=Timeout.TIMEOUT_15MIN,
441+
) as llm_service:
442+
yield llm_service
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Liveness probe for single-node configurations
2+
LLMD_LIVENESS_PROBE = {
3+
"httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
4+
"initialDelaySeconds": 120,
5+
"periodSeconds": 30,
6+
"timeoutSeconds": 30,
7+
"failureThreshold": 5,
8+
}
9+
10+
# Common parameters for vLLM and llm-d scheduler
11+
PREFIX_CACHE_BLOCK_SIZE = 64
12+
PREFIX_CACHE_HASH_ALGO = "sha256"
13+
PREFIX_CACHE_HASH_SEED = "42"
14+
15+
# Scheduler configuration for single-node with precise prefix cache
16+
SINGLENODE_SCHEDULER_CONFIG_PRECISE_PREFIX_CACHE = {
17+
"apiVersion": "inference.networking.x-k8s.io/v1alpha1",
18+
"kind": "EndpointPickerConfig",
19+
"plugins": [
20+
{
21+
"type": "prefix-cache-scorer",
22+
"parameters": {
23+
"indexerConfig": {
24+
"tokenProcessorConfig": {
25+
"blockSize": PREFIX_CACHE_BLOCK_SIZE,
26+
"hashAlgo": PREFIX_CACHE_HASH_ALGO,
27+
"hashSeed": PREFIX_CACHE_HASH_SEED,
28+
}
29+
}
30+
},
31+
}
32+
],
33+
"schedulingProfiles": [
34+
{
35+
"name": "default",
36+
"plugins": [
37+
{
38+
"pluginRef": "prefix-cache-scorer",
39+
"weight": 5.0,
40+
}
41+
],
42+
}
43+
],
44+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import pytest
2+
from kubernetes.dynamic import DynamicClient
3+
from ocp_resources.llm_inference_service import LLMInferenceService
4+
5+
from tests.model_serving.model_server.llmd.utils import (
6+
get_llmd_router_scheduler_pod,
7+
get_llmd_workload_pods,
8+
verify_gateway_status,
9+
verify_llm_service_status,
10+
verify_singlenode_prefix_cache_routing,
11+
)
12+
from simple_logger.logger import get_logger
13+
14+
"""
15+
Test Single-Node Precise Prefix Caching.
16+
17+
This test verifies that the LLM-D router correctly routes inference requests
18+
based on cache state, maximizing prefix cache hits.
19+
20+
Test configuration:
21+
- LLMInferenceService with 2 replicas and router enabled
22+
- Authentication enabled
23+
- Verify router pod and vLLM pods are running
24+
- Send multiple requests with shared prefixes and size greater than PREFIX_CACHE_BLOCK_SIZE
25+
"""
26+
27+
LOGGER = get_logger(name=__name__)
28+
29+
pytestmark = [pytest.mark.llmd_gpu]
30+
31+
@pytest.mark.parametrize(
32+
"unprivileged_model_namespace",
33+
[pytest.param({"name": "singlenode-prefix-cache-test"})],
34+
indirect=True,
35+
)
36+
@pytest.mark.usefixtures("valid_aws_config")
37+
class TestSingleNodePrecisePrefixCache:
38+
"""Test class for singlenode precise prefix cache routing."""
39+
40+
@pytest.fixture(scope="class", autouse=True)
41+
def setup_auth(
42+
self,
43+
llmd_gateway,
44+
singlenode_precise_prefix_cache,
45+
llmd_s3_service_account,
46+
llmisvc_auth_token,
47+
llmisvc_auth_view_role,
48+
llmisvc_auth_role_binding,
49+
):
50+
"""Set up authentication for single-node prefix cache test."""
51+
# Create token with RBAC resources using factory fixtures
52+
token = llmisvc_auth_token(
53+
service_account=llmd_s3_service_account,
54+
llmisvc=singlenode_precise_prefix_cache,
55+
view_role_factory=llmisvc_auth_view_role,
56+
role_binding_factory=llmisvc_auth_role_binding,
57+
)
58+
59+
# Store token as class attribute for use in tests
60+
TestSingleNodePrecisePrefixCache.auth_token = token
61+
62+
def test_singlenode_precise_prefix_cache(
63+
self,
64+
unprivileged_client: DynamicClient,
65+
llmd_gateway,
66+
singlenode_precise_prefix_cache: LLMInferenceService,
67+
gpu_count_on_cluster: int,
68+
):
69+
"""Test single-node precise prefix cache routing."""
70+
if gpu_count_on_cluster < 2:
71+
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
72+
73+
# Verify gateway and service are ready
74+
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
75+
assert verify_llm_service_status(singlenode_precise_prefix_cache), "LLMInferenceService should be ready"
76+
77+
# Verify router-scheduler pod exists and is running
78+
router_scheduler_pod = get_llmd_router_scheduler_pod(unprivileged_client, singlenode_precise_prefix_cache)
79+
assert router_scheduler_pod is not None, "Router-scheduler pod should exist"
80+
assert router_scheduler_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"
81+
82+
# Verify workload pods
83+
workload_pods = get_llmd_workload_pods(unprivileged_client, singlenode_precise_prefix_cache)
84+
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
85+
86+
# Test prefix cache routing (includes assertions for routing affinity)
87+
verify_singlenode_prefix_cache_routing(
88+
llmisvc=singlenode_precise_prefix_cache,
89+
token=self.auth_token,
90+
workload_pods=workload_pods,
91+
)

0 commit comments

Comments
 (0)