Skip to content

Commit 008d197

Browse files
committed
create test case for precise-prefix-cache routing
1 parent 1fd485b commit 008d197

1 file changed

Lines changed: 113 additions & 0 deletions

File tree

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""
2+
Test Single-Node Precise Prefix Caching.
3+
4+
This test verifies that the LLM-D router correctly routes inference requests
5+
based on precise KV cache tracking, maximizing prefix cache hits using the
6+
cache_tracking mode.
7+
8+
Test configuration:
9+
- LLMInferenceService with 2 replicas and router enabled
10+
- Authentication enabled
11+
- Precise prefix cache mode with KV block index tracking
12+
- HuggingFace model storage (TinyLlama)
13+
- Verify router pod and vLLM pods are running
14+
- Send multiple requests with shared prefixes and size greater than PREFIX_CACHE_BLOCK_SIZE
15+
- Validate prefix cache metrics
16+
"""
17+
18+
import pytest
19+
from kubernetes.dynamic import DynamicClient
20+
from ocp_resources.gateway import Gateway
21+
from ocp_resources.llm_inference_service import LLMInferenceService
22+
from ocp_resources.prometheus import Prometheus
23+
24+
from tests.model_serving.model_server.llmd.utils import (
25+
get_llmd_router_scheduler_pod,
26+
get_llmd_workload_pods,
27+
send_prefix_cache_test_requests,
28+
verify_precise_prefix_cache,
29+
verify_gateway_status,
30+
verify_llm_service_status,
31+
)
32+
33+
# Number of requests to send for prefix cache testing
34+
NUM_REQUESTS = 20
35+
36+
pytestmark = [pytest.mark.llmd_gpu]
37+
38+
39+
@pytest.mark.parametrize(
40+
"unprivileged_model_namespace, authenticated_llmisvc_token",
41+
[
42+
pytest.param(
43+
{"name": "llmd-test-singlenode-precise-prefix-cache"},
44+
{
45+
"service_account_fixture": "llmd_s3_service_account",
46+
"llmisvc_fixture": "singlenode_precise_prefix_cache",
47+
},
48+
)
49+
],
50+
indirect=True,
51+
)
52+
@pytest.mark.usefixtures("valid_aws_config", "user_workload_monitoring_config_map")
53+
class TestSingleNodePrecisePrefixCache:
54+
"""Test class for singlenode precise prefix cache routing."""
55+
56+
# @pytest.mark.skip(reason="Test currently failing - under investigation for precise prefix cache routing issues")
57+
def test_singlenode_precise_prefix_cache(
58+
self,
59+
unprivileged_client: DynamicClient,
60+
llmd_gateway: Gateway,
61+
singlenode_precise_prefix_cache: LLMInferenceService,
62+
authenticated_llmisvc_token: str,
63+
gpu_count_on_cluster: int,
64+
prometheus: Prometheus,
65+
):
66+
"""
67+
Test single-node precise prefix cache routing with KV block index tracking.
68+
69+
This test validates:
70+
1. Scheduler deployment succeeds with precise mode configuration
71+
2. KV-cache routing correctly enabled for precise mode
72+
3. Requests with shared prefixes properly reuse cached responses
73+
4. KV block index metrics are exposed and show cache activity
74+
5. No redundant computation for repeated prefixes
75+
76+
TODO: Fix precise prefix cache test - currently skipped due to bugs.
77+
"""
78+
if gpu_count_on_cluster < 2:
79+
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
80+
81+
# Verify infrastructure is ready before testing routing
82+
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
83+
assert verify_llm_service_status(singlenode_precise_prefix_cache), "LLMInferenceService should be ready"
84+
85+
router_scheduler_pod = get_llmd_router_scheduler_pod(
86+
client=unprivileged_client, llmisvc=singlenode_precise_prefix_cache
87+
)
88+
assert router_scheduler_pod is not None, "Router-scheduler pod should exist"
89+
assert router_scheduler_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"
90+
91+
workload_pods = get_llmd_workload_pods(client=unprivileged_client, llmisvc=singlenode_precise_prefix_cache)
92+
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
93+
94+
import pdb
95+
96+
pdb.set_trace()
97+
# send_prefix_cache_test_requests(llmisvc=singlenode_precise_prefix_cache,token=authenticated_llmisvc_token,num_requests=3)
98+
99+
# Send N identical requests to test prefix cache
100+
num_successful_requests = send_prefix_cache_test_requests(
101+
llmisvc=singlenode_precise_prefix_cache,
102+
token=authenticated_llmisvc_token,
103+
num_requests=NUM_REQUESTS,
104+
)
105+
106+
# Verify precise prefix cache routing
107+
verify_precise_prefix_cache(
108+
prometheus=prometheus,
109+
llmisvc=singlenode_precise_prefix_cache,
110+
workload_pods=workload_pods,
111+
router_scheduler_pod=router_scheduler_pod,
112+
expected_requests=num_successful_requests,
113+
)

0 commit comments

Comments
 (0)