Skip to content

Commit 93b5e2e

Browse files
committed
create functions to verify precise-prefix-cache routing
1 parent 008d197 commit 93b5e2e

1 file changed

Lines changed: 119 additions & 0 deletions

File tree

  • tests/model_serving/model_server/llmd

tests/model_serving/model_server/llmd/utils.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
Follows the established model server utils pattern for consistency.
66
"""
77

8+
import json
89
from typing import Any
910

1011
from kubernetes.dynamic import DynamicClient
@@ -447,3 +448,121 @@ def verify_estimated_prefix_cache(
447448
)
448449

449450
return True
451+
452+
453+
def get_scheduler_decision_logs(
454+
router_scheduler_pod: Pod,
455+
lookback_seconds: int = 600,
456+
) -> list[dict]:
457+
"""
458+
Retrieve scheduling decision logs from the router-scheduler pod.
459+
460+
Args:
461+
router_scheduler_pod: The router-scheduler Pod object
462+
lookback_seconds: How far back to look in logs (default: 600s = 10 minutes)
463+
464+
Returns:
465+
list[dict]: List of parsed JSON log entries containing scheduler decisions
466+
"""
467+
LOGGER.info(f"Retrieving logs from scheduler pod {router_scheduler_pod.name}")
468+
469+
# Get all logs from the scheduler pod
470+
# Note: The router-scheduler container is the default/main container
471+
raw_logs = router_scheduler_pod.log()
472+
473+
# Target decision message
474+
target_decision_msg = "Selecting pods from candidates sorted by max score"
475+
476+
# Filtering logs
477+
filtered_logs = "\n".join(line for line in raw_logs.splitlines() if target_decision_msg in line)
478+
479+
# Parsing as json
480+
json_logs = [json.loads(line) for line in filtered_logs.splitlines()]
481+
482+
LOGGER.info(f"Retrieved {len(json_logs)} logs from router-scheduler pod")
483+
return json_logs
484+
485+
486+
@retry(wait_timeout=90, sleep=30, exceptions_dict={AssertionError: []}, print_log=False)
487+
def verify_precise_prefix_cache(
488+
prometheus: Prometheus,
489+
llmisvc: LLMInferenceService,
490+
workload_pods: list[Pod],
491+
router_scheduler_pod: Pod,
492+
expected_requests: int,
493+
) -> bool:
494+
"""
495+
Verify that the Precise Prefix Cache is working correctly via metric assertions.
496+
497+
This function polls Prometheus to assess these behaviors:
498+
1. all traffic was routed to a single pod
499+
2. the number of prefix cache hits matches
500+
3. the scheduler used max-score-picker logic.
501+
502+
Retries for up to 90s to allow for metric scraping latency.
503+
504+
Args:
505+
prometheus: Prometheus client.
506+
llmisvc: Target Inference Service.
507+
workload_pods: List of serving pods.
508+
router_scheduler_pod: The router-scheduler Pod object.
509+
expected_requests: Total expected request count.
510+
511+
Returns:
512+
bool: True if verification succeeds (required by @retry decorator).
513+
514+
Raises:
515+
AssertionError: If validation fails after the retry timeout.
516+
"""
517+
LOGGER.info("Checking Precise Prefix Cache logic...")
518+
519+
# 1. Verify all traffic is routed to a single pod
520+
request_counts = get_successful_requests_by_pod(
521+
prometheus=prometheus,
522+
llmisvc=llmisvc,
523+
pods=workload_pods,
524+
)
525+
LOGGER.info(f"Request count by pod: {request_counts}")
526+
527+
# All requests must be routed to exactly one pod (prefix cache affinity).
528+
# This assertion works regardless of the number of pods in the deployment.
529+
pods_with_traffic = [pod for pod, count in request_counts.items() if count > 0]
530+
assert len(pods_with_traffic) == 1, (
531+
f"Expected all traffic to be routed to exactly 1 pod, but {len(pods_with_traffic)} pods received traffic. "
532+
f"Distribution: {request_counts}"
533+
)
534+
535+
active_pod = pods_with_traffic[0]
536+
assert request_counts[active_pod] == expected_requests, (
537+
f"Expected {expected_requests} requests on the active pod '{active_pod}', but got {request_counts[active_pod]}"
538+
)
539+
540+
# 2. Verify Prefix Cache Hits on the active pod
541+
# The first request warms the cache, subsequent requests should hit it.
542+
cache_hit_counts = get_prefix_cache_hits_by_pod(
543+
prometheus=prometheus,
544+
llmisvc=llmisvc,
545+
pods=workload_pods,
546+
)
547+
LOGGER.info(f"Prefix cache hits by pod: {cache_hit_counts}")
548+
549+
# Logic: (N-1) requests * Block Size
550+
expected_hits = (expected_requests - 1) * PREFIX_CACHE_BLOCK_SIZE
551+
552+
assert cache_hit_counts[active_pod] == expected_hits, (
553+
f"Cache hit mismatch on active pod '{active_pod}'. "
554+
f"Expected {expected_hits} hits, got {cache_hit_counts[active_pod]}"
555+
)
556+
557+
# 3. Verify the scheduler used max-score-picker logic at least as many times
558+
# as the number of requests sent
559+
decision_logs = get_scheduler_decision_logs(router_scheduler_pod=router_scheduler_pod)
560+
assert len(decision_logs) >= expected_requests, (
561+
f"Scheduler did not use max-score-picker logic enough times. "
562+
f"Expected: >= {expected_requests}, Found: {len(decision_logs)}. "
563+
f"Log sample: {decision_logs[:3] if decision_logs else 'No logs found'}"
564+
)
565+
566+
LOGGER.info("✓ Max-score-picker logic confirmed in scheduler logs")
567+
568+
return True

0 commit comments

Comments
 (0)