55Follows the established model server utils pattern for consistency.
66"""
77
8+ import json
89from typing import Any
910
1011from kubernetes .dynamic import DynamicClient
@@ -447,3 +448,121 @@ def verify_estimated_prefix_cache(
447448 )
448449
449450 return True
451+
452+
453+ def get_scheduler_decision_logs (
454+ router_scheduler_pod : Pod ,
455+ lookback_seconds : int = 600 ,
456+ ) -> list [dict ]:
457+ """
458+ Retrieve scheduling decision logs from the router-scheduler pod.
459+
460+ Args:
461+ router_scheduler_pod: The router-scheduler Pod object
462+ lookback_seconds: How far back to look in logs (default: 600s = 10 minutes)
463+
464+ Returns:
465+ list[dict]: List of parsed JSON log entries containing scheduler decisions
466+ """
467+ LOGGER .info (f"Retrieving logs from scheduler pod { router_scheduler_pod .name } " )
468+
469+ # Get all logs from the scheduler pod
470+ # Note: The router-scheduler container is the default/main container
471+ raw_logs = router_scheduler_pod .log ()
472+
473+ # Target decision message
474+ target_decision_msg = "Selecting pods from candidates sorted by max score"
475+
476+ # Filtering logs
477+ filtered_logs = "\n " .join (line for line in raw_logs .splitlines () if target_decision_msg in line )
478+
479+ # Parsing as json
480+ json_logs = [json .loads (line ) for line in filtered_logs .splitlines ()]
481+
482+ LOGGER .info (f"Retrieved { len (json_logs )} logs from router-scheduler pod" )
483+ return json_logs
484+
485+
486+ @retry (wait_timeout = 90 , sleep = 30 , exceptions_dict = {AssertionError : []}, print_log = False )
487+ def verify_precise_prefix_cache (
488+ prometheus : Prometheus ,
489+ llmisvc : LLMInferenceService ,
490+ workload_pods : list [Pod ],
491+ router_scheduler_pod : Pod ,
492+ expected_requests : int ,
493+ ) -> bool :
494+ """
495+ Verify that the Precise Prefix Cache is working correctly via metric assertions.
496+
497+ This function polls Prometheus to assess these behaviors:
498+ 1. all traffic was routed to a single pod
499+ 2. the number of prefix cache hits matches
500+ 3. the scheduler used max-score-picker logic.
501+
502+ Retries for up to 90s to allow for metric scraping latency.
503+
504+ Args:
505+ prometheus: Prometheus client.
506+ llmisvc: Target Inference Service.
507+ workload_pods: List of serving pods.
508+ router_scheduler_pod: The router-scheduler Pod object.
509+ expected_requests: Total expected request count.
510+
511+ Returns:
512+ bool: True if verification succeeds (required by @retry decorator).
513+
514+ Raises:
515+ AssertionError: If validation fails after the retry timeout.
516+ """
517+ LOGGER .info ("Checking Precise Prefix Cache logic..." )
518+
519+ # 1. Verify all traffic is routed to a single pod
520+ request_counts = get_successful_requests_by_pod (
521+ prometheus = prometheus ,
522+ llmisvc = llmisvc ,
523+ pods = workload_pods ,
524+ )
525+ LOGGER .info (f"Request count by pod: { request_counts } " )
526+
527+ # All requests must be routed to exactly one pod (prefix cache affinity).
528+ # This assertion works regardless of the number of pods in the deployment.
529+ pods_with_traffic = [pod for pod , count in request_counts .items () if count > 0 ]
530+ assert len (pods_with_traffic ) == 1 , (
531+ f"Expected all traffic to be routed to exactly 1 pod, but { len (pods_with_traffic )} pods received traffic. "
532+ f"Distribution: { request_counts } "
533+ )
534+
535+ active_pod = pods_with_traffic [0 ]
536+ assert request_counts [active_pod ] == expected_requests , (
537+ f"Expected { expected_requests } requests on the active pod '{ active_pod } ', but got { request_counts [active_pod ]} "
538+ )
539+
540+ # 2. Verify Prefix Cache Hits on the active pod
541+ # The first request warms the cache, subsequent requests should hit it.
542+ cache_hit_counts = get_prefix_cache_hits_by_pod (
543+ prometheus = prometheus ,
544+ llmisvc = llmisvc ,
545+ pods = workload_pods ,
546+ )
547+ LOGGER .info (f"Prefix cache hits by pod: { cache_hit_counts } " )
548+
549+ # Logic: (N-1) requests * Block Size
550+ expected_hits = (expected_requests - 1 ) * PREFIX_CACHE_BLOCK_SIZE
551+
552+ assert cache_hit_counts [active_pod ] == expected_hits , (
553+ f"Cache hit mismatch on active pod '{ active_pod } '. "
554+ f"Expected { expected_hits } hits, got { cache_hit_counts [active_pod ]} "
555+ )
556+
557+ # 3. Verify the scheduler used max-score-picker logic at least as many times
558+ # as the number of requests sent
559+ decision_logs = get_scheduler_decision_logs (router_scheduler_pod = router_scheduler_pod )
560+ assert len (decision_logs ) >= expected_requests , (
561+ f"Scheduler did not use max-score-picker logic enough times. "
562+ f"Expected: >= { expected_requests } , Found: { len (decision_logs )} . "
563+ f"Log sample: { decision_logs [:3 ] if decision_logs else 'No logs found' } "
564+ )
565+
566+ LOGGER .info ("✓ Max-score-picker logic confirmed in scheduler logs" )
567+
568+ return True
0 commit comments