3131 [({"name" : NAMESPACE }, PrecisePrefixCacheConfig )],
3232 indirect = True ,
3333)
34- @pytest .mark .usefixtures ("valid_aws_config" )
34+ @pytest .mark .usefixtures ("valid_aws_config" , "skip_if_less_than_2_gpus" )
3535class TestSingleNodePrecisePrefixCache :
3636 """Deploy Qwen on GPU with 2 replicas and precise prefix cache routing,
3737 then verify cache hits via Prometheus metrics.
@@ -42,21 +42,16 @@ def test_singlenode_precise_prefix_cache(
4242 unprivileged_client : DynamicClient ,
4343 llmisvc : LLMInferenceService ,
4444 llmisvc_token : str ,
45- gpu_count_on_cluster : int ,
4645 prometheus : Prometheus ,
4746 ):
4847 """Test steps:
4948
50- 1. Skip if fewer than 2 GPU nodes are available on the cluster.
51- 2. Assert the router-scheduler pod exists and is Running.
52- 3. Assert exactly 2 workload pods are found.
53- 4. Send 20 chat completion requests with a shared long prompt.
54- 5. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
55- 6. Assert the scheduler made at least the expected number of routing decisions.
49+ 1. Assert the router-scheduler pod exists and is Running.
50+ 2. Assert exactly 2 workload pods are found.
51+ 3. Send 20 chat completion requests with a shared long prompt.
52+ 4. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
53+ 5. Assert the scheduler made at least the expected number of routing decisions.
5654 """
57- if gpu_count_on_cluster < 2 :
58- pytest .skip (f"Test requires at least 2 GPUs (found { gpu_count_on_cluster } )" )
59-
6055 router_pod = get_llmd_router_scheduler_pod (client = unprivileged_client , llmisvc = llmisvc )
6156 assert router_pod is not None , "Router-scheduler pod should exist"
6257 assert router_pod .instance .status .phase == "Running" , "Router-scheduler pod should be running"
0 commit comments