use fixtures to check gpu before deploying llmisvc

threcc · threcc · commit 96b2a43fc26f · 2026-03-11T11:56:34.000+01:00
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
@@ -82,6 +82,16 @@ def s3_service_account(
         yield sa.name
 
 
+# ===========================================
+#  GPU guards
+# ===========================================
+@pytest.fixture(scope="session")
+def skip_if_less_than_2_gpus(gpu_count_on_cluster: int) -> None:
+    """Skip test if fewer than 2 GPUs are available on the cluster."""
+    if gpu_count_on_cluster < 2:
+        pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
+
+
 # ===========================================
 #  LLMInferenceService creation
 # ===========================================
diff --git a/tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py b/tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py
@@ -21,25 +21,20 @@
     ],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
 class TestLlmdConnectionGpu:
     """Deploy Qwen on GPU via S3 and HuggingFace and verify chat completions."""
 
     def test_llmd_connection_gpu(
         self,
         llmisvc: LLMInferenceService,
-        gpu_count_on_cluster: int,
     ):
         """Test steps:
 
-        1. Skip if no GPU nodes are available on the cluster.
-        2. Send a chat completion request to /v1/chat/completions.
-        3. Assert the response status is 200.
-        4. Assert the completion text contains the expected answer.
+        1. Send a chat completion request to /v1/chat/completions.
+        2. Assert the response status is 200.
+        3. Assert the completion text contains the expected answer.
         """
-        if gpu_count_on_cluster < 1:
-            pytest.skip("No GPUs available on cluster, skipping GPU test")
-
         prompt = "What is the capital of Italy?"
         expected = "rome"
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py b/tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py
@@ -26,25 +26,20 @@ def router_config(cls):
     [({"name": NAMESPACE}, S3GpuNoSchedulerConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
 class TestLlmdNoScheduler:
     """Deploy Qwen on GPU with the scheduler disabled and verify chat completions."""
 
     def test_llmd_no_scheduler(
         self,
         llmisvc: LLMInferenceService,
-        gpu_count_on_cluster: int,
     ):
         """Test steps:
 
-        1. Skip if no GPU nodes are available on the cluster.
-        2. Send a chat completion request to /v1/chat/completions.
-        3. Assert the response status is 200.
-        4. Assert the completion text contains the expected answer.
+        1. Send a chat completion request to /v1/chat/completions.
+        2. Assert the response status is 200.
+        3. Assert the completion text contains the expected answer.
         """
-        if gpu_count_on_cluster < 1:
-            pytest.skip("No GPUs available on cluster, skipping GPU test")
-
         prompt = "What is the capital of Italy?"
         expected = "rome"
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py b/tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py
@@ -18,25 +18,20 @@
     [({"name": NAMESPACE}, PrefillDecodeConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
 class TestLlmdPrefillDecode:
     """Deploy Qwen on GPU with prefill-decode disaggregation and verify chat completions."""
 
     def test_llmd_prefill_decode(
         self,
         llmisvc: LLMInferenceService,
-        gpu_count_on_cluster: int,
     ):
         """Test steps:
 
-        1. Skip if no GPU nodes are available on the cluster.
-        2. Send a chat completion request to /v1/chat/completions.
-        3. Assert the response status is 200.
-        4. Assert the completion text contains the expected answer.
+        1. Send a chat completion request to /v1/chat/completions.
+        2. Assert the response status is 200.
+        3. Assert the completion text contains the expected answer.
         """
-        if gpu_count_on_cluster < 1:
-            pytest.skip("No GPUs available on cluster, skipping GPU test")
-
         prompt = "What is the capital of Italy?"
         expected = "rome"
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py b/tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py
@@ -30,7 +30,7 @@
     [({"name": NAMESPACE}, EstimatedPrefixCacheConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
 class TestSingleNodeEstimatedPrefixCache:
     """Deploy Qwen on GPU with 2 replicas and estimated prefix cache routing,
     then verify cache hits via Prometheus metrics.
@@ -41,20 +41,15 @@ def test_singlenode_estimated_prefix_cache(
         unprivileged_client: DynamicClient,
         llmisvc: LLMInferenceService,
         llmisvc_token: str,
-        gpu_count_on_cluster: int,
         prometheus: Prometheus,
     ):
         """Test steps:
 
-        1. Skip if fewer than 2 GPU nodes are available on the cluster.
-        2. Assert the router-scheduler pod exists and is Running.
-        3. Assert exactly 2 workload pods are found.
-        4. Send 20 chat completion requests with a shared long prompt.
-        5. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
+        1. Assert the router-scheduler pod exists and is Running.
+        2. Assert exactly 2 workload pods are found.
+        3. Send 20 chat completion requests with a shared long prompt.
+        4. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
         """
-        if gpu_count_on_cluster < 2:
-            pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
-
         router_pod = get_llmd_router_scheduler_pod(client=unprivileged_client, llmisvc=llmisvc)
         assert router_pod is not None, "Router-scheduler pod should exist"
         assert router_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"
diff --git a/tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py b/tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py
@@ -31,7 +31,7 @@
     [({"name": NAMESPACE}, PrecisePrefixCacheConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
 class TestSingleNodePrecisePrefixCache:
     """Deploy Qwen on GPU with 2 replicas and precise prefix cache routing,
     then verify cache hits via Prometheus metrics.
@@ -42,21 +42,16 @@ def test_singlenode_precise_prefix_cache(
         unprivileged_client: DynamicClient,
         llmisvc: LLMInferenceService,
         llmisvc_token: str,
-        gpu_count_on_cluster: int,
         prometheus: Prometheus,
     ):
         """Test steps:
 
-        1. Skip if fewer than 2 GPU nodes are available on the cluster.
-        2. Assert the router-scheduler pod exists and is Running.
-        3. Assert exactly 2 workload pods are found.
-        4. Send 20 chat completion requests with a shared long prompt.
-        5. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
-        6. Assert the scheduler made at least the expected number of routing decisions.
+        1. Assert the router-scheduler pod exists and is Running.
+        2. Assert exactly 2 workload pods are found.
+        3. Send 20 chat completion requests with a shared long prompt.
+        4. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
+        5. Assert the scheduler made at least the expected number of routing decisions.
         """
-        if gpu_count_on_cluster < 2:
-            pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
-
         router_pod = get_llmd_router_scheduler_pod(client=unprivileged_client, llmisvc=llmisvc)
         assert router_pod is not None, "Router-scheduler pod should exist"
         assert router_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"