Skip to content

Commit 96b2a43

Browse files
committed
use fixtures to check gpu before deploying llmisvc
1 parent 3cbd728 commit 96b2a43

6 files changed

Lines changed: 33 additions & 48 deletions

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,16 @@ def s3_service_account(
8282
yield sa.name
8383

8484

85+
# ===========================================
86+
# GPU guards
87+
# ===========================================
88+
@pytest.fixture(scope="session")
89+
def skip_if_less_than_2_gpus(gpu_count_on_cluster: int) -> None:
90+
"""Skip test if fewer than 2 GPUs are available on the cluster."""
91+
if gpu_count_on_cluster < 2:
92+
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
93+
94+
8595
# ===========================================
8696
# LLMInferenceService creation
8797
# ===========================================

tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,25 +21,20 @@
2121
],
2222
indirect=True,
2323
)
24-
@pytest.mark.usefixtures("valid_aws_config")
24+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
2525
class TestLlmdConnectionGpu:
2626
"""Deploy Qwen on GPU via S3 and HuggingFace and verify chat completions."""
2727

2828
def test_llmd_connection_gpu(
2929
self,
3030
llmisvc: LLMInferenceService,
31-
gpu_count_on_cluster: int,
3231
):
3332
"""Test steps:
3433
35-
1. Skip if no GPU nodes are available on the cluster.
36-
2. Send a chat completion request to /v1/chat/completions.
37-
3. Assert the response status is 200.
38-
4. Assert the completion text contains the expected answer.
34+
1. Send a chat completion request to /v1/chat/completions.
35+
2. Assert the response status is 200.
36+
3. Assert the completion text contains the expected answer.
3937
"""
40-
if gpu_count_on_cluster < 1:
41-
pytest.skip("No GPUs available on cluster, skipping GPU test")
42-
4338
prompt = "What is the capital of Italy?"
4439
expected = "rome"
4540

tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,20 @@ def router_config(cls):
2626
[({"name": NAMESPACE}, S3GpuNoSchedulerConfig)],
2727
indirect=True,
2828
)
29-
@pytest.mark.usefixtures("valid_aws_config")
29+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
3030
class TestLlmdNoScheduler:
3131
"""Deploy Qwen on GPU with the scheduler disabled and verify chat completions."""
3232

3333
def test_llmd_no_scheduler(
3434
self,
3535
llmisvc: LLMInferenceService,
36-
gpu_count_on_cluster: int,
3736
):
3837
"""Test steps:
3938
40-
1. Skip if no GPU nodes are available on the cluster.
41-
2. Send a chat completion request to /v1/chat/completions.
42-
3. Assert the response status is 200.
43-
4. Assert the completion text contains the expected answer.
39+
1. Send a chat completion request to /v1/chat/completions.
40+
2. Assert the response status is 200.
41+
3. Assert the completion text contains the expected answer.
4442
"""
45-
if gpu_count_on_cluster < 1:
46-
pytest.skip("No GPUs available on cluster, skipping GPU test")
47-
4843
prompt = "What is the capital of Italy?"
4944
expected = "rome"
5045

tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,20 @@
1818
[({"name": NAMESPACE}, PrefillDecodeConfig)],
1919
indirect=True,
2020
)
21-
@pytest.mark.usefixtures("valid_aws_config")
21+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
2222
class TestLlmdPrefillDecode:
2323
"""Deploy Qwen on GPU with prefill-decode disaggregation and verify chat completions."""
2424

2525
def test_llmd_prefill_decode(
2626
self,
2727
llmisvc: LLMInferenceService,
28-
gpu_count_on_cluster: int,
2928
):
3029
"""Test steps:
3130
32-
1. Skip if no GPU nodes are available on the cluster.
33-
2. Send a chat completion request to /v1/chat/completions.
34-
3. Assert the response status is 200.
35-
4. Assert the completion text contains the expected answer.
31+
1. Send a chat completion request to /v1/chat/completions.
32+
2. Assert the response status is 200.
33+
3. Assert the completion text contains the expected answer.
3634
"""
37-
if gpu_count_on_cluster < 1:
38-
pytest.skip("No GPUs available on cluster, skipping GPU test")
39-
4035
prompt = "What is the capital of Italy?"
4136
expected = "rome"
4237

tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
[({"name": NAMESPACE}, EstimatedPrefixCacheConfig)],
3131
indirect=True,
3232
)
33-
@pytest.mark.usefixtures("valid_aws_config")
33+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
3434
class TestSingleNodeEstimatedPrefixCache:
3535
"""Deploy Qwen on GPU with 2 replicas and estimated prefix cache routing,
3636
then verify cache hits via Prometheus metrics.
@@ -41,20 +41,15 @@ def test_singlenode_estimated_prefix_cache(
4141
unprivileged_client: DynamicClient,
4242
llmisvc: LLMInferenceService,
4343
llmisvc_token: str,
44-
gpu_count_on_cluster: int,
4544
prometheus: Prometheus,
4645
):
4746
"""Test steps:
4847
49-
1. Skip if fewer than 2 GPU nodes are available on the cluster.
50-
2. Assert the router-scheduler pod exists and is Running.
51-
3. Assert exactly 2 workload pods are found.
52-
4. Send 20 chat completion requests with a shared long prompt.
53-
5. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
48+
1. Assert the router-scheduler pod exists and is Running.
49+
2. Assert exactly 2 workload pods are found.
50+
3. Send 20 chat completion requests with a shared long prompt.
51+
4. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
5452
"""
55-
if gpu_count_on_cluster < 2:
56-
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
57-
5853
router_pod = get_llmd_router_scheduler_pod(client=unprivileged_client, llmisvc=llmisvc)
5954
assert router_pod is not None, "Router-scheduler pod should exist"
6055
assert router_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"

tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
[({"name": NAMESPACE}, PrecisePrefixCacheConfig)],
3232
indirect=True,
3333
)
34-
@pytest.mark.usefixtures("valid_aws_config")
34+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
3535
class TestSingleNodePrecisePrefixCache:
3636
"""Deploy Qwen on GPU with 2 replicas and precise prefix cache routing,
3737
then verify cache hits via Prometheus metrics.
@@ -42,21 +42,16 @@ def test_singlenode_precise_prefix_cache(
4242
unprivileged_client: DynamicClient,
4343
llmisvc: LLMInferenceService,
4444
llmisvc_token: str,
45-
gpu_count_on_cluster: int,
4645
prometheus: Prometheus,
4746
):
4847
"""Test steps:
4948
50-
1. Skip if fewer than 2 GPU nodes are available on the cluster.
51-
2. Assert the router-scheduler pod exists and is Running.
52-
3. Assert exactly 2 workload pods are found.
53-
4. Send 20 chat completion requests with a shared long prompt.
54-
5. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
55-
6. Assert the scheduler made at least the expected number of routing decisions.
49+
1. Assert the router-scheduler pod exists and is Running.
50+
2. Assert exactly 2 workload pods are found.
51+
3. Send 20 chat completion requests with a shared long prompt.
52+
4. Query Prometheus and assert all traffic was routed to a single pod with correct prefix cache hit counts.
53+
5. Assert the scheduler made at least the expected number of routing decisions.
5654
"""
57-
if gpu_count_on_cluster < 2:
58-
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")
59-
6055
router_pod = get_llmd_router_scheduler_pod(client=unprivileged_client, llmisvc=llmisvc)
6156
assert router_pod is not None, "Router-scheduler pod should exist"
6257
assert router_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"

0 commit comments

Comments
 (0)