Skip to content

Commit 913f12f

Browse files
threccmwaykole
andauthored
[RHOAIENG-46495] Implement llm-d CI configuration for disconnected cluster (#1316)
* add function to detect disconnected clusters Signed-off-by: threcc <trecchiu@redhat.com> * update llm-d gateway fixture to reuse existing gateway when already present Signed-off-by: threcc <trecchiu@redhat.com> * add fixture to skip s3 or HF models in disconnected Signed-off-by: threcc <trecchiu@redhat.com> * remove repeated params Signed-off-by: threcc <trecchiu@redhat.com> * handle URL for inference when running on disconnected cluster Signed-off-by: threcc <trecchiu@redhat.com> * skip tests conditionally + linting Signed-off-by: threcc <trecchiu@redhat.com> * add guard on empty host Signed-off-by: threcc <trecchiu@redhat.com> * pr comments Signed-off-by: threcc <trecchiu@redhat.com> * pr comments Signed-off-by: threcc <trecchiu@redhat.com> --------- Signed-off-by: threcc <trecchiu@redhat.com> Co-authored-by: Milind waykole <mwaykole@redhat.com>
1 parent 48088ee commit 913f12f

14 files changed

+127
-57
lines changed

tests/model_serving/model_server/conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
)
3737
from utilities.inference_utils import create_isvc
3838
from utilities.infra import (
39+
is_disconnected_cluster,
3940
s3_endpoint_secret,
4041
update_configmap_data,
4142
)
@@ -375,6 +376,13 @@ def model_car_inference_service(
375376
yield isvc
376377

377378

379+
@pytest.fixture(scope="session")
380+
def skip_if_disconnected(admin_client: DynamicClient) -> None:
381+
"""Skip test if running on a disconnected (air-gapped) cluster."""
382+
if is_disconnected_cluster(client=admin_client):
383+
pytest.skip("S3/HuggingFace storage not available on disconnected clusters")
384+
385+
378386
@pytest.fixture(scope="session")
379387
def skip_if_no_gpu_available(gpu_count_on_cluster: int) -> None:
380388
"""Skip test if no GPUs are available on the cluster."""

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from tests.model_serving.model_server.llmd.utils import wait_for_llmisvc, wait_for_llmisvc_pods_ready
2222
from utilities.constants import Timeout
2323
from utilities.infra import create_inference_token, s3_endpoint_secret, update_configmap_data
24-
from utilities.llmd_constants import LLMDGateway
2524
from utilities.llmd_utils import create_llmd_gateway
2625
from utilities.logger import RedactedString
2726

@@ -39,11 +38,7 @@ def shared_llmd_gateway(admin_client: DynamicClient) -> Generator[Gateway]:
3938
"""Shared LLMD gateway for all tests."""
4039
with create_llmd_gateway(
4140
client=admin_client,
42-
namespace=LLMDGateway.DEFAULT_NAMESPACE,
43-
gateway_class_name=LLMDGateway.DEFAULT_CLASS,
44-
wait_for_condition=True,
4541
timeout=Timeout.TIMEOUT_1MIN,
46-
teardown=True,
4742
) as gateway:
4843
yield gateway
4944

tests/model_serving/model_server/llmd/test_llmd_auth.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ def test_llmisvc_authorized(self, llmisvc_auth_pair):
3333

3434
for entry in [entry_a, entry_b]:
3535
status, body = send_chat_completions(
36-
llmisvc=entry.service, prompt=prompt, token=entry.token, insecure=False
36+
llmisvc=entry.service,
37+
prompt=prompt,
38+
token=entry.token,
39+
insecure=False,
3740
)
3841
assert status == 200, f"Authorized request failed with {status}: {body}"
3942
completion = parse_completion_text(response_body=body)

tests/model_serving/model_server/llmd/test_llmd_connection_cpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
],
2222
indirect=True,
2323
)
24-
@pytest.mark.usefixtures("valid_aws_config")
24+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_disconnected")
2525
class TestLlmdConnectionCpu:
2626
"""Deploy TinyLlama on CPU via S3 and HuggingFace and verify chat completions."""
2727

tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
],
2222
indirect=True,
2323
)
24-
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
24+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
2525
class TestLlmdConnectionGpu:
2626
"""Deploy Qwen on GPU via S3 and HuggingFace and verify chat completions."""
2727

tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def router_config(cls):
2626
[({"name": NAMESPACE}, S3GpuNoSchedulerConfig)],
2727
indirect=True,
2828
)
29-
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
29+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
3030
class TestLlmdNoScheduler:
3131
"""Deploy Qwen on GPU with the scheduler disabled and verify chat completions."""
3232

tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
[({"name": NAMESPACE}, PrefillDecodeConfig)],
1919
indirect=True,
2020
)
21-
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
21+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
2222
class TestLlmdPrefillDecode:
2323
"""Deploy Qwen on GPU with prefill-decode disaggregation and verify chat completions."""
2424

tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
[({"name": NAMESPACE}, EstimatedPrefixCacheConfig)],
3131
indirect=True,
3232
)
33-
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
33+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus", "skip_if_disconnected")
3434
class TestSingleNodeEstimatedPrefixCache:
3535
"""Deploy Qwen on GPU with 2 replicas and estimated prefix cache routing,
3636
then verify cache hits via Prometheus metrics.
@@ -58,7 +58,10 @@ def test_singlenode_estimated_prefix_cache(
5858
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
5959

6060
successful = send_prefix_cache_requests(
61-
llmisvc=llmisvc, prompt=PREFIX_CACHE_PROMPT, token=llmisvc_token, count=NUM_REQUESTS
61+
llmisvc=llmisvc,
62+
prompt=PREFIX_CACHE_PROMPT,
63+
token=llmisvc_token,
64+
count=NUM_REQUESTS,
6265
)
6366
assert successful == NUM_REQUESTS, f"Expected all {NUM_REQUESTS} requests to succeed, got {successful}"
6467

tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
[({"name": NAMESPACE}, PrecisePrefixCacheConfig)],
3232
indirect=True,
3333
)
34-
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
34+
@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus", "skip_if_disconnected")
3535
class TestSingleNodePrecisePrefixCache:
3636
"""Deploy Qwen on GPU with 2 replicas and precise prefix cache routing,
3737
then verify cache hits via Prometheus metrics.
@@ -60,7 +60,10 @@ def test_singlenode_precise_prefix_cache(
6060
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
6161

6262
successful = send_prefix_cache_requests(
63-
llmisvc=llmisvc, prompt=PREFIX_CACHE_PROMPT, token=llmisvc_token, count=NUM_REQUESTS
63+
llmisvc=llmisvc,
64+
prompt=PREFIX_CACHE_PROMPT,
65+
token=llmisvc_token,
66+
count=NUM_REQUESTS,
6467
)
6568
assert successful == NUM_REQUESTS, f"Expected all {NUM_REQUESTS} requests to succeed, got {successful}"
6669

tests/model_serving/model_server/llmd/utils.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,14 @@
1313
from ocp_resources.llm_inference_service import LLMInferenceService
1414
from ocp_resources.pod import Pod
1515
from ocp_resources.prometheus import Prometheus
16+
from ocp_resources.route import Route
1617
from pyhelper_utils.shell import run_command
1718
from timeout_sampler import retry
1819

1920
from utilities.certificates_utils import get_ca_bundle
2021
from utilities.constants import Timeout
22+
from utilities.infra import is_disconnected_cluster
23+
from utilities.llmd_constants import LLMDGateway, LLMEndpoint
2124
from utilities.monitoring import get_metrics_value
2225

2326
LOGGER = structlog.get_logger(name=__name__)
@@ -75,6 +78,32 @@ def _get_inference_url(llmisvc: LLMInferenceService) -> str:
7578
return f"http://{llmisvc.name}.{llmisvc.namespace}.svc.cluster.local"
7679

7780

81+
def _get_disconnected_inference_url(llmisvc: LLMInferenceService) -> str:
82+
"""Build inference URL using the gateway Route for disconnected clusters.
83+
84+
On disconnected clusters the gateway uses ClusterIP instead of LoadBalancer,
85+
so the internal service URL from LLMISVC status is not reachable from outside
86+
the cluster. This function resolves the URL via the gateway Route instead.
87+
"""
88+
route = Route(
89+
client=llmisvc.client,
90+
name=LLMDGateway.DEFAULT_NAME,
91+
namespace=LLMDGateway.DEFAULT_NAMESPACE,
92+
)
93+
if not route.exists:
94+
raise RuntimeError(
95+
f"Gateway Route {LLMDGateway.DEFAULT_NAME} not found in {LLMDGateway.DEFAULT_NAMESPACE}. "
96+
"Disconnected clusters require the gateway Route to be configured."
97+
)
98+
host = route.instance.spec.host
99+
if not host:
100+
raise RuntimeError(
101+
f"Gateway Route {LLMDGateway.DEFAULT_NAME} in {LLMDGateway.DEFAULT_NAMESPACE} "
102+
"has no host set. Ensure the Route is fully configured."
103+
)
104+
return f"https://{host}/{llmisvc.namespace}/{llmisvc.name}"
105+
106+
78107
def _build_chat_body(model_name: str, prompt: str, max_tokens: int = 50) -> str:
79108
"""Build OpenAI chat completion request body."""
80109
return json.dumps({
@@ -163,7 +192,12 @@ def send_chat_completions(
163192
insecure: bool = True,
164193
) -> tuple[int, str]:
165194
"""Send a chat completion request. Returns (status_code, response_body)."""
166-
url = _get_inference_url(llmisvc) + "/v1/chat/completions"
195+
base_url = (
196+
_get_disconnected_inference_url(llmisvc)
197+
if is_disconnected_cluster(llmisvc.client)
198+
else _get_inference_url(llmisvc)
199+
)
200+
url = base_url + LLMEndpoint.CHAT_COMPLETIONS
167201
model_name = _get_model_name(llmisvc=llmisvc)
168202
body = _build_chat_body(model_name=model_name, prompt=prompt)
169203
ca_cert = None if insecure else _resolve_ca_cert(llmisvc.client)
@@ -314,7 +348,12 @@ def send_prefix_cache_requests(
314348
successful = 0
315349
for i in range(count):
316350
try:
317-
status, _ = send_chat_completions(llmisvc=llmisvc, prompt=prompt, token=token, insecure=False)
351+
status, _ = send_chat_completions(
352+
llmisvc=llmisvc,
353+
prompt=prompt,
354+
token=token,
355+
insecure=False,
356+
)
318357
if status == 200:
319358
successful += 1
320359
except Exception:

0 commit comments

Comments
 (0)