[RHOAIENG-46495] Implement llm-d CI configuration for disconnected cluster (#1316)

threcc · mwaykole · web-flow · commit 913f12f5d4ec · 2026-03-30T10:19:41.000+05:30
* add function to detect disconnected clusters

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* update llm-d gateway fixture to reuse existing gateway when already present

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* add fixture to skip s3 or HF models in disconnected

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* remove repeated params

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* handle URL for inference when running on disconnected cluster

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* skip tests conditionally + linting

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* add guard on empty host

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* pr comments

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

* pr comments

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

---------

Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;

Co-authored-by:  Milind waykole &lt;mwaykole@redhat.com&gt;
diff --git a/tests/model_serving/model_server/conftest.py b/tests/model_serving/model_server/conftest.py
@@ -36,6 +36,7 @@
 )
 from utilities.inference_utils import create_isvc
 from utilities.infra import (
+    is_disconnected_cluster,
     s3_endpoint_secret,
     update_configmap_data,
 )
@@ -375,6 +376,13 @@ def model_car_inference_service(
         yield isvc
 
 
+@pytest.fixture(scope="session")
+def skip_if_disconnected(admin_client: DynamicClient) -> None:
+    """Skip test if running on a disconnected (air-gapped) cluster."""
+    if is_disconnected_cluster(client=admin_client):
+        pytest.skip("S3/HuggingFace storage not available on disconnected clusters")
+
+
 @pytest.fixture(scope="session")
 def skip_if_no_gpu_available(gpu_count_on_cluster: int) -> None:
     """Skip test if no GPUs are available on the cluster."""
diff --git a/tests/model_serving/model_server/llmd/conftest.py b/tests/model_serving/model_server/llmd/conftest.py
@@ -21,7 +21,6 @@
 from tests.model_serving.model_server.llmd.utils import wait_for_llmisvc, wait_for_llmisvc_pods_ready
 from utilities.constants import Timeout
 from utilities.infra import create_inference_token, s3_endpoint_secret, update_configmap_data
-from utilities.llmd_constants import LLMDGateway
 from utilities.llmd_utils import create_llmd_gateway
 from utilities.logger import RedactedString
 
@@ -39,11 +38,7 @@ def shared_llmd_gateway(admin_client: DynamicClient) -> Generator[Gateway]:
     """Shared LLMD gateway for all tests."""
     with create_llmd_gateway(
         client=admin_client,
-        namespace=LLMDGateway.DEFAULT_NAMESPACE,
-        gateway_class_name=LLMDGateway.DEFAULT_CLASS,
-        wait_for_condition=True,
         timeout=Timeout.TIMEOUT_1MIN,
-        teardown=True,
     ) as gateway:
         yield gateway
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_auth.py b/tests/model_serving/model_server/llmd/test_llmd_auth.py
@@ -33,7 +33,10 @@ def test_llmisvc_authorized(self, llmisvc_auth_pair):
 
         for entry in [entry_a, entry_b]:
             status, body = send_chat_completions(
-                llmisvc=entry.service, prompt=prompt, token=entry.token, insecure=False
+                llmisvc=entry.service,
+                prompt=prompt,
+                token=entry.token,
+                insecure=False,
             )
             assert status == 200, f"Authorized request failed with {status}: {body}"
             completion = parse_completion_text(response_body=body)
diff --git a/tests/model_serving/model_server/llmd/test_llmd_connection_cpu.py b/tests/model_serving/model_server/llmd/test_llmd_connection_cpu.py
@@ -21,7 +21,7 @@
     ],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_disconnected")
 class TestLlmdConnectionCpu:
     """Deploy TinyLlama on CPU via S3 and HuggingFace and verify chat completions."""
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py b/tests/model_serving/model_server/llmd/test_llmd_connection_gpu.py
@@ -21,7 +21,7 @@
     ],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
 class TestLlmdConnectionGpu:
     """Deploy Qwen on GPU via S3 and HuggingFace and verify chat completions."""
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py b/tests/model_serving/model_server/llmd/test_llmd_no_scheduler.py
@@ -26,7 +26,7 @@ def router_config(cls):
     [({"name": NAMESPACE}, S3GpuNoSchedulerConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
 class TestLlmdNoScheduler:
     """Deploy Qwen on GPU with the scheduler disabled and verify chat completions."""
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py b/tests/model_serving/model_server/llmd/test_llmd_prefill_decode.py
@@ -18,7 +18,7 @@
     [({"name": NAMESPACE}, PrefillDecodeConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")
 class TestLlmdPrefillDecode:
     """Deploy Qwen on GPU with prefill-decode disaggregation and verify chat completions."""
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py b/tests/model_serving/model_server/llmd/test_llmd_singlenode_estimated_prefix_cache.py
@@ -30,7 +30,7 @@
     [({"name": NAMESPACE}, EstimatedPrefixCacheConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus", "skip_if_disconnected")
 class TestSingleNodeEstimatedPrefixCache:
     """Deploy Qwen on GPU with 2 replicas and estimated prefix cache routing,
     then verify cache hits via Prometheus metrics.
@@ -58,7 +58,10 @@ def test_singlenode_estimated_prefix_cache(
         assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
 
         successful = send_prefix_cache_requests(
-            llmisvc=llmisvc, prompt=PREFIX_CACHE_PROMPT, token=llmisvc_token, count=NUM_REQUESTS
+            llmisvc=llmisvc,
+            prompt=PREFIX_CACHE_PROMPT,
+            token=llmisvc_token,
+            count=NUM_REQUESTS,
         )
         assert successful == NUM_REQUESTS, f"Expected all {NUM_REQUESTS} requests to succeed, got {successful}"
 
diff --git a/tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py b/tests/model_serving/model_server/llmd/test_llmd_singlenode_precise_prefix_cache.py
@@ -31,7 +31,7 @@
     [({"name": NAMESPACE}, PrecisePrefixCacheConfig)],
     indirect=True,
 )
-@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus")
+@pytest.mark.usefixtures("valid_aws_config", "skip_if_less_than_2_gpus", "skip_if_disconnected")
 class TestSingleNodePrecisePrefixCache:
     """Deploy Qwen on GPU with 2 replicas and precise prefix cache routing,
     then verify cache hits via Prometheus metrics.
@@ -60,7 +60,10 @@ def test_singlenode_precise_prefix_cache(
         assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"
 
         successful = send_prefix_cache_requests(
-            llmisvc=llmisvc, prompt=PREFIX_CACHE_PROMPT, token=llmisvc_token, count=NUM_REQUESTS
+            llmisvc=llmisvc,
+            prompt=PREFIX_CACHE_PROMPT,
+            token=llmisvc_token,
+            count=NUM_REQUESTS,
         )
         assert successful == NUM_REQUESTS, f"Expected all {NUM_REQUESTS} requests to succeed, got {successful}"
 
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
@@ -13,11 +13,14 @@
 from ocp_resources.llm_inference_service import LLMInferenceService
 from ocp_resources.pod import Pod
 from ocp_resources.prometheus import Prometheus
+from ocp_resources.route import Route
 from pyhelper_utils.shell import run_command
 from timeout_sampler import retry
 
 from utilities.certificates_utils import get_ca_bundle
 from utilities.constants import Timeout
+from utilities.infra import is_disconnected_cluster
+from utilities.llmd_constants import LLMDGateway, LLMEndpoint
 from utilities.monitoring import get_metrics_value
 
 LOGGER = structlog.get_logger(name=__name__)
@@ -75,6 +78,32 @@ def _get_inference_url(llmisvc: LLMInferenceService) -> str:
     return f"http://{llmisvc.name}.{llmisvc.namespace}.svc.cluster.local"
 
 
+def _get_disconnected_inference_url(llmisvc: LLMInferenceService) -> str:
+    """Build inference URL using the gateway Route for disconnected clusters.
+
+    On disconnected clusters the gateway uses ClusterIP instead of LoadBalancer,
+    so the internal service URL from LLMISVC status is not reachable from outside
+    the cluster. This function resolves the URL via the gateway Route instead.
+    """
+    route = Route(
+        client=llmisvc.client,
+        name=LLMDGateway.DEFAULT_NAME,
+        namespace=LLMDGateway.DEFAULT_NAMESPACE,
+    )
+    if not route.exists:
+        raise RuntimeError(
+            f"Gateway Route {LLMDGateway.DEFAULT_NAME} not found in {LLMDGateway.DEFAULT_NAMESPACE}. "
+            "Disconnected clusters require the gateway Route to be configured."
+        )
+    host = route.instance.spec.host
+    if not host:
+        raise RuntimeError(
+            f"Gateway Route {LLMDGateway.DEFAULT_NAME} in {LLMDGateway.DEFAULT_NAMESPACE} "
+            "has no host set. Ensure the Route is fully configured."
+        )
+    return f"https://{host}/{llmisvc.namespace}/{llmisvc.name}"
+
+
 def _build_chat_body(model_name: str, prompt: str, max_tokens: int = 50) -> str:
     """Build OpenAI chat completion request body."""
     return json.dumps({
@@ -163,7 +192,12 @@ def send_chat_completions(
     insecure: bool = True,
 ) -> tuple[int, str]:
     """Send a chat completion request. Returns (status_code, response_body)."""
-    url = _get_inference_url(llmisvc) + "/v1/chat/completions"
+    base_url = (
+        _get_disconnected_inference_url(llmisvc)
+        if is_disconnected_cluster(llmisvc.client)
+        else _get_inference_url(llmisvc)
+    )
+    url = base_url + LLMEndpoint.CHAT_COMPLETIONS
     model_name = _get_model_name(llmisvc=llmisvc)
     body = _build_chat_body(model_name=model_name, prompt=prompt)
     ca_cert = None if insecure else _resolve_ca_cert(llmisvc.client)
@@ -314,7 +348,12 @@ def send_prefix_cache_requests(
     successful = 0
     for i in range(count):
         try:
-            status, _ = send_chat_completions(llmisvc=llmisvc, prompt=prompt, token=token, insecure=False)
+            status, _ = send_chat_completions(
+                llmisvc=llmisvc,
+                prompt=prompt,
+                token=token,
+                insecure=False,
+            )
             if status == 200:
                 successful += 1
         except Exception:
diff --git a/tests/model_serving/model_server/upgrade/conftest.py b/tests/model_serving/model_server/upgrade/conftest.py
@@ -817,9 +817,6 @@ def llmd_gateway_fixture(
     else:
         with create_llmd_gateway(
             client=admin_client,
-            namespace=LLMDGateway.DEFAULT_NAMESPACE,
-            gateway_class_name=LLMDGateway.DEFAULT_CLASS,
-            wait_for_condition=True,
             timeout=Timeout.TIMEOUT_1MIN,
             teardown=teardown_resources,
         ) as gateway:
diff --git a/utilities/infra.py b/utilities/infra.py
@@ -37,6 +37,7 @@
 from ocp_resources.infrastructure import Infrastructure
 from ocp_resources.namespace import Namespace
 from ocp_resources.node_config_openshift_io import Node
+from ocp_resources.operator_hub import OperatorHub
 from ocp_resources.pod import Pod
 from ocp_resources.project_project_openshift_io import Project
 from ocp_resources.project_request import ProjectRequest
@@ -467,6 +468,18 @@ def login_with_user_password(api_address: str, user: str, password: str | None =
     return bool(re.search(r"Login successful|Logged into", out))
 
 
+@cache
+def is_disconnected_cluster(client: DynamicClient) -> bool:
+    """Check if the cluster is disconnected (air-gapped) based on OperatorHub disableAllDefaultSources."""
+    operator_hub = OperatorHub(client=client, name="cluster")
+    if operator_hub.exists:
+        result = bool(getattr(operator_hub.instance.spec, "disableAllDefaultSources", False))
+        LOGGER.info(f"Disconnected cluster detection: {result}")
+        return result
+
+    raise RuntimeError("OperatorHub 'cluster' resource does not exist. Cannot determine cluster connectivity.")
+
+
 @cache
 def is_managed_cluster(client: DynamicClient) -> bool:
     """
diff --git a/utilities/llmd_constants.py b/utilities/llmd_constants.py
@@ -15,7 +15,7 @@
 class LLMDGateway:
     DEFAULT_NAME: str = "openshift-ai-inference"
     DEFAULT_NAMESPACE: str = "openshift-ingress"
-    DEFAULT_CLASS: str = "data-science-gateway-class"
+    DEFAULT_GATEWAY_CLASS: str = "openshift-default"
 
 
 class KServeGateway:
diff --git a/utilities/llmd_utils.py b/utilities/llmd_utils.py
@@ -11,7 +11,7 @@
 from timeout_sampler import TimeoutWatch
 
 from utilities.constants import Timeout
-from utilities.infra import get_services_by_isvc_label
+from utilities.infra import get_services_by_isvc_label, is_disconnected_cluster
 from utilities.llmd_constants import (
     ContainerImages,
     KServeGateway,
@@ -26,7 +26,7 @@ def create_llmd_gateway(
     client: DynamicClient,
     name: str = LLMDGateway.DEFAULT_NAME,
     namespace: str = LLMDGateway.DEFAULT_NAMESPACE,
-    gateway_class_name: str = LLMDGateway.DEFAULT_CLASS,
+    gateway_class_name: str = LLMDGateway.DEFAULT_GATEWAY_CLASS,
     listeners: list[dict[str, Any]] | None = None,
     infrastructure: dict[str, Any] | None = None,
     wait_for_condition: bool = True,
@@ -62,48 +62,57 @@ def create_llmd_gateway(
 
     if infrastructure is None:
         infrastructure = {"labels": {KServeGateway.LABEL: KServeGateway.INGRESS_GATEWAY}}
-    try:
-        existing_gateway = Gateway(
-            client=client,
-            name=name,
-            namespace=namespace,
-            api_group=KServeGateway.API_GROUP,
-        )
-        if existing_gateway.exists:
-            LOGGER.info(f"Cleaning up existing Gateway {name} in namespace {namespace}")
-            existing_gateway.delete(wait=True, timeout=Timeout.TIMEOUT_2MIN)
-    except Exception as e:  # noqa: BLE001
-        LOGGER.debug(f"No existing Gateway to clean up: {e}")
-    gateway_body = {
-        "apiVersion": f"{KServeGateway.API_GROUP}/v1",
-        "kind": "Gateway",
-        "metadata": {
-            "name": name,
-            "namespace": namespace,
-        },
-        "spec": {
-            "gatewayClassName": gateway_class_name,
-            "listeners": listeners,
-            "infrastructure": infrastructure,
-        },
-    }
-
-    with Gateway(
+    existing_gateway = Gateway(
         client=client,
-        teardown=teardown,
-        kind_dict=gateway_body,
-        api_group="gateway.networking.k8s.io",
-    ) as gateway:
+        name=name,
+        namespace=namespace,
+        api_group=KServeGateway.API_GROUP,
+    )
+    if existing_gateway.exists:
+        LOGGER.info(f"Reusing existing Gateway {name} in namespace {namespace}")
         if wait_for_condition:
-            LOGGER.info(f"Waiting for Gateway {name} to be programmed...")
-            gateway.wait_for_condition(
+            existing_gateway.wait_for_condition(
                 condition="Programmed",
                 status="True",
                 timeout=timeout,
             )
-            LOGGER.info(f"Gateway {name} is programmed and ready")
+        yield existing_gateway
+    elif is_disconnected_cluster(client=client):
+        raise RuntimeError(
+            f"Gateway {name} in namespace {namespace} does not exist on a disconnected cluster. "
+            "The gateway must be pre-created by CI using configure-disconnected-llmd-gateway.sh."
+        )
+    else:
+        gateway_body = {
+            "apiVersion": f"{KServeGateway.API_GROUP}/v1",
+            "kind": "Gateway",
+            "metadata": {
+                "name": name,
+                "namespace": namespace,
+            },
+            "spec": {
+                "gatewayClassName": gateway_class_name,
+                "listeners": listeners,
+                "infrastructure": infrastructure,
+            },
+        }
 
-        yield gateway
+        with Gateway(
+            client=client,
+            teardown=teardown,
+            kind_dict=gateway_body,
+            api_group="gateway.networking.k8s.io",
+        ) as gateway:
+            if wait_for_condition:
+                LOGGER.info(f"Waiting for Gateway {name} to be programmed...")
+                gateway.wait_for_condition(
+                    condition="Programmed",
+                    status="True",
+                    timeout=timeout,
+                )
+                LOGGER.info(f"Gateway {name} is programmed and ready")
+
+            yield gateway
 
 
 def _get_llm_config_references(enable_prefill_decode: bool = False, disable_scheduler: bool = False) -> dict[str, str]:

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`],`
`22`	`22`	`indirect=True,`
`23`	`23`	`)`
`24`		`-@pytest.mark.usefixtures("valid_aws_config")`
	`24`	`+@pytest.mark.usefixtures("valid_aws_config", "skip_if_disconnected")`
`25`	`25`	`class TestLlmdConnectionCpu:`
`26`	`26`	`"""Deploy TinyLlama on CPU via S3 and HuggingFace and verify chat completions."""`
`27`	`27`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ def router_config(cls):`
`26`	`26`	`[({"name": NAMESPACE}, S3GpuNoSchedulerConfig)],`
`27`	`27`	`indirect=True,`
`28`	`28`	`)`
`29`		`-@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")`
	`29`	`+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")`
`30`	`30`	`class TestLlmdNoScheduler:`
`31`	`31`	`"""Deploy Qwen on GPU with the scheduler disabled and verify chat completions."""`
`32`	`32`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`[({"name": NAMESPACE}, PrefillDecodeConfig)],`
`19`	`19`	`indirect=True,`
`20`	`20`	`)`
`21`		`-@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available")`
	`21`	`+@pytest.mark.usefixtures("valid_aws_config", "skip_if_no_gpu_available", "skip_if_disconnected")`
`22`	`22`	`class TestLlmdPrefillDecode:`
`23`	`23`	`"""Deploy Qwen on GPU with prefill-decode disaggregation and verify chat completions."""`
`24`	`24`