handle URL for inference when running on disconnected cluster

threcc · threcc · commit 7fe199892031 · 2026-03-27T14:40:17.000+01:00
Signed-off-by: threcc &lt;trecchiu@redhat.com&gt;
diff --git a/tests/model_serving/model_server/llmd/utils.py b/tests/model_serving/model_server/llmd/utils.py
@@ -13,11 +13,14 @@
 from ocp_resources.llm_inference_service import LLMInferenceService
 from ocp_resources.pod import Pod
 from ocp_resources.prometheus import Prometheus
+from ocp_resources.route import Route
 from pyhelper_utils.shell import run_command
 from timeout_sampler import retry
 
 from utilities.certificates_utils import get_ca_bundle
 from utilities.constants import Timeout
+from utilities.infra import is_disconnected_cluster
+from utilities.llmd_constants import LLMDGateway, LLMEndpoint
 from utilities.monitoring import get_metrics_value
 
 LOGGER = structlog.get_logger(name=__name__)
@@ -75,6 +78,27 @@ def _get_inference_url(llmisvc: LLMInferenceService) -> str:
     return f"http://{llmisvc.name}.{llmisvc.namespace}.svc.cluster.local"
 
 
+def _get_disconnected_inference_url(llmisvc: LLMInferenceService) -> str:
+    """Build inference URL using the gateway Route for disconnected clusters.
+
+    On disconnected clusters the gateway uses ClusterIP instead of LoadBalancer,
+    so the internal service URL from LLMISVC status is not reachable from outside
+    the cluster. This function resolves the URL via the gateway Route instead.
+    """
+    route = Route(
+        client=llmisvc.client,
+        name=LLMDGateway.DEFAULT_NAME,
+        namespace=LLMDGateway.DEFAULT_NAMESPACE,
+    )
+    if not route.exists:
+        raise RuntimeError(
+            f"Gateway Route {LLMDGateway.DEFAULT_NAME} not found in {LLMDGateway.DEFAULT_NAMESPACE}. "
+            "Disconnected clusters require the gateway Route to be configured."
+        )
+    host = route.instance.spec.host
+    return f"https://{host}/{llmisvc.namespace}/{llmisvc.name}"
+
+
 def _build_chat_body(model_name: str, prompt: str, max_tokens: int = 50) -> str:
     """Build OpenAI chat completion request body."""
     return json.dumps({
@@ -163,7 +187,12 @@ def send_chat_completions(
     insecure: bool = True,
 ) -> tuple[int, str]:
     """Send a chat completion request. Returns (status_code, response_body)."""
-    url = _get_inference_url(llmisvc) + "/v1/chat/completions"
+    base_url = (
+        _get_disconnected_inference_url(llmisvc)
+        if is_disconnected_cluster(llmisvc.client)
+        else _get_inference_url(llmisvc)
+    )
+    url = base_url + LLMEndpoint.CHAT_COMPLETIONS
     model_name = _get_model_name(llmisvc=llmisvc)
     body = _build_chat_body(model_name=model_name, prompt=prompt)
     ca_cert = None if insecure else _resolve_ca_cert(llmisvc.client)
@@ -314,7 +343,12 @@ def send_prefix_cache_requests(
     successful = 0
     for i in range(count):
         try:
-            status, _ = send_chat_completions(llmisvc=llmisvc, prompt=prompt, token=token, insecure=False)
+            status, _ = send_chat_completions(
+                llmisvc=llmisvc,
+                prompt=prompt,
+                token=token,
+                insecure=False,
+            )
             if status == 200:
                 successful += 1
         except Exception: