|
13 | 13 | from ocp_resources.llm_inference_service import LLMInferenceService |
14 | 14 | from ocp_resources.pod import Pod |
15 | 15 | from ocp_resources.prometheus import Prometheus |
| 16 | +from ocp_resources.route import Route |
16 | 17 | from pyhelper_utils.shell import run_command |
17 | 18 | from timeout_sampler import retry |
18 | 19 |
|
19 | 20 | from utilities.certificates_utils import get_ca_bundle |
20 | 21 | from utilities.constants import Timeout |
| 22 | +from utilities.infra import is_disconnected_cluster |
| 23 | +from utilities.llmd_constants import LLMDGateway, LLMEndpoint |
21 | 24 | from utilities.monitoring import get_metrics_value |
22 | 25 |
|
23 | 26 | LOGGER = structlog.get_logger(name=__name__) |
@@ -75,6 +78,27 @@ def _get_inference_url(llmisvc: LLMInferenceService) -> str: |
75 | 78 | return f"http://{llmisvc.name}.{llmisvc.namespace}.svc.cluster.local" |
76 | 79 |
|
77 | 80 |
|
| 81 | +def _get_disconnected_inference_url(llmisvc: LLMInferenceService) -> str: |
| 82 | + """Build inference URL using the gateway Route for disconnected clusters. |
| 83 | +
|
| 84 | + On disconnected clusters the gateway uses ClusterIP instead of LoadBalancer, |
| 85 | + so the internal service URL from LLMISVC status is not reachable from outside |
| 86 | + the cluster. This function resolves the URL via the gateway Route instead. |
| 87 | + """ |
| 88 | + route = Route( |
| 89 | + client=llmisvc.client, |
| 90 | + name=LLMDGateway.DEFAULT_NAME, |
| 91 | + namespace=LLMDGateway.DEFAULT_NAMESPACE, |
| 92 | + ) |
| 93 | + if not route.exists: |
| 94 | + raise RuntimeError( |
| 95 | + f"Gateway Route {LLMDGateway.DEFAULT_NAME} not found in {LLMDGateway.DEFAULT_NAMESPACE}. " |
| 96 | + "Disconnected clusters require the gateway Route to be configured." |
| 97 | + ) |
| 98 | + host = route.instance.spec.host |
| 99 | + return f"https://{host}/{llmisvc.namespace}/{llmisvc.name}" |
| 100 | + |
| 101 | + |
78 | 102 | def _build_chat_body(model_name: str, prompt: str, max_tokens: int = 50) -> str: |
79 | 103 | """Build OpenAI chat completion request body.""" |
80 | 104 | return json.dumps({ |
@@ -163,7 +187,12 @@ def send_chat_completions( |
163 | 187 | insecure: bool = True, |
164 | 188 | ) -> tuple[int, str]: |
165 | 189 | """Send a chat completion request. Returns (status_code, response_body).""" |
166 | | - url = _get_inference_url(llmisvc) + "/v1/chat/completions" |
| 190 | + base_url = ( |
| 191 | + _get_disconnected_inference_url(llmisvc) |
| 192 | + if is_disconnected_cluster(llmisvc.client) |
| 193 | + else _get_inference_url(llmisvc) |
| 194 | + ) |
| 195 | + url = base_url + LLMEndpoint.CHAT_COMPLETIONS |
167 | 196 | model_name = _get_model_name(llmisvc=llmisvc) |
168 | 197 | body = _build_chat_body(model_name=model_name, prompt=prompt) |
169 | 198 | ca_cert = None if insecure else _resolve_ca_cert(llmisvc.client) |
@@ -314,7 +343,12 @@ def send_prefix_cache_requests( |
314 | 343 | successful = 0 |
315 | 344 | for i in range(count): |
316 | 345 | try: |
317 | | - status, _ = send_chat_completions(llmisvc=llmisvc, prompt=prompt, token=token, insecure=False) |
| 346 | + status, _ = send_chat_completions( |
| 347 | + llmisvc=llmisvc, |
| 348 | + prompt=prompt, |
| 349 | + token=token, |
| 350 | + insecure=False, |
| 351 | + ) |
318 | 352 | if status == 200: |
319 | 353 | successful += 1 |
320 | 354 | except Exception: |
|
0 commit comments