opendatahub-io · jgarciao · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 2, 2026
@@ -113,10 +113,6 @@ def _factory(provider_name: str) -> list[dict[str, Any]]:
             env_vars.append({"name": "MILVUS_CONSISTENCY_LEVEL", "value": "Bounded"})
         elif provider_name == "faiss":
             env_vars.append({"name": "ENABLE_FAISS", "value": "faiss"})
-            env_vars.append({
-                "name": "FAISS_KVSTORE_DB_PATH",
-                "value": "/opt/app-root/src/.llama/distributions/rh/sqlite_vec.db",
-            })
         elif provider_name == "pgvector":
             request.getfixturevalue(argname="pgvector_service")
             env_vars.append({"name": "ENABLE_PGVECTOR", "value": "true"})

@@ -1,4 +1,3 @@
-import os
 from collections.abc import Callable, Generator
 from typing import Any
 
@@ -18,8 +17,11 @@
 from ocp_resources.service import Service
 from semver import Version
 
+import utilities
 from tests.llama_stack.constants import (
+    HTTPS_PROXY,
     LLAMA_STACK_DISTRIBUTION_SECRET_DATA,
+    LLS_CLIENT_VERIFY_SSL,
     LLS_CORE_EMBEDDING_MODEL,
     LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID,
     LLS_CORE_INFERENCE_MODEL,
@@ -71,13 +73,20 @@ def enabled_llama_stack_operator(dsc_resource: DataScienceCluster) -> Generator[
         yield dsc
 
 
+@pytest.fixture(scope="class")
+def is_disconnected_cluster(admin_client: DynamicClient) -> bool:
+    """Whether the target cluster is disconnected (air-gapped)."""
+    return utilities.infra.is_disconnected_cluster(client=admin_client)
+
+
 @pytest.fixture(scope="class")
 def llama_stack_server_config(
     request: FixtureRequest,
     pytestconfig: pytest.Config,
     distribution_name: str,
     vector_io_provider_deployment_config_factory: Callable[[str], list[dict[str, str]]],
     files_provider_config_factory: Callable[[str], list[dict[str, str]]],
+    is_disconnected_cluster: bool,
 ) -> dict[str, Any]:
     """
     Generate server configuration for LlamaStack distribution deployment and deploy vector I/O provider resources.
@@ -94,6 +103,7 @@ def llama_stack_server_config(
             and return their configuration environment variables
         files_provider_config_factory: Factory function to configure files storage providers
             and return their configuration environment variables
+        is_disconnected_cluster: Whether the target cluster is disconnected (air-gapped)
 
     Returns:
         Dict containing server configuration with the following structure:
@@ -141,7 +151,10 @@ def test_with_remote_milvus(llama_stack_server_config):
     """
 
     env_vars = []
+    tls_config: dict[str, Any] | None = None
     params = getattr(request, "param", {})
+    cpu_requests = "2"
+    cpu_limits = "4"
 
     # INFERENCE_MODEL
     if params.get("inference_model"):
@@ -191,8 +204,21 @@ def test_with_remote_milvus(llama_stack_server_config):
         env_vars.append({"name": "VLLM_EMBEDDING_MAX_TOKENS", "value": LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS})
         env_vars.append({"name": "VLLM_EMBEDDING_TLS_VERIFY", "value": LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY})
     elif embedding_provider == "sentence-transformers":
+        # Increase CPU limits to prevent timeouts when inserting files into vector stores
+        cpu_requests = "4"
+        cpu_limits = "8"
+
+        # Enable sentence-transformers embedding model
         env_vars.append({"name": "ENABLE_SENTENCE_TRANSFORMERS", "value": "true"})
         env_vars.append({"name": "EMBEDDING_PROVIDER", "value": "sentence-transformers"})
+
+        if is_disconnected_cluster:
+            # Workaround to fix sentence-transformer embeddings on disconnected (RHAIENG-1624)
+            env_vars.append({"name": "SENTENCE_TRANSFORMERS_HOME", "value": "/opt/app-root/src/.cache/huggingface/hub"})
+            env_vars.append({"name": "HF_HUB_OFFLINE", "value": "1"})
+            env_vars.append({"name": "TRANSFORMERS_OFFLINE", "value": "1"})
+            env_vars.append({"name": "HF_DATASETS_OFFLINE", "value": "1"})
+
     else:
         raise ValueError(f"Unsupported embeddings provider: {embedding_provider}")
 
@@ -229,11 +255,35 @@ def test_with_remote_milvus(llama_stack_server_config):
     env_vars_vector_io = vector_io_provider_deployment_config_factory(provider_name=vector_io_provider)
     env_vars.extend(env_vars_vector_io)
 
+    if is_disconnected_cluster and HTTPS_PROXY:
+        LOGGER.info(f"Setting proxy and tlsconfig configuration (https_proxy:{HTTPS_PROXY})")
+        env_vars.append({"name": "HTTPS_PROXY", "value": HTTPS_PROXY})
+
+        # The operator sets SSL_CERT_FILE automatically when tlsConfig.caBundle is
+        # configured, but the `requests` library (used by tiktoken to download
+        # tokenizer data) ignores SSL_CERT_FILE and only checks REQUESTS_CA_BUNDLE.
+        # Without this, tiktoken fails with SSL CERTIFICATE_VERIFY_FAILED when the
+        # proxy uses a self-signed certificate (e.g. in disconnected clusters).
+        env_vars.append({
+            "name": "REQUESTS_CA_BUNDLE",
+            "value": "/etc/ssl/certs/ca-bundle/ca-bundle.crt",
+        })
+
+        tls_config = {
+            "caBundle": {
+                "configMapName": "odh-trusted-ca-bundle",
+                "configMapKeys": [
+                    "ca-bundle.crt",  # CNO-injected cluster CAs
+                    "odh-ca-bundle.crt",  # User-specified custom CAs
+                ],
+            },
+        }
+
     server_config: dict[str, Any] = {
         "containerSpec": {
             "resources": {
-                "requests": {"cpu": "1", "memory": "3Gi"},
-                "limits": {"cpu": "3", "memory": "6Gi"},
+                "requests": {"cpu": cpu_requests, "memory": "3Gi"},
+                "limits": {"cpu": cpu_limits, "memory": "6Gi"},
             },
             "env": env_vars,
             "name": "llama-stack",
@@ -242,9 +292,15 @@ def test_with_remote_milvus(llama_stack_server_config):
         "distribution": {"name": "rh-dev"},
     }
 
+    if tls_config:
+        server_config["tlsConfig"] = tls_config
+
     if params.get("llama_stack_storage_size"):
-        storage_size = params.get("llama_stack_storage_size")
-        server_config["storage"] = {"size": storage_size}
+        if is_disconnected_cluster:
+            LOGGER.warning("Skipping storage_size configuration on disconnected clusters due to known bug RHAIENG-1819")
+        else:
+            storage_size = params.get("llama_stack_storage_size")
+            server_config["storage"] = {"size": storage_size}
 
     return server_config
 
@@ -593,14 +649,13 @@ def llama_stack_test_route(
 def _create_llama_stack_client(
     route: Route,
 ) -> Generator[LlamaStackClient, Any, Any]:
-    # LLS_CLIENT_VERIFY_SSL is false by default to be able to test with Self-Signed certificates
-    verifySSL = os.getenv("LLS_CLIENT_VERIFY_SSL", "false").lower() == "true"
-    http_client = httpx.Client(verify=verifySSL, timeout=240)
+    http_client = httpx.Client(verify=LLS_CLIENT_VERIFY_SSL, timeout=300)
     try:
         client = LlamaStackClient(
             base_url=f"https://{route.host}",
             max_retries=3,
             http_client=http_client,
+            timeout=300,
         )
         wait_for_llama_stack_client_ready(client=client)
         existing_file_ids = {f.id for f in client.files.list().data}

@@ -28,6 +28,10 @@ class ModelInfo(NamedTuple):
     embedding_dimension: int  # API returns integer (e.g., 768)
 
 
+HTTPS_PROXY: str = os.getenv("SQUID_HTTPS_PROXY", "")
+
+# LLS_CLIENT_VERIFY_SSL is false by default to be able to test with Self-Signed certificates
+LLS_CLIENT_VERIFY_SSL = os.getenv("LLS_CLIENT_VERIFY_SSL", "false").lower() == "true"
 LLS_CORE_POD_FILTER: str = "app=llama-stack"
 LLS_OPENSHIFT_MINIMAL_VERSION: VersionInfo = semver.VersionInfo.parse("4.17.0")
 

@@ -103,7 +103,10 @@ def vector_store_create_and_poll(
         TimeoutError: If wait_timeout is reached while status is still in_progress.
     """
     vs_file = llama_stack_client.vector_stores.files.create(
-        vector_store_id=vector_store_id, file_id=file_id, attributes=attributes
+        vector_store_id=vector_store_id,
+        file_id=file_id,
+        timeout=240,  # Increased timeout for slow processing (e.g., sentence-transformers)
+        attributes=dict(attributes) if attributes else attributes,
     )
-    vs_file = llama_stack_client.vector_stores.files.create(
-        vector_store_id=vector_store_id, file_id=file_id, attributes=attributes
-        vector_store_id=vector_store_id,
-        file_id=file_id,
-        timeout=240,  # Increased timeout for slow processing (e.g., sentence-transformers)
-        attributes=dict(attributes) if attributes else attributes,
-    )
+    def vector_store_create_and_poll(
+        llama_stack_client: LlamaStackClient,
+        vector_store_id: str,
+        file_id: str,
+        *,
+        attributes: dict[str, str | int | float | bool] | None = None,
+        poll_interval_sec: float = 5.0,
+        wait_timeout: float = 240.0,
+        request_timeout: float | None = None,
+    ) -> VectorStoreFile:
+        create_timeout = wait_timeout if request_timeout is None else min(request_timeout, wait_timeout)
+        vs_file = llama_stack_client.vector_stores.files.create(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+            timeout=create_timeout,
+            attributes=dict(attributes) if attributes else attributes,
+        )
-    vs_file = llama_stack_client.vector_stores.files.create(
-        vector_store_id=vector_store_id, file_id=file_id, attributes=attributes
-        vector_store_id=vector_store_id,
-        file_id=file_id,
-        timeout=240,  # Increased timeout for slow processing (e.g., sentence-transformers)
-        attributes=dict(attributes) if attributes else attributes,
-    )
+    def vector_store_create_and_poll(
+        llama_stack_client: LlamaStackClient,
+        vector_store_id: str,
+        file_id: str,
+        *,
+        attributes: dict[str, str | int | float | bool] | None = None,
+        poll_interval_sec: float = 5.0,
+        wait_timeout: float = 240.0,
+        request_timeout: float | None = None,
+    ) -> VectorStoreFile:
+        create_timeout = wait_timeout if request_timeout is None else min(request_timeout, wait_timeout)
+        vs_file = llama_stack_client.vector_stores.files.create(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+            timeout=create_timeout,
+            attributes=dict(attributes) if attributes else attributes,
+        )
     terminal_statuses = ("completed", "failed", "cancelled")
     deadline = time.monotonic() + wait_timeout
@@ -155,7 +158,7 @@ def create_llama_stack_distribution(
 
 
 @retry(
-    wait_timeout=60,
+    wait_timeout=240,
     sleep=5,
     exceptions_dict={ResourceNotFoundError: [], UnexpectedResourceCountError: []},
 )