trustyai-explainability · dmaniloff · Feb 26, 2026 · Feb 24, 2026
diff --git a/tests/cluster-deployment/Containerfile b/tests/cluster-deployment/Containerfile
@@ -0,0 +1,37 @@
+# This Containerfile is used to build the llama-stack-provider-ragas-distro-image for the e2e tests.
+
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install uv by copying the static binaries from the official image
+COPY --from=ghcr.io/astral-sh/uv:0.9.21 /uv /uvx /bin/
+
+# Create a venv and make it the default Python for subsequent steps.
+RUN uv venv /app/.venv
+ENV VIRTUAL_ENV=/app/.venv
+ENV PATH="/app/.venv/bin:${PATH}"
+
+# Install sentence-transformers + torch (cached layer — these rarely change).
+RUN uv pip install --python /app/.venv/bin/python \
+    --extra-index-url https://download.pytorch.org/whl/cpu \
+    torch sentence-transformers einops tokenizers safetensors
+
+# Pre-download the embedding model so no HF fetch is needed at runtime.
+# Pass --build-arg HF_TOKEN=hf_... to avoid rate limits.
+ARG HF_TOKEN=""
+RUN HF_TOKEN=${HF_TOKEN} python -c "from huggingface_hub import snapshot_download; snapshot_download('nomic-ai/nomic-embed-text-v1.5')"
+
+# Copy code (changes frequently — kept after heavy layers for caching).
+COPY src /app/src
+COPY distribution /app/distribution
+COPY pyproject.toml /app/pyproject.toml
+COPY uv.lock /app/uv.lock
+COPY README.md /app/README.md
+
+# Install the project into the venv.
+RUN uv pip install --python /app/.venv/bin/python -e ".[remote,distro]"
+
+EXPOSE 8321
+
+ENTRYPOINT ["uv", "run", "--no-sync", "llama", "stack", "run", "distribution/run.yaml"]
diff --git a/tests/cluster-deployment/deploy-e2e.sh b/tests/cluster-deployment/deploy-e2e.sh
@@ -0,0 +1,297 @@
+#!/usr/bin/env bash
+#
+# Deploy the llama-stack-provider-ragas e2e test environment on an OpenShift cluster.
+#
+# Usage:
+#   ./deploy-e2e.sh --build
+#   ./deploy-e2e.sh --image <image-ref>
+#
+# Reads credentials from ../../.env (repo root) and creates a single
+# 'ragas-env' k8s secret from it.
+#
+# Prerequisites:
+#   - oc CLI installed and logged into an OpenShift cluster
+#   - podman (only required for --build mode)
+#
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${SCRIPT_DIR}/../.."
+IMAGE_NAME="llama-stack-provider-ragas-distro-image"
+NAMESPACE="ragas-test"
+
+# ---------------------------------------------------------------------------
+# Parse arguments
+# ---------------------------------------------------------------------------
+MODE=""
+IMAGE_REF=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --build)
+            MODE="build"
+            shift
+            ;;
+        --image)
+            MODE="image"
+            IMAGE_REF="$2"
+            if [[ -z "${IMAGE_REF}" ]]; then
+                echo "Error: --image requires an image reference argument."
+                exit 1
+            fi
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 --build | --image <image-ref>"
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "${MODE}" ]]; then
+    echo "Usage: $0 --build | --image <image-ref>"
+    exit 1
+fi
+
+# ---------------------------------------------------------------------------
+# Prerequisites
+# ---------------------------------------------------------------------------
+echo "Checking prerequisites..."
+
+if ! command -v oc &> /dev/null; then
+    echo "Error: oc is not installed."
+    exit 1
+fi
+
+if ! oc whoami &> /dev/null; then
+    echo "Error: Not logged into an OpenShift cluster. Run 'oc login' first."
+    exit 1
+fi
+
+echo "  Logged in as: $(oc whoami)"
+echo "  Cluster: $(oc whoami --show-server)"
+
+# ---------------------------------------------------------------------------
+# Resolve image
+# ---------------------------------------------------------------------------
+if [[ "${MODE}" == "build" ]]; then
+    if ! command -v podman &> /dev/null; then
+        echo "Error: podman is not installed (required for --build)."
+        exit 1
+    fi
+
+    echo ""
+    echo "=== Building image from Containerfile ==="
+
+    # Detect cluster node architecture (not local host arch)
+    NODE_ARCH=$(oc get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "amd64")
+    case "${NODE_ARCH}" in
+        amd64)  PLATFORM="linux/amd64" ;;
+        arm64)  PLATFORM="linux/arm64" ;;
+        *)      echo "Warning: unknown cluster architecture ${NODE_ARCH}, defaulting to linux/amd64"; PLATFORM="linux/amd64" ;;
+    esac
+    echo "  Cluster node architecture: ${NODE_ARCH} -> ${PLATFORM}"
+
+    # Build the image
+    LOCAL_TAG="${IMAGE_NAME}:latest"
+    echo "  Building ${LOCAL_TAG}..."
+    podman build --platform "${PLATFORM}" \
+        --build-arg HF_TOKEN="${HF_TOKEN:-}" \
+        -t "${LOCAL_TAG}" \
+        -f "${SCRIPT_DIR}/Containerfile" "${REPO_ROOT}"
+
+    # Expose the OpenShift internal registry route (idempotent)
+    echo "  Exposing OpenShift internal registry..."
+    oc patch configs.imageregistry.operator.openshift.io/cluster \
+        --type=merge --patch '{"spec":{"defaultRoute":true}}' 2>/dev/null || true
+
+    # Wait briefly for the route to appear
+    for i in $(seq 1 12); do
+        REGISTRY_ROUTE=$(oc get route default-route -n openshift-image-registry \
+            --template='{{ .spec.host }}' 2>/dev/null) && break
+        sleep 5
+    done
+
+    if [[ -z "${REGISTRY_ROUTE}" ]]; then
+        echo "Error: Could not determine the OpenShift internal registry route."
+        exit 1
+    fi
+    echo "  Registry route: ${REGISTRY_ROUTE}"
+
+    # Login to the registry
+    echo "  Logging into registry..."
+    podman login --tls-verify=false -u "$(oc whoami)" -p "$(oc whoami -t)" "${REGISTRY_ROUTE}"
+
+    # Ensure the namespace exists before pushing (registry needs the namespace/project)
+    oc create namespace "${NAMESPACE}" 2>/dev/null || true
+
+    # Tag and push
+    REMOTE_TAG="${REGISTRY_ROUTE}/${NAMESPACE}/${IMAGE_NAME}:latest"
+    echo "  Tagging ${LOCAL_TAG} -> ${REMOTE_TAG}"
+    podman tag "${LOCAL_TAG}" "${REMOTE_TAG}"
+
+    echo "  Pushing to internal registry..."
+    podman push --tls-verify=false "${REMOTE_TAG}"
+
+    # The in-cluster image reference uses the internal service address
+    IMAGE_REF="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/${IMAGE_NAME}:latest"
+    echo "  In-cluster image ref: ${IMAGE_REF}"
+
+elif [[ "${MODE}" == "image" ]]; then
+    echo ""
+    echo "=== Using pre-built image ==="
+    echo "  Image: ${IMAGE_REF}"
+fi
+
+# ---------------------------------------------------------------------------
+# Install operators
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Installing Open Data Hub operator ==="
+oc apply -f "${SCRIPT_DIR}/manifests/operators/opendatahub-operator.yaml"
+
+echo "Waiting for ODH operator to be ready..."
+for i in $(seq 1 60); do
+    if oc get csv -n openshift-operators 2>/dev/null | grep -q "opendatahub-operator.*Succeeded"; then
+        echo "  ODH operator is ready."
+        break
+    fi
+    if [ "$i" -eq 60 ]; then
+        echo "Error: Timed out waiting for ODH operator to install."
+        exit 1
+    fi
+    sleep 10
+done
+
+echo ""
+echo "=== Configuring DataScienceCluster ==="
+oc apply -f "${SCRIPT_DIR}/manifests/operators/datasciencecluster.yaml"
+
+echo "Waiting for DataScienceCluster to be ready..."
+for i in $(seq 1 60); do
+    if oc get dsc default-dsc -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Ready"; then
+        echo "  DataScienceCluster is ready."
+        break
+    fi
+    if [ "$i" -eq 60 ]; then
+        echo "Error: Timed out waiting for DataScienceCluster to become ready."
+        exit 1
+    fi
+    sleep 10
+done
+
+echo ""
+echo "=== Installing LlamaStack operator ==="
+oc apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml
+
+echo "Waiting for LlamaStack operator to be ready..."
+oc wait --for=condition=available deployment/llama-stack-k8s-operator-controller-manager \
+    -n llama-stack-k8s-operator-system --timeout=120s
+
+# ---------------------------------------------------------------------------
+# Create namespace and apply manifests
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Setting up ${NAMESPACE} namespace ==="
+oc create namespace "${NAMESPACE}" 2>/dev/null || true
+
+echo "Applying configmaps and secrets..."
+oc apply -f "${SCRIPT_DIR}/manifests/configmap-and-secrets.yaml"
+
+echo "Creating ragas-env secret from .env..."
+ENV_FILE="${REPO_ROOT}/.env"
+if [[ ! -f "${ENV_FILE}" ]]; then
+    echo "Error: ${ENV_FILE} not found."
+    exit 1
+fi
+oc create secret generic ragas-env -n "${NAMESPACE}" \
+    --from-env-file="${ENV_FILE}" \
+    --dry-run=client -o yaml | oc apply -f -
+
+echo "Applying MinIO (results storage)..."
+oc apply -f "${SCRIPT_DIR}/manifests/minio.yaml"
+
+echo "Applying Kubeflow pipeline resources (aws-credentials)..."
+oc apply -f "${SCRIPT_DIR}/manifests/kubeflow-pipeline-resources.yaml"
+
+echo "Applying DataSciencePipelinesApplication..."
+oc apply -f "${SCRIPT_DIR}/manifests/datasciencepipelinesapplication.yaml"
+
+echo "Applying LlamaStackDistribution CR (image: ${IMAGE_REF})..."
+sed "s|__LLAMA_STACK_IMAGE__|${IMAGE_REF}|g" \
+    "${SCRIPT_DIR}/manifests/llama-stack-distribution.yaml" | oc apply -f -
+
+# ---------------------------------------------------------------------------
+# Wait for MinIO (results storage)
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Waiting for MinIO ==="
+echo "Waiting for MinIO deployment..."
+oc wait --for=condition=available deployment/ragas-results-minio -n "${NAMESPACE}" --timeout=120s
+
+echo "Waiting for MinIO bucket creation job..."
+oc wait --for=condition=complete job/minio-create-bucket -n "${NAMESPACE}" --timeout=120s
+
+# ---------------------------------------------------------------------------
+# Wait for Data Science Pipelines
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Waiting for Data Science Pipelines ==="
+echo "Waiting for DSPA to be ready..."
+for i in $(seq 1 60); do
+    DSPA_READY=$(oc get dspa ragas-e2e-dspa -n "${NAMESPACE}" \
+        -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
+    if [ "${DSPA_READY}" = "True" ]; then
+        echo "  DSPA is ready."
+        break
+    fi
+    if [ "$i" -eq 60 ]; then
+        echo "Error: Timed out waiting for DSPA to become ready."
+        exit 1
+    fi
+    sleep 10
+done
+
+# ---------------------------------------------------------------------------
+# Wait for operator reconciliation and deployments
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Waiting for deployments ==="
+
+echo "Waiting for operator to reconcile LlamaStackDistribution..."
+for i in $(seq 1 30); do
+    if oc get deployment/lsd-ragas-test -n "${NAMESPACE}" &>/dev/null; then
+        echo "  Deployment created."
+        break
+    fi
+    if [ "$i" -eq 30 ]; then
+        echo "Error: Timed out waiting for deployment/lsd-ragas-test to be created by the operator."
+        exit 1
+    fi
+    sleep 5
+done
+
+echo "Waiting for llama-stack deployment..."
+oc wait --for=condition=available deployment/lsd-ragas-test -n "${NAMESPACE}" --timeout=300s
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo ""
+echo "========================================="
+echo " E2E deployment complete!"
+echo "========================================="
+echo ""
+echo "  Namespace: ${NAMESPACE}"
+echo "  Image:     ${IMAGE_REF}"
+echo "  Env file:  ${ENV_FILE}"
+echo ""
+echo "Next steps:"
+echo "  1. Verify pods:    oc get pods -n ${NAMESPACE}"
+echo "  2. Port forward:   oc port-forward -n ${NAMESPACE} svc/lsd-ragas-test-service 8321:8321 &"
+echo "  3. Test API:       curl http://localhost:8321/v1/models"
+echo ""
+echo "To tear down:"
+echo "  ./teardown-e2e.sh"
diff --git a/tests/cluster-deployment/manifests/configmap-and-secrets.yaml b/tests/cluster-deployment/manifests/configmap-and-secrets.yaml
@@ -0,0 +1,37 @@
+# Default configuration for the e2e test environment.
+#
+# All values here can be overridden by the ragas-env secret (created from .env).
+# The ragas-env secret is loaded AFTER this ConfigMap, so .env values take precedence.
+#
+# The following keys MUST be provided in .env (they are left blank here):
+#   LITELLM_API_URL, LITELLM_API_KEY
+#
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kubeflow-ragas-config
+  namespace: ragas-test
+data:
+  # Inference
+  INFERENCE_MODEL: "Mistral-Small-24B-W8A8"
+  LITELLM_API_URL: ""
+  LITELLM_API_KEY: ""
+
+  # Embedding (inline sentence-transformers, model downloaded at startup)
+  EMBEDDING_MODEL: "nomic-ai/nomic-embed-text-v1.5"
+
+  # Kubeflow pipelines
+  KUBEFLOW_LLAMA_STACK_URL: "http://lsd-ragas-test-service.ragas-test.svc.cluster.local:8321"
+  KUBEFLOW_PIPELINES_ENDPOINT: "http://ds-pipeline-ragas-e2e-dspa.ragas-test.svc.cluster.local:8888"
+  KUBEFLOW_PIPELINES_TOKEN: ""
+  KUBEFLOW_NAMESPACE: "ragas-test"
+  KUBEFLOW_BASE_IMAGE: "quay.io/sclorg/python-312-minimal-c9s:latest"
+
+  # S3 / MinIO results storage
+  KUBEFLOW_RESULTS_S3_PREFIX: "s3://ragas-results/evaluations"
+  KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: "aws-credentials"
+  RESULTS_S3_ENDPOINT: "http://ragas-results-minio.ragas-test.svc.cluster.local:9000"
+  RESULTS_S3_PATH_STYLE: "true"
+  AWS_ACCESS_KEY_ID: "minioadmin"
+  AWS_SECRET_ACCESS_KEY: "minioadmin"
+  AWS_DEFAULT_REGION: "us-east-1"