Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions tests/cluster-deployment/Containerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# This Containerfile is used to build the llama-stack-provider-ragas-distro-image for the e2e tests.

FROM python:3.12-slim

WORKDIR /app

# Install uv by copying the static binaries from the official image
COPY --from=ghcr.io/astral-sh/uv:0.9.21 /uv /uvx /bin/

# Create a venv and make it the default Python for subsequent steps.
RUN uv venv /app/.venv
ENV VIRTUAL_ENV=/app/.venv
ENV PATH="/app/.venv/bin:${PATH}"

# Install sentence-transformers + torch (cached layer — these rarely change).
RUN uv pip install --python /app/.venv/bin/python \
--extra-index-url https://download.pytorch.org/whl/cpu \
torch sentence-transformers einops tokenizers safetensors

# Pre-download the embedding model so no HF fetch is needed at runtime.
# Pass --build-arg HF_TOKEN=hf_... to avoid rate limits.
ARG HF_TOKEN=""
RUN HF_TOKEN=${HF_TOKEN} python -c "from huggingface_hub import snapshot_download; snapshot_download('nomic-ai/nomic-embed-text-v1.5')"

# Copy code (changes frequently — kept after heavy layers for caching).
COPY src /app/src
COPY distribution /app/distribution
COPY pyproject.toml /app/pyproject.toml
COPY uv.lock /app/uv.lock
COPY README.md /app/README.md

# Install the project into the venv.
RUN uv pip install --python /app/.venv/bin/python -e ".[remote,distro]"

EXPOSE 8321

ENTRYPOINT ["uv", "run", "--no-sync", "llama", "stack", "run", "distribution/run.yaml"]
297 changes: 297 additions & 0 deletions tests/cluster-deployment/deploy-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env bash
#
# Deploy the llama-stack-provider-ragas e2e test environment on an OpenShift cluster.
#
# Usage:
# ./deploy-e2e.sh --build
# ./deploy-e2e.sh --image <image-ref>
#
# Reads credentials from ../../.env (repo root) and creates a single
# 'ragas-env' k8s secret from it.
#
# Prerequisites:
# - oc CLI installed and logged into an OpenShift cluster
# - podman (only required for --build mode)
#

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${SCRIPT_DIR}/../.."
IMAGE_NAME="llama-stack-provider-ragas-distro-image"
NAMESPACE="ragas-test"

# ---------------------------------------------------------------------------
# Parse arguments
# ---------------------------------------------------------------------------
MODE=""
IMAGE_REF=""

while [[ $# -gt 0 ]]; do
case "$1" in
--build)
MODE="build"
shift
;;
--image)
MODE="image"
IMAGE_REF="$2"
if [[ -z "${IMAGE_REF}" ]]; then
echo "Error: --image requires an image reference argument."
exit 1
fi
shift 2
;;
*)
echo "Unknown option: $1"
echo "Usage: $0 --build | --image <image-ref>"
exit 1
;;
esac
done

if [[ -z "${MODE}" ]]; then
echo "Usage: $0 --build | --image <image-ref>"
exit 1
fi

# ---------------------------------------------------------------------------
# Prerequisites
# ---------------------------------------------------------------------------
echo "Checking prerequisites..."

if ! command -v oc &> /dev/null; then
echo "Error: oc is not installed."
exit 1
fi

if ! oc whoami &> /dev/null; then
echo "Error: Not logged into an OpenShift cluster. Run 'oc login' first."
exit 1
fi

echo " Logged in as: $(oc whoami)"
echo " Cluster: $(oc whoami --show-server)"

# ---------------------------------------------------------------------------
# Resolve image
# ---------------------------------------------------------------------------
if [[ "${MODE}" == "build" ]]; then
if ! command -v podman &> /dev/null; then
echo "Error: podman is not installed (required for --build)."
exit 1
fi

echo ""
echo "=== Building image from Containerfile ==="

# Detect cluster node architecture (not local host arch)
NODE_ARCH=$(oc get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "amd64")
case "${NODE_ARCH}" in
amd64) PLATFORM="linux/amd64" ;;
arm64) PLATFORM="linux/arm64" ;;
*) echo "Warning: unknown cluster architecture ${NODE_ARCH}, defaulting to linux/amd64"; PLATFORM="linux/amd64" ;;
esac
echo " Cluster node architecture: ${NODE_ARCH} -> ${PLATFORM}"

# Build the image
LOCAL_TAG="${IMAGE_NAME}:latest"
echo " Building ${LOCAL_TAG}..."
podman build --platform "${PLATFORM}" \
--build-arg HF_TOKEN="${HF_TOKEN:-}" \
-t "${LOCAL_TAG}" \
-f "${SCRIPT_DIR}/Containerfile" "${REPO_ROOT}"

# Expose the OpenShift internal registry route (idempotent)
echo " Exposing OpenShift internal registry..."
oc patch configs.imageregistry.operator.openshift.io/cluster \
--type=merge --patch '{"spec":{"defaultRoute":true}}' 2>/dev/null || true

# Wait briefly for the route to appear
for i in $(seq 1 12); do
REGISTRY_ROUTE=$(oc get route default-route -n openshift-image-registry \
--template='{{ .spec.host }}' 2>/dev/null) && break
sleep 5
done

if [[ -z "${REGISTRY_ROUTE}" ]]; then
echo "Error: Could not determine the OpenShift internal registry route."
exit 1
fi
echo " Registry route: ${REGISTRY_ROUTE}"

# Login to the registry
echo " Logging into registry..."
podman login --tls-verify=false -u "$(oc whoami)" -p "$(oc whoami -t)" "${REGISTRY_ROUTE}"

# Ensure the namespace exists before pushing (registry needs the namespace/project)
oc create namespace "${NAMESPACE}" 2>/dev/null || true

# Tag and push
REMOTE_TAG="${REGISTRY_ROUTE}/${NAMESPACE}/${IMAGE_NAME}:latest"
echo " Tagging ${LOCAL_TAG} -> ${REMOTE_TAG}"
podman tag "${LOCAL_TAG}" "${REMOTE_TAG}"

echo " Pushing to internal registry..."
podman push --tls-verify=false "${REMOTE_TAG}"

# The in-cluster image reference uses the internal service address
IMAGE_REF="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/${IMAGE_NAME}:latest"
echo " In-cluster image ref: ${IMAGE_REF}"

elif [[ "${MODE}" == "image" ]]; then
echo ""
echo "=== Using pre-built image ==="
echo " Image: ${IMAGE_REF}"
fi

# ---------------------------------------------------------------------------
# Install operators
# ---------------------------------------------------------------------------
echo ""
echo "=== Installing Open Data Hub operator ==="
oc apply -f "${SCRIPT_DIR}/manifests/operators/opendatahub-operator.yaml"

echo "Waiting for ODH operator to be ready..."
for i in $(seq 1 60); do
if oc get csv -n openshift-operators 2>/dev/null | grep -q "opendatahub-operator.*Succeeded"; then
echo " ODH operator is ready."
break
fi
if [ "$i" -eq 60 ]; then
echo "Error: Timed out waiting for ODH operator to install."
exit 1
fi
sleep 10
done

echo ""
echo "=== Configuring DataScienceCluster ==="
oc apply -f "${SCRIPT_DIR}/manifests/operators/datasciencecluster.yaml"

echo "Waiting for DataScienceCluster to be ready..."
for i in $(seq 1 60); do
if oc get dsc default-dsc -o jsonpath='{.status.phase}' 2>/dev/null | grep -q "Ready"; then
echo " DataScienceCluster is ready."
break
fi
if [ "$i" -eq 60 ]; then
echo "Error: Timed out waiting for DataScienceCluster to become ready."
exit 1
fi
sleep 10
done

echo ""
echo "=== Installing LlamaStack operator ==="
oc apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml

echo "Waiting for LlamaStack operator to be ready..."
oc wait --for=condition=available deployment/llama-stack-k8s-operator-controller-manager \
-n llama-stack-k8s-operator-system --timeout=120s

# ---------------------------------------------------------------------------
# Create namespace and apply manifests
# ---------------------------------------------------------------------------
echo ""
echo "=== Setting up ${NAMESPACE} namespace ==="
oc create namespace "${NAMESPACE}" 2>/dev/null || true

echo "Applying configmaps and secrets..."
oc apply -f "${SCRIPT_DIR}/manifests/configmap-and-secrets.yaml"

echo "Creating ragas-env secret from .env..."
ENV_FILE="${REPO_ROOT}/.env"
if [[ ! -f "${ENV_FILE}" ]]; then
echo "Error: ${ENV_FILE} not found."
exit 1
fi
oc create secret generic ragas-env -n "${NAMESPACE}" \
--from-env-file="${ENV_FILE}" \
--dry-run=client -o yaml | oc apply -f -

echo "Applying MinIO (results storage)..."
oc apply -f "${SCRIPT_DIR}/manifests/minio.yaml"

echo "Applying Kubeflow pipeline resources (aws-credentials)..."
oc apply -f "${SCRIPT_DIR}/manifests/kubeflow-pipeline-resources.yaml"

echo "Applying DataSciencePipelinesApplication..."
oc apply -f "${SCRIPT_DIR}/manifests/datasciencepipelinesapplication.yaml"

echo "Applying LlamaStackDistribution CR (image: ${IMAGE_REF})..."
sed "s|__LLAMA_STACK_IMAGE__|${IMAGE_REF}|g" \
"${SCRIPT_DIR}/manifests/llama-stack-distribution.yaml" | oc apply -f -

# ---------------------------------------------------------------------------
# Wait for MinIO (results storage)
# ---------------------------------------------------------------------------
echo ""
echo "=== Waiting for MinIO ==="
echo "Waiting for MinIO deployment..."
oc wait --for=condition=available deployment/ragas-results-minio -n "${NAMESPACE}" --timeout=120s

echo "Waiting for MinIO bucket creation job..."
oc wait --for=condition=complete job/minio-create-bucket -n "${NAMESPACE}" --timeout=120s

# ---------------------------------------------------------------------------
# Wait for Data Science Pipelines
# ---------------------------------------------------------------------------
echo ""
echo "=== Waiting for Data Science Pipelines ==="
echo "Waiting for DSPA to be ready..."
for i in $(seq 1 60); do
DSPA_READY=$(oc get dspa ragas-e2e-dspa -n "${NAMESPACE}" \
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [ "${DSPA_READY}" = "True" ]; then
echo " DSPA is ready."
break
fi
if [ "$i" -eq 60 ]; then
echo "Error: Timed out waiting for DSPA to become ready."
exit 1
fi
sleep 10
done

# ---------------------------------------------------------------------------
# Wait for operator reconciliation and deployments
# ---------------------------------------------------------------------------
echo ""
echo "=== Waiting for deployments ==="

echo "Waiting for operator to reconcile LlamaStackDistribution..."
for i in $(seq 1 30); do
if oc get deployment/lsd-ragas-test -n "${NAMESPACE}" &>/dev/null; then
echo " Deployment created."
break
fi
if [ "$i" -eq 30 ]; then
echo "Error: Timed out waiting for deployment/lsd-ragas-test to be created by the operator."
exit 1
fi
sleep 5
done

echo "Waiting for llama-stack deployment..."
oc wait --for=condition=available deployment/lsd-ragas-test -n "${NAMESPACE}" --timeout=300s

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "========================================="
echo " E2E deployment complete!"
echo "========================================="
echo ""
echo " Namespace: ${NAMESPACE}"
echo " Image: ${IMAGE_REF}"
echo " Env file: ${ENV_FILE}"
echo ""
echo "Next steps:"
echo " 1. Verify pods: oc get pods -n ${NAMESPACE}"
echo " 2. Port forward: oc port-forward -n ${NAMESPACE} svc/lsd-ragas-test-service 8321:8321 &"
echo " 3. Test API: curl http://localhost:8321/v1/models"
echo ""
echo "To tear down:"
echo " ./teardown-e2e.sh"
37 changes: 37 additions & 0 deletions tests/cluster-deployment/manifests/configmap-and-secrets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Default configuration for the e2e test environment.
#
# All values here can be overridden by the ragas-env secret (created from .env).
# The ragas-env secret is loaded AFTER this ConfigMap, so .env values take precedence.
#
# The following keys MUST be provided in .env (they are left blank here):
# LITELLM_API_URL, LITELLM_API_KEY
#
apiVersion: v1
kind: ConfigMap
metadata:
name: kubeflow-ragas-config
namespace: ragas-test
data:
# Inference
INFERENCE_MODEL: "Mistral-Small-24B-W8A8"
LITELLM_API_URL: ""
LITELLM_API_KEY: ""

# Embedding (inline sentence-transformers, model downloaded at startup)
EMBEDDING_MODEL: "nomic-ai/nomic-embed-text-v1.5"

# Kubeflow pipelines
KUBEFLOW_LLAMA_STACK_URL: "http://lsd-ragas-test-service.ragas-test.svc.cluster.local:8321"
KUBEFLOW_PIPELINES_ENDPOINT: "http://ds-pipeline-ragas-e2e-dspa.ragas-test.svc.cluster.local:8888"
KUBEFLOW_PIPELINES_TOKEN: ""
KUBEFLOW_NAMESPACE: "ragas-test"
KUBEFLOW_BASE_IMAGE: "quay.io/sclorg/python-312-minimal-c9s:latest"

# S3 / MinIO results storage
KUBEFLOW_RESULTS_S3_PREFIX: "s3://ragas-results/evaluations"
KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: "aws-credentials"
RESULTS_S3_ENDPOINT: "http://ragas-results-minio.ragas-test.svc.cluster.local:9000"
RESULTS_S3_PATH_STYLE: "true"
AWS_ACCESS_KEY_ID: "minioadmin"
AWS_SECRET_ACCESS_KEY: "minioadmin"
AWS_DEFAULT_REGION: "us-east-1"
Loading