From 38bc9fc695a7177e3679bd25c7cb252b06f2957a Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Fri, 20 Feb 2026 23:29:05 -0500
Subject: [PATCH 1/7] Add end-to-end testing setup for
 llama-stack-provider-ragas

This commit introduces a comprehensive end-to-end testing environment for the llama-stack-provider-ragas distribution on OpenShift. It includes a new Containerfile for building the test image, deployment and teardown scripts, configuration manifests, and a test suite using pytest. The setup ensures that the necessary resources are created and validated, facilitating robust testing of the provider's functionality.
---
 tests/e2e/Containerfile                       |  35 +++
 tests/e2e/deploy-e2e.sh                       | 243 ++++++++++++++++++
 .../e2e/manifests/configmap-and-secrets.yaml  |  37 +++
 .../kubeflow-pipeline-resources.yaml          |  10 +
 .../manifests/llama-stack-distribution.yaml   | 239 +++++++++++++++++
 tests/e2e/manifests/minio.yaml                |  85 ++++++
 tests/e2e/teardown-e2e.sh                     |  13 +
 tests/e2e/test_e2e.py                         | 181 +++++++++++++
 8 files changed, 843 insertions(+)
 create mode 100644 tests/e2e/Containerfile
 create mode 100755 tests/e2e/deploy-e2e.sh
 create mode 100644 tests/e2e/manifests/configmap-and-secrets.yaml
 create mode 100644 tests/e2e/manifests/kubeflow-pipeline-resources.yaml
 create mode 100644 tests/e2e/manifests/llama-stack-distribution.yaml
 create mode 100644 tests/e2e/manifests/minio.yaml
 create mode 100755 tests/e2e/teardown-e2e.sh
 create mode 100644 tests/e2e/test_e2e.py
diff --git a/tests/e2e/Containerfile b/tests/e2e/Containerfile
new file mode 100644
index 00000000..69759f86
--- /dev/null
+++ b/tests/e2e/Containerfile
@@ -0,0 +1,35 @@
+# This Containerfile is used to build the llama-stack-provider-ragas-distro-image for the e2e tests.
+
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install uv by copying the static binaries from the official image
+COPY --from=ghcr.io/astral-sh/uv:0.9.21 /uv /uvx /bin/
+
+# Create a venv and make it the default Python for subsequent steps.
+RUN uv venv /app/.venv
+ENV VIRTUAL_ENV=/app/.venv
+ENV PATH="/app/.venv/bin:${PATH}"
+
+# Install sentence-transformers + torch (cached layer — these rarely change).
+RUN uv pip install --python /app/.venv/bin/python \
+    --extra-index-url https://download.pytorch.org/whl/cpu \
+    torch sentence-transformers einops tokenizers safetensors
+
+# Pre-download the embedding model so no HF fetch is needed at runtime.
+RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('nomic-ai/nomic-embed-text-v1.5')"
+
+# Copy code (changes frequently — kept after heavy layers for caching).
+COPY src /app/src
+COPY distribution /app/distribution
+COPY pyproject.toml /app/pyproject.toml
+COPY uv.lock /app/uv.lock
+COPY README.md /app/README.md
+
+# Install the project into the venv.
+RUN uv pip install --python /app/.venv/bin/python -e ".[remote,distro]"
+
+EXPOSE 8321
+
+ENTRYPOINT ["uv", "run", "--no-sync", "llama", "stack", "run", "distribution/run.yaml"]
diff --git a/tests/e2e/deploy-e2e.sh b/tests/e2e/deploy-e2e.sh
new file mode 100755
index 00000000..bb38dbfd
--- /dev/null
+++ b/tests/e2e/deploy-e2e.sh
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+#
+# Deploy the llama-stack-provider-ragas e2e test environment on an OpenShift cluster.
+#
+# Usage:
+#   ./deploy-e2e.sh --build
+#   ./deploy-e2e.sh --image <image-ref>
+#
+# Reads credentials from ../../.env (repo root) and creates a single
+# 'ragas-env' k8s secret from it.
+#
+# Prerequisites:
+#   - oc CLI installed and logged into an OpenShift cluster
+#   - podman (only required for --build mode)
+#
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${SCRIPT_DIR}/../.."
+IMAGE_NAME="llama-stack-provider-ragas-distro-image"
+NAMESPACE="ragas-test"
+
+# ---------------------------------------------------------------------------
+# Parse arguments
+# ---------------------------------------------------------------------------
+MODE=""
+IMAGE_REF=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --build)
+            MODE="build"
+            shift
+            ;;
+        --image)
+            MODE="image"
+            IMAGE_REF="$2"
+            if [[ -z "${IMAGE_REF}" ]]; then
+                echo "Error: --image requires an image reference argument."
+                exit 1
+            fi
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 --build | --image <image-ref>"
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "${MODE}" ]]; then
+    echo "Usage: $0 --build | --image <image-ref>"
+    exit 1
+fi
+
+# ---------------------------------------------------------------------------
+# Prerequisites
+# ---------------------------------------------------------------------------
+echo "Checking prerequisites..."
+
+if ! command -v oc &> /dev/null; then
+    echo "Error: oc is not installed."
+    exit 1
+fi
+
+if ! oc whoami &> /dev/null; then
+    echo "Error: Not logged into an OpenShift cluster. Run 'oc login' first."
+    exit 1
+fi
+
+echo "  Logged in as: $(oc whoami)"
+echo "  Cluster: $(oc whoami --show-server)"
+
+# ---------------------------------------------------------------------------
+# Resolve image
+# ---------------------------------------------------------------------------
+if [[ "${MODE}" == "build" ]]; then
+    if ! command -v podman &> /dev/null; then
+        echo "Error: podman is not installed (required for --build)."
+        exit 1
+    fi
+
+    echo ""
+    echo "=== Building image from Containerfile ==="
+
+    # Detect cluster node architecture (not local host arch)
+    NODE_ARCH=$(oc get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "amd64")
+    case "${NODE_ARCH}" in
+        amd64)  PLATFORM="linux/amd64" ;;
+        arm64)  PLATFORM="linux/arm64" ;;
+        *)      echo "Warning: unknown cluster architecture ${NODE_ARCH}, defaulting to linux/amd64"; PLATFORM="linux/amd64" ;;
+    esac
+    echo "  Cluster node architecture: ${NODE_ARCH} -> ${PLATFORM}"
+
+    # Build the image
+    LOCAL_TAG="${IMAGE_NAME}:latest"
+    echo "  Building ${LOCAL_TAG}..."
+    podman build --no-cache --platform "${PLATFORM}" \
+        -t "${LOCAL_TAG}" \
+        -f "${SCRIPT_DIR}/Containerfile" "${REPO_ROOT}"
+
+    # Expose the OpenShift internal registry route (idempotent)
+    echo "  Exposing OpenShift internal registry..."
+    oc patch configs.imageregistry.operator.openshift.io/cluster \
+        --type=merge --patch '{"spec":{"defaultRoute":true}}' 2>/dev/null || true
+
+    # Wait briefly for the route to appear
+    for i in $(seq 1 12); do
+        REGISTRY_ROUTE=$(oc get route default-route -n openshift-image-registry \
+            --template='{{ .spec.host }}' 2>/dev/null) && break
+        sleep 5
+    done
+
+    if [[ -z "${REGISTRY_ROUTE}" ]]; then
+        echo "Error: Could not determine the OpenShift internal registry route."
+        exit 1
+    fi
+    echo "  Registry route: ${REGISTRY_ROUTE}"
+
+    # Login to the registry
+    echo "  Logging into registry..."
+    podman login --tls-verify=false -u "$(oc whoami)" -p "$(oc whoami -t)" "${REGISTRY_ROUTE}"
+
+    # Ensure the namespace exists before pushing (registry needs the namespace/project)
+    oc create namespace "${NAMESPACE}" 2>/dev/null || true
+
+    # Tag and push
+    REMOTE_TAG="${REGISTRY_ROUTE}/${NAMESPACE}/${IMAGE_NAME}:latest"
+    echo "  Tagging ${LOCAL_TAG} -> ${REMOTE_TAG}"
+    podman tag "${LOCAL_TAG}" "${REMOTE_TAG}"
+
+    echo "  Pushing to internal registry..."
+    podman push --tls-verify=false "${REMOTE_TAG}"
+
+    # The in-cluster image reference uses the internal service address
+    IMAGE_REF="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/${IMAGE_NAME}:latest"
+    echo "  In-cluster image ref: ${IMAGE_REF}"
+
+elif [[ "${MODE}" == "image" ]]; then
+    echo ""
+    echo "=== Using pre-built image ==="
+    echo "  Image: ${IMAGE_REF}"
+fi
+
+# ---------------------------------------------------------------------------
+# Install LlamaStack operator
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Installing LlamaStack operator ==="
+oc apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml
+
+echo "Waiting for LlamaStack operator to be ready..."
+oc wait --for=condition=available deployment/llama-stack-k8s-operator-controller-manager \
+    -n llama-stack-k8s-operator-system --timeout=120s
+
+# ---------------------------------------------------------------------------
+# Create namespace and apply manifests
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Setting up ${NAMESPACE} namespace ==="
+oc create namespace "${NAMESPACE}" 2>/dev/null || true
+
+echo "Applying configmaps and secrets..."
+oc apply -f "${SCRIPT_DIR}/manifests/configmap-and-secrets.yaml"
+
+echo "Creating ragas-env secret from .env..."
+ENV_FILE="${REPO_ROOT}/.env"
+if [[ ! -f "${ENV_FILE}" ]]; then
+    echo "Error: ${ENV_FILE} not found."
+    exit 1
+fi
+oc create secret generic ragas-env -n "${NAMESPACE}" \
+    --from-env-file="${ENV_FILE}" \
+    --dry-run=client -o yaml | oc apply -f -
+
+echo "Applying MinIO..."
+oc apply -f "${SCRIPT_DIR}/manifests/minio.yaml"
+
+echo "Applying LlamaStackDistribution CR (image: ${IMAGE_REF})..."
+sed "s|__LLAMA_STACK_IMAGE__|${IMAGE_REF}|g" \
+    "${SCRIPT_DIR}/manifests/llama-stack-distribution.yaml" | oc apply -f -
+
+# ---------------------------------------------------------------------------
+# Wait for MinIO
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Waiting for MinIO ==="
+echo "Waiting for MinIO deployment..."
+oc wait --for=condition=available deployment/minio -n "${NAMESPACE}" --timeout=120s
+
+echo "Waiting for MinIO bucket creation job..."
+oc wait --for=condition=complete job/minio-create-bucket -n "${NAMESPACE}" --timeout=120s
+
+# ---------------------------------------------------------------------------
+# Kubeflow pipeline resources (aws-credentials in ragas-test namespace)
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Applying Kubeflow pipeline resources ==="
+oc apply -f "${SCRIPT_DIR}/manifests/kubeflow-pipeline-resources.yaml"
+
+# ---------------------------------------------------------------------------
+# Wait for operator reconciliation and deployments
+# ---------------------------------------------------------------------------
+echo ""
+echo "=== Waiting for deployments ==="
+
+echo "Waiting for operator to reconcile LlamaStackDistribution..."
+for i in $(seq 1 30); do
+    if oc get deployment/lsd-ragas-test -n "${NAMESPACE}" &>/dev/null; then
+        echo "  Deployment created."
+        break
+    fi
+    if [ "$i" -eq 30 ]; then
+        echo "Error: Timed out waiting for deployment/lsd-ragas-test to be created by the operator."
+        exit 1
+    fi
+    sleep 5
+done
+
+echo "Waiting for llama-stack deployment..."
+oc wait --for=condition=available deployment/lsd-ragas-test -n "${NAMESPACE}" --timeout=300s
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo ""
+echo "========================================="
+echo " E2E deployment complete!"
+echo "========================================="
+echo ""
+echo "  Namespace: ${NAMESPACE}"
+echo "  Image:     ${IMAGE_REF}"
+echo "  Env file:  ${ENV_FILE}"
+echo ""
+echo "Next steps:"
+echo "  1. Verify pods:    oc get pods -n ${NAMESPACE}"
+echo "  2. Port forward:   oc port-forward -n ${NAMESPACE} svc/lsd-ragas-test-service 8321:8321 &"
+echo "  3. Test API:       curl http://localhost:8321/v1/models"
+echo ""
+echo "To tear down:"
+echo "  ./teardown-e2e.sh"
diff --git a/tests/e2e/manifests/configmap-and-secrets.yaml b/tests/e2e/manifests/configmap-and-secrets.yaml
new file mode 100644
index 00000000..b625f3f8
--- /dev/null
+++ b/tests/e2e/manifests/configmap-and-secrets.yaml
@@ -0,0 +1,37 @@
+# Default configuration for the e2e test environment.
+#
+# All values here can be overridden by the ragas-env secret (created from .env).
+# The ragas-env secret is loaded AFTER this ConfigMap, so .env values take precedence.
+#
+# The following keys MUST be provided in .env (they are left blank here):
+#   LITELLM_API_URL, LITELLM_API_KEY
+#
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kubeflow-ragas-config
+  namespace: ragas-test
+data:
+  # Inference
+  INFERENCE_MODEL: "Mistral-Small-24B-W8A8"
+  LITELLM_API_URL: ""
+  LITELLM_API_KEY: ""
+
+  # Embedding (inline sentence-transformers, model downloaded at startup)
+  EMBEDDING_MODEL: "nomic-ai/nomic-embed-text-v1.5"
+
+  # Kubeflow pipelines
+  KUBEFLOW_LLAMA_STACK_URL: "http://lsd-ragas-test-service.ragas-test.svc.cluster.local:8321"
+  KUBEFLOW_PIPELINES_ENDPOINT: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
+  KUBEFLOW_PIPELINES_TOKEN: ""
+  KUBEFLOW_NAMESPACE: "ragas-test"
+  KUBEFLOW_BASE_IMAGE: "python:3.12-slim"
+
+  # S3 / MinIO results storage
+  KUBEFLOW_RESULTS_S3_PREFIX: "s3://ragas-results/evaluations"
+  KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: "aws-credentials"
+  RESULTS_S3_ENDPOINT: "http://minio-service.ragas-test.svc.cluster.local:9000"
+  RESULTS_S3_PATH_STYLE: "true"
+  AWS_ACCESS_KEY_ID: "minioadmin"
+  AWS_SECRET_ACCESS_KEY: "minioadmin"
+  AWS_DEFAULT_REGION: "us-east-1"
diff --git a/tests/e2e/manifests/kubeflow-pipeline-resources.yaml b/tests/e2e/manifests/kubeflow-pipeline-resources.yaml
new file mode 100644
index 00000000..d58bc96a
--- /dev/null
+++ b/tests/e2e/manifests/kubeflow-pipeline-resources.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: aws-credentials
+  namespace: ragas-test
+type: Opaque
+stringData:
+  AWS_ACCESS_KEY_ID: "minioadmin"
+  AWS_SECRET_ACCESS_KEY: "minioadmin"
+  AWS_DEFAULT_REGION: "us-east-1"
diff --git a/tests/e2e/manifests/llama-stack-distribution.yaml b/tests/e2e/manifests/llama-stack-distribution.yaml
new file mode 100644
index 00000000..d4f41d32
--- /dev/null
+++ b/tests/e2e/manifests/llama-stack-distribution.yaml
@@ -0,0 +1,239 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: llama-stack-e2e-config
+  namespace: ragas-test
+data:
+  config.yaml: |
+    version: 2
+    image_name: trustyai_ragas_distro
+    apis:
+    - eval
+    - inference
+    - files
+    - datasetio
+    providers:
+      eval:
+        - provider_id: ${env.KUBEFLOW_LLAMA_STACK_URL:+trustyai_ragas_remote}
+          provider_type: remote::trustyai_ragas
+          module: llama_stack_provider_ragas.remote
+          config:
+            embedding_model: embedding/${env.EMBEDDING_MODEL}
+            kubeflow_config:
+              results_s3_prefix: ${env.KUBEFLOW_RESULTS_S3_PREFIX}
+              s3_credentials_secret_name: ${env.KUBEFLOW_S3_CREDENTIALS_SECRET_NAME}
+              pipelines_endpoint: ${env.KUBEFLOW_PIPELINES_ENDPOINT}
+              namespace: ${env.KUBEFLOW_NAMESPACE}
+              llama_stack_url: ${env.KUBEFLOW_LLAMA_STACK_URL}
+              base_image: ${env.KUBEFLOW_BASE_IMAGE}
+              pipelines_api_token: ${env.KUBEFLOW_PIPELINES_TOKEN:=}
+              results_s3_endpoint: ${env.RESULTS_S3_ENDPOINT}
+              results_s3_path_style: ${env.RESULTS_S3_PATH_STYLE}
+            kvstore:
+              namespace: ragas
+              backend: kv_default
+        - provider_id: ${env.EMBEDDING_MODEL:+trustyai_ragas_inline}
+          provider_type: inline::trustyai_ragas
+          module: llama_stack_provider_ragas.inline
+          config:
+            embedding_model: embedding/${env.EMBEDDING_MODEL}
+            kvstore:
+              namespace: ragas
+              backend: kv_default
+      datasetio:
+      - provider_id: localfs
+        provider_type: inline::localfs
+        config:
+          kvstore:
+            namespace: datasetio::localfs
+            backend: kv_default
+      - provider_id: huggingface
+        provider_type: remote::huggingface
+        config:
+          kvstore:
+            namespace: datasetio::huggingface
+            backend: kv_default
+      inference:
+        - provider_id: litellm
+          provider_type: "remote::openai"
+          config:
+            base_url: "${env.LITELLM_API_URL}"
+            api_key: "${env.LITELLM_API_KEY}"
+        - provider_id: embedding
+          provider_type: "inline::sentence-transformers"
+          config: {}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/trustyai_ragas_distro/files}
+          metadata_store:
+            table_name: files_metadata
+            backend: sql_default
+    storage:
+      backends:
+        kv_default:
+          type: kv_sqlite
+          db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/trustyai_ragas_distro}/kvstore.db
+        sql_default:
+          type: sql_sqlite
+          db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/trustyai_ragas_distro}/sql_store.db
+      stores:
+        metadata:
+          namespace: registry
+          backend: kv_default
+        inference:
+          table_name: inference_store
+          backend: sql_default
+          max_write_queue_size: 10000
+          num_writers: 4
+        conversations:
+          table_name: openai_conversations
+          backend: sql_default
+        connectors:
+          namespace: connectors
+          backend: kv_default
+        prompts:
+          namespace: prompts
+          backend: kv_default
+    registered_resources:
+      models:
+      - metadata: {}
+        model_id: "${env.INFERENCE_MODEL:=Mistral-Small-24B-W8A8}"
+        provider_id: litellm
+        model_type: llm
+      - metadata:
+          embedding_dimension: 768
+        model_id: "${env.EMBEDDING_MODEL}"
+        provider_id: embedding
+        provider_model_id: "${env.EMBEDDING_MODEL}"
+        model_type: embedding
+      shields: []
+      vector_dbs: []
+      datasets: []
+      scoring_fns: []
+      benchmarks:
+      - benchmark_id: hf-doc-qa-ragas-inline-benchmark
+        dataset_id: hf_doc_qa_ragas_eval
+        scoring_functions:
+          - semantic_similarity
+        provider_id: trustyai_ragas_inline
+        metadata: {}
+      - benchmark_id: hf-doc-qa-ragas-remote-benchmark
+        dataset_id: hf_doc_qa_ragas_eval
+        scoring_functions:
+          - semantic_similarity
+        provider_id: trustyai_ragas_remote
+        metadata: {}
+      tool_groups: []
+    server:
+      port: 8321
+---
+apiVersion: llamastack.io/v1alpha1
+kind: LlamaStackDistribution
+metadata:
+  name: lsd-ragas-test
+  namespace: ragas-test
+spec:
+  replicas: 1
+  server:
+    containerSpec:
+      resources:
+        requests:
+          cpu: 1
+          memory: "2Gi"
+        limits:
+          cpu: 2
+          memory: "4Gi"
+      env:
+        # Inference (defaults in configmap, credentials in ragas-env from .env)
+        - name: INFERENCE_MODEL
+          valueFrom:
+            configMapKeyRef:
+              key: INFERENCE_MODEL
+              name: kubeflow-ragas-config
+        - name: LITELLM_API_URL
+          valueFrom:
+            secretKeyRef:
+              key: LITELLM_API_URL
+              name: ragas-env
+        - name: LITELLM_API_KEY
+          valueFrom:
+            secretKeyRef:
+              key: LITELLM_API_KEY
+              name: ragas-env
+        # Embedding
+        - name: EMBEDDING_MODEL
+          valueFrom:
+            configMapKeyRef:
+              key: EMBEDDING_MODEL
+              name: kubeflow-ragas-config
+        # Kubeflow pipelines
+        - name: KUBEFLOW_PIPELINES_ENDPOINT
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_PIPELINES_ENDPOINT
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_NAMESPACE
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_NAMESPACE
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_BASE_IMAGE
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_BASE_IMAGE
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_LLAMA_STACK_URL
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_LLAMA_STACK_URL
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_RESULTS_S3_PREFIX
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_RESULTS_S3_PREFIX
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
+              name: kubeflow-ragas-config
+        - name: KUBEFLOW_PIPELINES_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: KUBEFLOW_PIPELINES_TOKEN
+              name: ragas-env
+              optional: true
+        # S3 / MinIO
+        - name: RESULTS_S3_ENDPOINT
+          valueFrom:
+            configMapKeyRef:
+              key: RESULTS_S3_ENDPOINT
+              name: kubeflow-ragas-config
+        - name: RESULTS_S3_PATH_STYLE
+          valueFrom:
+            configMapKeyRef:
+              key: RESULTS_S3_PATH_STYLE
+              name: kubeflow-ragas-config
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            configMapKeyRef:
+              key: AWS_ACCESS_KEY_ID
+              name: kubeflow-ragas-config
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            configMapKeyRef:
+              key: AWS_SECRET_ACCESS_KEY
+              name: kubeflow-ragas-config
+        - name: AWS_DEFAULT_REGION
+          valueFrom:
+            configMapKeyRef:
+              key: AWS_DEFAULT_REGION
+              name: kubeflow-ragas-config
+      name: llama-stack
+      port: 8321
+    distribution:
+      image: __LLAMA_STACK_IMAGE__
+    userConfig:
+      configMapName: llama-stack-e2e-config
diff --git a/tests/e2e/manifests/minio.yaml b/tests/e2e/manifests/minio.yaml
new file mode 100644
index 00000000..fd0df634
--- /dev/null
+++ b/tests/e2e/manifests/minio.yaml
@@ -0,0 +1,85 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: minio
+  namespace: ragas-test
+  labels:
+    app: minio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: minio
+  template:
+    metadata:
+      labels:
+        app: minio
+    spec:
+      containers:
+        - name: minio
+          image: quay.io/minio/minio:latest
+          args:
+            - server
+            - /data
+            - --console-address
+            - ":9001"
+          env:
+            - name: MINIO_ROOT_USER
+              value: "minioadmin"
+            - name: MINIO_ROOT_PASSWORD
+              value: "minioadmin"
+          ports:
+            - containerPort: 9000
+              name: s3
+            - containerPort: 9001
+              name: console
+          readinessProbe:
+            httpGet:
+              path: /minio/health/ready
+              port: 9000
+            initialDelaySeconds: 5
+            periodSeconds: 5
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: minio-service
+  namespace: ragas-test
+spec:
+  selector:
+    app: minio
+  ports:
+    - name: s3
+      port: 9000
+      targetPort: 9000
+    - name: console
+      port: 9001
+      targetPort: 9001
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: minio-create-bucket
+  namespace: ragas-test
+spec:
+  backoffLimit: 3
+  template:
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: mc
+          image: quay.io/minio/mc:latest
+          env:
+            - name: HOME
+              value: /tmp
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for MinIO to be ready..."
+              until mc alias set local http://minio-service:9000 minioadmin minioadmin 2>/dev/null; do
+                sleep 2
+              done
+              echo "MinIO is ready."
+              mc mb --ignore-existing local/ragas-results
+              echo "Bucket 'ragas-results' is ready."
diff --git a/tests/e2e/teardown-e2e.sh b/tests/e2e/teardown-e2e.sh
new file mode 100755
index 00000000..c6fc3c74
--- /dev/null
+++ b/tests/e2e/teardown-e2e.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+#
+# Tear down the llama-stack-provider-ragas e2e test environment.
+#
+
+set -e
+
+echo "Tearing down e2e test environment..."
+
+oc delete namespace ragas-test --ignore-not-found
+
+echo ""
+echo "Teardown complete."
diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
new file mode 100644
index 00000000..41f00ad3
--- /dev/null
+++ b/tests/e2e/test_e2e.py
@@ -0,0 +1,181 @@
+"""End-to-end tests for the llama-stack-provider-ragas distribution on OpenShift.
+
+Prerequisites:
+    - OpenShift cluster with the e2e environment deployed (see deploy-e2e.sh)
+    - Port-forward active:
+        oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
+
+Environment variables:
+    LLAMA_STACK_BASE_URL  - Llama Stack server URL (default: http://localhost:8321)
+    INFERENCE_MODEL       - Model ID for eval candidate (default: Mistral-Small-24B-W8A8)
+"""
+
+import os
+import time
+
+import pytest
+from llama_stack_client import LlamaStackClient
+from rich import print as pprint
+
+# Pre-registered resource IDs (must match llama-stack-distribution.yaml)
+INLINE_BENCHMARK_ID = "hf-doc-qa-ragas-inline-benchmark"
+REMOTE_BENCHMARK_ID = "hf-doc-qa-ragas-remote-benchmark"
+DATASET_ID = "hf_doc_qa_ragas_eval"
+
+POLL_INTERVAL = 5  # seconds
+POLL_TIMEOUT = 300  # seconds
+REMOTE_POLL_TIMEOUT = 600  # seconds – pipeline pods need to pull images and install packages
+
+RAW_EVALUATION_DATA = [
+    {
+        "user_input": "What is the capital of France?",
+        "response": "The capital of France is Paris.",
+        "retrieved_contexts": [
+            "Paris is the capital and most populous city of France."
+        ],
+        "reference": "Paris",
+    },
+    {
+        "user_input": "Who invented the telephone?",
+        "response": "Alexander Graham Bell invented the telephone in 1876.",
+        "retrieved_contexts": [
+            "Alexander Graham Bell was a Scottish-American inventor who patented the first practical telephone."
+        ],
+        "reference": "Alexander Graham Bell",
+    },
+    {
+        "user_input": "What is photosynthesis?",
+        "response": "Photosynthesis is the process by which plants convert sunlight into energy.",
+        "retrieved_contexts": [
+            "Photosynthesis is a process used by plants to convert light energy into chemical energy."
+        ],
+        "reference": "Photosynthesis is the process by which plants and other organisms convert light energy into chemical energy.",
+    },
+]
+
+
+@pytest.fixture(scope="module")
+def client():
+    base_url = os.getenv("LLAMA_STACK_BASE_URL", "http://localhost:8321")
+    return LlamaStackClient(base_url=base_url)
+
+
+@pytest.fixture(scope="module")
+def inference_model():
+    return os.getenv("INFERENCE_MODEL", "Mistral-Small-24B-W8A8")
+
+
+@pytest.fixture(scope="module", autouse=True)
+def register_dataset(client):
+    """Register the evaluation dataset with inline rows."""
+    client.beta.datasets.register(
+        dataset_id=DATASET_ID,
+        purpose="eval/messages-answer",
+        source={"type": "rows", "rows": RAW_EVALUATION_DATA},
+    )
+    yield
+    client.beta.datasets.unregister(dataset_id=DATASET_ID)
+
+
+def _wait_for_job(client, benchmark_id, job_id, timeout=POLL_TIMEOUT):
+    """Poll until the eval job reaches a terminal state."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        job = client.alpha.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)
+        pprint("Job details:", job)
+        if job.status in ("completed", "failed"):
+            return job
+        time.sleep(POLL_INTERVAL)
+    raise TimeoutError(
+        f"Job {job_id} for benchmark {benchmark_id} did not complete within {timeout}s"
+    )
+
+
+class TestClusterSmoke:
+    """Verify the cluster has the expected resources registered."""
+
+    def test_models_registered(self, client):
+        models = client.models.list()
+        assert len(models) > 0, "No models registered"
+
+    def test_datasets_registered(self, client):
+        datasets = client.beta.datasets.list()
+        dataset_ids = [d.identifier for d in datasets]
+        assert DATASET_ID in dataset_ids, (
+            f"Dataset '{DATASET_ID}' not found. Available: {dataset_ids}"
+        )
+
+    def test_benchmarks_registered(self, client):
+        benchmarks = client.alpha.benchmarks.list()
+        benchmark_ids = [b.identifier for b in benchmarks]
+        assert INLINE_BENCHMARK_ID in benchmark_ids, (
+            f"Benchmark '{INLINE_BENCHMARK_ID}' not found. Available: {benchmark_ids}"
+        )
+
+
+class TestInlineEval:
+    """Run evaluation using the inline ragas provider."""
+
+    def test_run_eval(self, client, inference_model):
+        job = client.alpha.eval.run_eval(
+            benchmark_id=INLINE_BENCHMARK_ID,
+            benchmark_config={
+                "eval_candidate": {
+                    "type": "model",
+                    "model": inference_model,
+                    "sampling_params": {
+                        "temperature": 0.1,
+                        "max_tokens": 100,
+                    },
+                },
+                "scoring_params": {},
+                "num_examples": 3,
+            },
+        )
+        assert job.job_id is not None
+        assert job.status == "in_progress"
+
+        completed = _wait_for_job(client, INLINE_BENCHMARK_ID, job.job_id)
+        assert completed.status == "completed", (
+            f"Job finished with status '{completed.status}'"
+        )
+
+        results = client.alpha.eval.jobs.retrieve(
+            benchmark_id=INLINE_BENCHMARK_ID, job_id=job.job_id
+        )
+        assert results.scores, "Expected non-empty scores"
+
+
+class TestRemoteEval:
+    """Run evaluation using the remote ragas provider (KFP + MinIO)."""
+
+    def test_run_eval(self, client, inference_model):
+        job = client.alpha.eval.run_eval(
+            benchmark_id=REMOTE_BENCHMARK_ID,
+            benchmark_config={
+                "eval_candidate": {
+                    "type": "model",
+                    "model": inference_model,
+                    "sampling_params": {
+                        "temperature": 0.1,
+                        "max_tokens": 100,
+                    },
+                },
+                "scoring_params": {},
+                "num_examples": 3,
+            },
+        )
+        assert job.job_id is not None
+        assert job.status == "in_progress"
+
+        completed = _wait_for_job(
+            client, REMOTE_BENCHMARK_ID, job.job_id, timeout=REMOTE_POLL_TIMEOUT
+        )
+        assert completed.status == "completed", (
+            f"Job finished with status '{completed.status}'"
+        )
+
+        results = client.alpha.eval.jobs.retrieve(
+            benchmark_id=REMOTE_BENCHMARK_ID, job_id=job.job_id
+        )
+        assert results.scores, "Expected non-empty scores"

From ff40fcc52bb48c4fb97018c118c27528194d523d Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Mon, 23 Feb 2026 12:41:20 -0500
Subject: [PATCH 2/7] e2e tests.

---
 tests/TESTING.md  |   9 +++
 tests/conftest.py |   2 +-
 tests/test_e2e.py | 189 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 tests/TESTING.md
 create mode 100644 tests/test_e2e.py

diff --git a/tests/TESTING.md b/tests/TESTING.md
new file mode 100644
index 00000000..40076b3f
--- /dev/null
+++ b/tests/TESTING.md
@@ -0,0 +1,9 @@
+# Testing
+
+## Unit
+
+## Integration (In-process Llama Stack server)
+These tests use the `LlamaStackAsLibraryClient` to run the evaluation in-process.
+
+## End-to-End (With a running Llama Stack server)
+These tests use the `LlamaStackClient` to run the evaluation against a running Llama Stack server.
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index ddbc3404..43aee653 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -200,7 +200,7 @@ def remote_eval_config(embedding_model, kubeflow_config):
     )
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def raw_evaluation_data():
     """Sample data for Ragas evaluation."""
     return [
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
new file mode 100644
index 00000000..9c43a170
--- /dev/null
+++ b/tests/test_e2e.py
@@ -0,0 +1,189 @@
+"""End-to-end tests for the llama-stack-provider-ragas distribution on OpenShift.
+
+Prerequisites:
+    - OpenShift cluster with the e2e environment deployed (see deploy-e2e.sh)
+    - Port-forward active:
+        oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
+
+Environment variables:
+    LLAMA_STACK_BASE_URL  - Llama Stack server URL (default: http://localhost:8321)
+    INFERENCE_MODEL       - Model ID for eval candidate (default: Mistral-Small-24B-W8A8)
+"""
+
+import os
+import time
+
+import pytest
+from llama_stack_client import LlamaStackClient
+from rich import print as pprint
+
+INLINE_BENCHMARK_ID = "hf-doc-qa-ragas-inline-benchmark"
+REMOTE_BENCHMARK_ID = "hf-doc-qa-ragas-remote-benchmark"
+DATASET_ID = "hf_doc_qa_ragas_eval"
+
+POLL_INTERVAL = 5  # seconds
+POLL_TIMEOUT = 300  # seconds
+REMOTE_POLL_TIMEOUT = (
+    600  # seconds – pipeline pods need to pull images and install packages
+)
+
+
+@pytest.fixture(scope="module")
+def client():
+    base_url = os.getenv("LLAMA_STACK_BASE_URL", "http://localhost:8321")
+    return LlamaStackClient(base_url=base_url)
+
+
+@pytest.fixture(scope="module")
+def inference_model():
+    return os.getenv("INFERENCE_MODEL", "Mistral-Small-24B-W8A8")
+
+
+@pytest.fixture(scope="module")
+def register_dataset(client, raw_evaluation_data):
+    """Register the evaluation dataset with inline rows."""
+    client.beta.datasets.register(
+        dataset_id=DATASET_ID,
+        purpose="eval/messages-answer",
+        source={"type": "rows", "rows": raw_evaluation_data},
+    )
+    yield
+    try:
+        client.beta.datasets.unregister(dataset_id=DATASET_ID)
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="module")
+def register_benchmarks(client, register_dataset):
+    """Register evaluation benchmarks for inline and remote providers."""
+    client.alpha.benchmarks.register(
+        benchmark_id=INLINE_BENCHMARK_ID,
+        dataset_id=DATASET_ID,
+        scoring_functions=["semantic_similarity"],
+        provider_id="trustyai_ragas_inline",
+    )
+    client.alpha.benchmarks.register(
+        benchmark_id=REMOTE_BENCHMARK_ID,
+        dataset_id=DATASET_ID,
+        scoring_functions=["semantic_similarity"],
+        provider_id="trustyai_ragas_remote",
+    )
+    yield
+    for bid in (INLINE_BENCHMARK_ID, REMOTE_BENCHMARK_ID):
+        try:
+            client.alpha.benchmarks.unregister(benchmark_id=bid)
+        except Exception:
+            pass
+
+
+def _wait_for_job(client, benchmark_id, job_id, timeout=POLL_TIMEOUT):
+    """Poll until the eval job reaches a terminal state."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        job = client.alpha.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)
+        pprint("Job details:", job)
+        if job.status in ("completed", "failed"):
+            return job
+        time.sleep(POLL_INTERVAL)
+    raise TimeoutError(
+        f"Job {job_id} for benchmark {benchmark_id} did not complete within {timeout}s"
+    )
+
+
+@pytest.mark.usefixtures("register_benchmarks")
+class TestClusterSmoke:
+    """Verify the cluster has the expected resources registered."""
+
+    def test_models_registered(self, client):
+        models = client.models.list()
+        pprint("Models:", models)
+        assert len(models) > 0, "No models registered"
+
+    def test_datasets_registered(self, client):
+        datasets = client.beta.datasets.list()
+        pprint("Datasets:", datasets)
+        dataset_ids = [d.identifier for d in datasets]
+        assert DATASET_ID in dataset_ids, (
+            f"Dataset '{DATASET_ID}' not found. Available: {dataset_ids}"
+        )
+
+    def test_benchmarks_registered(self, client):
+        benchmarks = client.alpha.benchmarks.list()
+        pprint("Benchmarks:", benchmarks)
+        benchmark_ids = [b.identifier for b in benchmarks]
+        assert INLINE_BENCHMARK_ID in benchmark_ids, (
+            f"Benchmark '{INLINE_BENCHMARK_ID}' not found. Available: {benchmark_ids}"
+        )
+
+
+@pytest.mark.usefixtures("register_benchmarks")
+class TestInlineEval:
+    """Run evaluation using the inline ragas provider."""
+
+    def test_run_eval(self, client, inference_model):
+        job = client.alpha.eval.run_eval(
+            benchmark_id=INLINE_BENCHMARK_ID,
+            benchmark_config={
+                "eval_candidate": {
+                    "type": "model",
+                    "model": inference_model,
+                    "sampling_params": {
+                        "temperature": 0.1,
+                        "max_tokens": 100,
+                    },
+                },
+                "scoring_params": {},
+                "num_examples": 3,
+            },
+        )
+        assert job.job_id is not None
+        assert job.status == "in_progress"
+
+        completed = _wait_for_job(client, INLINE_BENCHMARK_ID, job.job_id)
+        assert completed.status == "completed", (
+            f"Job finished with status '{completed.status}'"
+        )
+
+        results = client.alpha.eval.jobs.retrieve(
+            benchmark_id=INLINE_BENCHMARK_ID, job_id=job.job_id
+        )
+        pprint("Results:", results)
+        assert results.scores, "Expected non-empty scores"
+
+
+@pytest.mark.usefixtures("register_benchmarks")
+class TestRemoteEval:
+    """Run evaluation using the remote ragas provider (KFP + MinIO)."""
+
+    def test_run_eval(self, client, inference_model):
+        job = client.alpha.eval.run_eval(
+            benchmark_id=REMOTE_BENCHMARK_ID,
+            benchmark_config={
+                "eval_candidate": {
+                    "type": "model",
+                    "model": inference_model,
+                    "sampling_params": {
+                        "temperature": 0.1,
+                        "max_tokens": 100,
+                    },
+                },
+                "scoring_params": {},
+                "num_examples": 3,
+            },
+        )
+        assert job.job_id is not None
+        assert job.status == "in_progress"
+
+        completed = _wait_for_job(
+            client, REMOTE_BENCHMARK_ID, job.job_id, timeout=REMOTE_POLL_TIMEOUT
+        )
+        assert completed.status == "completed", (
+            f"Job finished with status '{completed.status}'"
+        )
+
+        results = client.alpha.eval.jobs.retrieve(
+            benchmark_id=REMOTE_BENCHMARK_ID, job_id=job.job_id
+        )
+        pprint("Results:", results)
+        assert results.scores, "Expected non-empty scores"

From 3bf8ef1978b31837f7043e7d6dc0e97297c5102a Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Tue, 24 Feb 2026 12:22:11 -0500
Subject: [PATCH 3/7] Refactor testing setup and enhance evaluation tests

This commit introduces several improvements to the testing framework for the llama-stack-provider-ragas. Key changes include:

- Updated the pre-commit configuration to run unit tests for "unit or lls_integration".
- Added shared evaluation helper classes (`SmokeTester` and `EvalTester`) to `base_eval_tests.py` for better test organization and reusability.
- Enhanced inline evaluation tests to utilize the new helper classes, improving clarity and maintainability.
- Removed outdated Kubeflow integration tests and streamlined the test structure for better focus on current testing strategies.

These changes aim to improve the robustness and clarity of the testing process, ensuring better coverage and easier maintenance.
---
 .pre-commit-config.yaml            |   2 +-
 pyproject.toml                     |   3 +-
 tests/TESTING.md                   |  49 +++++-
 tests/base_eval_tests.py           | 141 +++++++++++++++++
 tests/conftest.py                  | 243 ++++++++---------------------
 tests/test_e2e.py                  | 185 +++++-----------------
 tests/test_inline_evaluation.py    | 216 +++++++++++++------------
 tests/test_kubeflow_integration.py | 242 ----------------------------
 tests/test_remote_wrappers.py      | 190 ++++++++++++++++++++--
 9 files changed, 586 insertions(+), 685 deletions(-)
 create mode 100644 tests/base_eval_tests.py
 delete mode 100644 tests/test_kubeflow_integration.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index df7c2b47..fc4c0aee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,4 +32,4 @@ repos:
         language: system
         pass_filenames: false
         always_run: true
-        args: [-c, 'KUBEFLOW_BASE_IMAGE=dummy uv run pytest -v -m "not lls_integration and not kfp_integration" --tb=short --maxfail=3; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret']
+        args: [-c, 'KUBEFLOW_BASE_IMAGE=dummy uv run pytest -v -m "unit or lls_integration" --tb=short --maxfail=3; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret']
diff --git a/pyproject.toml b/pyproject.toml
index 8a6f5f5f..8b44ce73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,8 +64,9 @@ addopts = "-v"
 log_cli = true
 log_cli_level = "INFO"
 markers = [
+    "unit: Unit tests for wrapper classes (mocked client by default)",
     "lls_integration: Llama Stack integration tests",
-    "kfp_integration: Kubeflow Pipelines integration tests",
+    "e2e: End-to-end tests against a deployed Llama Stack distribution on OpenShift",
 ]
 
 [tool.ruff]
diff --git a/tests/TESTING.md b/tests/TESTING.md
index 40076b3f..bda59e88 100644
--- a/tests/TESTING.md
+++ b/tests/TESTING.md
@@ -1,9 +1,48 @@
 # Testing
 
-## Unit
+All test files live under `tests/`. Shared evaluation logic (smoke checks, eval job polling) is factored into `base_eval_tests.py`, which is not collected by pytest directly.
 
-## Integration (In-process Llama Stack server)
-These tests use the `LlamaStackAsLibraryClient` to run the evaluation in-process.
+## Unit tests (`test_remote_wrappers.py`, pytest marker `unit`)
 
-## End-to-End (With a running Llama Stack server)
-These tests use the `LlamaStackClient` to run the evaluation against a running Llama Stack server.
\ No newline at end of file
+Tests the LangChain-compatible wrapper classes (`LlamaStackRemoteLLM` and `LlamaStackRemoteEmbeddings`) that the remote provider uses for inference. By default, the `LlamaStackClient` is mocked — no running server is required.
+
+```bash
+uv run pytest tests/test_remote_wrappers.py
+```
+
+Pass `--no-mock-client` to use a real `LlamaStackClient` against a running Llama Stack server (defaults to `http://localhost:8321`). Model IDs can be overridden with `INFERENCE_MODEL` and `EMBEDDING_MODEL`.
+
+```bash
+uv run pytest tests/test_remote_wrappers.py --no-mock-client
+```
+
+## Integration tests (`test_inline_evaluation.py`, pytest marker `lls_integration`)
+
+Tests the eval providers through an in-process Llama Stack server using `LlamaStackAsLibraryClient`. The stack configuration (providers, models, storage) is built entirely in fixtures. By default, Ollama connectivity and inference are mocked.
+
+```bash
+uv run pytest tests/test_inline_evaluation.py
+```
+
+Pass `--no-mock-inference` to use a real Ollama instance for inference:
+
+```bash
+INFERENCE_MODEL=ollama/granite3.3:2b \
+EMBEDDING_MODEL=ollama/all-minilm:latest \
+    uv run pytest tests/test_inline_evaluation.py --no-mock-inference
+```
+
+## End-to-end tests (`test_e2e.py`, pytest marker `e2e`)
+
+Tests against a fully deployed Llama Stack distribution on an OpenShift cluster. Requires the cluster environment from `cluster-deployment/` to be set up and a port-forward to the Llama Stack service:
+
+```bash
+oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
+uv run pytest tests/test_e2e.py
+```
+
+These tests exercise both the inline and remote eval providers through the Llama Stack eval API, including dataset registration, benchmark creation, and eval job execution with result polling.
+
+## Cluster deployment (`cluster-deployment/`)
+
+Contains the Containerfile, deployment/teardown scripts, and Kubernetes manifests needed to stand up the e2e test environment on OpenShift. See `cluster-deployment/deploy-e2e.sh` to deploy.
diff --git a/tests/base_eval_tests.py b/tests/base_eval_tests.py
new file mode 100644
index 00000000..8b7f73b8
--- /dev/null
+++ b/tests/base_eval_tests.py
@@ -0,0 +1,141 @@
+"""Shared test helpers for Llama Stack eval provider tests.
+
+Provides ``SmokeTester`` and ``EvalTester``, plain helper classes that
+encapsulate common assertions (model/dataset/benchmark registration) and
+eval-job execution logic (run, poll, verify scores).  Test modules
+instantiate them via fixtures, supplying the appropriate client and
+configuration for each environment (in-process library client or remote
+``LlamaStackClient``).
+"""
+
+import time
+
+from rich import print as pprint
+
+
+class SmokeTester:
+    def __init__(self, client, dataset_id, inline_benchmark_id, remote_benchmark_id):
+        self.client = client
+        self.dataset_id = dataset_id
+        self.inline_benchmark_id = inline_benchmark_id
+        self.remote_benchmark_id = remote_benchmark_id
+
+    def test_providers_registered(self):
+        providers = self.client.providers.list()
+        assert len(providers) > 0
+        assert any(p.api == "eval" for p in providers)
+        pprint("Providers:", providers)
+
+    def test_models_registered(self):
+        models = self.client.models.list()
+        pprint("Models:", models)
+        assert len(models) > 0, "No models registered"
+
+    def test_datasets_registered(self):
+        datasets = self.client.beta.datasets.list()
+        pprint("Datasets:", datasets)
+        dataset_ids = [d.identifier for d in datasets]
+        assert self.dataset_id in dataset_ids, (
+            f"Dataset '{self.dataset_id}' not found. Available: {dataset_ids}"
+        )
+
+    def test_benchmarks_registered(self):
+        benchmarks = self.client.alpha.benchmarks.list()
+        pprint("Benchmarks:", benchmarks)
+        benchmark_ids = [b.identifier for b in benchmarks]
+        assert self.inline_benchmark_id in benchmark_ids, (
+            f"Benchmark '{self.inline_benchmark_id}' not found. Available: {benchmark_ids}"
+        )
+        assert self.remote_benchmark_id in benchmark_ids, (
+            f"Benchmark '{self.remote_benchmark_id}' not found. Available: {benchmark_ids}"
+        )
+
+
+class EvalTester:
+    """Base evaluation test class."""
+
+    def __init__(
+        self,
+        client,
+        inference_model,
+        dataset_id,
+        inline_benchmark_id,
+        remote_benchmark_id,
+        poll_interval: int = 5,
+        poll_timeout: int = 300,
+    ):
+        self.client = client
+        self.inference_model = inference_model
+        self.dataset_id = dataset_id
+        self.inline_benchmark_id = inline_benchmark_id
+        self.remote_benchmark_id = remote_benchmark_id
+        self.poll_interval = poll_interval
+        self.poll_timeout = poll_timeout
+
+    def run_eval(
+        self,
+        benchmark_id: str,
+        inference_model: str,
+        num_examples: int | None = None,
+    ):
+        """Run an evaluation job and verify it completes with scores."""
+        benchmark_config = self._build_benchmark_config(
+            inference_model, num_examples=num_examples
+        )
+        job = self.client.alpha.eval.run_eval(
+            benchmark_id=benchmark_id,
+            benchmark_config=benchmark_config,
+        )
+        assert job.job_id is not None
+        assert job.status == "in_progress"
+
+        completed = self._wait_for_job(self.client, benchmark_id, job.job_id)
+        assert completed.status == "completed", (
+            f"Job finished with status '{completed.status}'"
+        )
+
+        results = self.client.alpha.eval.jobs.retrieve(
+            benchmark_id=benchmark_id, job_id=job.job_id
+        )
+        pprint(f"[{self.__class__.__name__}] Results:", results)
+        assert results.scores, "Expected non-empty scores"
+
+    # -- helpers --------------------------------------------------------
+
+    def _build_benchmark_config(
+        self, inference_model: str, num_examples: int | None = None
+    ) -> dict:
+        """Build the ``benchmark_config`` dict for ``run_eval``."""
+        config: dict = {
+            "eval_candidate": {
+                "type": "model",
+                "model": inference_model,
+                "sampling_params": {
+                    "temperature": 0.1,
+                    "max_tokens": 100,
+                },
+            },
+            "scoring_params": {},
+        }
+        if num_examples is not None:
+            config["num_examples"] = num_examples
+        return config
+
+    def _wait_for_job(
+        self, client, benchmark_id: str, job_id: str, timeout: int | None = None
+    ):
+        """Poll until the eval job reaches a terminal state."""
+        timeout = timeout if timeout is not None else self.poll_timeout
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            job = client.alpha.eval.jobs.status(
+                benchmark_id=benchmark_id, job_id=job_id
+            )
+            pprint(f"[{self.__class__.__name__}] Job status:", job)
+            if job.status in ("completed", "failed"):
+                return job
+            time.sleep(self.poll_interval)
+        raise TimeoutError(
+            f"Job {job_id} for benchmark {benchmark_id} "
+            f"did not complete within {timeout}s"
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index 43aee653..dce9eb93 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,30 +1,8 @@
 import os
-import random
-from datetime import datetime
 
 import pytest
-from dotenv import load_dotenv
-from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient
-from llama_stack_client.types.completion_create_response import (
-    Choice,
-    CompletionCreateResponse,
-)
-from llama_stack_client.types.create_embeddings_response import (
-    CreateEmbeddingsResponse,
-    Data,
-    Usage,
-)
 from ragas import EvaluationDataset
 
-from llama_stack_provider_ragas.compat import SamplingParams, TopPSamplingStrategy
-from llama_stack_provider_ragas.config import (
-    KubeflowConfig,
-    RagasProviderInlineConfig,
-    RagasProviderRemoteConfig,
-)
-
-load_dotenv()
-
 
 def pytest_addoption(parser):
     parser.addoption(
@@ -32,11 +10,16 @@ def pytest_addoption(parser):
         action="store_true",
         help="Don't mock LLM inference (embeddings and completions)",
     )
+    parser.addoption(
+        "--no-mock-client",
+        action="store_true",
+        help="Don't mock the LlamaStackClient; use a real server for wrapper tests",
+    )
 
 
-@pytest.fixture
-def unique_timestamp():
-    return datetime.now().strftime("%Y%m%d_%H%M%S")
+@pytest.fixture(scope="session")
+def llama_stack_base_url():
+    return os.getenv("LLAMA_STACK_BASE_URL", "http://localhost:8321")
 
 
 @pytest.fixture
@@ -45,161 +28,6 @@ def embedding_dimension():
     return 384
 
 
-@pytest.fixture
-def lls_client(request):
-    if request.config.getoption("--no-mock-inference") is True:
-        return request.getfixturevalue("real_lls_client")
-    else:
-        return request.getfixturevalue("mocked_lls_client")
-
-
-@pytest.fixture
-def real_lls_client():
-    return LlamaStackClient(base_url=os.environ.get("KUBEFLOW_LLAMA_STACK_URL"))
-
-
-@pytest.fixture(autouse=True)
-def mocked_llm_response(request):
-    return getattr(request, "param", "Hello, world!")
-
-
-@pytest.fixture()
-def mocked_lls_client(mocked_lls_clients):
-    sync_client, _ = mocked_lls_clients
-    return sync_client
-
-
-@pytest.fixture()
-def mocked_lls_clients(monkeypatch, request, embedding_dimension):
-    """
-    Mock LLM inference (embeddings and completions) by default,
-    unless --mock-inference=False is passed in the command line.
-
-    You can indirectly parametrize this fixture to customize completion text:
-
-        @pytest.mark.parametrize(
-            "mocked_llm_response",
-            ["Hello from mock!"],
-            indirect=True,
-        )
-    """
-    # Create real clients, but patch only the `.create()` methods we need.
-    base_url = os.environ.get("KUBEFLOW_LLAMA_STACK_URL")
-    sync_client = LlamaStackClient(base_url=base_url)
-    async_client = AsyncLlamaStackClient(base_url=base_url)
-
-    completion_text = request.getfixturevalue("mocked_llm_response")
-
-    def _make_embeddings_response(n: int) -> CreateEmbeddingsResponse:
-        # return one embedding vector per input string
-        return CreateEmbeddingsResponse(
-            data=[
-                Data(
-                    embedding=[random.random() for _ in range(embedding_dimension)],
-                    index=i,
-                    object="embedding",
-                )
-                for i in range(n)
-            ],
-            model="mocked/model",
-            object="list",
-            usage=Usage(prompt_tokens=10, total_tokens=10),
-        )
-
-    def _make_completions_response(text: str) -> CompletionCreateResponse:
-        return CompletionCreateResponse(
-            id="cmpl-123",
-            created=1717000000,
-            choices=[Choice(index=0, text=text, finish_reason="stop")],
-            model="mocked/model",
-            object="text_completion",
-        )
-
-    def _embeddings_create(*args, **kwargs):
-        embedding_input = kwargs.get("input")
-        if isinstance(embedding_input, list):
-            return _make_embeddings_response(len(embedding_input))
-        return _make_embeddings_response(1)
-
-    async def _async_embeddings_create(*args, **kwargs):
-        embedding_input = kwargs.get("input")
-        if isinstance(embedding_input, list):
-            return _make_embeddings_response(len(embedding_input))
-        return _make_embeddings_response(1)
-
-    def _completions_create(*args, **kwargs):
-        return _make_completions_response(completion_text)
-
-    async def _async_completions_create(*args, **kwargs):
-        return _make_completions_response(completion_text)
-
-    # Patch nested methods (avoids dotted-attribute monkeypatch issues on classes).
-    monkeypatch.setattr(sync_client.embeddings, "create", _embeddings_create)
-    monkeypatch.setattr(sync_client.completions, "create", _completions_create)
-    monkeypatch.setattr(async_client.embeddings, "create", _async_embeddings_create)
-    monkeypatch.setattr(async_client.completions, "create", _async_completions_create)
-
-    return sync_client, async_client
-
-
-@pytest.fixture(autouse=True)
-def patch_remote_wrappers(monkeypatch, mocked_lls_clients, request):
-    sync_client, async_client = mocked_lls_clients
-    if request.config.getoption("--no-mock-inference") is not True:
-        from llama_stack_provider_ragas.remote import wrappers_remote
-
-        monkeypatch.setattr(
-            wrappers_remote, "LlamaStackClient", lambda *a, **k: sync_client
-        )
-        monkeypatch.setattr(
-            wrappers_remote, "AsyncLlamaStackClient", lambda *a, **k: async_client
-        )
-
-
-@pytest.fixture
-def model():
-    return "ollama/granite3.3:2b"  # TODO : read from env
-
-
-@pytest.fixture
-def embedding_model():
-    return "ollama/all-minilm:latest"
-
-
-@pytest.fixture
-def sampling_params():
-    return SamplingParams(
-        strategy=TopPSamplingStrategy(temperature=0.1, top_p=0.95),
-        max_tokens=100,
-        stop=None,
-    )
-
-
-@pytest.fixture
-def inline_eval_config(embedding_model):
-    return RagasProviderInlineConfig(embedding_model=embedding_model)
-
-
-@pytest.fixture
-def kubeflow_config():
-    return KubeflowConfig(
-        pipelines_endpoint=os.environ["KUBEFLOW_PIPELINES_ENDPOINT"],
-        namespace=os.environ["KUBEFLOW_NAMESPACE"],
-        llama_stack_url=os.environ["KUBEFLOW_LLAMA_STACK_URL"],
-        base_image=os.environ["KUBEFLOW_BASE_IMAGE"],
-        results_s3_prefix=os.environ["KUBEFLOW_RESULTS_S3_PREFIX"],
-        s3_credentials_secret_name=os.environ["KUBEFLOW_S3_CREDENTIALS_SECRET_NAME"],
-    )
-
-
-@pytest.fixture
-def remote_eval_config(embedding_model, kubeflow_config):
-    return RagasProviderRemoteConfig(
-        embedding_model=embedding_model,
-        kubeflow_config=kubeflow_config,
-    )
-
-
 @pytest.fixture(scope="session")
 def raw_evaluation_data():
     """Sample data for Ragas evaluation."""
@@ -235,3 +63,58 @@ def raw_evaluation_data():
 def evaluation_dataset(raw_evaluation_data):
     """Create EvaluationDataset from sample data."""
     return EvaluationDataset.from_list(raw_evaluation_data)
+
+
+@pytest.fixture(scope="session")
+def dataset_id():
+    return "ragas_test_dataset"
+
+
+@pytest.fixture(scope="session")
+def inline_benchmark_id():
+    return "hf-doc-qa-ragas-inline-benchmark"
+
+
+@pytest.fixture(scope="session")
+def remote_benchmark_id():
+    return "hf-doc-qa-ragas-remote-benchmark"
+
+
+@pytest.fixture
+def register_dataset(client, raw_evaluation_data, dataset_id):
+    """Register the evaluation dataset with inline rows."""
+    client.beta.datasets.register(
+        dataset_id=dataset_id,
+        purpose="eval/messages-answer",
+        source={"type": "rows", "rows": raw_evaluation_data},
+    )
+    yield
+    try:
+        client.beta.datasets.unregister(dataset_id=dataset_id)
+    except Exception:
+        pass
+
+
+@pytest.fixture
+def register_benchmarks(
+    client, register_dataset, dataset_id, inline_benchmark_id, remote_benchmark_id
+):
+    """Register evaluation benchmarks for inline and remote providers."""
+    client.alpha.benchmarks.register(
+        benchmark_id=inline_benchmark_id,
+        dataset_id=dataset_id,
+        scoring_functions=["semantic_similarity"],
+        provider_id="trustyai_ragas_inline",
+    )
+    client.alpha.benchmarks.register(
+        benchmark_id=remote_benchmark_id,
+        dataset_id=dataset_id,
+        scoring_functions=["semantic_similarity"],
+        provider_id="trustyai_ragas_remote",
+    )
+    yield
+    for bid in (inline_benchmark_id, remote_benchmark_id):
+        try:
+            client.alpha.benchmarks.unregister(benchmark_id=bid)
+        except Exception:
+            pass
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 9c43a170..219bd583 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -1,7 +1,7 @@
 """End-to-end tests for the llama-stack-provider-ragas distribution on OpenShift.
 
 Prerequisites:
-    - OpenShift cluster with the e2e environment deployed (see deploy-e2e.sh)
+    - OpenShift cluster with the e2e environment deployed (see cluster-deployment directory)
     - Port-forward active:
         oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
 
@@ -11,27 +11,17 @@
 """
 
 import os
-import time
 
 import pytest
+from base_eval_tests import EvalTester, SmokeTester
 from llama_stack_client import LlamaStackClient
-from rich import print as pprint
 
-INLINE_BENCHMARK_ID = "hf-doc-qa-ragas-inline-benchmark"
-REMOTE_BENCHMARK_ID = "hf-doc-qa-ragas-remote-benchmark"
-DATASET_ID = "hf_doc_qa_ragas_eval"
-
-POLL_INTERVAL = 5  # seconds
-POLL_TIMEOUT = 300  # seconds
-REMOTE_POLL_TIMEOUT = (
-    600  # seconds – pipeline pods need to pull images and install packages
-)
+pytestmark = pytest.mark.e2e
 
 
 @pytest.fixture(scope="module")
-def client():
-    base_url = os.getenv("LLAMA_STACK_BASE_URL", "http://localhost:8321")
-    return LlamaStackClient(base_url=base_url)
+def client(llama_stack_base_url):
+    return LlamaStackClient(base_url=llama_stack_base_url)
 
 
 @pytest.fixture(scope="module")
@@ -40,150 +30,49 @@ def inference_model():
 
 
 @pytest.fixture(scope="module")
-def register_dataset(client, raw_evaluation_data):
-    """Register the evaluation dataset with inline rows."""
-    client.beta.datasets.register(
-        dataset_id=DATASET_ID,
-        purpose="eval/messages-answer",
-        source={"type": "rows", "rows": raw_evaluation_data},
-    )
-    yield
-    try:
-        client.beta.datasets.unregister(dataset_id=DATASET_ID)
-    except Exception:
-        pass
+def embedding_model():
+    return os.getenv("EMBEDDING_MODEL", "embedding/nomic-ai/nomic-embed-text-v1.5")
 
 
 @pytest.fixture(scope="module")
-def register_benchmarks(client, register_dataset):
-    """Register evaluation benchmarks for inline and remote providers."""
-    client.alpha.benchmarks.register(
-        benchmark_id=INLINE_BENCHMARK_ID,
-        dataset_id=DATASET_ID,
-        scoring_functions=["semantic_similarity"],
-        provider_id="trustyai_ragas_inline",
-    )
-    client.alpha.benchmarks.register(
-        benchmark_id=REMOTE_BENCHMARK_ID,
-        dataset_id=DATASET_ID,
-        scoring_functions=["semantic_similarity"],
-        provider_id="trustyai_ragas_remote",
+def smoke_tester(client, dataset_id, inline_benchmark_id, remote_benchmark_id):
+    return SmokeTester(
+        client,
+        dataset_id,
+        inline_benchmark_id,
+        remote_benchmark_id,
     )
-    yield
-    for bid in (INLINE_BENCHMARK_ID, REMOTE_BENCHMARK_ID):
-        try:
-            client.alpha.benchmarks.unregister(benchmark_id=bid)
-        except Exception:
-            pass
-
-
-def _wait_for_job(client, benchmark_id, job_id, timeout=POLL_TIMEOUT):
-    """Poll until the eval job reaches a terminal state."""
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        job = client.alpha.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)
-        pprint("Job details:", job)
-        if job.status in ("completed", "failed"):
-            return job
-        time.sleep(POLL_INTERVAL)
-    raise TimeoutError(
-        f"Job {job_id} for benchmark {benchmark_id} did not complete within {timeout}s"
+
+
+@pytest.fixture(scope="module")
+def eval_tester(
+    client, inference_model, dataset_id, inline_benchmark_id, remote_benchmark_id
+):
+    return EvalTester(
+        client,
+        inference_model,
+        dataset_id,
+        inline_benchmark_id,
+        remote_benchmark_id,
     )
 
 
 @pytest.mark.usefixtures("register_benchmarks")
-class TestClusterSmoke:
-    """Verify the cluster has the expected resources registered."""
-
-    def test_models_registered(self, client):
-        models = client.models.list()
-        pprint("Models:", models)
-        assert len(models) > 0, "No models registered"
-
-    def test_datasets_registered(self, client):
-        datasets = client.beta.datasets.list()
-        pprint("Datasets:", datasets)
-        dataset_ids = [d.identifier for d in datasets]
-        assert DATASET_ID in dataset_ids, (
-            f"Dataset '{DATASET_ID}' not found. Available: {dataset_ids}"
-        )
-
-    def test_benchmarks_registered(self, client):
-        benchmarks = client.alpha.benchmarks.list()
-        pprint("Benchmarks:", benchmarks)
-        benchmark_ids = [b.identifier for b in benchmarks]
-        assert INLINE_BENCHMARK_ID in benchmark_ids, (
-            f"Benchmark '{INLINE_BENCHMARK_ID}' not found. Available: {benchmark_ids}"
-        )
+def test_cluster_smoke(smoke_tester):
+    smoke_tester.test_models_registered()
+    smoke_tester.test_datasets_registered()
+    smoke_tester.test_benchmarks_registered()
 
 
 @pytest.mark.usefixtures("register_benchmarks")
-class TestInlineEval:
-    """Run evaluation using the inline ragas provider."""
-
-    def test_run_eval(self, client, inference_model):
-        job = client.alpha.eval.run_eval(
-            benchmark_id=INLINE_BENCHMARK_ID,
-            benchmark_config={
-                "eval_candidate": {
-                    "type": "model",
-                    "model": inference_model,
-                    "sampling_params": {
-                        "temperature": 0.1,
-                        "max_tokens": 100,
-                    },
-                },
-                "scoring_params": {},
-                "num_examples": 3,
-            },
-        )
-        assert job.job_id is not None
-        assert job.status == "in_progress"
-
-        completed = _wait_for_job(client, INLINE_BENCHMARK_ID, job.job_id)
-        assert completed.status == "completed", (
-            f"Job finished with status '{completed.status}'"
-        )
-
-        results = client.alpha.eval.jobs.retrieve(
-            benchmark_id=INLINE_BENCHMARK_ID, job_id=job.job_id
-        )
-        pprint("Results:", results)
-        assert results.scores, "Expected non-empty scores"
+def test_inline_eval(eval_tester, inline_benchmark_id, inference_model):
+    eval_tester.poll_interval = 3
+    eval_tester.poll_timeout = 30
+    eval_tester.run_eval(inline_benchmark_id, inference_model, num_examples=3)
 
 
 @pytest.mark.usefixtures("register_benchmarks")
-class TestRemoteEval:
-    """Run evaluation using the remote ragas provider (KFP + MinIO)."""
-
-    def test_run_eval(self, client, inference_model):
-        job = client.alpha.eval.run_eval(
-            benchmark_id=REMOTE_BENCHMARK_ID,
-            benchmark_config={
-                "eval_candidate": {
-                    "type": "model",
-                    "model": inference_model,
-                    "sampling_params": {
-                        "temperature": 0.1,
-                        "max_tokens": 100,
-                    },
-                },
-                "scoring_params": {},
-                "num_examples": 3,
-            },
-        )
-        assert job.job_id is not None
-        assert job.status == "in_progress"
-
-        completed = _wait_for_job(
-            client, REMOTE_BENCHMARK_ID, job.job_id, timeout=REMOTE_POLL_TIMEOUT
-        )
-        assert completed.status == "completed", (
-            f"Job finished with status '{completed.status}'"
-        )
-
-        results = client.alpha.eval.jobs.retrieve(
-            benchmark_id=REMOTE_BENCHMARK_ID, job_id=job.job_id
-        )
-        pprint("Results:", results)
-        assert results.scores, "Expected non-empty scores"
+def test_remote_eval(eval_tester, remote_benchmark_id, inference_model):
+    eval_tester.poll_interval = 10
+    eval_tester.poll_timeout = 300
+    eval_tester.run_eval(remote_benchmark_id, inference_model, num_examples=3)
diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py
index c0d986ff..2e4ec34f 100644
--- a/tests/test_inline_evaluation.py
+++ b/tests/test_inline_evaluation.py
@@ -1,24 +1,41 @@
-"""Test inline evaluation."""
+"""Llama Stack integration tests using an in-process server.
 
-import json
+These tests use ``LlamaStackAsLibraryClient`` to spin up a Llama Stack
+server in-process with the configuration defined in the
+``library_stack_config`` fixture.  By default, LLM inference (embeddings
+and completions) is mocked so no external services are required.
+
+To run against a real inference provider (e.g. a local Ollama instance),
+pass ``--no-mock-inference``::
+
+    pytest tests/test_inline_evaluation.py --no-mock-inference
+
+You can also override the models via environment variables::
+
+    INFERENCE_MODEL=ollama/granite3.3:2b \\
+    EMBEDDING_MODEL=ollama/all-minilm:latest \\
+        pytest tests/test_inline_evaluation.py --no-mock-inference
+"""
+
+import os
 import random
 from types import SimpleNamespace
 
 import pytest
 import yaml
+from base_eval_tests import EvalTester, SmokeTester
 from llama_stack.core.library_client import LlamaStackAsLibraryClient
-from ragas.metrics import answer_relevancy
-from rich import print as rprint
-from rich.pretty import Pretty
 
 from llama_stack_provider_ragas.compat import Api
-from llama_stack_provider_ragas.constants import PROVIDER_ID_INLINE
+from llama_stack_provider_ragas.constants import PROVIDER_ID_INLINE, PROVIDER_ID_REMOTE
 
 pytestmark = pytest.mark.lls_integration
 
 
 @pytest.fixture
-def library_stack_config(tmp_path, embedding_dimension):
+def library_stack_config(
+    tmp_path, embedding_dimension, embedding_model, inference_model
+):
     """Stack configuration for library client testing."""
     storage_dir = tmp_path / "test_storage"
     storage_dir.mkdir()
@@ -41,10 +58,26 @@ def library_stack_config(tmp_path, embedding_dimension):
                     "provider_type": "inline::trustyai_ragas",
                     "module": "llama_stack_provider_ragas.inline",
                     "config": {
-                        "embedding_model": "ollama/all-minilm:latest",
+                        "embedding_model": embedding_model,
                         "kvstore": {"namespace": "ragas", "backend": "kv_default"},
                     },
-                }
+                },
+                {
+                    "provider_id": PROVIDER_ID_REMOTE,
+                    "provider_type": "remote::trustyai_ragas",
+                    "module": "llama_stack_provider_ragas.remote",
+                    "config": {
+                        "embedding_model": embedding_model,
+                        "kubeflow_config": {
+                            "pipelines_endpoint": "http://localhost:8888",
+                            "namespace": "default",
+                            "llama_stack_url": "http://localhost:8321",
+                            "base_image": "python:3.12-slim",
+                            "results_s3_prefix": "s3://ragas-results",
+                            "s3_credentials_secret_name": "aws-credentials",
+                        },
+                    },
+                },
             ],
             "datasetio": [
                 {
@@ -101,16 +134,16 @@ def library_stack_config(tmp_path, embedding_dimension):
             "models": [
                 {
                     "metadata": {"embedding_dimension": embedding_dimension},
-                    "model_id": "all-MiniLM-L6-v2",
+                    "model_id": embedding_model,
                     "provider_id": "ollama",
-                    "provider_model_id": "all-minilm:latest",
+                    "provider_model_id": embedding_model.removeprefix("ollama/"),
                     "model_type": "embedding",
                 },
                 {
                     "metadata": {},
-                    "model_id": "granite3.3:2b",
+                    "model_id": inference_model,
                     "provider_id": "ollama",
-                    "provider_model_id": "granite3.3:2b",
+                    "provider_model_id": inference_model.removeprefix("ollama/"),
                     "model_type": "llm",
                 },
             ],
@@ -124,6 +157,19 @@ def library_stack_config(tmp_path, embedding_dimension):
     }
 
 
+@pytest.fixture(autouse=True)
+def mocked_inference_response(request):
+    """Fake completion text returned by the mocked Ollama inference adapter.
+
+    The in-process library client's ``openai_completion`` and
+    ``openai_embeddings`` methods are monkey-patched in
+    ``mocked_library_client``; this fixture controls the text that the
+    mocked completion endpoint returns.  Use indirect parametrization to
+    override the default value per test.
+    """
+    return getattr(request, "param", "Hello, world!")
+
+
 @pytest.fixture
 def library_stack_config_file(library_stack_config, tmp_path):
     """Write the stack config dict to a temp YAML file and return its path."""
@@ -135,14 +181,16 @@ def library_stack_config_file(library_stack_config, tmp_path):
 
 @pytest.fixture
 def library_client(request):
-    """
-    Mock LLM inference (embeddings and completions) by default,
-    unless --mock-inference=False is passed in the command line.
+    """Return a library client, with or without mocked inference.
 
-    You can indirectly parametrize this fixture to customize the completion text:
+    By default, Ollama inference is mocked so no external services are
+    needed.  Pass ``--no-mock-inference`` to use a real Ollama instance.
+
+    The completion text used by the mock can be overridden via indirect
+    parametrization of ``mocked_inference_response``::
 
         @pytest.mark.parametrize(
-            "mocked_llm_response",
+            "mocked_inference_response",
             ["Hello from mock!"],
             indirect=True,
         )
@@ -161,18 +209,20 @@ def real_library_client(library_stack_config_file):
 @pytest.fixture()
 def mocked_library_client(
     monkeypatch,
-    mocked_llm_response,
+    mocked_inference_response,
     library_stack_config_file,
     embedding_dimension,
+    embedding_model,
+    inference_model,
 ):
-    completion_text = mocked_llm_response
+    completion_text = mocked_inference_response
 
     # Mock Ollama connectivity check & model listing
     async def _fake_check_model_availability(*args, **kwargs):
         return True
 
     async def _fake_list_provider_model_ids(*args, **kwargs):
-        return ["all-minilm:latest", "granite3.3:2b"]
+        return [embedding_model, inference_model]
 
     monkeypatch.setattr("ollama.Client", lambda *args, **kwargs: SimpleNamespace())
 
@@ -225,85 +275,53 @@ async def _fake_openai_completion(req):  # noqa: ANN001
     return real_library_client
 
 
-def test_library_client_health(library_client):
-    assert library_client is not None
-    assert hasattr(library_client, "alpha")
-    assert hasattr(library_client.alpha, "eval")
-
-    models = library_client.models.list()
-    assert len(models) > 0
-    print("Available models:")
-    rprint(Pretty(models, max_depth=6, expand_all=True))
-
-    providers = library_client.providers.list()
-    assert len(providers) > 0
-    assert any(p.api == "eval" for p in providers)
-    print("Available providers:")
-    rprint(Pretty(providers, max_depth=6, expand_all=True))
-
-
-@pytest.mark.parametrize(
-    "metric_to_test,mocked_llm_response",
-    [
-        # `answer_relevancy` expects the LLM to output a JSON payload with:
-        # - question: a question implied by the given answer
-        # - noncommittal: 0/1
-        pytest.param(
-            answer_relevancy,
-            json.dumps(
-                {"question": "What is the capital of France?", "noncommittal": 0}
-            ),
-            id="answer_relevancy",
-        ),
-    ],
-    indirect=["mocked_llm_response"],
-)
-def test_full_evaluation_with_library_client(
-    library_client,
-    model,
-    sampling_params,
-    unique_timestamp,
-    raw_evaluation_data,
-    metric_to_test,
-):
-    dataset_id = f"library_full_test_dataset_{unique_timestamp}"
-    library_client.beta.datasets.register(
-        dataset_id=dataset_id,
-        purpose="eval/question-answer",
-        source={"type": "rows", "rows": raw_evaluation_data[:1]},
-        metadata={"provider_id": "localfs"},
-    )
-    datasets = library_client.beta.datasets.list()
-    print(f"Available datasets: {[d.identifier for d in datasets]}")
-    assert any(d.identifier == dataset_id for d in datasets)
-
-    benchmark_id = f"library_full_test_benchmark_{unique_timestamp}"
-    library_client.alpha.benchmarks.register(
-        benchmark_id=benchmark_id,
-        dataset_id=dataset_id,
-        scoring_functions=[metric_to_test.name],
-        provider_id=PROVIDER_ID_INLINE,
-    )
-    benchmarks = library_client.alpha.benchmarks.list()
-    print(f"Available benchmarks: {[b.identifier for b in benchmarks]}")
-    assert any(b.identifier == benchmark_id for b in benchmarks)
-
-    job = library_client.alpha.eval.run_eval(
-        benchmark_id=benchmark_id,
-        benchmark_config={
-            "eval_candidate": {
-                "type": "model",
-                "model": model,
-                "sampling_params": sampling_params.model_dump(exclude_none=True),
-            },
-            "scoring_params": {},
-        },
+@pytest.fixture
+def client(library_client):
+    return library_client
+
+
+@pytest.fixture
+def inference_model():
+    return os.getenv("INFERENCE_MODEL", "ollama/granite3.3:2b")
+
+
+@pytest.fixture
+def embedding_model():
+    return os.getenv("EMBEDDING_MODEL", "ollama/all-minilm:latest")
+
+
+@pytest.fixture
+def smoke_tester(client, dataset_id, inline_benchmark_id, remote_benchmark_id):
+    return SmokeTester(
+        client,
+        dataset_id,
+        inline_benchmark_id,
+        remote_benchmark_id,
     )
 
-    assert job.job_id is not None
-    assert job.status == "in_progress"
 
-    job = library_client.alpha.eval.jobs.status(
-        benchmark_id=benchmark_id, job_id=job.job_id
+@pytest.fixture
+def eval_tester(
+    client, inference_model, dataset_id, inline_benchmark_id, remote_benchmark_id
+):
+    return EvalTester(
+        client,
+        inference_model,
+        dataset_id,
+        inline_benchmark_id,
+        remote_benchmark_id,
     )
-    assert job.status == "completed"
+
+
+@pytest.mark.usefixtures("register_benchmarks")
+def test_library_client_smoke(smoke_tester):
+    smoke_tester.test_models_registered()
+    smoke_tester.test_datasets_registered()
+    smoke_tester.test_benchmarks_registered()
+
+
+@pytest.mark.usefixtures("register_benchmarks")
+def test_inline_eval(eval_tester, inline_benchmark_id, inference_model):
+    eval_tester.poll_interval = 1
+    eval_tester.poll_timeout = 10
+    eval_tester.run_eval(inline_benchmark_id, inference_model, num_examples=3)
diff --git a/tests/test_kubeflow_integration.py b/tests/test_kubeflow_integration.py
deleted file mode 100644
index bbf9355e..00000000
--- a/tests/test_kubeflow_integration.py
+++ /dev/null
@@ -1,242 +0,0 @@
-"""Integration tests for Kubeflow pipeline components against live cluster."""
-
-import os
-from textwrap import dedent
-from typing import List  # noqa
-
-import kfp
-import pytest
-from kfp import dsl
-from ragas.metrics import answer_relevancy
-
-pytestmark = pytest.mark.kfp_integration
-
-
-# Skip the entire module if required environment variables are missing
-required_env_vars = ["KUBEFLOW_PIPELINES_ENDPOINT", "KUBEFLOW_BASE_IMAGE"]
-missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
-if missing_vars:
-    pytest.skip(
-        f"Kubeflow environment variables not set: {', '.join(missing_vars)}. "
-        "Set these environment variables to run Kubeflow integration tests.",
-        allow_module_level=True,
-    )
-
-
-@pytest.fixture
-def kf_client():
-    token = os.popen("oc whoami -t").read().strip()
-    return kfp.Client(
-        host=os.environ["KUBEFLOW_PIPELINES_ENDPOINT"], existing_token=token
-    )
-
-
-@dsl.component(base_image=os.environ["KUBEFLOW_BASE_IMAGE"])
-def retrieve_data_for_testing(output_dataset: dsl.Output[dsl.Dataset]):
-    import pandas as pd
-
-    dataset = pd.DataFrame(
-        [
-            {
-                "user_input": "What is the capital of France?",
-                "response": "The capital of France is Paris.",
-                "retrieved_contexts": [
-                    "Paris is the capital and most populous city of France."
-                ],
-                "reference": "Paris",
-            },
-            {
-                "user_input": "Who invented the telephone?",
-                "response": "Alexander Graham Bell invented the telephone in 1876.",
-                "retrieved_contexts": [
-                    "Alexander Graham Bell was a Scottish-American inventor who patented the first practical telephone."
-                ],
-                "reference": "Alexander Graham Bell",
-            },
-        ]
-    )
-    dataset.to_json(output_dataset.path, orient="records", lines=True)
-
-
-@dsl.component(base_image=os.environ["KUBEFLOW_BASE_IMAGE"])
-def run_fake_ragas_evaluation(
-    model: str,
-    sampling_params: dict,
-    embedding_model: str,
-    metrics: List[str],  # noqa
-    llama_stack_base_url: str,
-    input_dataset: dsl.Input[dsl.Dataset],
-    output_results: dsl.Output[dsl.Dataset],
-):
-    import logging
-    from unittest.mock import AsyncMock, patch
-
-    import pandas as pd
-    from ragas import EvaluationDataset, evaluate
-    from ragas.dataset_schema import EvaluationResult
-    from ragas.run_config import RunConfig
-
-    from llama_stack_provider_ragas.constants import METRIC_MAPPING
-    from llama_stack_provider_ragas.logging_utils import render_dataframe_as_table
-    from llama_stack_provider_ragas.remote.wrappers_remote import (
-        LlamaStackRemoteEmbeddings,
-        LlamaStackRemoteLLM,
-    )
-
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.INFO)
-
-    # Mock the agenerate_text method to return realistic structured responses
-    from langchain_core.language_models.llms import Generation, LLMResult
-
-    def mock_answer_relevancy_side_effect(
-        prompt, n=1, temperature=None, stop=None, callbacks=None, **kwargs
-    ):
-        # Return answer relevancy specific JSON format for all prompts
-        # Expected format: {"question": "...", "noncommittal": 0|1}
-        json_text = dedent("""```json
-{
-    "question": "When was the telephone invented?",
-    "noncommittal": 0
-}
-```""")
-
-        generations = [Generation(text=json_text) for _ in range(n)]
-        return LLMResult(generations=[generations])
-
-    mock_agenerate = AsyncMock()
-    mock_agenerate.side_effect = mock_answer_relevancy_side_effect
-
-    mock_embed_documents = AsyncMock()
-    mock_embed_documents.return_value = [[0.1, 0.2, 0.3, 0.4, 0.5] for _ in range(10)]
-
-    with (
-        patch.object(LlamaStackRemoteLLM, "agenerate_text", mock_agenerate),
-        patch.object(
-            LlamaStackRemoteEmbeddings, "embed_documents", mock_embed_documents
-        ),
-    ):
-        llm = LlamaStackRemoteLLM(
-            base_url=llama_stack_base_url,
-            model_id=model,
-            sampling_params=sampling_params,
-        )
-        embeddings = LlamaStackRemoteEmbeddings(
-            base_url=llama_stack_base_url,
-            embedding_model_id=embedding_model,
-        )
-
-        metrics = [METRIC_MAPPING[m] for m in metrics]
-
-        with open(input_dataset.path) as f:
-            df_input = pd.read_json(f, lines=True)
-            eval_dataset = EvaluationDataset.from_list(
-                df_input.to_dict(orient="records")
-            )
-
-        ragas_output: EvaluationResult = evaluate(
-            dataset=eval_dataset,
-            metrics=metrics,
-            llm=llm,
-            embeddings=embeddings,
-            run_config=RunConfig(max_workers=1),
-        )
-
-    df_output = ragas_output.to_pandas()
-    table_output = render_dataframe_as_table(df_output, "Ragas Evaluation Results")
-    logger.info(f"Ragas evaluation completed:\n{table_output}")
-    df_output.to_json(output_results.path, orient="records", lines=True)
-
-
-def test_pipeline_dummy_dataset_retrieval(kf_client, remote_eval_config):
-    @dsl.pipeline()
-    def pipline_dataset_retrieval():
-        retrieve_data_for_testing()
-
-    run_result = kf_client.create_run_from_pipeline_func(
-        pipeline_func=pipline_dataset_retrieval,
-        namespace=remote_eval_config.kubeflow_config.namespace,
-        run_name="test-pipeline-dummy-dataset-retrieval",
-        experiment_name="ragas-provider-kf-tests",
-    )
-
-    assert run_result.run_id is not None
-
-
-@pytest.mark.parametrize(
-    "metric_to_test",
-    [
-        pytest.param(m, id=m.name) for m in [answer_relevancy]
-    ],  # , context_precision, faithfulness, context_recall]
-)
-def test_pipeline_dummy_ragas_evaluation(
-    kf_client, remote_eval_config, model, sampling_params, metric_to_test
-):
-    @dsl.pipeline()
-    def pipeline_ragas_evaluation():
-        test_dataset = retrieve_data_for_testing()
-        run_fake_ragas_evaluation(
-            input_dataset=test_dataset.output,
-            model=model,
-            sampling_params=sampling_params.model_dump(exclude_none=True),
-            embedding_model=remote_eval_config.embedding_model,
-            metrics=[metric_to_test.name],
-            llama_stack_base_url=remote_eval_config.kubeflow_config.llama_stack_url,
-        )
-
-    run_result = kf_client.create_run_from_pipeline_func(
-        pipeline_func=pipeline_ragas_evaluation,
-        namespace=remote_eval_config.kubeflow_config.namespace,
-        run_name="test-pipeline-dummy-ragas-evaluation",
-        experiment_name="ragas-provider-kf-tests",
-    )
-
-    assert run_result.run_id is not None
-
-
-@pytest.mark.parametrize(
-    "metric_to_test",
-    [
-        pytest.param(m, id=m.name) for m in [answer_relevancy]
-    ],  # , context_precision, faithfulness, context_recall]
-)
-def test_full_pipeline(
-    lls_client,
-    kf_client,
-    raw_evaluation_data,
-    remote_eval_config,
-    metric_to_test,
-    model,
-    sampling_params,
-    unique_timestamp,
-):
-    from llama_stack_provider_ragas.remote.kubeflow.pipeline import (
-        ragas_evaluation_pipeline,
-    )
-
-    dataset_id = f"test_ragas_dataset_remote_{unique_timestamp}"
-    lls_client.beta.datasets.register(
-        dataset_id=dataset_id,
-        purpose="eval/question-answer",
-        source={"type": "rows", "rows": raw_evaluation_data},
-        metadata={"provider_id": "localfs"},
-    )
-
-    run_result = kf_client.create_run_from_pipeline_func(
-        pipeline_func=ragas_evaluation_pipeline,
-        namespace=remote_eval_config.kubeflow_config.namespace,
-        arguments={
-            "model": model,
-            "dataset_id": dataset_id,
-            "sampling_params": sampling_params.model_dump(exclude_none=True),
-            "embedding_model": remote_eval_config.embedding_model,
-            "metrics": [metric_to_test.name],
-            "llama_stack_base_url": remote_eval_config.kubeflow_config.llama_stack_url,
-            "s3_credentials_secret_name": remote_eval_config.kubeflow_config.s3_credentials_secret_name,
-            "result_s3_location": remote_eval_config.kubeflow_config.results_s3_prefix,
-        },
-        run_name="test-full-pipeline",
-        experiment_name="ragas-provider-kf-tests",
-    )
-
-    assert run_result.run_id is not None
diff --git a/tests/test_remote_wrappers.py b/tests/test_remote_wrappers.py
index 28af4dbe..7cf9a247 100644
--- a/tests/test_remote_wrappers.py
+++ b/tests/test_remote_wrappers.py
@@ -1,16 +1,51 @@
-"""Test the remote wrappers for the Llama Stack client."""
+"""Tests for the LangChain-compatible remote wrappers (LLM and Embeddings).
+
+These tests exercise ``LlamaStackRemoteLLM`` and
+``LlamaStackRemoteEmbeddings``, which wrap the OpenAI-compatible
+completions and embeddings endpoints exposed by a Llama Stack server.
+
+By default, the client is **mocked**: the ``LlamaStackClient`` and
+``AsyncLlamaStackClient`` constructors are monkey-patched so that
+completions and embeddings calls return deterministic fake responses.
+No running server is required in this mode::
+
+    pytest tests/test_remote_wrappers.py
+
+To run against a real Llama Stack server, pass ``--no-mock-client``.
+The server URL is read from ``LLAMA_STACK_BASE_URL`` (default
+``http://localhost:8321``).  Model IDs can be overridden with
+``INFERENCE_MODEL`` and ``EMBEDDING_MODEL``::
+
+    pytest tests/test_remote_wrappers.py --no-mock-client
+
+    INFERENCE_MODEL=ollama/granite3.3:2b \\
+    EMBEDDING_MODEL=ollama/all-minilm:latest \\
+        pytest tests/test_remote_wrappers.py --no-mock-client
+"""
 
 import json
 import logging
 import os
+import random
 
 import pytest
 from langchain_core.prompt_values import StringPromptValue
+from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient
+from llama_stack_client.types.completion_create_response import (
+    Choice,
+    CompletionCreateResponse,
+)
+from llama_stack_client.types.create_embeddings_response import (
+    CreateEmbeddingsResponse,
+    Data,
+    Usage,
+)
 from ragas import evaluate
 from ragas.evaluation import EvaluationResult
 from ragas.metrics import answer_relevancy
 from ragas.run_config import RunConfig
 
+from llama_stack_provider_ragas.compat import SamplingParams, TopPSamplingStrategy
 from llama_stack_provider_ragas.logging_utils import render_dataframe_as_table
 from llama_stack_provider_ragas.remote.wrappers_remote import (
     LlamaStackRemoteEmbeddings,
@@ -18,23 +53,160 @@
 )
 
 logger = logging.getLogger(__name__)
-pytestmark = pytest.mark.lls_integration
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture
+def inference_model():
+    return os.getenv("INFERENCE_MODEL", "litellm/Mistral-Small-24B-W8A8")
+
+
+@pytest.fixture
+def embedding_model():
+    return os.getenv("EMBEDDING_MODEL", "embedding/nomic-ai/nomic-embed-text-v1.5")
+
+
+@pytest.fixture
+def sampling_params():
+    return SamplingParams(
+        strategy=TopPSamplingStrategy(temperature=0.1, top_p=0.95),
+        max_tokens=100,
+        stop=None,
+    )
+
+
+@pytest.fixture
+def lls_client(request):
+    if request.config.getoption("--no-mock-client") is True:
+        return request.getfixturevalue("real_lls_client")
+    else:
+        return request.getfixturevalue("mocked_lls_client")
+
+
+@pytest.fixture
+def real_lls_client(llama_stack_base_url):
+    return LlamaStackClient(base_url=llama_stack_base_url)
+
+
+@pytest.fixture(autouse=True)
+def mocked_client_response(request):
+    """Fake completion text returned by the mocked ``LlamaStackClient``.
+
+    The client's ``completions.create`` and ``embeddings.create`` methods
+    are monkey-patched in ``mocked_lls_clients``; this fixture controls the
+    text that the mocked completions endpoint returns.  Use indirect
+    parametrization to override the default value per test.
+    """
+    return getattr(request, "param", "Hello, world!")
+
+
+@pytest.fixture()
+def mocked_lls_client(mocked_lls_clients):
+    sync_client, _ = mocked_lls_clients
+    return sync_client
+
+
+@pytest.fixture()
+def mocked_lls_clients(monkeypatch, request, embedding_dimension, llama_stack_base_url):
+    """Build mocked sync and async ``LlamaStackClient`` instances.
+
+    Completions and embeddings ``.create()`` methods are replaced with
+    fakes that return deterministic responses.  The completion text comes
+    from the ``mocked_client_response`` fixture, which can be overridden
+    via indirect parametrization::
+
+        @pytest.mark.parametrize(
+            "mocked_client_response",
+            ["Hello from mock!"],
+            indirect=True,
+        )
+    """
+    # Create real clients, but patch only the `.create()` methods we need.
+    sync_client = LlamaStackClient(base_url=llama_stack_base_url)
+    async_client = AsyncLlamaStackClient(base_url=llama_stack_base_url)
+
+    completion_text = request.getfixturevalue("mocked_client_response")
+
+    def _make_embeddings_response(n: int) -> CreateEmbeddingsResponse:
+        # return one embedding vector per input string
+        return CreateEmbeddingsResponse(
+            data=[
+                Data(
+                    embedding=[random.random() for _ in range(embedding_dimension)],
+                    index=i,
+                    object="embedding",
+                )
+                for i in range(n)
+            ],
+            model="mocked/model",
+            object="list",
+            usage=Usage(prompt_tokens=10, total_tokens=10),
+        )
+
+    def _make_completions_response(text: str) -> CompletionCreateResponse:
+        return CompletionCreateResponse(
+            id="cmpl-123",
+            created=1717000000,
+            choices=[Choice(index=0, text=text, finish_reason="stop")],
+            model="mocked/model",
+            object="text_completion",
+        )
+
+    def _embeddings_create(*args, **kwargs):
+        embedding_input = kwargs.get("input")
+        if isinstance(embedding_input, list):
+            return _make_embeddings_response(len(embedding_input))
+        return _make_embeddings_response(1)
+
+    async def _async_embeddings_create(*args, **kwargs):
+        embedding_input = kwargs.get("input")
+        if isinstance(embedding_input, list):
+            return _make_embeddings_response(len(embedding_input))
+        return _make_embeddings_response(1)
+
+    def _completions_create(*args, **kwargs):
+        return _make_completions_response(completion_text)
+
+    async def _async_completions_create(*args, **kwargs):
+        return _make_completions_response(completion_text)
+
+    # Patch nested methods (avoids dotted-attribute monkeypatch issues on classes).
+    monkeypatch.setattr(sync_client.embeddings, "create", _embeddings_create)
+    monkeypatch.setattr(sync_client.completions, "create", _completions_create)
+    monkeypatch.setattr(async_client.embeddings, "create", _async_embeddings_create)
+    monkeypatch.setattr(async_client.completions, "create", _async_completions_create)
+
+    return sync_client, async_client
+
+
+@pytest.fixture(autouse=True)
+def patch_remote_wrappers(monkeypatch, mocked_lls_clients, request):
+    sync_client, async_client = mocked_lls_clients
+    if request.config.getoption("--no-mock-client") is not True:
+        from llama_stack_provider_ragas.remote import wrappers_remote
+
+        monkeypatch.setattr(
+            wrappers_remote, "LlamaStackClient", lambda *a, **k: sync_client
+        )
+        monkeypatch.setattr(
+            wrappers_remote, "AsyncLlamaStackClient", lambda *a, **k: async_client
+        )
 
 
 @pytest.fixture
-def lls_remote_embeddings(embedding_model):
+def lls_remote_embeddings(embedding_model, llama_stack_base_url):
     return LlamaStackRemoteEmbeddings(
-        base_url=os.environ.get("KUBEFLOW_LLAMA_STACK_URL"),
+        base_url=llama_stack_base_url,
         embedding_model_id=embedding_model,
     )
 
 
 @pytest.fixture
-def lls_remote_llm(model, sampling_params):
+def lls_remote_llm(inference_model, sampling_params, llama_stack_base_url):
     """Remote LLM wrapper for evaluation."""
     return LlamaStackRemoteLLM(
-        base_url=os.environ.get("KUBEFLOW_LLAMA_STACK_URL"),
-        model_id=model,
+        base_url=llama_stack_base_url,
+        model_id=inference_model,
         sampling_params=sampling_params,
     )
 
@@ -97,7 +269,7 @@ async def test_remote_llm_async(lls_remote_llm):
 
 
 @pytest.mark.parametrize(
-    "metric_to_test,mocked_llm_response",
+    "metric_to_test,mocked_client_response",
     [
         # `answer_relevancy` expects the LLM to output a JSON payload with:
         # - question: a question implied by the given answer
@@ -110,7 +282,7 @@ async def test_remote_llm_async(lls_remote_llm):
             id="answer_relevancy",
         ),
     ],
-    indirect=["mocked_llm_response"],
+    indirect=["mocked_client_response"],
 )
 def test_direct_evaluation(
     evaluation_dataset,

From 2e49f092f01c914aed12cbaf56fed706877d9b3b Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Thu, 12 Mar 2026 11:46:06 -0400
Subject: [PATCH 4/7] Remove outdated tests/e2e/ directory

These files were superseded by tests/cluster-deployment/ in PR #59.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/e2e/Containerfile                       |  35 ---
 tests/e2e/deploy-e2e.sh                       | 243 ------------------
 .../e2e/manifests/configmap-and-secrets.yaml  |  37 ---
 .../kubeflow-pipeline-resources.yaml          |  10 -
 .../manifests/llama-stack-distribution.yaml   | 239 -----------------
 tests/e2e/manifests/minio.yaml                |  85 ------
 tests/e2e/teardown-e2e.sh                     |  13 -
 tests/e2e/test_e2e.py                         | 181 -------------
 8 files changed, 843 deletions(-)
 delete mode 100644 tests/e2e/Containerfile
 delete mode 100755 tests/e2e/deploy-e2e.sh
 delete mode 100644 tests/e2e/manifests/configmap-and-secrets.yaml
 delete mode 100644 tests/e2e/manifests/kubeflow-pipeline-resources.yaml
 delete mode 100644 tests/e2e/manifests/llama-stack-distribution.yaml
 delete mode 100644 tests/e2e/manifests/minio.yaml
 delete mode 100755 tests/e2e/teardown-e2e.sh
 delete mode 100644 tests/e2e/test_e2e.py

diff --git a/tests/e2e/Containerfile b/tests/e2e/Containerfile
deleted file mode 100644
index 69759f86..00000000
--- a/tests/e2e/Containerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-# This Containerfile is used to build the llama-stack-provider-ragas-distro-image for the e2e tests.
-
-FROM python:3.12-slim
-
-WORKDIR /app
-
-# Install uv by copying the static binaries from the official image
-COPY --from=ghcr.io/astral-sh/uv:0.9.21 /uv /uvx /bin/
-
-# Create a venv and make it the default Python for subsequent steps.
-RUN uv venv /app/.venv
-ENV VIRTUAL_ENV=/app/.venv
-ENV PATH="/app/.venv/bin:${PATH}"
-
-# Install sentence-transformers + torch (cached layer — these rarely change).
-RUN uv pip install --python /app/.venv/bin/python \
-    --extra-index-url https://download.pytorch.org/whl/cpu \
-    torch sentence-transformers einops tokenizers safetensors
-
-# Pre-download the embedding model so no HF fetch is needed at runtime.
-RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('nomic-ai/nomic-embed-text-v1.5')"
-
-# Copy code (changes frequently — kept after heavy layers for caching).
-COPY src /app/src
-COPY distribution /app/distribution
-COPY pyproject.toml /app/pyproject.toml
-COPY uv.lock /app/uv.lock
-COPY README.md /app/README.md
-
-# Install the project into the venv.
-RUN uv pip install --python /app/.venv/bin/python -e ".[remote,distro]"
-
-EXPOSE 8321
-
-ENTRYPOINT ["uv", "run", "--no-sync", "llama", "stack", "run", "distribution/run.yaml"]
diff --git a/tests/e2e/deploy-e2e.sh b/tests/e2e/deploy-e2e.sh
deleted file mode 100755
index bb38dbfd..00000000
--- a/tests/e2e/deploy-e2e.sh
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env bash
-#
-# Deploy the llama-stack-provider-ragas e2e test environment on an OpenShift cluster.
-#
-# Usage:
-#   ./deploy-e2e.sh --build
-#   ./deploy-e2e.sh --image <image-ref>
-#
-# Reads credentials from ../../.env (repo root) and creates a single
-# 'ragas-env' k8s secret from it.
-#
-# Prerequisites:
-#   - oc CLI installed and logged into an OpenShift cluster
-#   - podman (only required for --build mode)
-#
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="${SCRIPT_DIR}/../.."
-IMAGE_NAME="llama-stack-provider-ragas-distro-image"
-NAMESPACE="ragas-test"
-
-# ---------------------------------------------------------------------------
-# Parse arguments
-# ---------------------------------------------------------------------------
-MODE=""
-IMAGE_REF=""
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --build)
-            MODE="build"
-            shift
-            ;;
-        --image)
-            MODE="image"
-            IMAGE_REF="$2"
-            if [[ -z "${IMAGE_REF}" ]]; then
-                echo "Error: --image requires an image reference argument."
-                exit 1
-            fi
-            shift 2
-            ;;
-        *)
-            echo "Unknown option: $1"
-            echo "Usage: $0 --build | --image <image-ref>"
-            exit 1
-            ;;
-    esac
-done
-
-if [[ -z "${MODE}" ]]; then
-    echo "Usage: $0 --build | --image <image-ref>"
-    exit 1
-fi
-
-# ---------------------------------------------------------------------------
-# Prerequisites
-# ---------------------------------------------------------------------------
-echo "Checking prerequisites..."
-
-if ! command -v oc &> /dev/null; then
-    echo "Error: oc is not installed."
-    exit 1
-fi
-
-if ! oc whoami &> /dev/null; then
-    echo "Error: Not logged into an OpenShift cluster. Run 'oc login' first."
-    exit 1
-fi
-
-echo "  Logged in as: $(oc whoami)"
-echo "  Cluster: $(oc whoami --show-server)"
-
-# ---------------------------------------------------------------------------
-# Resolve image
-# ---------------------------------------------------------------------------
-if [[ "${MODE}" == "build" ]]; then
-    if ! command -v podman &> /dev/null; then
-        echo "Error: podman is not installed (required for --build)."
-        exit 1
-    fi
-
-    echo ""
-    echo "=== Building image from Containerfile ==="
-
-    # Detect cluster node architecture (not local host arch)
-    NODE_ARCH=$(oc get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "amd64")
-    case "${NODE_ARCH}" in
-        amd64)  PLATFORM="linux/amd64" ;;
-        arm64)  PLATFORM="linux/arm64" ;;
-        *)      echo "Warning: unknown cluster architecture ${NODE_ARCH}, defaulting to linux/amd64"; PLATFORM="linux/amd64" ;;
-    esac
-    echo "  Cluster node architecture: ${NODE_ARCH} -> ${PLATFORM}"
-
-    # Build the image
-    LOCAL_TAG="${IMAGE_NAME}:latest"
-    echo "  Building ${LOCAL_TAG}..."
-    podman build --no-cache --platform "${PLATFORM}" \
-        -t "${LOCAL_TAG}" \
-        -f "${SCRIPT_DIR}/Containerfile" "${REPO_ROOT}"
-
-    # Expose the OpenShift internal registry route (idempotent)
-    echo "  Exposing OpenShift internal registry..."
-    oc patch configs.imageregistry.operator.openshift.io/cluster \
-        --type=merge --patch '{"spec":{"defaultRoute":true}}' 2>/dev/null || true
-
-    # Wait briefly for the route to appear
-    for i in $(seq 1 12); do
-        REGISTRY_ROUTE=$(oc get route default-route -n openshift-image-registry \
-            --template='{{ .spec.host }}' 2>/dev/null) && break
-        sleep 5
-    done
-
-    if [[ -z "${REGISTRY_ROUTE}" ]]; then
-        echo "Error: Could not determine the OpenShift internal registry route."
-        exit 1
-    fi
-    echo "  Registry route: ${REGISTRY_ROUTE}"
-
-    # Login to the registry
-    echo "  Logging into registry..."
-    podman login --tls-verify=false -u "$(oc whoami)" -p "$(oc whoami -t)" "${REGISTRY_ROUTE}"
-
-    # Ensure the namespace exists before pushing (registry needs the namespace/project)
-    oc create namespace "${NAMESPACE}" 2>/dev/null || true
-
-    # Tag and push
-    REMOTE_TAG="${REGISTRY_ROUTE}/${NAMESPACE}/${IMAGE_NAME}:latest"
-    echo "  Tagging ${LOCAL_TAG} -> ${REMOTE_TAG}"
-    podman tag "${LOCAL_TAG}" "${REMOTE_TAG}"
-
-    echo "  Pushing to internal registry..."
-    podman push --tls-verify=false "${REMOTE_TAG}"
-
-    # The in-cluster image reference uses the internal service address
-    IMAGE_REF="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/${IMAGE_NAME}:latest"
-    echo "  In-cluster image ref: ${IMAGE_REF}"
-
-elif [[ "${MODE}" == "image" ]]; then
-    echo ""
-    echo "=== Using pre-built image ==="
-    echo "  Image: ${IMAGE_REF}"
-fi
-
-# ---------------------------------------------------------------------------
-# Install LlamaStack operator
-# ---------------------------------------------------------------------------
-echo ""
-echo "=== Installing LlamaStack operator ==="
-oc apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml
-
-echo "Waiting for LlamaStack operator to be ready..."
-oc wait --for=condition=available deployment/llama-stack-k8s-operator-controller-manager \
-    -n llama-stack-k8s-operator-system --timeout=120s
-
-# ---------------------------------------------------------------------------
-# Create namespace and apply manifests
-# ---------------------------------------------------------------------------
-echo ""
-echo "=== Setting up ${NAMESPACE} namespace ==="
-oc create namespace "${NAMESPACE}" 2>/dev/null || true
-
-echo "Applying configmaps and secrets..."
-oc apply -f "${SCRIPT_DIR}/manifests/configmap-and-secrets.yaml"
-
-echo "Creating ragas-env secret from .env..."
-ENV_FILE="${REPO_ROOT}/.env"
-if [[ ! -f "${ENV_FILE}" ]]; then
-    echo "Error: ${ENV_FILE} not found."
-    exit 1
-fi
-oc create secret generic ragas-env -n "${NAMESPACE}" \
-    --from-env-file="${ENV_FILE}" \
-    --dry-run=client -o yaml | oc apply -f -
-
-echo "Applying MinIO..."
-oc apply -f "${SCRIPT_DIR}/manifests/minio.yaml"
-
-echo "Applying LlamaStackDistribution CR (image: ${IMAGE_REF})..."
-sed "s|__LLAMA_STACK_IMAGE__|${IMAGE_REF}|g" \
-    "${SCRIPT_DIR}/manifests/llama-stack-distribution.yaml" | oc apply -f -
-
-# ---------------------------------------------------------------------------
-# Wait for MinIO
-# ---------------------------------------------------------------------------
-echo ""
-echo "=== Waiting for MinIO ==="
-echo "Waiting for MinIO deployment..."
-oc wait --for=condition=available deployment/minio -n "${NAMESPACE}" --timeout=120s
-
-echo "Waiting for MinIO bucket creation job..."
-oc wait --for=condition=complete job/minio-create-bucket -n "${NAMESPACE}" --timeout=120s
-
-# ---------------------------------------------------------------------------
-# Kubeflow pipeline resources (aws-credentials in ragas-test namespace)
-# ---------------------------------------------------------------------------
-echo ""
-echo "=== Applying Kubeflow pipeline resources ==="
-oc apply -f "${SCRIPT_DIR}/manifests/kubeflow-pipeline-resources.yaml"
-
-# ---------------------------------------------------------------------------
-# Wait for operator reconciliation and deployments
-# ---------------------------------------------------------------------------
-echo ""
-echo "=== Waiting for deployments ==="
-
-echo "Waiting for operator to reconcile LlamaStackDistribution..."
-for i in $(seq 1 30); do
-    if oc get deployment/lsd-ragas-test -n "${NAMESPACE}" &>/dev/null; then
-        echo "  Deployment created."
-        break
-    fi
-    if [ "$i" -eq 30 ]; then
-        echo "Error: Timed out waiting for deployment/lsd-ragas-test to be created by the operator."
-        exit 1
-    fi
-    sleep 5
-done
-
-echo "Waiting for llama-stack deployment..."
-oc wait --for=condition=available deployment/lsd-ragas-test -n "${NAMESPACE}" --timeout=300s
-
-# ---------------------------------------------------------------------------
-# Summary
-# ---------------------------------------------------------------------------
-echo ""
-echo "========================================="
-echo " E2E deployment complete!"
-echo "========================================="
-echo ""
-echo "  Namespace: ${NAMESPACE}"
-echo "  Image:     ${IMAGE_REF}"
-echo "  Env file:  ${ENV_FILE}"
-echo ""
-echo "Next steps:"
-echo "  1. Verify pods:    oc get pods -n ${NAMESPACE}"
-echo "  2. Port forward:   oc port-forward -n ${NAMESPACE} svc/lsd-ragas-test-service 8321:8321 &"
-echo "  3. Test API:       curl http://localhost:8321/v1/models"
-echo ""
-echo "To tear down:"
-echo "  ./teardown-e2e.sh"
diff --git a/tests/e2e/manifests/configmap-and-secrets.yaml b/tests/e2e/manifests/configmap-and-secrets.yaml
deleted file mode 100644
index b625f3f8..00000000
--- a/tests/e2e/manifests/configmap-and-secrets.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# Default configuration for the e2e test environment.
-#
-# All values here can be overridden by the ragas-env secret (created from .env).
-# The ragas-env secret is loaded AFTER this ConfigMap, so .env values take precedence.
-#
-# The following keys MUST be provided in .env (they are left blank here):
-#   LITELLM_API_URL, LITELLM_API_KEY
-#
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: kubeflow-ragas-config
-  namespace: ragas-test
-data:
-  # Inference
-  INFERENCE_MODEL: "Mistral-Small-24B-W8A8"
-  LITELLM_API_URL: ""
-  LITELLM_API_KEY: ""
-
-  # Embedding (inline sentence-transformers, model downloaded at startup)
-  EMBEDDING_MODEL: "nomic-ai/nomic-embed-text-v1.5"
-
-  # Kubeflow pipelines
-  KUBEFLOW_LLAMA_STACK_URL: "http://lsd-ragas-test-service.ragas-test.svc.cluster.local:8321"
-  KUBEFLOW_PIPELINES_ENDPOINT: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
-  KUBEFLOW_PIPELINES_TOKEN: ""
-  KUBEFLOW_NAMESPACE: "ragas-test"
-  KUBEFLOW_BASE_IMAGE: "python:3.12-slim"
-
-  # S3 / MinIO results storage
-  KUBEFLOW_RESULTS_S3_PREFIX: "s3://ragas-results/evaluations"
-  KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: "aws-credentials"
-  RESULTS_S3_ENDPOINT: "http://minio-service.ragas-test.svc.cluster.local:9000"
-  RESULTS_S3_PATH_STYLE: "true"
-  AWS_ACCESS_KEY_ID: "minioadmin"
-  AWS_SECRET_ACCESS_KEY: "minioadmin"
-  AWS_DEFAULT_REGION: "us-east-1"
diff --git a/tests/e2e/manifests/kubeflow-pipeline-resources.yaml b/tests/e2e/manifests/kubeflow-pipeline-resources.yaml
deleted file mode 100644
index d58bc96a..00000000
--- a/tests/e2e/manifests/kubeflow-pipeline-resources.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-apiVersion: v1
-kind: Secret
-metadata:
-  name: aws-credentials
-  namespace: ragas-test
-type: Opaque
-stringData:
-  AWS_ACCESS_KEY_ID: "minioadmin"
-  AWS_SECRET_ACCESS_KEY: "minioadmin"
-  AWS_DEFAULT_REGION: "us-east-1"
diff --git a/tests/e2e/manifests/llama-stack-distribution.yaml b/tests/e2e/manifests/llama-stack-distribution.yaml
deleted file mode 100644
index d4f41d32..00000000
--- a/tests/e2e/manifests/llama-stack-distribution.yaml
+++ /dev/null
@@ -1,239 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: llama-stack-e2e-config
-  namespace: ragas-test
-data:
-  config.yaml: |
-    version: 2
-    image_name: trustyai_ragas_distro
-    apis:
-    - eval
-    - inference
-    - files
-    - datasetio
-    providers:
-      eval:
-        - provider_id: ${env.KUBEFLOW_LLAMA_STACK_URL:+trustyai_ragas_remote}
-          provider_type: remote::trustyai_ragas
-          module: llama_stack_provider_ragas.remote
-          config:
-            embedding_model: embedding/${env.EMBEDDING_MODEL}
-            kubeflow_config:
-              results_s3_prefix: ${env.KUBEFLOW_RESULTS_S3_PREFIX}
-              s3_credentials_secret_name: ${env.KUBEFLOW_S3_CREDENTIALS_SECRET_NAME}
-              pipelines_endpoint: ${env.KUBEFLOW_PIPELINES_ENDPOINT}
-              namespace: ${env.KUBEFLOW_NAMESPACE}
-              llama_stack_url: ${env.KUBEFLOW_LLAMA_STACK_URL}
-              base_image: ${env.KUBEFLOW_BASE_IMAGE}
-              pipelines_api_token: ${env.KUBEFLOW_PIPELINES_TOKEN:=}
-              results_s3_endpoint: ${env.RESULTS_S3_ENDPOINT}
-              results_s3_path_style: ${env.RESULTS_S3_PATH_STYLE}
-            kvstore:
-              namespace: ragas
-              backend: kv_default
-        - provider_id: ${env.EMBEDDING_MODEL:+trustyai_ragas_inline}
-          provider_type: inline::trustyai_ragas
-          module: llama_stack_provider_ragas.inline
-          config:
-            embedding_model: embedding/${env.EMBEDDING_MODEL}
-            kvstore:
-              namespace: ragas
-              backend: kv_default
-      datasetio:
-      - provider_id: localfs
-        provider_type: inline::localfs
-        config:
-          kvstore:
-            namespace: datasetio::localfs
-            backend: kv_default
-      - provider_id: huggingface
-        provider_type: remote::huggingface
-        config:
-          kvstore:
-            namespace: datasetio::huggingface
-            backend: kv_default
-      inference:
-        - provider_id: litellm
-          provider_type: "remote::openai"
-          config:
-            base_url: "${env.LITELLM_API_URL}"
-            api_key: "${env.LITELLM_API_KEY}"
-        - provider_id: embedding
-          provider_type: "inline::sentence-transformers"
-          config: {}
-      files:
-      - provider_id: meta-reference-files
-        provider_type: inline::localfs
-        config:
-          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/trustyai_ragas_distro/files}
-          metadata_store:
-            table_name: files_metadata
-            backend: sql_default
-    storage:
-      backends:
-        kv_default:
-          type: kv_sqlite
-          db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/trustyai_ragas_distro}/kvstore.db
-        sql_default:
-          type: sql_sqlite
-          db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/trustyai_ragas_distro}/sql_store.db
-      stores:
-        metadata:
-          namespace: registry
-          backend: kv_default
-        inference:
-          table_name: inference_store
-          backend: sql_default
-          max_write_queue_size: 10000
-          num_writers: 4
-        conversations:
-          table_name: openai_conversations
-          backend: sql_default
-        connectors:
-          namespace: connectors
-          backend: kv_default
-        prompts:
-          namespace: prompts
-          backend: kv_default
-    registered_resources:
-      models:
-      - metadata: {}
-        model_id: "${env.INFERENCE_MODEL:=Mistral-Small-24B-W8A8}"
-        provider_id: litellm
-        model_type: llm
-      - metadata:
-          embedding_dimension: 768
-        model_id: "${env.EMBEDDING_MODEL}"
-        provider_id: embedding
-        provider_model_id: "${env.EMBEDDING_MODEL}"
-        model_type: embedding
-      shields: []
-      vector_dbs: []
-      datasets: []
-      scoring_fns: []
-      benchmarks:
-      - benchmark_id: hf-doc-qa-ragas-inline-benchmark
-        dataset_id: hf_doc_qa_ragas_eval
-        scoring_functions:
-          - semantic_similarity
-        provider_id: trustyai_ragas_inline
-        metadata: {}
-      - benchmark_id: hf-doc-qa-ragas-remote-benchmark
-        dataset_id: hf_doc_qa_ragas_eval
-        scoring_functions:
-          - semantic_similarity
-        provider_id: trustyai_ragas_remote
-        metadata: {}
-      tool_groups: []
-    server:
-      port: 8321
----
-apiVersion: llamastack.io/v1alpha1
-kind: LlamaStackDistribution
-metadata:
-  name: lsd-ragas-test
-  namespace: ragas-test
-spec:
-  replicas: 1
-  server:
-    containerSpec:
-      resources:
-        requests:
-          cpu: 1
-          memory: "2Gi"
-        limits:
-          cpu: 2
-          memory: "4Gi"
-      env:
-        # Inference (defaults in configmap, credentials in ragas-env from .env)
-        - name: INFERENCE_MODEL
-          valueFrom:
-            configMapKeyRef:
-              key: INFERENCE_MODEL
-              name: kubeflow-ragas-config
-        - name: LITELLM_API_URL
-          valueFrom:
-            secretKeyRef:
-              key: LITELLM_API_URL
-              name: ragas-env
-        - name: LITELLM_API_KEY
-          valueFrom:
-            secretKeyRef:
-              key: LITELLM_API_KEY
-              name: ragas-env
-        # Embedding
-        - name: EMBEDDING_MODEL
-          valueFrom:
-            configMapKeyRef:
-              key: EMBEDDING_MODEL
-              name: kubeflow-ragas-config
-        # Kubeflow pipelines
-        - name: KUBEFLOW_PIPELINES_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_PIPELINES_ENDPOINT
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_NAMESPACE
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_NAMESPACE
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_BASE_IMAGE
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_BASE_IMAGE
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_LLAMA_STACK_URL
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_LLAMA_STACK_URL
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_RESULTS_S3_PREFIX
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_RESULTS_S3_PREFIX
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
-          valueFrom:
-            configMapKeyRef:
-              key: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
-              name: kubeflow-ragas-config
-        - name: KUBEFLOW_PIPELINES_TOKEN
-          valueFrom:
-            secretKeyRef:
-              key: KUBEFLOW_PIPELINES_TOKEN
-              name: ragas-env
-              optional: true
-        # S3 / MinIO
-        - name: RESULTS_S3_ENDPOINT
-          valueFrom:
-            configMapKeyRef:
-              key: RESULTS_S3_ENDPOINT
-              name: kubeflow-ragas-config
-        - name: RESULTS_S3_PATH_STYLE
-          valueFrom:
-            configMapKeyRef:
-              key: RESULTS_S3_PATH_STYLE
-              name: kubeflow-ragas-config
-        - name: AWS_ACCESS_KEY_ID
-          valueFrom:
-            configMapKeyRef:
-              key: AWS_ACCESS_KEY_ID
-              name: kubeflow-ragas-config
-        - name: AWS_SECRET_ACCESS_KEY
-          valueFrom:
-            configMapKeyRef:
-              key: AWS_SECRET_ACCESS_KEY
-              name: kubeflow-ragas-config
-        - name: AWS_DEFAULT_REGION
-          valueFrom:
-            configMapKeyRef:
-              key: AWS_DEFAULT_REGION
-              name: kubeflow-ragas-config
-      name: llama-stack
-      port: 8321
-    distribution:
-      image: __LLAMA_STACK_IMAGE__
-    userConfig:
-      configMapName: llama-stack-e2e-config
diff --git a/tests/e2e/manifests/minio.yaml b/tests/e2e/manifests/minio.yaml
deleted file mode 100644
index fd0df634..00000000
--- a/tests/e2e/manifests/minio.yaml
+++ /dev/null
@@ -1,85 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: minio
-  namespace: ragas-test
-  labels:
-    app: minio
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: minio
-  template:
-    metadata:
-      labels:
-        app: minio
-    spec:
-      containers:
-        - name: minio
-          image: quay.io/minio/minio:latest
-          args:
-            - server
-            - /data
-            - --console-address
-            - ":9001"
-          env:
-            - name: MINIO_ROOT_USER
-              value: "minioadmin"
-            - name: MINIO_ROOT_PASSWORD
-              value: "minioadmin"
-          ports:
-            - containerPort: 9000
-              name: s3
-            - containerPort: 9001
-              name: console
-          readinessProbe:
-            httpGet:
-              path: /minio/health/ready
-              port: 9000
-            initialDelaySeconds: 5
-            periodSeconds: 5
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: minio-service
-  namespace: ragas-test
-spec:
-  selector:
-    app: minio
-  ports:
-    - name: s3
-      port: 9000
-      targetPort: 9000
-    - name: console
-      port: 9001
-      targetPort: 9001
----
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: minio-create-bucket
-  namespace: ragas-test
-spec:
-  backoffLimit: 3
-  template:
-    spec:
-      restartPolicy: OnFailure
-      containers:
-        - name: mc
-          image: quay.io/minio/mc:latest
-          env:
-            - name: HOME
-              value: /tmp
-          command:
-            - sh
-            - -c
-            - |
-              echo "Waiting for MinIO to be ready..."
-              until mc alias set local http://minio-service:9000 minioadmin minioadmin 2>/dev/null; do
-                sleep 2
-              done
-              echo "MinIO is ready."
-              mc mb --ignore-existing local/ragas-results
-              echo "Bucket 'ragas-results' is ready."
diff --git a/tests/e2e/teardown-e2e.sh b/tests/e2e/teardown-e2e.sh
deleted file mode 100755
index c6fc3c74..00000000
--- a/tests/e2e/teardown-e2e.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-#
-# Tear down the llama-stack-provider-ragas e2e test environment.
-#
-
-set -e
-
-echo "Tearing down e2e test environment..."
-
-oc delete namespace ragas-test --ignore-not-found
-
-echo ""
-echo "Teardown complete."
diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
deleted file mode 100644
index 41f00ad3..00000000
--- a/tests/e2e/test_e2e.py
+++ /dev/null
@@ -1,181 +0,0 @@
-"""End-to-end tests for the llama-stack-provider-ragas distribution on OpenShift.
-
-Prerequisites:
-    - OpenShift cluster with the e2e environment deployed (see deploy-e2e.sh)
-    - Port-forward active:
-        oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
-
-Environment variables:
-    LLAMA_STACK_BASE_URL  - Llama Stack server URL (default: http://localhost:8321)
-    INFERENCE_MODEL       - Model ID for eval candidate (default: Mistral-Small-24B-W8A8)
-"""
-
-import os
-import time
-
-import pytest
-from llama_stack_client import LlamaStackClient
-from rich import print as pprint
-
-# Pre-registered resource IDs (must match llama-stack-distribution.yaml)
-INLINE_BENCHMARK_ID = "hf-doc-qa-ragas-inline-benchmark"
-REMOTE_BENCHMARK_ID = "hf-doc-qa-ragas-remote-benchmark"
-DATASET_ID = "hf_doc_qa_ragas_eval"
-
-POLL_INTERVAL = 5  # seconds
-POLL_TIMEOUT = 300  # seconds
-REMOTE_POLL_TIMEOUT = 600  # seconds – pipeline pods need to pull images and install packages
-
-RAW_EVALUATION_DATA = [
-    {
-        "user_input": "What is the capital of France?",
-        "response": "The capital of France is Paris.",
-        "retrieved_contexts": [
-            "Paris is the capital and most populous city of France."
-        ],
-        "reference": "Paris",
-    },
-    {
-        "user_input": "Who invented the telephone?",
-        "response": "Alexander Graham Bell invented the telephone in 1876.",
-        "retrieved_contexts": [
-            "Alexander Graham Bell was a Scottish-American inventor who patented the first practical telephone."
-        ],
-        "reference": "Alexander Graham Bell",
-    },
-    {
-        "user_input": "What is photosynthesis?",
-        "response": "Photosynthesis is the process by which plants convert sunlight into energy.",
-        "retrieved_contexts": [
-            "Photosynthesis is a process used by plants to convert light energy into chemical energy."
-        ],
-        "reference": "Photosynthesis is the process by which plants and other organisms convert light energy into chemical energy.",
-    },
-]
-
-
-@pytest.fixture(scope="module")
-def client():
-    base_url = os.getenv("LLAMA_STACK_BASE_URL", "http://localhost:8321")
-    return LlamaStackClient(base_url=base_url)
-
-
-@pytest.fixture(scope="module")
-def inference_model():
-    return os.getenv("INFERENCE_MODEL", "Mistral-Small-24B-W8A8")
-
-
-@pytest.fixture(scope="module", autouse=True)
-def register_dataset(client):
-    """Register the evaluation dataset with inline rows."""
-    client.beta.datasets.register(
-        dataset_id=DATASET_ID,
-        purpose="eval/messages-answer",
-        source={"type": "rows", "rows": RAW_EVALUATION_DATA},
-    )
-    yield
-    client.beta.datasets.unregister(dataset_id=DATASET_ID)
-
-
-def _wait_for_job(client, benchmark_id, job_id, timeout=POLL_TIMEOUT):
-    """Poll until the eval job reaches a terminal state."""
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        job = client.alpha.eval.jobs.status(benchmark_id=benchmark_id, job_id=job_id)
-        pprint("Job details:", job)
-        if job.status in ("completed", "failed"):
-            return job
-        time.sleep(POLL_INTERVAL)
-    raise TimeoutError(
-        f"Job {job_id} for benchmark {benchmark_id} did not complete within {timeout}s"
-    )
-
-
-class TestClusterSmoke:
-    """Verify the cluster has the expected resources registered."""
-
-    def test_models_registered(self, client):
-        models = client.models.list()
-        assert len(models) > 0, "No models registered"
-
-    def test_datasets_registered(self, client):
-        datasets = client.beta.datasets.list()
-        dataset_ids = [d.identifier for d in datasets]
-        assert DATASET_ID in dataset_ids, (
-            f"Dataset '{DATASET_ID}' not found. Available: {dataset_ids}"
-        )
-
-    def test_benchmarks_registered(self, client):
-        benchmarks = client.alpha.benchmarks.list()
-        benchmark_ids = [b.identifier for b in benchmarks]
-        assert INLINE_BENCHMARK_ID in benchmark_ids, (
-            f"Benchmark '{INLINE_BENCHMARK_ID}' not found. Available: {benchmark_ids}"
-        )
-
-
-class TestInlineEval:
-    """Run evaluation using the inline ragas provider."""
-
-    def test_run_eval(self, client, inference_model):
-        job = client.alpha.eval.run_eval(
-            benchmark_id=INLINE_BENCHMARK_ID,
-            benchmark_config={
-                "eval_candidate": {
-                    "type": "model",
-                    "model": inference_model,
-                    "sampling_params": {
-                        "temperature": 0.1,
-                        "max_tokens": 100,
-                    },
-                },
-                "scoring_params": {},
-                "num_examples": 3,
-            },
-        )
-        assert job.job_id is not None
-        assert job.status == "in_progress"
-
-        completed = _wait_for_job(client, INLINE_BENCHMARK_ID, job.job_id)
-        assert completed.status == "completed", (
-            f"Job finished with status '{completed.status}'"
-        )
-
-        results = client.alpha.eval.jobs.retrieve(
-            benchmark_id=INLINE_BENCHMARK_ID, job_id=job.job_id
-        )
-        assert results.scores, "Expected non-empty scores"
-
-
-class TestRemoteEval:
-    """Run evaluation using the remote ragas provider (KFP + MinIO)."""
-
-    def test_run_eval(self, client, inference_model):
-        job = client.alpha.eval.run_eval(
-            benchmark_id=REMOTE_BENCHMARK_ID,
-            benchmark_config={
-                "eval_candidate": {
-                    "type": "model",
-                    "model": inference_model,
-                    "sampling_params": {
-                        "temperature": 0.1,
-                        "max_tokens": 100,
-                    },
-                },
-                "scoring_params": {},
-                "num_examples": 3,
-            },
-        )
-        assert job.job_id is not None
-        assert job.status == "in_progress"
-
-        completed = _wait_for_job(
-            client, REMOTE_BENCHMARK_ID, job.job_id, timeout=REMOTE_POLL_TIMEOUT
-        )
-        assert completed.status == "completed", (
-            f"Job finished with status '{completed.status}'"
-        )
-
-        results = client.alpha.eval.jobs.retrieve(
-            benchmark_id=REMOTE_BENCHMARK_ID, job_id=job.job_id
-        )
-        assert results.scores, "Expected non-empty scores"

From e4130ce7d77a9e58ea83a5e4efb845200f7d09ca Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Thu, 12 Mar 2026 12:01:41 -0400
Subject: [PATCH 5/7] Invoke SmokeTester.test_providers_registered in smoke
 tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_e2e.py               | 1 +
 tests/test_inline_evaluation.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 219bd583..d40dcffe 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -59,6 +59,7 @@ def eval_tester(
 
 @pytest.mark.usefixtures("register_benchmarks")
 def test_cluster_smoke(smoke_tester):
+    smoke_tester.test_providers_registered()
     smoke_tester.test_models_registered()
     smoke_tester.test_datasets_registered()
     smoke_tester.test_benchmarks_registered()
diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py
index 2e4ec34f..14671ef4 100644
--- a/tests/test_inline_evaluation.py
+++ b/tests/test_inline_evaluation.py
@@ -315,6 +315,7 @@ def eval_tester(
 
 @pytest.mark.usefixtures("register_benchmarks")
 def test_library_client_smoke(smoke_tester):
+    smoke_tester.test_providers_registered()
     smoke_tester.test_models_registered()
     smoke_tester.test_datasets_registered()
     smoke_tester.test_benchmarks_registered()

From a6289675d1f5852f01e86396d5c7be67c3681ab9 Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Thu, 12 Mar 2026 12:13:45 -0400
Subject: [PATCH 6/7] Align EMBEDDING_MODEL defaults with OpenShift
 distribution config

Drop the `embedding/` prefix so test defaults match the configmap value
`nomic-ai/nomic-embed-text-v1.5`.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_e2e.py             | 2 +-
 tests/test_remote_wrappers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index d40dcffe..6ce21f1b 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -31,7 +31,7 @@ def inference_model():
 
 @pytest.fixture(scope="module")
 def embedding_model():
-    return os.getenv("EMBEDDING_MODEL", "embedding/nomic-ai/nomic-embed-text-v1.5")
+    return os.getenv("EMBEDDING_MODEL", "nomic-ai/nomic-embed-text-v1.5")
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/test_remote_wrappers.py b/tests/test_remote_wrappers.py
index 7cf9a247..10650844 100644
--- a/tests/test_remote_wrappers.py
+++ b/tests/test_remote_wrappers.py
@@ -63,7 +63,7 @@ def inference_model():
 
 @pytest.fixture
 def embedding_model():
-    return os.getenv("EMBEDDING_MODEL", "embedding/nomic-ai/nomic-embed-text-v1.5")
+    return os.getenv("EMBEDDING_MODEL", "nomic-ai/nomic-embed-text-v1.5")
 
 
 @pytest.fixture

From ba148d2ff7bcb86314b287a0194ca8905331fd13 Mon Sep 17 00:00:00 2001
From: Diego Maniloff <diego.maniloff@gmail.com>
Date: Thu, 12 Mar 2026 12:18:19 -0400
Subject: [PATCH 7/7] Document model fixture defaults and their
 backend-specific rationale

Add a model configuration section to TESTING.md and inline comments in
each test module explaining why the defaults differ across suites.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/TESTING.md                | 12 ++++++++++++
 tests/test_e2e.py               |  2 ++
 tests/test_inline_evaluation.py |  3 +++
 tests/test_remote_wrappers.py   |  2 ++
 4 files changed, 19 insertions(+)

diff --git a/tests/TESTING.md b/tests/TESTING.md
index bda59e88..71a482b7 100644
--- a/tests/TESTING.md
+++ b/tests/TESTING.md
@@ -43,6 +43,18 @@ uv run pytest tests/test_e2e.py
 
 These tests exercise both the inline and remote eval providers through the Llama Stack eval API, including dataset registration, benchmark creation, and eval job execution with result polling.
 
+## Model configuration
+
+Each test module defines its own `inference_model` and `embedding_model` fixtures with defaults appropriate to its backend:
+
+| Module | Inference default | Embedding default | Backend |
+|--------|-------------------|-------------------|---------|
+| `test_inline_evaluation.py` | `ollama/granite3.3:2b` | `ollama/all-minilm:latest` | In-process Ollama (library client) |
+| `test_remote_wrappers.py` | `litellm/Mistral-Small-24B-W8A8` | `nomic-ai/nomic-embed-text-v1.5` | Mocked `LlamaStackClient` |
+| `test_e2e.py` | `Mistral-Small-24B-W8A8` | `nomic-ai/nomic-embed-text-v1.5` | OpenShift cluster (see `cluster-deployment/manifests/configmap-and-secrets.yaml`) |
+
+The `INFERENCE_MODEL` and `EMBEDDING_MODEL` environment variables override these defaults across all suites. When overriding, ensure the values match the models registered in the target environment — e.g. e2e defaults must match the OpenShift configmap, and inline defaults must use the `ollama/` prefix expected by the library client config.
+
 ## Cluster deployment (`cluster-deployment/`)
 
 Contains the Containerfile, deployment/teardown scripts, and Kubernetes manifests needed to stand up the e2e test environment on OpenShift. See `cluster-deployment/deploy-e2e.sh` to deploy.
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 6ce21f1b..fcce3cdd 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -26,11 +26,13 @@ def client(llama_stack_base_url):
 
 @pytest.fixture(scope="module")
 def inference_model():
+    # Default must match cluster-deployment/manifests/configmap-and-secrets.yaml
     return os.getenv("INFERENCE_MODEL", "Mistral-Small-24B-W8A8")
 
 
 @pytest.fixture(scope="module")
 def embedding_model():
+    # Default must match cluster-deployment/manifests/configmap-and-secrets.yaml
     return os.getenv("EMBEDDING_MODEL", "nomic-ai/nomic-embed-text-v1.5")
 
 
diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py
index 14671ef4..888b1f83 100644
--- a/tests/test_inline_evaluation.py
+++ b/tests/test_inline_evaluation.py
@@ -282,11 +282,14 @@ def client(library_client):
 
 @pytest.fixture
 def inference_model():
+    # Default must use the ollama/ prefix to match the library client config
+    # built in library_stack_config (provider_model_id strips this prefix).
     return os.getenv("INFERENCE_MODEL", "ollama/granite3.3:2b")
 
 
 @pytest.fixture
 def embedding_model():
+    # Default must use the ollama/ prefix — see inference_model comment above.
     return os.getenv("EMBEDDING_MODEL", "ollama/all-minilm:latest")
 
 
diff --git a/tests/test_remote_wrappers.py b/tests/test_remote_wrappers.py
index 10650844..31025dca 100644
--- a/tests/test_remote_wrappers.py
+++ b/tests/test_remote_wrappers.py
@@ -58,11 +58,13 @@
 
 @pytest.fixture
 def inference_model():
+    # Mocked by default; this value is only meaningful with --no-mock-client.
     return os.getenv("INFERENCE_MODEL", "litellm/Mistral-Small-24B-W8A8")
 
 
 @pytest.fixture
 def embedding_model():
+    # Mocked by default; this value is only meaningful with --no-mock-client.
     return os.getenv("EMBEDDING_MODEL", "nomic-ai/nomic-embed-text-v1.5")