diff --git a/Makefile b/Makefile index 30adbd3..90ce3a3 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,8 @@ .PHONY: deploy deploy-all undeploy undeploy-kserve status help check-kubeconfig sync clear-cache -.PHONY: deploy-cert-manager deploy-istio deploy-lws deploy-kserve +.PHONY: deploy-cert-manager deploy-istio deploy-lws deploy-kserve deploy-opendatahub-prerequisites deploy-cert-manager-pki .PHONY: test conformance HELMFILE_CACHE := $(HOME)/.cache/helmfile -KSERVE_REF ?= release-v0.15 KSERVE_NAMESPACE ?= opendatahub check-kubeconfig: @@ -13,8 +12,8 @@ help: @echo "rhaii-on-xks - Infrastructure for llm-d on xKS (AKS/CoreWeave)" @echo "" @echo "Deploy:" - @echo " make deploy - Deploy cert-manager + istio" - @echo " make deploy-all - Deploy all (cert-manager + istio + lws)" + @echo " make deploy - Deploy cert-manager + istio + lws" + @echo " make deploy-all - Deploy all (cert-manager + istio + lws + kserve)" @echo " make deploy-kserve - Deploy KServe" @echo "" @echo "Undeploy:" @@ -40,10 +39,10 @@ sync: clear-cache deploy: check-kubeconfig clear-cache helmfile apply --selector name=cert-manager-operator helmfile apply --selector name=sail-operator + helmfile apply --selector name=lws-operator @$(MAKE) status -deploy-all: check-kubeconfig clear-cache - helmfile apply +deploy-all: check-kubeconfig deploy-cert-manager deploy-istio deploy-lws deploy-kserve @$(MAKE) status deploy-cert-manager: check-kubeconfig clear-cache @@ -55,35 +54,44 @@ deploy-istio: check-kubeconfig clear-cache deploy-lws: check-kubeconfig clear-cache helmfile apply --selector name=lws-operator -deploy-kserve: check-kubeconfig - @echo "=== Deploying KServe (ref=$(KSERVE_REF)) ===" +deploy-opendatahub-prerequisites: check-kubeconfig + @echo "=== Deploying OpenDataHub prerequisites ===" kubectl create namespace $(KSERVE_NAMESPACE) --dry-run=client -o yaml | kubectl apply -f - -kubectl get secret redhat-pull-secret -n istio-system -o yaml 2>/dev/null | \ sed 's/namespace: istio-system/namespace: $(KSERVE_NAMESPACE)/' | \ kubectl apply -f - 2>/dev/null || true - kubectl apply -k "https://github.com/opendatahub-io/kserve/config/overlays/odh-test/cert-manager?ref=$(KSERVE_REF)" + +deploy-cert-manager-pki: check-kubeconfig deploy-opendatahub-prerequisites + @kubectl get crd clusterissuers.cert-manager.io >/dev/null 2>&1 || \ + (echo "ERROR: cert-manager CRDs not found. Run 'make deploy-cert-manager' first." && exit 1) + @echo "Waiting for cert-manager webhook..." + -kubectl delete secret cert-manager-webhook-ca -n cert-manager --ignore-not-found 2>/dev/null || true + kubectl rollout restart deployment/cert-manager-webhook -n cert-manager + kubectl rollout status deployment/cert-manager-webhook -n cert-manager --timeout=120s + @sleep 5 + kubectl apply -f ./charts/kserve/pki-prereq.yaml kubectl wait --for=condition=Ready clusterissuer/opendatahub-ca-issuer --timeout=120s - @echo "Applying CRDs and deployment (CR errors expected, will retry)..." - -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=$(KSERVE_REF)" | kubectl apply --server-side --force-conflicts -f - 2>/dev/null || true - @echo "Removing webhooks to allow controller startup..." - -kubectl delete validatingwebhookconfiguration llminferenceservice.serving.kserve.io llminferenceserviceconfig.serving.kserve.io --ignore-not-found 2>/dev/null || true - kubectl wait --for=condition=Available deployment/kserve-controller-manager -n $(KSERVE_NAMESPACE) --timeout=300s - @echo "Controller ready, applying CRs..." - kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=$(KSERVE_REF)" | kubectl apply --server-side --force-conflicts -f - + +deploy-kserve: check-kubeconfig deploy-cert-manager-pki + @echo "Applying KServe via Helm..." + helmfile sync --wait --selector name=kserve-rhaii-xks --skip-crds @echo "=== KServe deployed ===" # Undeploy -undeploy: check-kubeconfig +undeploy: check-kubeconfig undeploy-kserve @./scripts/cleanup.sh -y undeploy-kserve: check-kubeconfig -@kubectl delete llminferenceservice --all -A --ignore-not-found 2>/dev/null || true -@kubectl delete inferencepool --all -A --ignore-not-found 2>/dev/null || true - -@kubectl delete deployment kserve-controller-manager -n $(KSERVE_NAMESPACE) --ignore-not-found 2>/dev/null || true + -@helm uninstall kserve-rhaii-xks --namespace $(KSERVE_NAMESPACE) 2>/dev/null || true -@kubectl delete validatingwebhookconfiguration llminferenceservice.serving.kserve.io llminferenceserviceconfig.serving.kserve.io --ignore-not-found 2>/dev/null || true - -@# Removes KServe CRDs and Inference Extension CRDs (InferencePool, InferenceModel) + -@# Removes KServe CRDs and Inference Extension CRDs (Helm does not remove CRDs on uninstall) -@kubectl get crd -o name | grep -E "serving.kserve.io|inference.networking" | xargs -r kubectl delete --ignore-not-found 2>/dev/null || true - -@kubectl delete clusterissuer opendatahub-ca-issuer --ignore-not-found 2>/dev/null || true + -@# Removes cluster-scoped RBAC resources + -@kubectl get clusterrole,clusterrolebinding -o name | grep -i kserve | xargs -r kubectl delete --ignore-not-found 2>/dev/null || true + -@kubectl delete clusterissuer opendatahub-ca-issuer opendatahub-selfsigned-issuer --ignore-not-found 2>/dev/null || true + -@kubectl delete certificate opendatahub-ca -n cert-manager --ignore-not-found 2>/dev/null || true -@kubectl delete namespace $(KSERVE_NAMESPACE) --ignore-not-found --wait=false 2>/dev/null || true @echo "=== KServe removed ===" @@ -103,6 +111,12 @@ status: check-kubeconfig @echo "lws-operator:" @kubectl get pods -n openshift-lws-operator 2>/dev/null || echo " Not deployed" @echo "" + @echo "kserve:" + @kubectl get pods -n $(KSERVE_NAMESPACE) -l control-plane=kserve-controller-manager 2>/dev/null || echo " Not deployed" + @echo "" + @echo "kserve config:" + @kubectl get llminferenceserviceconfig -n $(KSERVE_NAMESPACE) 2>/dev/null || echo " Not deployed" + @echo "" @echo "=== API Versions ===" @echo -n "InferencePool API: " @if kubectl get crd inferencepools.inference.networking.k8s.io >/dev/null 2>&1; then \ diff --git a/README.md b/README.md index ac68425..edff8e0 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ Infrastructure Helm charts for deploying Red Hat AI Inference Server (KServe LLM | Repository | Purpose | |------------|---------| | [llm-d-xks-aks](https://github.com/kwozyman/llm-d-xks-aks) | AKS cluster provisioning (creates cluster + GPU nodes + GPU Operator) | -| [rhaii-xks-kserve](https://github.com/pierDipi/rhaii-xks-kserve) | KServe Helm charts (WIP) | ## Overview @@ -18,6 +17,7 @@ Infrastructure Helm charts for deploying Red Hat AI Inference Server (KServe LLM | cert-manager-operator | 1.15.2 | TLS certificate management | | sail-operator (Istio) | 3.2.x | Gateway API for inference routing | | lws-operator | 1.0 | LeaderWorkerSet controller for multi-node workloads | +| kserve | 3.4.0-ea.1 | KServe controller for LLMInferenceService lifecycle | ### Version Compatibility @@ -26,14 +26,16 @@ Infrastructure Helm charts for deploying Red Hat AI Inference Server (KServe LLM | OSSM (Sail Operator) | 3.2.x | Gateway API for inference routing | | Istio | v1.27.x | Service mesh | | InferencePool API | v1 | `inference.networking.k8s.io/v1` | -| KServe | release-v0.15 | LLMInferenceService controller | +| KServe | rhoai-3.4+ | LLMInferenceService controller | ## Prerequisites - Kubernetes cluster (AKS or CoreWeave) - see [llm-d-xks-aks](https://github.com/kwozyman/llm-d-xks-aks) for AKS provisioning -- `kubectl`, `helm` (v3.17+), `helmfile`, `kustomize` (v5.7+) +- `kubectl`, `helm` (v3.17+), `helmfile` - Red Hat account (for Sail Operator and vLLM images from `registry.redhat.io`) +**Cluster readiness check (optional):** Run `cd validation && make container && make run` to verify cloud provider, GPU availability, and instance types before deploying. CRD checks will pass only after operators are deployed. See [Preflight Validation](./validation/README.md). + ### Red Hat Pull Secret Setup The Sail Operator and RHAIIS vLLM images are hosted on `registry.redhat.io` which requires authentication. @@ -93,210 +95,24 @@ useSystemPodmanAuth: true ## Quick Start -### Step 1: Deploy Infrastructure - ```bash git clone https://github.com/opendatahub-io/rhaii-on-xks.git cd rhaii-on-xks -# Deploy cert-manager + istio + lws +# 1. Deploy all components (cert-manager + Istio + LWS + KServe) make deploy-all -# Check status -make status -``` - -### Step 2: Deploy KServe - -```bash -make deploy-kserve - -# Verify -kubectl get pods -n opendatahub -kubectl get llminferenceserviceconfig -n opendatahub -``` - -
-Manual steps (click to expand) - -```bash -# Create opendatahub namespace -kubectl create namespace opendatahub --dry-run=client -o yaml | kubectl apply -f - - -# Copy pull secret from istio-system (created by infrastructure deployment) -kubectl get secret redhat-pull-secret -n istio-system -o yaml | \ - sed 's/namespace: istio-system/namespace: opendatahub/' | \ - kubectl apply -f - - -# Apply cert-manager PKI resources first (required for webhook certificates) -kubectl apply -k "https://github.com/opendatahub-io/kserve/config/overlays/odh-test/cert-manager?ref=release-v0.15" -kubectl wait --for=condition=Ready clusterissuer/opendatahub-ca-issuer --timeout=120s - -# First apply - creates CRDs and deployment (CR errors expected due to webhook) -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=release-v0.15" | kubectl apply --server-side --force-conflicts -f - || true - -# Delete webhooks to allow controller startup -kubectl delete validatingwebhookconfiguration llminferenceservice.serving.kserve.io llminferenceserviceconfig.serving.kserve.io --ignore-not-found - -# Wait for controller to be ready -kubectl wait --for=condition=Available deployment/kserve-controller-manager -n opendatahub --timeout=300s - -# Second apply - now webhooks work, applies CRs -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=release-v0.15" | kubectl apply --server-side --force-conflicts -f - - -# Verify LLMInferenceServiceConfig templates exist -kubectl get llminferenceserviceconfig -n opendatahub -``` - -
- -### Step 3: Set up Gateway - -```bash +# 2. Set up inference gateway ./scripts/setup-gateway.sh -# Verify -kubectl get gateway -n opendatahub -``` - -
-What the script does (click to expand) +# 3. Validate deployment +cd validation && make container && make run -The script: -1. Copies the CA bundle from cert-manager to opendatahub namespace -2. Creates a Gateway with the CA bundle mounted for mTLS to backend services -3. Patches the Gateway pod to use the pull secret - -
- -### Step 4: Deploy LLMInferenceService - -#### Hardware Requirements - -| Resource | Per Replica | Notes | -|----------|-------------|-------| -| GPU | 1x NVIDIA GPU | A10, A100, H100, or similar | -| CPU | 2-4 cores | | -| Memory | 16-32 Gi | Depends on model size | - -#### Node Requirements - -Ensure your cluster has GPU nodes with the NVIDIA device plugin installed: - -```bash -# Verify GPU nodes are available -kubectl get nodes -l nvidia.com/gpu.present=true - -# Check GPU resources -kubectl describe nodes | grep -A5 "nvidia.com/gpu" -``` - -For AKS, create a GPU node pool: -```bash -az aks nodepool add \ - --resource-group \ - --cluster-name \ - --name gpunp \ - --node-count 2 \ - --node-vm-size Standard_NC24ads_A100_v4 \ - --node-taints sku=gpu:NoSchedule \ - --labels nvidia.com/gpu.present=true -``` - -#### Deploy Sample Model - -```bash -# Create namespace first -export NAMESPACE=llm-d-test -kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f - - -# Copy pull secret from istio-system (created by infrastructure deployment) -kubectl get secret redhat-pull-secret -n istio-system -o yaml | \ - sed "s/namespace: istio-system/namespace: $NAMESPACE/" | \ - kubectl apply -f - - -# Patch default ServiceAccount to use pull secret (all pods will inherit it) -kubectl patch serviceaccount default -n $NAMESPACE \ - -p '{"imagePullSecrets": [{"name": "redhat-pull-secret"}]}' - -# Deploy Qwen2.5-7B model with scheduler -kubectl apply -n $NAMESPACE -f - <<'EOF' -apiVersion: serving.kserve.io/v1alpha1 -kind: LLMInferenceService -metadata: - name: qwen2-7b-instruct -spec: - model: - name: Qwen/Qwen2.5-7B-Instruct - uri: hf://Qwen/Qwen2.5-7B-Instruct - replicas: 1 - router: - gateway: {} - route: {} - scheduler: {} # Enable EPP scheduler for intelligent routing - template: - containers: - - name: main - resources: - limits: - cpu: "4" - memory: 32Gi - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 16Gi - nvidia.com/gpu: "1" - livenessProbe: - httpGet: - path: /health - port: 8000 - scheme: HTTPS - initialDelaySeconds: 120 - periodSeconds: 30 - timeoutSeconds: 30 - failureThreshold: 5 -EOF - -# Watch deployment status -kubectl get llmisvc -n $NAMESPACE -w -``` - -#### Check Deployment Status - -```bash -# Check pods -kubectl get pods -n $NAMESPACE - -# Check events if pods are not starting -kubectl describe llmisvc qwen2-7b-instruct -n $NAMESPACE -``` - -#### Test Inference - -```bash -# Get the service URL -SERVICE_URL=$(kubectl get llmisvc qwen2-7b-instruct -n $NAMESPACE -o jsonpath='{.status.url}') - -# Test with curl (use external gateway IP) -curl -X POST "${SERVICE_URL}/v1/chat/completions" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen2.5-7B-Instruct", - "messages": [{"role": "user", "content": "What is Kubernetes?"}], - "max_tokens": 100 - }' +# 4. Check status +make status ``` -#### More Examples - -| Example | Description | Path | -|---------|-------------|------| -| CPU (OPT-125M) | Simple CPU deployment for testing | `docs/samples/llmisvc/opt-125m-cpu/` | -| GPU with Scheduler | Intelligent request routing | `docs/samples/llmisvc/single-node-gpu/` | -| Prefill-Decode | Disaggregated serving | `docs/samples/llmisvc/single-node-gpu/llm-inference-service-pd-qwen2-7b-gpu.yaml` | -| Multi-node MoE | DeepSeek with expert parallelism | `docs/samples/llmisvc/dp-ep/` | - -See the [KServe samples](https://github.com/opendatahub-io/kserve/tree/main/docs/samples/llmisvc) for more examples. +For deploying LLM inference services, GPU requirements, and testing inference, see the [full deployment guide](./docs/deploying-llm-d-on-managed-kubernetes.md). --- @@ -304,8 +120,8 @@ See the [KServe samples](https://github.com/opendatahub-io/kserve/tree/main/docs ```bash # Deploy -make deploy # cert-manager + istio -make deploy-all # cert-manager + istio + lws +make deploy # cert-manager + istio + lws +make deploy-all # cert-manager + istio + lws + kserve make deploy-kserve # Deploy KServe # Undeploy @@ -345,43 +161,6 @@ lwsOperator: --- -## KServe Controller Settings - -The odh-xks overlay disables several OpenShift-specific features for vanilla Kubernetes (AKS/CoreWeave) compatibility: - -```yaml -# Disabled by default in odh-xks overlay -- name: LLMISVC_MONITORING_DISABLED - value: "true" # No Prometheus Operator dependency -- name: LLMISVC_AUTH_DISABLED - value: "true" # No Kuadrant/RHCL dependency -- name: LLMISVC_SCC_DISABLED - value: "true" # No OpenShift SecurityContextConstraints -``` - -| Setting | Why Disabled on xKS | -|---------|---------------------| -| `LLMISVC_MONITORING_DISABLED` | Prometheus Operator not required for basic inference | -| `LLMISVC_AUTH_DISABLED` | Authorino/Kuadrant (Red Hat Connectivity Link) is OpenShift-only | -| `LLMISVC_SCC_DISABLED` | SecurityContextConstraints are OpenShift-specific | - -### Enabling Monitoring - -To enable Prometheus monitoring for KServe-managed workloads: - -1. Deploy Prometheus Operator on your cluster (see [monitoring-stack/](./monitoring-stack/)) - -2. Patch the KServe controller to enable monitoring: -```bash -kubectl set env deployment/kserve-controller-manager \ - -n opendatahub \ - LLMISVC_MONITORING_DISABLED=false -``` - -This enables KServe to automatically create `PodMonitor` resources for vLLM pods. - ---- - ## Collecting Debug Information If you encounter issues, collect diagnostic information for troubleshooting or to share with Red Hat support: @@ -390,92 +169,13 @@ If you encounter issues, collect diagnostic information for troubleshooting or t ./scripts/collect-debug-info.sh ``` -This collects logs, status, and events from all components (cert-manager, Istio, LWS, KServe) into a single directory. See the [Collecting Debug Information](./docs/collecting-debug-information.md) guide for details. +See the [Collecting Debug Information](./docs/collecting-debug-information.md) guide for details. --- ## Troubleshooting -### KServe Controller Issues - -If the controller pod is stuck in `ContainerCreating` (waiting for certificate): -```bash -# Apply cert-manager resources separately first -kubectl apply -k "https://github.com/opendatahub-io/kserve/config/overlays/odh-test/cert-manager?ref=release-v0.15" -kubectl wait --for=condition=Ready certificate/kserve-webhook-server -n opendatahub --timeout=120s - -# Then re-apply the overlay -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=release-v0.15" | kubectl apply --server-side --force-conflicts -f - -``` - -If webhook validation blocks apply (manual deployment only - `make deploy-kserve` handles this automatically): -```bash -kubectl delete validatingwebhookconfiguration llminferenceservice.serving.kserve.io llminferenceserviceconfig.serving.kserve.io -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=release-v0.15" | kubectl apply --server-side --force-conflicts -f - -``` - -If you get "no matches for kind LLMInferenceServiceConfig" errors: -```bash -# This is a CRD timing issue - run the apply command again after CRDs are registered -sleep 5 -kustomize build "https://github.com/opendatahub-io/kserve/config/overlays/odh-xks?ref=release-v0.15" | kubectl apply --server-side --force-conflicts -f - -``` - -### Gateway Issues - -If Gateway pod has `ErrImagePull`: -```bash -# Copy pull secret to opendatahub namespace -kubectl get secret redhat-pull-secret -n istio-system -o yaml | \ - sed 's/namespace: istio-system/namespace: opendatahub/' | kubectl apply -f - - -# Patch the gateway ServiceAccount -kubectl patch sa inference-gateway-istio -n opendatahub \ - -p '{"imagePullSecrets": [{"name": "redhat-pull-secret"}]}' - -# Delete the failing pod to trigger restart -kubectl delete pod -n opendatahub -l gateway.networking.k8s.io/gateway-name=inference-gateway -``` - ---- - -## Reinstalling Istio - -If you need to do a clean reinstall of Istio: - -```bash -# 1. Delete the Istio CR (triggers istiod cleanup) -kubectl delete istio default -n istio-system - -# 2. Wait for istiod to be removed -kubectl wait --for=delete pod -l app=istiod -n istio-system --timeout=120s - -# 3. Redeploy -make deploy-istio -``` - ---- - -## Architecture - -### TLS Certificate Architecture - -The odh-xks overlay creates an OpenDataHub-scoped CA: -1. Self-signed bootstrap issuer creates root CA in cert-manager namespace -2. ClusterIssuer (`opendatahub-ca-issuer`) uses this CA to sign certificates -3. KServe controller generates certificates for LLM workload mTLS automatically -4. Gateway needs CA bundle mounted at `/var/run/secrets/opendatahub/ca.crt` - -### Key Differences from OpenShift (ODH) Overlay - -| Component | OpenShift (ODH) | Vanilla K8s (odh-xks) | -|-----------|-----------------|----------------------| -| Certificates | OpenShift service-ca | cert-manager | -| Security constraints | SCC included | Removed | -| Traffic routing | Istio VirtualService | Gateway API | -| Webhook CA injection | Service annotations | cert-manager annotations | -| Auth | Authorino/Kuadrant | Disabled | -| Monitoring | Prometheus included | Disabled (optional) | +For detailed troubleshooting steps (KServe controller issues, gateway errors, webhook problems, monitoring setup), see the [full deployment guide - Troubleshooting](./docs/deploying-llm-d-on-managed-kubernetes.md#9-troubleshooting). --- @@ -490,18 +190,24 @@ rhaii-on-xks/ ├── charts/ │ ├── cert-manager-operator/ # cert-manager operator Helm chart │ ├── sail-operator/ # Sail/Istio operator Helm chart -│ └── lws-operator/ # LWS operator Helm chart +│ ├── lws-operator/ # LWS operator Helm chart +│ └── kserve/ # KServe controller Helm chart (auto-generated) +├── validation/ # Preflight validation checks +│ ├── llmd_xks_checks.py # Validation script +│ ├── Containerfile # Container build +│ └── Makefile # Build and run helpers └── scripts/ ├── cleanup.sh # Cleanup infrastructure (helmfile destroy + finalizers) └── setup-gateway.sh # Set up Gateway with CA bundle for mTLS ``` -## Operator Charts +## Charts -Operator Helm charts are included locally under `charts/`: +Helm charts are included locally under `charts/`: - `charts/cert-manager-operator/` — cert-manager operator - `charts/sail-operator/` — Sail/Istio operator - `charts/lws-operator/` — LeaderWorkerSet operator +- `charts/kserve/` — KServe controller (auto-generated from Kustomize overlays, all images from `registry.redhat.io`) -The helmfile imports these local charts including presync hooks for CRD installation. +The helmfile imports the infrastructure charts (cert-manager, sail-operator, lws-operator) including presync hooks for CRD installation. The KServe OCI chart is deployed via helmfile from `ghcr.io/opendatahub-io/kserve-rhaii-xks`. diff --git a/charts/kserve/pki-prereq.yaml b/charts/kserve/pki-prereq.yaml new file mode 100644 index 0000000..c888ee7 --- /dev/null +++ b/charts/kserve/pki-prereq.yaml @@ -0,0 +1,39 @@ +# cert-manager PKI prerequisites for KServe +# These resources must be applied before installing the KServe chart. +# The default uses a self-signed CA for internal TLS between inference +# components. For production, replace opendatahub-selfsigned-issuer with +# a ClusterIssuer backed by your organization's CA (e.g., Vault, ACM PCA). +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: opendatahub-selfsigned-issuer +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: opendatahub-ca-issuer +spec: + ca: + secretName: opendatahub-ca +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: opendatahub-ca + namespace: cert-manager +spec: + secretName: opendatahub-ca + isCA: true + commonName: opendatahub-ca + duration: 87600h # 10 years + renewBefore: 2160h # 90 days + privateKey: + algorithm: RSA + size: 4096 + issuerRef: + name: opendatahub-selfsigned-issuer + kind: ClusterIssuer + group: cert-manager.io diff --git a/docs/deploying-llm-d-on-managed-kubernetes.md b/docs/deploying-llm-d-on-managed-kubernetes.md index aa15309..be782e2 100644 --- a/docs/deploying-llm-d-on-managed-kubernetes.md +++ b/docs/deploying-llm-d-on-managed-kubernetes.md @@ -1,7 +1,7 @@ # Deploying Red Hat AI Inference Server on Managed Kubernetes **Product:** Red Hat AI Inference Server (RHAIIS) -**Version:** 0.15 +**Version:** 3.4 **Platforms:** Azure Kubernetes Service (AKS), CoreWeave Kubernetes Service (CKS) --- @@ -21,16 +21,16 @@ This guide provides step-by-step instructions for deploying Red Hat AI Inference ## Table of Contents 1. [Prerequisites](#1-prerequisites) + - [Preflight Validation](#15-preflight-validation-recommended) 2. [Architecture Overview](#2-architecture-overview) -3. [Deploying Infrastructure Components](#3-deploying-infrastructure-components) -4. [Deploying the Inference Controller](#4-deploying-the-inference-controller) -5. [Configuring the Inference Gateway](#5-configuring-the-inference-gateway) -6. [Deploying an LLM Inference Service](#6-deploying-an-llm-inference-service) -7. [Verifying the Deployment](#7-verifying-the-deployment) -8. [Optional: Enabling Monitoring](#8-optional-enabling-monitoring) -9. [Collecting Debug Information](#9-collecting-debug-information) -10. [Troubleshooting](#10-troubleshooting) -11. [Appendix: Component Versions](#appendix-component-versions) +3. [Deploying All Components](#3-deploying-all-components) +4. [Configuring the Inference Gateway](#4-configuring-the-inference-gateway) +5. [Deploying an LLM Inference Service](#5-deploying-an-llm-inference-service) +6. [Verifying the Deployment](#6-verifying-the-deployment) +7. [Optional: Enabling Monitoring](#7-optional-enabling-monitoring) +8. [Collecting Debug Information](#8-collecting-debug-information) +9. [Troubleshooting](#9-troubleshooting) +10. [Appendix: Component Versions](#appendix-component-versions) --- @@ -54,7 +54,6 @@ Install the following tools on your workstation: | `kubectl` | 1.28+ | Kubernetes CLI | | `helm` | 3.17+ | Helm package manager | | `helmfile` | 0.160+ | Declarative Helm deployments | -| `kustomize` | 5.7+ | Kubernetes manifest customization | ### 1.3 Red Hat Registry Authentication @@ -122,6 +121,36 @@ kubectl describe nodes | grep -A5 "nvidia.com/gpu" --- +## 1.5 Preflight Validation (Recommended) + +Run the preflight validation checks to verify your cluster is properly configured: + +```bash +# Build the validation container +cd validation && make container + +# Run preflight checks against your cluster +make run +``` + +The preflight tool automatically detects your cloud provider and validates: + +| Check | When it passes | +|-------|----------------| +| Cloud provider | Cluster is reachable and provider detected (pre-deployment) | +| Instance type | Supported GPU instance types are present (pre-deployment) | +| GPU availability | GPU drivers and node labels found (pre-deployment) | +| cert-manager CRDs | After `make deploy-all` | +| Sail Operator CRDs | After `make deploy-all` | +| LWS Operator CRDs | After `make deploy-all` | +| KServe CRDs | After `make deploy-all` | + +> **Tip:** Run before deploying to verify cluster readiness (cloud provider, GPU, instance types). Run again after deployment to confirm all CRDs are installed. See Section 6.4 for full post-deployment validation. + +See the [Preflight Validation README](../validation/README.md) for configuration options and standalone usage. + +--- + ## 2. Architecture Overview Red Hat AI Inference Server on managed Kubernetes consists of the following components: @@ -152,7 +181,7 @@ Red Hat AI Inference Server on managed Kubernetes consists of the following comp --- -## 3. Deploying Infrastructure Components +## 3. Deploying All Components ### 3.1 Clone the Deployment Repository @@ -161,14 +190,16 @@ git clone https://github.com/opendatahub-io/rhaii-on-xks.git cd rhaii-on-xks ``` -### 3.2 Deploy Infrastructure +### 3.2 Deploy -Deploy cert-manager, Istio (Sail Operator), and LeaderWorkerSet: +Deploy cert-manager, Istio (Sail Operator), LeaderWorkerSet, and KServe: ```bash make deploy-all ``` +> **Note:** To deploy components individually, use `make deploy-cert-manager`, `make deploy-istio`, `make deploy-lws`, and `make deploy-kserve`. + ### 3.3 Verify Infrastructure Deployment ```bash @@ -202,46 +233,13 @@ InferencePool API: v1 (inference.networking.k8s.io) Istio version: v1.27.5 ``` ---- - -## 4. Deploying the Inference Controller - -### 4.1 Deploy KServe Controller - -```bash -make deploy-kserve -``` - -This command performs the following actions: -- Creates the `opendatahub` namespace -- Applies cert-manager PKI resources for webhook certificates -- Deploys the KServe controller with LLMInferenceService support -- Configures validating webhooks - -### 4.2 Verify Controller Deployment - -```bash -kubectl get pods -n opendatahub -``` - -**Expected output:** - -```text -NAME READY STATUS RESTARTS AGE -kserve-controller-manager-xxxxxxxxx-xxxxx 1/1 Running 0 2m -``` - -Verify the LLMInferenceServiceConfig templates are installed: - -```bash -kubectl get llminferenceserviceconfig -n opendatahub -``` +> **TLS Certificates:** The default configuration uses a self-signed CA for internal mTLS between inference components (router, scheduler, vLLM). This is sufficient for most deployments as the certificates are only used for pod-to-pod communication within the cluster. If your organization requires certificates issued by a corporate PKI, replace the `opendatahub-selfsigned-issuer` with a cert-manager ClusterIssuer backed by your CA (e.g., Vault, AWS ACM PCA, or an external PKI). See the [KServe Chart README - cert-manager PKI Setup](https://github.com/opendatahub-io/rhaii-on-xks/blob/main/charts/kserve/README.md#cert-manager-pki-setup) for details. The KServe chart version is configured in `values.yaml` (`kserveChartVersion`). See the [KServe Chart README](https://github.com/opendatahub-io/rhaii-on-xks/blob/main/charts/kserve/README.md) for chart details and cert-manager PKI prerequisites. --- -## 5. Configuring the Inference Gateway +## 4. Configuring the Inference Gateway -### 5.1 Create the Gateway +### 4.1 Create the Gateway Run the gateway setup script: @@ -254,7 +252,7 @@ This script: 2. Creates an Istio Gateway with the CA bundle mounted for mTLS 3. Configures the Gateway pod with registry authentication -### 5.2 Verify Gateway Deployment +### 4.2 Verify Gateway Deployment ```bash kubectl get gateway -n opendatahub @@ -275,16 +273,16 @@ kubectl get pods -n opendatahub -l gateway.networking.k8s.io/gateway-name=infere --- -## 6. Deploying an LLM Inference Service +## 5. Deploying an LLM Inference Service -### 6.1 Create the Application Namespace +### 5.1 Create the Application Namespace ```bash export NAMESPACE=llm-inference kubectl create namespace $NAMESPACE ``` -### 6.2 Configure Registry Authentication +### 5.2 Configure Registry Authentication Copy the pull secret to your application namespace: @@ -301,7 +299,7 @@ kubectl patch serviceaccount default -n $NAMESPACE \ -p '{"imagePullSecrets": [{"name": "redhat-pull-secret"}]}' ``` -### 6.3 Deploy the LLMInferenceService +### 5.3 Deploy the LLMInferenceService Create the LLMInferenceService resource: @@ -349,7 +347,7 @@ spec: EOF ``` -### 6.4 Monitor Deployment Progress +### 5.4 Monitor Deployment Progress Watch the LLMInferenceService status: @@ -361,9 +359,9 @@ The service is ready when the `READY` column shows `True`. --- -## 7. Verifying the Deployment +## 6. Verifying the Deployment -### 7.1 Check Service Status +### 6.1 Check Service Status ```bash kubectl get llmisvc -n $NAMESPACE @@ -376,7 +374,7 @@ NAME READY URL AGE qwen2-7b-instruct True http://20.xx.xx.xx/llm-inference/... 5m ``` -### 7.2 Check Pod Status +### 6.2 Check Pod Status ```bash kubectl get pods -n $NAMESPACE @@ -384,7 +382,7 @@ kubectl get pods -n $NAMESPACE All pods should show `Running` status with `1/1` or `2/2` ready containers. -### 7.3 Test Inference +### 6.3 Test Inference Retrieve the service URL: @@ -405,19 +403,41 @@ curl -X POST "${SERVICE_URL}/v1/chat/completions" \ }' ``` +### 6.4 Run Preflight Validation + +Run the full validation suite to confirm all components are properly installed: + +```bash +cd validation && make container && make run +``` + +All checks should show `PASSED`: + +```text +cloud_provider PASSED +instance_type PASSED +gpu_availability PASSED +crd_certmanager PASSED +crd_sailoperator PASSED +crd_lwsoperator PASSED +crd_kserve PASSED +``` + +If any checks fail, review the suggested actions in the output. See the [Preflight Validation README](../validation/README.md) for configuration options. + --- -## 8. Optional: Enabling Monitoring +## 7. Optional: Enabling Monitoring Monitoring is disabled by default. Enable it if you need: - Grafana dashboards for inference metrics - Workload Variant Autoscaler (WVA) for auto-scaling -### 8.1 Prerequisites +### 7.1 Prerequisites Install Prometheus with ServiceMonitor/PodMonitor CRD support. See the [Monitoring Setup Guide](../monitoring-stack/) for platform-specific instructions. -### 8.2 Enable Monitoring in KServe +### 7.2 Enable Monitoring in KServe ```bash kubectl set env deployment/kserve-controller-manager \ @@ -427,7 +447,7 @@ kubectl set env deployment/kserve-controller-manager \ When enabled, KServe automatically creates `PodMonitor` resources for vLLM pods. -### 8.3 Verify +### 7.3 Verify ```bash # Check PodMonitors created by KServe @@ -436,7 +456,7 @@ kubectl get podmonitors -n --- -## 9. Collecting Debug Information +## 8. Collecting Debug Information If you encounter issues during or after deployment, collect diagnostic data for troubleshooting: @@ -465,9 +485,9 @@ See the full guide: [Collecting Debug Information](./collecting-debug-informatio --- -## 10. Troubleshooting +## 9. Troubleshooting -### 10.1 Controller Pod Stuck in ContainerCreating +### 9.1 Controller Pod Stuck in ContainerCreating **Symptom:** The `kserve-controller-manager` pod remains in `ContainerCreating` state. @@ -475,12 +495,17 @@ See the full guide: [Collecting Debug Information](./collecting-debug-informatio **Resolution:** +Verify the cert-manager PKI resources are applied (the KServe chart expects `opendatahub-ca-issuer` ClusterIssuer): + ```bash -kubectl apply -k "https://github.com/opendatahub-io/kserve/config/overlays/odh-test/cert-manager?ref=release-v0.15" -kubectl wait --for=condition=Ready certificate/kserve-webhook-server -n opendatahub --timeout=120s +kubectl get clusterissuer opendatahub-ca-issuer +kubectl get certificate -n cert-manager + +# If missing, re-run the deployment +make deploy-kserve ``` -### 10.2 Gateway Pod Shows ErrImagePull +### 9.2 Gateway Pod Shows ErrImagePull **Symptom:** The Gateway pod fails with `ErrImagePull` or `ImagePullBackOff`. @@ -499,7 +524,7 @@ kubectl patch sa inference-gateway-istio -n opendatahub \ kubectl delete pod -n opendatahub -l gateway.networking.k8s.io/gateway-name=inference-gateway ``` -### 10.3 LLMInferenceService Pod Shows FailedScheduling +### 9.3 LLMInferenceService Pod Shows FailedScheduling **Symptom:** The inference pod shows `FailedScheduling` with message "Insufficient nvidia.com/gpu". @@ -517,9 +542,9 @@ kubectl delete pod -n opendatahub -l gateway.networking.k8s.io/gateway-name=infe kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: {.spec.taints}{"\n"}{end}' ``` -3. Add matching tolerations to the LLMInferenceService spec (see Section 6.3). +3. Add matching tolerations to the LLMInferenceService spec (see Section 5.3). -### 10.4 Webhook Validation Errors During Deployment +### 9.4 Webhook Validation Errors During Deployment **Symptom:** Deployment fails with "no endpoints available for service" webhook errors. @@ -528,11 +553,13 @@ kubectl delete pod -n opendatahub -l gateway.networking.k8s.io/gateway-name=infe **Resolution:** ```bash +# Delete stale webhooks kubectl delete validatingwebhookconfiguration \ llminferenceservice.serving.kserve.io \ llminferenceserviceconfig.serving.kserve.io \ --ignore-not-found +# Re-deploy KServe make deploy-kserve ``` @@ -546,7 +573,7 @@ make deploy-kserve | Sail Operator (Istio) | 3.2.1 | `registry.redhat.io/openshift-service-mesh/istio-sail-operator-bundle:3.2` | | Istio | 1.27.x | Dynamic resolution via `v1.27-latest` | | LeaderWorkerSet | 1.0 | `registry.k8s.io/lws/lws-controller` | -| KServe Controller | 0.15 | `quay.io/opendatahub/kserve-controller` | +| KServe Controller | 0.15 (chart 3.4.0-ea.1) | `registry.redhat.io` (via `charts/kserve/`) | | vLLM | Latest | `registry.redhat.io/rhaiis-tech-preview/vllm-openai-rhel9` | ### API Versions @@ -565,7 +592,7 @@ make deploy-kserve For assistance with Red Hat AI Inference Server deployments, contact Red Hat Support or consult the product documentation. **Additional Resources:** +- [KServe Chart README](https://github.com/opendatahub-io/rhaii-on-xks/blob/main/charts/kserve/README.md) - KServe Helm chart details, PKI prerequisites, and OCI registry install +- [Preflight Validation](https://github.com/opendatahub-io/rhaii-on-xks/blob/main/validation/README.md) - Cluster readiness and post-deployment validation checks - [Monitoring Setup Guide](../monitoring-stack/) - Optional Prometheus/Grafana configuration for dashboards and autoscaling -- [KServe LLMInferenceService Samples](https://github.com/opendatahub-io/kserve/tree/main/docs/samples/llmisvc) -- [Gateway API Documentation](https://gateway-api.sigs.k8s.io/) -- [Istio Documentation](https://istio.io/latest/docs/) +- [KServe LLMInferenceService Samples](https://github.com/red-hat-data-services/kserve/tree/rhoai-3.4/docs/samples/llmisvc) diff --git a/helmfile.yaml.gotmpl b/helmfile.yaml.gotmpl index 7a68b41..7874b90 100644 --- a/helmfile.yaml.gotmpl +++ b/helmfile.yaml.gotmpl @@ -5,6 +5,7 @@ # helmfile apply --selector name=cert-manager-operator # Deploy only cert-manager # helmfile apply --selector name=sail-operator # Deploy only istio # helmfile apply --selector name=lws-operator # Deploy only lws +# helmfile sync --selector name=kserve-rhaii-xks # Deploy only Kserve # helmfile destroy # Remove all environments: @@ -37,3 +38,30 @@ helmfiles: values: - useSystemPodmanAuth: {{ .Values.useSystemPodmanAuth | default true }} - pullSecretFile: {{ .Values.pullSecretFile | default "" | quote }} + +--- + +{{ $kserveChart := "oci://ghcr.io/opendatahub-io/kserve-rhaii-xks" }} +{{ $kserveVersion := .Values.kserveChartVersion | default "3.4.0-ea.1-dev-8a30e66" }} + +releases: + - name: kserve-rhaii-xks + chart: {{ $kserveChart }} + # Use dev variant until official builds are released to registry.redhat.io + version: {{ $kserveVersion }} + namespace: opendatahub + disableValidation: true + # CRDs are applied separately via presync hook to avoid 1MB secret size limit + hooks: + # Apply CRDs before helm install via presync + - events: ["presync"] + showlogs: true + command: "sh" + args: + - "-c" + - | + set -e + CHART_DIR=$(mktemp -d) + trap 'rm -rf "$CHART_DIR"' EXIT + helm pull {{ $kserveChart }} --version {{ $kserveVersion }} --untar --untardir "$CHART_DIR" + kubectl apply -f "$CHART_DIR"/kserve-rhaii-xks/crds/ --server-side diff --git a/scripts/cleanup.sh b/scripts/cleanup.sh index 3e3fe51..7b3d3d0 100755 --- a/scripts/cleanup.sh +++ b/scripts/cleanup.sh @@ -91,13 +91,21 @@ done echo "$CRDS" | grep -E "\.cert-manager\.io" | while read -r crd; do kubectl delete "$crd" --ignore-not-found 2>/dev/null || true done -# LWS and cert-manager operator CRDs (exact names to avoid matching other OpenShift operators) -kubectl delete crd certmanagers.operator.openshift.io leaderworkersetoperators.operator.openshift.io --ignore-not-found 2>/dev/null || true +# LWS CRDs +echo "$CRDS" | grep -E "leaderworkerset\.x-k8s\.io" | while read -r crd; do + kubectl delete "$crd" --ignore-not-found 2>/dev/null || true +done +# Operator CRDs (exact names to avoid matching other OpenShift operators) +kubectl delete crd certmanagers.operator.openshift.io leaderworkersetoperators.operator.openshift.io istiocsrs.operator.openshift.io --ignore-not-found 2>/dev/null || true # Gateway API CRDs and Inference Extension CRDs (InferencePool, InferenceModel) # Matches both inference.networking.k8s.io (v1) and inference.networking.x-k8s.io (v1alpha2) echo "$CRDS" | grep -E "gateway\.networking\.k8s\.io|inference\.networking\.k8s\.io|inference\.networking\.x-k8s\.io" | while read -r crd; do kubectl delete "$crd" --ignore-not-found 2>/dev/null || true done +# KServe CRDs +echo "$CRDS" | grep -E "serving\.kserve\.io" | while read -r crd; do + kubectl delete "$crd" --ignore-not-found 2>/dev/null || true +done # Infrastructure stub CRD kubectl delete crd infrastructures.config.openshift.io --ignore-not-found 2>/dev/null || true @@ -107,5 +115,6 @@ kubectl delete namespace cert-manager --ignore-not-found --wait=false 2>/dev/nul kubectl delete namespace cert-manager-operator --ignore-not-found --wait=false 2>/dev/null || true kubectl delete namespace istio-system --ignore-not-found --wait=false 2>/dev/null || true kubectl delete namespace openshift-lws-operator --ignore-not-found --wait=false 2>/dev/null || true +kubectl delete namespace opendatahub --ignore-not-found --wait=false 2>/dev/null || true log "=== Cleanup Complete ===" diff --git a/test/conformance/verify-llm-d-deployment.sh b/test/conformance/verify-llm-d-deployment.sh index 72f3bd1..aa7b4dd 100755 --- a/test/conformance/verify-llm-d-deployment.sh +++ b/test/conformance/verify-llm-d-deployment.sh @@ -703,7 +703,7 @@ while [[ $# -gt 0 ]]; do done echo "" echo "Upstream guides: https://github.com/llm-d/llm-d/tree/main/guides" - echo "KServe docs: https://github.com/opendatahub-io/kserve/tree/release-v0.15/docs/samples/llmisvc" + echo "KServe docs: https://github.com/red-hat-data-services/kserve/tree/rhoai-3.4/docs/samples/llmisvc" exit 0 ;; --help|-h) @@ -745,7 +745,7 @@ Profiles: Documentation: Upstream: https://github.com/llm-d/llm-d/tree/main/guides - KServe: https://github.com/opendatahub-io/kserve/tree/release-v0.15/docs/samples/llmisvc + KServe: https://github.com/red-hat-data-services/kserve/tree/rhoai-3.4/docs/samples/llmisvc EOF exit 0 ;; diff --git a/values.yaml b/values.yaml index f233dec..c051b52 100644 --- a/values.yaml +++ b/values.yaml @@ -26,3 +26,10 @@ sailOperator: # lws-operator (optional - for multi-node workloads) lwsOperator: enabled: true + +# ============================================================================= +# KServe +# ============================================================================= + +# KServe OCI chart version (dev variant until official builds are on registry.redhat.io) +kserveChartVersion: "3.4.0-ea.1-dev-8a30e66"