models-as-a-service/.github/hack/install-odh.sh at b5ade4bd25ef019e1561aef0fda56851cb3438a4 · chaitanya1731/models-as-a-service · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env bash
# Install OpenDataHub (ODH) operator and apply DataScienceCluster (KServe).
# MaaS is deployed separately via deploy.sh --deployment-mode kustomize.
#
# Prerequisites: cert-manager and LWS operators (run install-cert-manager-and-lws.sh first).
#
# Environment variables:
#   OPERATOR_CATALOG - Custom catalog image (optional). When unset, uses community-operators.
#                      Set to e.g. quay.io/opendatahub/opendatahub-operator-catalog:latest for custom builds.
#   OPERATOR_CHANNEL   - Subscription channel (default: fast-3)
#   OPERATOR_STARTING_CSV - Pin Subscription startingCSV (default: opendatahub-operator.v3.4.0-ea.1). Set to "-" to omit.
#   OPERATOR_INSTALL_PLAN_APPROVAL - Manual (default) or Automatic; use "-" to omit.
#     Manual: blocks auto-upgrades; this script auto-approves only the first InstallPlan so install does not stall.
#   OPERATOR_IMAGE   - Custom operator image to patch into CSV (optional)
#   OPERATOR_OPERANDS_MAP - Path to operands-map.yaml for RELATED_IMAGE env var injection (optional)
#                           Used with OPERATOR_IMAGE to ensure component images match the operator.
#
# Usage: ./install-odh.sh

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
DATA_DIR="${REPO_ROOT}/scripts/data"

NAMESPACE="${OPERATOR_NAMESPACE:-opendatahub}"
OPERATOR_CATALOG="${OPERATOR_CATALOG:-}"
OPERATOR_CHANNEL="${OPERATOR_CHANNEL:-}"
OPERATOR_STARTING_CSV="${OPERATOR_STARTING_CSV:-}"
OPERATOR_INSTALL_PLAN_APPROVAL="${OPERATOR_INSTALL_PLAN_APPROVAL:-}"
OPERATOR_IMAGE="${OPERATOR_IMAGE:-}"

# Source deployment helpers
source "$REPO_ROOT/scripts/deployment-helpers.sh"

patch_operator_csv_if_needed() {
  [[ -z "$OPERATOR_IMAGE" ]] && return 0
  local operator_prefix="$1"
  local namespace="$2"

  log_info "Patching operator CSV with custom image: $OPERATOR_IMAGE"
  local csv_name=""
  local timeout=60
  local elapsed=0
  local interval=5

  while [[ $elapsed -lt $timeout ]]; do
    csv_name=$(kubectl get csv -n "$namespace" --no-headers 2>/dev/null | grep "^${operator_prefix}" | head -n1 | awk '{print $1}')
    [[ -n "$csv_name" ]] && break
    sleep $interval
    elapsed=$((elapsed + interval))
  done

  if [[ -z "$csv_name" ]]; then
    log_warn "Could not find CSV for $operator_prefix after ${timeout}s, skipping image patch"
    return 0
  fi

  kubectl annotate csv "$csv_name" -n "$namespace" opendatahub.io/managed=false --overwrite 2>/dev/null || true
  kubectl patch csv "$csv_name" -n "$namespace" --type='json' -p="[
    {\"op\": \"replace\", \"path\": \"/spec/install/spec/deployments/0/spec/template/spec/containers/0/image\", \"value\": \"$OPERATOR_IMAGE\"}
  ]"
  log_info "CSV $csv_name patched with image $OPERATOR_IMAGE"

  # When using a custom operator image, the community CSV may lack RELATED_IMAGE env vars
  # that the operator needs to deploy the correct component versions.
  # If OPERATOR_OPERANDS_MAP points to a local operands-map.yaml, inject its env vars into the CSV.
  if [[ -n "${OPERATOR_OPERANDS_MAP:-}" && -f "$OPERATOR_OPERANDS_MAP" ]]; then
    log_info "Injecting RELATED_IMAGE env vars from $OPERATOR_OPERANDS_MAP into CSV"
    local env_patches="["
    local first=true
    while IFS= read -r line; do
      local name value
      name=$(echo "$line" | sed -n 's/.*name: \(RELATED_IMAGE_[^ ]*\)/\1/p')
      if [[ -n "$name" ]]; then
        read -r value_line
        value=$(echo "$value_line" | sed -n 's/.*value: \(.*\)/\1/p')
        if [[ -n "$value" ]]; then
          $first || env_patches+=","
          first=false
          env_patches+="{\"name\":\"$name\",\"value\":\"$value\"}"
        fi
      fi
    done < "$OPERATOR_OPERANDS_MAP"

    if [[ "$env_patches" != "[" ]]; then
      env_patches+="]"
      local container_path="/spec/install/spec/deployments/0/spec/template/spec/containers/0"
      local existing_env
      existing_env=$(kubectl get csv "$csv_name" -n "$namespace" -o jsonpath="{${container_path}.env}" 2>/dev/null || echo "[]")

      local merged_env
      merged_env=$(python3 -c "
import json, sys
existing = json.loads('${existing_env}')
new_envs = json.loads(sys.stdin.read())
existing_names = {e['name'] for e in existing}
for e in new_envs:
    if e['name'] not in existing_names:
        existing.append(e)
print(json.dumps(existing))
" <<< "$env_patches")

      kubectl patch csv "$csv_name" -n "$namespace" --type='json' \
        -p="[{\"op\": \"replace\", \"path\": \"${container_path}/env\", \"value\": ${merged_env}}]"
      log_info "CSV env vars patched with RELATED_IMAGE entries"
    fi
  fi
}

echo "=== Installing OpenDataHub operator ==="
echo ""

# 1. Catalog setup: community-operators by default, or custom catalog when OPERATOR_CATALOG is set
echo "1. Setting up ODH catalog..."
if [[ -n "$OPERATOR_CATALOG" ]]; then
  echo "   Using custom catalog: $OPERATOR_CATALOG"
  create_custom_catalogsource "odh-custom-catalog" "openshift-marketplace" "$OPERATOR_CATALOG"
  catalog_source="odh-custom-catalog"
  channel="${OPERATOR_CHANNEL:-fast-3}"
else
  echo "   Using community-operators"
  catalog_source="community-operators"
  channel="${OPERATOR_CHANNEL:-fast-3}"
fi

# Pin to ODH 3.4 EA1 unless overridden (omit with OPERATOR_STARTING_CSV=- to follow channel head)
starting_csv="${OPERATOR_STARTING_CSV:-opendatahub-operator.v3.4.0-ea.1}"
[[ "$starting_csv" == "-" ]] && starting_csv=""

# Manual = no auto-upgrades; install_olm_operator still approves the first InstallPlan programmatically
plan_approval="${OPERATOR_INSTALL_PLAN_APPROVAL:-Manual}"
[[ "$plan_approval" == "-" ]] && plan_approval=""

# 2. Install ODH operator via OLM
echo "2. Installing ODH operator..."
if ! install_olm_operator \
  "opendatahub-operator" \
  "$NAMESPACE" \
  "$catalog_source" \
  "$channel" \
  "$starting_csv" \
  "AllNamespaces" \
  "openshift-marketplace" \
  "$plan_approval"; then
  log_error "ODH operator installation failed"
  exit 1
fi

# 3. Patch CSV with custom image if specified
if [[ -n "$OPERATOR_IMAGE" ]]; then
  echo "3. Patching operator image..."
  patch_operator_csv_if_needed "opendatahub-operator" "$NAMESPACE"
else
  echo "3. Skipping operator image patch (OPERATOR_IMAGE not set)"
fi

# 4. Wait for CRDs
echo "4. Waiting for operator CRDs..."
wait_for_crd "datascienceclusters.datasciencecluster.opendatahub.io" 180 || {
  log_error "DataScienceCluster CRD not available - operator may not have installed correctly"
  exit 1
}

# 5. Wait for webhook
echo "5. Waiting for operator webhook..."
wait_for_resource "deployment" "opendatahub-operator-controller-manager" "$NAMESPACE" 120 || {
  log_warn "Webhook deployment not found after 120s, proceeding anyway..."
}
if kubectl get deployment opendatahub-operator-controller-manager -n "$NAMESPACE" &>/dev/null; then
  kubectl wait --for=condition=Available --timeout=120s \
    deployment/opendatahub-operator-controller-manager -n "$NAMESPACE" 2>/dev/null || {
    log_warn "Webhook deployment not fully ready, proceeding anyway..."
  }
fi

# 6. Apply DSCInitialization (with retries)
echo "6. Applying DSCInitialization..."
if kubectl get dscinitializations default-dsci &>/dev/null; then
  echo "   DSCInitialization already exists, skipping"
else
  dsci_applied=false
  for attempt in $(seq 1 5); do
    if kubectl apply -f - <<EOF
apiVersion: dscinitialization.opendatahub.io/v1
kind: DSCInitialization
metadata:
  name: default-dsci
spec:
  applicationsNamespace: ${NAMESPACE}
  monitoring:
    managementState: Managed
    namespace: ${NAMESPACE}-monitoring
    metrics: {}
  trustedCABundle:
    managementState: Managed
EOF
    then
      dsci_applied=true
      break
    fi
    echo "   Attempt $attempt/5 failed (webhook may not be ready), retrying in 15s..."
    sleep 15
  done
  if [[ "$dsci_applied" != "true" ]]; then
    log_error "Failed to apply DSCInitialization after 5 attempts"
    exit 1
  fi
fi

# 7. Apply DataScienceCluster (KServe + ModelsAsService Managed)
# The manifest filename retains "unmanaged" for backward compat; contents include
# modelsAsService.managementState: Managed so the operator deploys maas-controller.
echo "7. Applying DataScienceCluster..."
if kubectl get datasciencecluster -A --no-headers 2>/dev/null | grep -q .; then
  echo "   DataScienceCluster already exists, skipping"
else
  kubectl apply --server-side=true -f "${DATA_DIR}/datasciencecluster-unmanaged.yaml"
fi

# 8. Wait for DataScienceCluster ready (KServe)
echo "8. Waiting for DataScienceCluster (KServe)..."
wait_datasciencecluster_ready "default-dsc" 600 || {
  log_error "DataScienceCluster did not become ready"
  exit 1
}

# 9. Wait for odh-model-controller webhook to be ready
# The odh-model-controller registers a ConfigMap validating webhook. If we proceed before
# its pods are ready, any ConfigMap create/update fails with "no endpoints available".
echo "9. Waiting for odh-model-controller webhook..."
wait_for_validating_webhooks "$NAMESPACE" 180 || {
  log_warn "Validating webhooks in $NAMESPACE not ready after 180s, proceeding anyway..."
}

echo ""
echo "=== ODH installation complete ==="
echo ""
echo "Verify:"
echo "  kubectl get datasciencecluster -A"
echo "  kubectl get pods -n opendatahub"
echo "  kubectl get pods -n kserve"