Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 68 additions & 28 deletions scripts/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@
# LOG_LEVEL Logging verbosity (DEBUG, INFO, WARN, ERROR)
# KUSTOMIZE_FORCE_CONFLICTS When true, use --force-conflicts on kubectl apply in kustomize mode
#
# TIMEOUT CONFIGURATION (all in seconds, see deployment-helpers.sh for defaults):
# CUSTOM_RESOURCE_TIMEOUT DataScienceCluster wait (default: 600)
# NAMESPACE_TIMEOUT Namespace creation/ready (default: 300)
# RESOURCE_TIMEOUT Generic resource wait (default: 300)
# CRD_TIMEOUT CRD establishment (default: 180)
# CSV_TIMEOUT CSV installation (default: 180)
# SUBSCRIPTION_TIMEOUT Subscription install (default: 300)
# POD_TIMEOUT Pod ready wait (default: 120)
# WEBHOOK_TIMEOUT Webhook ready (default: 60)
# CUSTOM_CHECK_TIMEOUT Generic check (default: 120)
# AUTHORINO_TIMEOUT Authorino ready (default: 120)
# ROLLOUT_TIMEOUT kubectl rollout status (default: 120)
# CATALOGSOURCE_TIMEOUT CatalogSource ready (default: 120)
#
# EXAMPLES:
# # Deploy ODH (default, uses kuadrant policy engine)
# ./scripts/deploy.sh
Expand Down Expand Up @@ -182,6 +196,16 @@ ENVIRONMENT VARIABLES:
LOG_LEVEL Logging verbosity (DEBUG, INFO, WARN, ERROR)
KUSTOMIZE_FORCE_CONFLICTS When true, pass --force-conflicts to kubectl apply in kustomize mode (default: false)

TIMEOUT CONFIGURATION (all values in seconds):
Customize timeouts for slow clusters or CI/CD environments:
- CUSTOM_RESOURCE_TIMEOUT=600 DataScienceCluster wait
- NAMESPACE_TIMEOUT=300 Namespace creation
- CRD_TIMEOUT=180 CRD establishment
- CSV_TIMEOUT=180 Operator CSV installation
- ROLLOUT_TIMEOUT=120 Deployment rollout
- AUTHORINO_TIMEOUT=120 Authorino ready
See deployment-helpers.sh for complete list and defaults

EXAMPLES:
# Deploy ODH (default, uses kuadrant policy engine)
./scripts/deploy.sh
Expand Down Expand Up @@ -505,8 +529,8 @@ main() {
fi

log_info " Waiting for maas-controller to be ready..."
if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout=120s; then
log_error "maas-controller deployment not ready"
if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then
log_error "maas-controller deployment not ready (timeout: ${ROLLOUT_TIMEOUT}s)"
return 1
fi

Expand All @@ -523,8 +547,8 @@ main() {
log_info " Non-standard cluster audience detected: $cluster_aud"
log_info " Patching maas-controller with correct CLUSTER_AUDIENCE..."
kubectl set env deployment/maas-controller -n "$NAMESPACE" CLUSTER_AUDIENCE="$cluster_aud"
if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout=120s; then
log_warn "maas-controller rollout after audience patch did not complete in time"
if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then
log_warn "maas-controller rollout after audience patch did not complete in time (timeout: ${ROLLOUT_TIMEOUT}s)"
fi
fi
fi
Expand Down Expand Up @@ -611,7 +635,12 @@ deploy_via_kustomize() {

if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
log_info "Creating namespace: $NAMESPACE"
kubectl create namespace "$NAMESPACE"
if ! kubectl create namespace "$NAMESPACE"; then
log_error "Failed to create namespace $NAMESPACE"
return 1
fi
else
log_debug "Namespace $NAMESPACE already exists"
fi

# Note: The subscription namespace (default: models-as-a-service) is automatically
Expand Down Expand Up @@ -819,8 +848,8 @@ patch_kuadrant_csv_for_gateway() {
# Wait for the new pod to be ready
log_info "Waiting for operator pod to restart..."
sleep 5
kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout=120s 2>/dev/null || \
log_warn "Operator rollout status check timed out"
kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || \
log_warn "Operator rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)"

# Verify the env var is in the RUNNING pod
local pod_env
Expand Down Expand Up @@ -1060,8 +1089,8 @@ apply_custom_resources() {
# The operator creates CRDs when its CSV becomes active, but there can be a delay.
# Both CRDs are installed together, so waiting for DataScienceCluster is sufficient.
log_info "Waiting for operator CRDs to be established..."
wait_for_crd "datascienceclusters.datasciencecluster.opendatahub.io" 180 || {
log_error "DataScienceCluster CRD not available - operator may not have installed correctly"
wait_for_crd "datascienceclusters.datasciencecluster.opendatahub.io" "$CRD_TIMEOUT" || {
log_error "DataScienceCluster CRD not available - operator may not have installed correctly (timeout: ${CRD_TIMEOUT}s)"
return 1
}

Expand All @@ -1084,15 +1113,15 @@ apply_custom_resources() {
fi

# Wait for webhook deployment to exist and be ready (ensures service + endpoints are ready)
wait_for_resource "deployment" "$webhook_deployment" "$webhook_namespace" 120 || {
log_warn "Webhook deployment not found after 120s, proceeding anyway..."
wait_for_resource "deployment" "$webhook_deployment" "$webhook_namespace" "$ROLLOUT_TIMEOUT" || {
log_warn "Webhook deployment not found after ${ROLLOUT_TIMEOUT}s, proceeding anyway..."
}

# Wait for deployment to be fully ready (replicas available)
if kubectl get deployment "$webhook_deployment" -n "$webhook_namespace" >/dev/null 2>&1; then
kubectl wait --for=condition=Available --timeout=120s \
kubectl wait --for=condition=Available --timeout="${ROLLOUT_TIMEOUT}s" \
deployment/"$webhook_deployment" -n "$webhook_namespace" 2>/dev/null || {
log_warn "Webhook deployment not fully ready, proceeding anyway..."
log_warn "Webhook deployment not fully ready after ${ROLLOUT_TIMEOUT}s, proceeding anyway..."
}
fi

Expand Down Expand Up @@ -1225,7 +1254,12 @@ setup_gateway_api() {

# Create GatewayClass for OpenShift Gateway API controller
# This enables the built-in Gateway API implementation (OpenShift 4.14+)
kubectl apply -f "${data_dir}/gatewayclass.yaml"
if kubectl get gatewayclass openshift-default &>/dev/null; then
log_debug "GatewayClass openshift-default already exists, skipping creation"
else
log_info "Creating GatewayClass openshift-default..."
kubectl apply -f "${data_dir}/gatewayclass.yaml"
fi
}

# setup_maas_gateway
Expand Down Expand Up @@ -1313,8 +1347,13 @@ setup_maas_gateway() {

# Create the Gateway resource using the kustomize manifest
# This includes both HTTP and HTTPS listeners, required annotations and labels
log_info "Creating maas-default-gateway resource (allowing routes from all namespaces)..."

if kubectl get gateway maas-default-gateway -n openshift-ingress &>/dev/null; then
log_info "Gateway maas-default-gateway already exists in openshift-ingress"
log_debug " Updating Gateway configuration if needed..."
else
log_info "Creating maas-default-gateway resource (allowing routes from all namespaces)..."
fi

local maas_networking_dir="${SCRIPT_DIR}/../deployment/base/networking/maas"
if [[ -d "$maas_networking_dir" ]]; then
# Use local kustomize manifest with envsubst for variable substitution
Expand Down Expand Up @@ -1346,38 +1385,39 @@ apply_kuadrant_cr() {
# Wait for Gateway to be Programmed (required before Kuadrant can become ready)
# This ensures Service Mesh is installed and Gateway API provider is operational
log_info "Waiting for Gateway to be Programmed (Service Mesh initialization)..."
if ! kubectl wait --for=condition=Programmed gateway/maas-default-gateway -n openshift-ingress --timeout=120s 2>/dev/null; then
log_warn "Gateway not yet Programmed after 120s - Kuadrant may take longer to become ready"
if ! kubectl wait --for=condition=Programmed gateway/maas-default-gateway -n openshift-ingress --timeout="${CUSTOM_CHECK_TIMEOUT}s" 2>/dev/null; then
log_warn "Gateway not yet Programmed after ${CUSTOM_CHECK_TIMEOUT}s - Kuadrant may take longer to become ready"
fi

log_info "Applying Kuadrant custom resource in $namespace..."

local data_dir="${SCRIPT_DIR}/data"
kubectl apply -f "${data_dir}/kuadrant.yaml" -n "$namespace"

# Wait for Kuadrant to be ready (initial attempt - 60s)
# Wait for Kuadrant to be ready (initial attempt - configurable timeout)
# If it fails with MissingDependency, restart the operator and retry
log_info "Waiting for Kuadrant to become ready (initial check)..."
local kuadrant_initial_timeout=$((CUSTOM_CHECK_TIMEOUT / 2)) # Use half of standard timeout for initial check
if ! wait_for_custom_check "Kuadrant ready in $namespace" \
"kubectl get kuadrant kuadrant -n $namespace -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' 2>/dev/null | grep -q True" \
60 \
"$kuadrant_initial_timeout" \
5; then

# Check if it's a MissingDependency issue
local kuadrant_reason
kuadrant_reason=$(kubectl get kuadrant kuadrant -n "$namespace" -o jsonpath='{.status.conditions[?(@.type=="Ready")].reason}' 2>/dev/null || echo "")

if [[ "$kuadrant_reason" == "MissingDependency" ]]; then
log_info "Kuadrant shows MissingDependency - restarting operator to re-register Gateway controller..."
kubectl delete pod -n "$namespace" -l control-plane=controller-manager --force --grace-period=0 2>/dev/null || true
sleep 15

# Retry waiting for Kuadrant
log_info "Retrying Kuadrant readiness check after operator restart..."
wait_for_custom_check "Kuadrant ready in $namespace" \
"kubectl get kuadrant kuadrant -n $namespace -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' 2>/dev/null | grep -q True" \
120 \
5 || log_warn "Kuadrant not ready yet - AuthPolicy enforcement may fail on model HTTPRoutes"
"$CUSTOM_CHECK_TIMEOUT" \
5 || log_warn "Kuadrant not ready yet (timeout: ${CUSTOM_CHECK_TIMEOUT}s) - AuthPolicy enforcement may fail on model HTTPRoutes"
else
log_warn "Kuadrant not ready (reason: $kuadrant_reason) - AuthPolicy enforcement may fail"
fi
Expand Down Expand Up @@ -1682,8 +1722,8 @@ configure_tls_backend() {

# Wait for Authorino deployment to be created by Kuadrant operator
# This is necessary because Kuadrant may not be fully ready yet (timing issue)
wait_for_resource "deployment" "authorino" "$authorino_namespace" 180 || {
log_warn "Authorino deployment not found, TLS configuration may fail"
wait_for_resource "deployment" "authorino" "$authorino_namespace" "$RESOURCE_TIMEOUT" || {
log_warn "Authorino deployment not found after ${RESOURCE_TIMEOUT}s, TLS configuration may fail"
}

# Call TLS configuration script
Expand Down Expand Up @@ -1719,7 +1759,7 @@ configure_tls_backend() {

# Wait for Authorino to be ready after restart
log_info "Waiting for Authorino deployment to be ready..."
kubectl rollout status deployment/authorino -n "$authorino_namespace" --timeout=120s 2>/dev/null || log_warn "Authorino rollout status check timed out"
kubectl rollout status deployment/authorino -n "$authorino_namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || log_warn "Authorino rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)"

log_info "TLS backend configuration complete"
}
Expand Down
Loading
Loading