diff --git a/.tekton/odh-maas-api-pull-request.yaml b/.tekton/odh-maas-api-pull-request.yaml index 82fb43cf3..ae1ab35e3 100644 --- a/.tekton/odh-maas-api-pull-request.yaml +++ b/.tekton/odh-maas-api-pull-request.yaml @@ -9,7 +9,9 @@ metadata: pipelinesascode.tekton.dev/cancel-in-progress: "false" pipelinesascode.tekton.dev/max-keep-runs: "3" pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch - == "main" && !files.all.all(x, x.matches('^docs/') || x.matches('\\.md$')) + == "main" && !files.all.all(x, x.matches('^docs/') || x.matches('\\.md$') + || x.matches('^(.*/)?(\\.gitignore|OWNERS|PROJECT|LICENSE)$') + || x.matches('^\\.github/')) creationTimestamp: null labels: appstudio.openshift.io/application: opendatahub-builds diff --git a/.tekton/odh-maas-controller-pull-request.yaml b/.tekton/odh-maas-controller-pull-request.yaml index 53b7ab77a..0cff42ffa 100644 --- a/.tekton/odh-maas-controller-pull-request.yaml +++ b/.tekton/odh-maas-controller-pull-request.yaml @@ -9,7 +9,9 @@ metadata: pipelinesascode.tekton.dev/cancel-in-progress: "false" pipelinesascode.tekton.dev/max-keep-runs: "3" pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch - == "main" && !files.all.all(x, x.matches('^docs/') || x.matches('\\.md$')) + == "main" && !files.all.all(x, x.matches('^docs/') || x.matches('\\.md$') + || x.matches('^(.*/)?(\\.gitignore|OWNERS|PROJECT|LICENSE)$') + || x.matches('^\\.github/')) creationTimestamp: null labels: appstudio.openshift.io/application: opendatahub-builds diff --git a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml index d2839ff34..a0c3a98ef 100644 --- a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml +++ b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml @@ -76,10 +76,6 @@ spec: maxLength: 63 minLength: 1 type: string - tokenRateLimitRef: - description: TokenRateLimitRef references an existing TokenRateLimit - resource - type: string tokenRateLimits: description: TokenRateLimits defines token-based rate limits for this model @@ -89,6 +85,7 @@ spec: limit: description: Limit is the maximum number of tokens allowed format: int64 + minimum: 1 type: integer window: description: Window is the time window (e.g., "1m", "1h", @@ -99,10 +96,12 @@ spec: - limit - window type: object + minItems: 1 type: array required: - name - namespace + - tokenRateLimits type: object minItems: 1 type: array diff --git a/docs/content/configuration-and-management/maas-controller-overview.md b/docs/content/configuration-and-management/maas-controller-overview.md index 7e556c18a..c2f03a88c 100644 --- a/docs/content/configuration-and-management/maas-controller-overview.md +++ b/docs/content/configuration-and-management/maas-controller-overview.md @@ -192,7 +192,7 @@ erDiagram - **MaaSModelRef**: `spec.modelRef.kind` = LLMInferenceService or ExternalModel; `spec.modelRef.name` = name of the referenced model resource. - **MaaSAuthPolicy**: `spec.modelRefs` (list of ModelRef objects with name and namespace), `spec.subjects` (groups, users). -- **MaaSSubscription**: `spec.owner` (groups, users), `spec.modelRefs` (list of ModelSubscriptionRef objects with name, namespace, and either `tokenRateLimits` array or `tokenRateLimitRef` reference to define per-model rate limits). +- **MaaSSubscription**: `spec.owner` (groups, users), `spec.modelRefs` (list of ModelSubscriptionRef objects with name, namespace, and required `tokenRateLimits` array to define per-model rate limits). --- diff --git a/docs/content/configuration-and-management/model-listing-flow.md b/docs/content/configuration-and-management/model-listing-flow.md index 18485067a..f898ade3e 100644 --- a/docs/content/configuration-and-management/model-listing-flow.md +++ b/docs/content/configuration-and-management/model-listing-flow.md @@ -23,7 +23,7 @@ When the [MaaS controller](https://github.com/opendatahub-io/models-as-a-service 2. For each MaaSModelRef, it reads **id** (`metadata.name`), **url** (`status.endpoint`), **ready** (`status.phase == "Ready"`), and related metadata. The controller has populated `status.endpoint` and `status.phase` from the underlying LLMInferenceService (for llmisvc) or HTTPRoute/Gateway. -3. **Access validation**: The API probes each model’s `/v1/models` endpoint with the **exact Authorization header** the client sent (passed through as-is). Only models that return **2xx**, **3xx** or **405** are included in the response. This ensures the list only shows models the client is authorized to use. +3. **Access validation**: The API probes each model’s `/v1/models` endpoint with the **exact Authorization header** the client sent (passed through as-is). Only models that return **2xx** or **405** are included in the response. This ensures the list only shows models the client is authorized to use. 4. For each model, the API reads **annotations** from the MaaSModelRef to populate `modelDetails` in the response (display name, description, use case, context window). See [CRD annotations](crd-annotations.md) for the full list. diff --git a/docs/content/reference/crds/maas-subscription.md b/docs/content/reference/crds/maas-subscription.md index 775311b75..2dd919174 100644 --- a/docs/content/reference/crds/maas-subscription.md +++ b/docs/content/reference/crds/maas-subscription.md @@ -24,8 +24,7 @@ Defines a subscription plan with per-model token rate limits. Creates Kuadrant T |-------|------|----------|-------------| | name | string | Yes | Name of the MaaSModelRef | | namespace | string | Yes | Namespace where the MaaSModelRef lives | -| tokenRateLimits | []TokenRateLimit | No | Token-based rate limits for this model | -| tokenRateLimitRef | string | No | Reference to an existing TokenRateLimit resource | +| tokenRateLimits | []TokenRateLimit | Yes | Token-based rate limits for this model (at least one required) | | billingRate | BillingRate | No | Cost per token | ## TokenRateLimit diff --git a/docs/content/release-notes/index.md b/docs/content/release-notes/index.md index b608aeacb..940f5ae92 100644 --- a/docs/content/release-notes/index.md +++ b/docs/content/release-notes/index.md @@ -1,5 +1,15 @@ # Release Notes +## v3.4.0 + +### Major Changes + +Version 3.4.0 introduces new CRDs and API resources that are not compatible with previous versions. All MaaS custom resources (`MaaSModelRef`, `MaaSAuthPolicy`, `MaaSSubscription`) are new in this release. + +**Migration:** See the overall migration plan for detailed upgrade instructions from previous versions. + +--- + ## v0.1.0 *Initial release.* diff --git a/maas-controller/api/maas/v1alpha1/maassubscription_types.go b/maas-controller/api/maas/v1alpha1/maassubscription_types.go index a2d3bbf16..b6f81a678 100644 --- a/maas-controller/api/maas/v1alpha1/maassubscription_types.go +++ b/maas-controller/api/maas/v1alpha1/maassubscription_types.go @@ -64,12 +64,8 @@ type ModelSubscriptionRef struct { Namespace string `json:"namespace"` // TokenRateLimits defines token-based rate limits for this model - // +optional - TokenRateLimits []TokenRateLimit `json:"tokenRateLimits,omitempty"` - - // TokenRateLimitRef references an existing TokenRateLimit resource - // +optional - TokenRateLimitRef *string `json:"tokenRateLimitRef,omitempty"` + // +kubebuilder:validation:MinItems=1 + TokenRateLimits []TokenRateLimit `json:"tokenRateLimits"` // BillingRate defines the cost per token // +optional @@ -79,6 +75,7 @@ type ModelSubscriptionRef struct { // TokenRateLimit defines a token rate limit type TokenRateLimit struct { // Limit is the maximum number of tokens allowed + // +kubebuilder:validation:Minimum=1 Limit int64 `json:"limit"` // Window is the time window (e.g., "1m", "1h", "24h") diff --git a/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go b/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go index 683a6ae68..578e4a28c 100644 --- a/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go +++ b/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go @@ -549,11 +549,6 @@ func (in *ModelSubscriptionRef) DeepCopyInto(out *ModelSubscriptionRef) { *out = make([]TokenRateLimit, len(*in)) copy(*out, *in) } - if in.TokenRateLimitRef != nil { - in, out := &in.TokenRateLimitRef, &out.TokenRateLimitRef - *out = new(string) - **out = **in - } if in.BillingRate != nil { in, out := &in.BillingRate, &out.BillingRate *out = new(BillingRate) diff --git a/maas-controller/pkg/controller/maas/maassubscription_controller_test.go b/maas-controller/pkg/controller/maas/maassubscription_controller_test.go index 7992a4d8a..89ec070f7 100644 --- a/maas-controller/pkg/controller/maas/maassubscription_controller_test.go +++ b/maas-controller/pkg/controller/maas/maassubscription_controller_test.go @@ -397,6 +397,7 @@ func TestMaaSSubscriptionReconciler_RemoveModelRef(t *testing.T) { WithRESTMapper(testRESTMapper()). WithObjects(modelRefA, modelRefB, routeA, routeB, sub). WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). Build() r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} @@ -492,6 +493,7 @@ func TestMaaSSubscriptionReconciler_RemoveModelRef_Aggregation(t *testing.T) { WithRESTMapper(testRESTMapper()). WithObjects(modelRefA, modelRefB, routeA, routeB, sub1, sub2). WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). Build() r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} @@ -717,6 +719,7 @@ func TestMaaSSubscriptionReconciler_SimplifiedTRLP(t *testing.T) { WithRESTMapper(testRESTMapper()). WithObjects(model, route, maasSub). WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). Build() r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} @@ -810,6 +813,7 @@ func TestMaaSSubscriptionReconciler_MultipleSubscriptionsSimplified(t *testing.T WithRESTMapper(testRESTMapper()). WithObjects(model, route, subA, subB). WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). Build() r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} diff --git a/scripts/deploy.sh b/scripts/deploy.sh index f871f4f2f..1aee4e1bd 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -33,6 +33,20 @@ # LOG_LEVEL Logging verbosity (DEBUG, INFO, WARN, ERROR) # KUSTOMIZE_FORCE_CONFLICTS When true, use --force-conflicts on kubectl apply in kustomize mode # +# TIMEOUT CONFIGURATION (all in seconds, see deployment-helpers.sh for defaults): +# CUSTOM_RESOURCE_TIMEOUT DataScienceCluster wait (default: 600) +# NAMESPACE_TIMEOUT Namespace creation/ready (default: 300) +# RESOURCE_TIMEOUT Generic resource wait (default: 300) +# CRD_TIMEOUT CRD establishment (default: 180) +# CSV_TIMEOUT CSV installation (default: 180) +# SUBSCRIPTION_TIMEOUT Subscription install (default: 300) +# POD_TIMEOUT Pod ready wait (default: 120) +# WEBHOOK_TIMEOUT Webhook ready (default: 60) +# CUSTOM_CHECK_TIMEOUT Generic check (default: 120) +# AUTHORINO_TIMEOUT Authorino ready (default: 120) +# ROLLOUT_TIMEOUT kubectl rollout status (default: 120) +# CATALOGSOURCE_TIMEOUT CatalogSource ready (default: 120) +# # EXAMPLES: # # Deploy ODH (default, uses kuadrant policy engine) # ./scripts/deploy.sh @@ -182,6 +196,16 @@ ENVIRONMENT VARIABLES: LOG_LEVEL Logging verbosity (DEBUG, INFO, WARN, ERROR) KUSTOMIZE_FORCE_CONFLICTS When true, pass --force-conflicts to kubectl apply in kustomize mode (default: false) +TIMEOUT CONFIGURATION (all values in seconds): + Customize timeouts for slow clusters or CI/CD environments: + - CUSTOM_RESOURCE_TIMEOUT=600 DataScienceCluster wait + - NAMESPACE_TIMEOUT=300 Namespace creation + - CRD_TIMEOUT=180 CRD establishment + - CSV_TIMEOUT=180 Operator CSV installation + - ROLLOUT_TIMEOUT=120 Deployment rollout + - AUTHORINO_TIMEOUT=120 Authorino ready + See deployment-helpers.sh for complete list and defaults + EXAMPLES: # Deploy ODH (default, uses kuadrant policy engine) ./scripts/deploy.sh @@ -505,8 +529,8 @@ main() { fi log_info " Waiting for maas-controller to be ready..." - if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout=120s; then - log_error "maas-controller deployment not ready" + if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then + log_error "maas-controller deployment not ready (timeout: ${ROLLOUT_TIMEOUT}s)" return 1 fi @@ -523,8 +547,8 @@ main() { log_info " Non-standard cluster audience detected: $cluster_aud" log_info " Patching maas-controller with correct CLUSTER_AUDIENCE..." kubectl set env deployment/maas-controller -n "$NAMESPACE" CLUSTER_AUDIENCE="$cluster_aud" - if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout=120s; then - log_warn "maas-controller rollout after audience patch did not complete in time" + if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then + log_warn "maas-controller rollout after audience patch did not complete in time (timeout: ${ROLLOUT_TIMEOUT}s)" fi fi fi @@ -611,7 +635,12 @@ deploy_via_kustomize() { if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then log_info "Creating namespace: $NAMESPACE" - kubectl create namespace "$NAMESPACE" + if ! kubectl create namespace "$NAMESPACE"; then + log_error "Failed to create namespace $NAMESPACE" + return 1 + fi + else + log_debug "Namespace $NAMESPACE already exists" fi # Note: The subscription namespace (default: models-as-a-service) is automatically @@ -819,8 +848,8 @@ patch_kuadrant_csv_for_gateway() { # Wait for the new pod to be ready log_info "Waiting for operator pod to restart..." sleep 5 - kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout=120s 2>/dev/null || \ - log_warn "Operator rollout status check timed out" + kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || \ + log_warn "Operator rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)" # Verify the env var is in the RUNNING pod local pod_env @@ -1060,8 +1089,8 @@ apply_custom_resources() { # The operator creates CRDs when its CSV becomes active, but there can be a delay. # Both CRDs are installed together, so waiting for DataScienceCluster is sufficient. log_info "Waiting for operator CRDs to be established..." - wait_for_crd "datascienceclusters.datasciencecluster.opendatahub.io" 180 || { - log_error "DataScienceCluster CRD not available - operator may not have installed correctly" + wait_for_crd "datascienceclusters.datasciencecluster.opendatahub.io" "$CRD_TIMEOUT" || { + log_error "DataScienceCluster CRD not available - operator may not have installed correctly (timeout: ${CRD_TIMEOUT}s)" return 1 } @@ -1084,15 +1113,15 @@ apply_custom_resources() { fi # Wait for webhook deployment to exist and be ready (ensures service + endpoints are ready) - wait_for_resource "deployment" "$webhook_deployment" "$webhook_namespace" 120 || { - log_warn "Webhook deployment not found after 120s, proceeding anyway..." + wait_for_resource "deployment" "$webhook_deployment" "$webhook_namespace" "$ROLLOUT_TIMEOUT" || { + log_warn "Webhook deployment not found after ${ROLLOUT_TIMEOUT}s, proceeding anyway..." } # Wait for deployment to be fully ready (replicas available) if kubectl get deployment "$webhook_deployment" -n "$webhook_namespace" >/dev/null 2>&1; then - kubectl wait --for=condition=Available --timeout=120s \ + kubectl wait --for=condition=Available --timeout="${ROLLOUT_TIMEOUT}s" \ deployment/"$webhook_deployment" -n "$webhook_namespace" 2>/dev/null || { - log_warn "Webhook deployment not fully ready, proceeding anyway..." + log_warn "Webhook deployment not fully ready after ${ROLLOUT_TIMEOUT}s, proceeding anyway..." } fi @@ -1225,7 +1254,12 @@ setup_gateway_api() { # Create GatewayClass for OpenShift Gateway API controller # This enables the built-in Gateway API implementation (OpenShift 4.14+) - kubectl apply -f "${data_dir}/gatewayclass.yaml" + if kubectl get gatewayclass openshift-default &>/dev/null; then + log_debug "GatewayClass openshift-default already exists, skipping creation" + else + log_info "Creating GatewayClass openshift-default..." + kubectl apply -f "${data_dir}/gatewayclass.yaml" + fi } # setup_maas_gateway @@ -1313,8 +1347,13 @@ setup_maas_gateway() { # Create the Gateway resource using the kustomize manifest # This includes both HTTP and HTTPS listeners, required annotations and labels - log_info "Creating maas-default-gateway resource (allowing routes from all namespaces)..." - + if kubectl get gateway maas-default-gateway -n openshift-ingress &>/dev/null; then + log_info "Gateway maas-default-gateway already exists in openshift-ingress" + log_debug " Updating Gateway configuration if needed..." + else + log_info "Creating maas-default-gateway resource (allowing routes from all namespaces)..." + fi + local maas_networking_dir="${SCRIPT_DIR}/../deployment/base/networking/maas" if [[ -d "$maas_networking_dir" ]]; then # Use local kustomize manifest with envsubst for variable substitution @@ -1346,8 +1385,8 @@ apply_kuadrant_cr() { # Wait for Gateway to be Programmed (required before Kuadrant can become ready) # This ensures Service Mesh is installed and Gateway API provider is operational log_info "Waiting for Gateway to be Programmed (Service Mesh initialization)..." - if ! kubectl wait --for=condition=Programmed gateway/maas-default-gateway -n openshift-ingress --timeout=120s 2>/dev/null; then - log_warn "Gateway not yet Programmed after 120s - Kuadrant may take longer to become ready" + if ! kubectl wait --for=condition=Programmed gateway/maas-default-gateway -n openshift-ingress --timeout="${CUSTOM_CHECK_TIMEOUT}s" 2>/dev/null; then + log_warn "Gateway not yet Programmed after ${CUSTOM_CHECK_TIMEOUT}s - Kuadrant may take longer to become ready" fi log_info "Applying Kuadrant custom resource in $namespace..." @@ -1355,29 +1394,30 @@ apply_kuadrant_cr() { local data_dir="${SCRIPT_DIR}/data" kubectl apply -f "${data_dir}/kuadrant.yaml" -n "$namespace" - # Wait for Kuadrant to be ready (initial attempt - 60s) + # Wait for Kuadrant to be ready (initial attempt - configurable timeout) # If it fails with MissingDependency, restart the operator and retry log_info "Waiting for Kuadrant to become ready (initial check)..." + local kuadrant_initial_timeout=$((CUSTOM_CHECK_TIMEOUT / 2)) # Use half of standard timeout for initial check if ! wait_for_custom_check "Kuadrant ready in $namespace" \ "kubectl get kuadrant kuadrant -n $namespace -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' 2>/dev/null | grep -q True" \ - 60 \ + "$kuadrant_initial_timeout" \ 5; then - + # Check if it's a MissingDependency issue local kuadrant_reason kuadrant_reason=$(kubectl get kuadrant kuadrant -n "$namespace" -o jsonpath='{.status.conditions[?(@.type=="Ready")].reason}' 2>/dev/null || echo "") - + if [[ "$kuadrant_reason" == "MissingDependency" ]]; then log_info "Kuadrant shows MissingDependency - restarting operator to re-register Gateway controller..." kubectl delete pod -n "$namespace" -l control-plane=controller-manager --force --grace-period=0 2>/dev/null || true sleep 15 - + # Retry waiting for Kuadrant log_info "Retrying Kuadrant readiness check after operator restart..." wait_for_custom_check "Kuadrant ready in $namespace" \ "kubectl get kuadrant kuadrant -n $namespace -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' 2>/dev/null | grep -q True" \ - 120 \ - 5 || log_warn "Kuadrant not ready yet - AuthPolicy enforcement may fail on model HTTPRoutes" + "$CUSTOM_CHECK_TIMEOUT" \ + 5 || log_warn "Kuadrant not ready yet (timeout: ${CUSTOM_CHECK_TIMEOUT}s) - AuthPolicy enforcement may fail on model HTTPRoutes" else log_warn "Kuadrant not ready (reason: $kuadrant_reason) - AuthPolicy enforcement may fail" fi @@ -1682,8 +1722,8 @@ configure_tls_backend() { # Wait for Authorino deployment to be created by Kuadrant operator # This is necessary because Kuadrant may not be fully ready yet (timing issue) - wait_for_resource "deployment" "authorino" "$authorino_namespace" 180 || { - log_warn "Authorino deployment not found, TLS configuration may fail" + wait_for_resource "deployment" "authorino" "$authorino_namespace" "$RESOURCE_TIMEOUT" || { + log_warn "Authorino deployment not found after ${RESOURCE_TIMEOUT}s, TLS configuration may fail" } # Call TLS configuration script @@ -1719,7 +1759,7 @@ configure_tls_backend() { # Wait for Authorino to be ready after restart log_info "Waiting for Authorino deployment to be ready..." - kubectl rollout status deployment/authorino -n "$authorino_namespace" --timeout=120s 2>/dev/null || log_warn "Authorino rollout status check timed out" + kubectl rollout status deployment/authorino -n "$authorino_namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || log_warn "Authorino rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)" log_info "TLS backend configuration complete" } diff --git a/scripts/deployment-helpers.sh b/scripts/deployment-helpers.sh index ac7181e33..c91e724df 100755 --- a/scripts/deployment-helpers.sh +++ b/scripts/deployment-helpers.sh @@ -107,8 +107,24 @@ get_cluster_audience() { # Constants and Configuration # ============================================================================ -# Timeout values (seconds) - used by deploy.sh and related scripts -readonly CUSTOM_RESOURCE_TIMEOUT=600 # Used for DataScienceCluster wait +# Timeout values (seconds) - can be overridden via environment variables +# These provide sensible defaults but allow customization for slow/fast clusters +readonly CUSTOM_RESOURCE_TIMEOUT="${CUSTOM_RESOURCE_TIMEOUT:-600}" # DataScienceCluster wait +readonly NAMESPACE_TIMEOUT="${NAMESPACE_TIMEOUT:-300}" # Namespace creation/ready +readonly RESOURCE_TIMEOUT="${RESOURCE_TIMEOUT:-300}" # Generic resource wait +readonly CRD_TIMEOUT="${CRD_TIMEOUT:-180}" # CRD establishment +readonly CSV_TIMEOUT="${CSV_TIMEOUT:-180}" # CSV installation +readonly SUBSCRIPTION_TIMEOUT="${SUBSCRIPTION_TIMEOUT:-300}" # Subscription install +readonly POD_TIMEOUT="${POD_TIMEOUT:-120}" # Pod ready wait +readonly WEBHOOK_TIMEOUT="${WEBHOOK_TIMEOUT:-60}" # Webhook ready +readonly CUSTOM_CHECK_TIMEOUT="${CUSTOM_CHECK_TIMEOUT:-120}" # Generic check +readonly AUTHORINO_TIMEOUT="${AUTHORINO_TIMEOUT:-120}" # Authorino ready +readonly ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-120}" # kubectl rollout status +readonly KUBECONFIG_WAIT_TIMEOUT="${KUBECONFIG_WAIT_TIMEOUT:-60}" # Kubeconfig operations +readonly CATALOGSOURCE_TIMEOUT="${CATALOGSOURCE_TIMEOUT:-120}" # CatalogSource ready +readonly LLMIS_TIMEOUT="${LLMIS_TIMEOUT:-300}" # LLMInferenceService ready +readonly MAASMODELREF_TIMEOUT="${MAASMODELREF_TIMEOUT:-300}" # MaaSModelRef ready +readonly AUTHPOLICY_TIMEOUT="${AUTHPOLICY_TIMEOUT:-180}" # AuthPolicy enforced # Logging levels readonly LOG_LEVEL_DEBUG=0 @@ -181,7 +197,10 @@ waitsubscriptioninstalled() { echo " * Waiting for Subscription $ns/$name to start setup..." # Use fully qualified resource name to avoid conflicts with Knative subscriptions - kubectl wait subscription.operators.coreos.com --timeout=300s -n "$ns" "$name" --for=jsonpath='{.status.currentCSV}' + if ! kubectl wait subscription.operators.coreos.com --timeout="${SUBSCRIPTION_TIMEOUT}s" -n "$ns" "$name" --for=jsonpath='{.status.currentCSV}'; then + echo " * ERROR: Timeout waiting for Subscription $ns/$name to get currentCSV" + return 1 + fi local csv csv=$(kubectl get subscription.operators.coreos.com -n "$ns" "$name" -o jsonpath='{.status.currentCSV}') @@ -191,8 +210,8 @@ waitsubscriptioninstalled() { done echo " * Waiting for Subscription setup to finish setup. CSV = $csv ..." - if ! kubectl wait -n "$ns" --for=jsonpath="{.status.phase}"=Succeeded csv "$csv" --timeout=300s; then - echo " * ERROR: Timeout while waiting for Subscription to finish installation." + if ! kubectl wait -n "$ns" --for=jsonpath="{.status.phase}"=Succeeded csv "$csv" --timeout="${CSV_TIMEOUT}s"; then + echo " * ERROR: Timeout while waiting for Subscription to finish installation (CSV=$csv, timeout=${CSV_TIMEOUT}s)" return 1 fi } @@ -424,12 +443,12 @@ spec: # Arguments: # description - Description of what we're waiting for # check_command - Command to execute (should return 0 on success) -# timeout - Timeout in seconds -# interval - Check interval in seconds +# timeout - Timeout in seconds (default: CUSTOM_CHECK_TIMEOUT) +# interval - Check interval in seconds (default: 5) wait_for_custom_check() { local description=${1?description is required}; shift local check_command=${1?check command is required}; shift - local timeout=${1:-120}; shift || true + local timeout=${1:-$CUSTOM_CHECK_TIMEOUT}; shift || true local interval=${1:-5}; shift || true log_info "Waiting for: $description (timeout: ${timeout}s)" @@ -444,7 +463,7 @@ wait_for_custom_check() { elapsed=$((elapsed + interval)) done - log_warn "$description - Timeout after ${timeout}s" + log_error "$description - Timeout after ${timeout}s" return 1 } @@ -457,26 +476,32 @@ wait_for_custom_check() { # Returns 0 on success, 1 on timeout. wait_for_namespace() { local namespace=${1?namespace is required}; shift - local timeout=${1:-300} # default 5 minutes + local timeout=${1:-$NAMESPACE_TIMEOUT} if kubectl get namespace "$namespace" >/dev/null 2>&1; then - kubectl wait namespace/"$namespace" --for=jsonpath='{.status.phase}'=Active --timeout=60s - return $? + if ! kubectl wait namespace/"$namespace" --for=jsonpath='{.status.phase}'=Active --timeout=60s; then + echo " ERROR: Namespace $namespace exists but failed to become Active" + return 1 + fi + return 0 fi - echo "* Waiting for $namespace namespace to be created..." + echo "* Waiting for $namespace namespace to be created (timeout: ${timeout}s)..." local elapsed=0 local interval=5 while [ $elapsed -lt $timeout ]; do if kubectl get namespace "$namespace" >/dev/null 2>&1; then - kubectl wait namespace/"$namespace" --for=jsonpath='{.status.phase}'=Active --timeout=60s - return $? + if ! kubectl wait namespace/"$namespace" --for=jsonpath='{.status.phase}'=Active --timeout=60s; then + echo " ERROR: Namespace $namespace created but failed to become Active" + return 1 + fi + return 0 fi sleep $interval elapsed=$((elapsed + interval)) done - echo " WARNING: $namespace namespace was not created within timeout." + echo " ERROR: $namespace namespace was not created within ${timeout}s timeout" return 1 } @@ -487,9 +512,9 @@ wait_for_resource() { local kind=${1?kind is required}; shift local name=${1?name is required}; shift local namespace=${1?namespace is required}; shift - local timeout=${1:-300} # default 5 minutes + local timeout=${1:-$RESOURCE_TIMEOUT} - echo "* Waiting for $kind/$name in $namespace..." + echo "* Waiting for $kind/$name in $namespace (timeout: ${timeout}s)..." local elapsed=0 local interval=5 while [ $elapsed -lt $timeout ]; do @@ -501,7 +526,7 @@ wait_for_resource() { elapsed=$((elapsed + interval)) done - echo " WARNING: $kind/$name was not found within timeout." + echo " ERROR: $kind/$name was not found within ${timeout}s timeout" return 1 } @@ -849,7 +874,7 @@ inject_maas_api_image_operator_mode() { # Helper function to wait for CRD to be established wait_for_crd() { local crd="$1" - local timeout="${2:-60}" # timeout in seconds + local timeout="${2:-$CRD_TIMEOUT}" local interval=2 local end_time=$((SECONDS + timeout)) @@ -932,15 +957,16 @@ wait_for_csv_with_min_version() { local operator_prefix="$1" local min_version="$2" local namespace="${3:-kuadrant-system}" - local timeout="${4:-180}" - - echo "⏳ Looking for ${operator_prefix} (minimum version: ${min_version})..." - + local timeout="${4:-$CSV_TIMEOUT}" + + echo "⏳ Looking for ${operator_prefix} (minimum version: ${min_version}, timeout: ${timeout}s)..." + local end_time=$((SECONDS + timeout)) - + while [ $SECONDS -lt $end_time ]; do - local csv_name=$(find_csv_with_min_version "$operator_prefix" "$min_version" "$namespace") - + local csv_name + csv_name=$(find_csv_with_min_version "$operator_prefix" "$min_version" "$namespace") + if [ -n "$csv_name" ]; then # Found a CSV with suitable version local installed_version=$(extract_version_from_csv "$csv_name") @@ -951,7 +977,7 @@ wait_for_csv_with_min_version() { wait_for_csv "$csv_name" "$namespace" "$remaining_time" return $? fi - + # Check if any version exists (for progress feedback) local any_csv=$(kubectl get csv -n "$namespace" --no-headers 2>/dev/null | grep "^${operator_prefix}" | head -n1 | awk '{print $1}' || echo "") if [ -n "$any_csv" ]; then @@ -960,12 +986,12 @@ wait_for_csv_with_min_version() { else echo " No CSV found for ${operator_prefix} yet, waiting for installation..." fi - + sleep 10 done - + # Timeout reached - echo "❌ Timed out waiting for ${operator_prefix} with minimum version ${min_version}" + echo "❌ Timed out after ${timeout}s waiting for ${operator_prefix} with minimum version ${min_version}" return 1 } @@ -973,7 +999,7 @@ wait_for_csv_with_min_version() { wait_for_csv() { local csv_name="$1" local namespace="${2:-kuadrant-system}" - local timeout="${3:-180}" # timeout in seconds + local timeout="${3:-$CSV_TIMEOUT}" local interval=5 local end_time=$((SECONDS + timeout)) local last_status_print=$SECONDS @@ -1011,11 +1037,11 @@ wait_for_csv() { # Helper function to wait for pods in a namespace to be ready wait_for_pods() { local namespace="$1" - local timeout="${2:-120}" - + local timeout="${2:-$POD_TIMEOUT}" + kubectl get namespace "$namespace" &>/dev/null || return 0 - - echo "⏳ Waiting for pods in $namespace to be ready..." + + echo "⏳ Waiting for pods in $namespace to be ready (timeout: ${timeout}s)..." local end=$((SECONDS + timeout)) local not_ready while [ $SECONDS -lt $end ]; do @@ -1023,17 +1049,17 @@ wait_for_pods() { [ "$not_ready" -eq 0 ] && return 0 sleep 5 done - echo "⚠️ Timeout waiting for pods in $namespace" >&2 + echo "⚠️ Timeout after ${timeout}s waiting for pods in $namespace" >&2 return 1 } wait_for_validating_webhooks() { local namespace="$1" - local timeout="${2:-60}" + local timeout="${2:-$WEBHOOK_TIMEOUT}" local interval=2 local end=$((SECONDS+timeout)) - echo "⏳ Waiting for validating webhooks in namespace $namespace (timeout: $timeout sec)..." + echo "⏳ Waiting for validating webhooks in namespace $namespace (timeout: ${timeout}s)..." while [ $SECONDS -lt $end ]; do local not_ready=0 @@ -1070,7 +1096,7 @@ wait_for_validating_webhooks() { sleep $interval done - echo "❌ Timed out waiting for validating webhooks in $namespace" + echo "❌ Timed out after ${timeout}s waiting for validating webhooks in $namespace" return 1 } @@ -1124,14 +1150,14 @@ spec: EOF echo " * Waiting for CatalogSource to be ready..." - local timeout=120 if ! kubectl wait catalogsource "$name" -n "$namespace" \ --for=jsonpath='{.status.connectionState.lastObservedState}'=READY \ - --timeout="${timeout}s" 2>/dev/null; then - local state=$(kubectl get catalogsource "$name" -n "$namespace" \ + --timeout="${CATALOGSOURCE_TIMEOUT}s" 2>/dev/null; then + local state + state=$(kubectl get catalogsource "$name" -n "$namespace" \ -o jsonpath='{.status.connectionState.lastObservedState}' 2>/dev/null) - echo " ERROR: CatalogSource may not be fully ready yet (state: $state)" + echo " ERROR: CatalogSource not ready after ${CATALOGSOURCE_TIMEOUT}s (state: $state)" return 1 fi echo " * CatalogSource '$name' is ready" @@ -1214,7 +1240,7 @@ wait_datasciencecluster_ready() { # wait_authorino_ready [timeout] # Waits for Authorino to be ready and accepting requests. # Note: Request are required because authorino will report ready status but still give 500 errors. -# +# # This checks: # 1. Authorino CR status is Ready # 2. Auth service cluster is healthy in gateway's Envoy @@ -1224,13 +1250,13 @@ wait_datasciencecluster_ready() { # namespace - Authorino namespace (required) # "kuadrant-system" for Kuadrant (upstream/ODH) # "rh-connectivity-link" for RHCL (downstream/RHOAI) -# timeout - Timeout in seconds (default: 120) +# timeout - Timeout in seconds (default: AUTHORINO_TIMEOUT) # # Returns: # 0 on success, 1 on failure wait_authorino_ready() { local authorino_namespace="${1:?ERROR: namespace is required (kuadrant-system or rh-connectivity-link)}" - local timeout=${2:-120} + local timeout=${2:-$AUTHORINO_TIMEOUT} local interval=5 local elapsed=0 diff --git a/scripts/validate-deployment.sh b/scripts/validate-deployment.sh index 1d5846853..d921b7c56 100755 --- a/scripts/validate-deployment.sh +++ b/scripts/validate-deployment.sh @@ -1,5 +1,10 @@ #!/bin/bash +# Bash strict mode (without -e to continue validation even if some checks fail) +# -u: treat unset variables as an error +# -o pipefail: return value of a pipeline is the value of the last command to exit with a non-zero status +set -uo pipefail + # Source helper functions for JWT decoding SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/deployment-helpers.sh" @@ -10,7 +15,8 @@ source "$SCRIPT_DIR/deployment-helpers.sh" # Usage: ./validate-deployment.sh [MODEL_NAME] # MODEL_NAME: Optional. If provided, the script will validate using this specific model -# Note: We don't use 'set -e' because we want to continue validation even if some checks fail +# Note: We use 'set -uo pipefail' but NOT 'set -e' because we want to continue +# validation even if some checks fail, while still catching undefined variables and pipe failures # Parse command line arguments REQUESTED_MODEL="" @@ -50,9 +56,13 @@ if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then echo " -n, --namespace NS Namespace where MaaS API is deployed" echo " Default: opendatahub (or MAAS_API_NAMESPACE env var)" echo "" - echo "Environment (for non-admin users):" + echo "Environment Variables:" echo " MAAS_GATEWAY_HOST Override gateway URL when cluster domain is not readable" echo " e.g. export MAAS_GATEWAY_HOST=https://maas.apps.your-cluster.example.com" + echo " MAAS_API_NAMESPACE Namespace where MaaS API is deployed (default: opendatahub)" + echo "" + echo "Note: This script uses connection timeouts from curl (10s connect, 30s max)" + echo " For cluster-level timeouts, see deployment-helpers.sh timeout constants" echo "" echo "Examples:" echo " # Basic validation" diff --git a/test/e2e/scripts/prow_run_smoke_test.sh b/test/e2e/scripts/prow_run_smoke_test.sh index f7abbfb57..1bc972ba4 100755 --- a/test/e2e/scripts/prow_run_smoke_test.sh +++ b/test/e2e/scripts/prow_run_smoke_test.sh @@ -48,6 +48,16 @@ # DEPLOYMENT_NAMESPACE - Namespace of MaaS API and controller (default: opendatahub) # MAAS_SUBSCRIPTION_NAMESPACE - Namespace of MaaS CRs (default: models-as-a-service) # MODEL_NAMESPACE - Namespace of models and MaaSModelRefs (default: llm) +# +# TIMEOUT CONFIGURATION (all in seconds, sourced from deployment-helpers.sh): +# Customize for CI/CD environments or slow clusters: +# CUSTOM_RESOURCE_TIMEOUT=600 DataScienceCluster wait +# LLMIS_TIMEOUT=300 LLMInferenceService ready +# MAASMODELREF_TIMEOUT=300 MaaSModelRef ready +# AUTHPOLICY_TIMEOUT=180 AuthPolicy enforced +# AUTHORINO_TIMEOUT=120 Authorino ready +# ROLLOUT_TIMEOUT=120 Deployment rollout +# See deployment-helpers.sh for complete list # ============================================================================= set -euo pipefail @@ -204,8 +214,8 @@ deploy_maas_platform() { fi # Wait for DataScienceCluster (install-odh already waited; deploy may have updated) - if ! wait_datasciencecluster_ready "default-dsc" 300; then - echo "⚠️ WARNING: DataScienceCluster readiness check had issues, continuing anyway" + if ! wait_datasciencecluster_ready "default-dsc" "$CUSTOM_RESOURCE_TIMEOUT"; then + echo "⚠️ WARNING: DataScienceCluster readiness check had issues (timeout: ${CUSTOM_RESOURCE_TIMEOUT}s), continuing anyway" fi # Wait for Authorino to be ready and auth service cluster to be healthy @@ -217,10 +227,10 @@ deploy_maas_platform() { echo "⚠️ WARNING: Skipping Authorino readiness check (SKIP_AUTH_CHECK=true)" echo " This is a temporary workaround for the gateway→Authorino TLS chicken-egg problem" else - # Using 300s timeout to fit within Prow's 15m job limit + # Using configurable timeout (default suitable for Prow's 15m job limit) echo "Waiting for Authorino and auth service to be ready (namespace: ${AUTHORINO_NAMESPACE})..." - if ! wait_authorino_ready "$AUTHORINO_NAMESPACE" 300; then - echo "⚠️ WARNING: Authorino readiness check had issues, continuing anyway" + if ! wait_authorino_ready "$AUTHORINO_NAMESPACE" "$AUTHORINO_TIMEOUT"; then + echo "⚠️ WARNING: Authorino readiness check had issues (timeout: ${AUTHORINO_TIMEOUT}s), continuing anyway" fi fi @@ -262,15 +272,15 @@ deploy_models() { fi echo "✅ MaaS system deployed (free + premium + e2e test fixtures)" - echo "Waiting for models to be ready..." - if ! oc wait llminferenceservice/facebook-opt-125m-simulated -n llm --for=condition=Ready --timeout=300s; then - echo "❌ ERROR: Timed out waiting for free simulator to be ready" + echo "Waiting for models to be ready (timeout: ${LLMIS_TIMEOUT}s)..." + if ! oc wait llminferenceservice/facebook-opt-125m-simulated -n llm --for=condition=Ready --timeout="${LLMIS_TIMEOUT}s"; then + echo "❌ ERROR: Timed out after ${LLMIS_TIMEOUT}s waiting for free simulator to be ready" oc get llminferenceservice/facebook-opt-125m-simulated -n llm -o yaml || true oc get events -n llm --sort-by='.lastTimestamp' || true exit 1 fi - if ! oc wait llminferenceservice/premium-simulated-simulated-premium -n llm --for=condition=Ready --timeout=300s; then - echo "❌ ERROR: Timed out waiting for premium simulator to be ready" + if ! oc wait llminferenceservice/premium-simulated-simulated-premium -n llm --for=condition=Ready --timeout="${LLMIS_TIMEOUT}s"; then + echo "❌ ERROR: Timed out after ${LLMIS_TIMEOUT}s waiting for premium simulator to be ready" oc get llminferenceservice/premium-simulated-simulated-premium -n llm -o yaml || true oc get events -n llm --sort-by='.lastTimestamp' || true exit 1 @@ -280,9 +290,8 @@ deploy_models() { # Wait for MaaSModelRefs to transition to Ready phase. # The controller now properly handles the race condition where MaaSModelRef is created # before KServe creates the HTTPRoute (sets Pending, then Ready when HTTPRoute watch triggers). - echo "Waiting for MaaSModelRefs to be Ready..." - local timeout=300 # 5 minutes - sufficient for KServe to create HTTPRoutes - local deadline=$((SECONDS + timeout)) + echo "Waiting for MaaSModelRefs to be Ready (timeout: ${MAASMODELREF_TIMEOUT}s)..." + local deadline=$((SECONDS + MAASMODELREF_TIMEOUT)) local all_ready=false local found_any=false @@ -306,7 +315,7 @@ deploy_models() { done if ! $found_any || ! $all_ready; then - echo "❌ ERROR: MaaSModelRefs did not reach Ready state within ${timeout}s" + echo "❌ ERROR: MaaSModelRefs did not reach Ready state within ${MAASMODELREF_TIMEOUT}s" echo "Dumping MaaSModelRef status:" oc get maasmodelrefs -n "$MODEL_NAMESPACE" -o yaml || true echo "Dumping controller logs:" @@ -318,7 +327,7 @@ deploy_models() { } wait_for_auth_policies_enforced() { - local timeout=180 + local timeout="$AUTHPOLICY_TIMEOUT" echo "Waiting for Kuadrant AuthPolicies to be enforced (timeout: ${timeout}s)..." local namespaces