From 9c7c06eafc5f483e4290fb4726b46d1b02630978 Mon Sep 17 00:00:00 2001 From: Palash Goel Date: Thu, 26 Mar 2026 05:26:23 +0000 Subject: [PATCH 1/3] initial commit --- .../onprem/post-orch-upgrade.sh | 1675 +++++++++++++++++ on-prem-installers/onprem/pre-orch-install.sh | 2 +- on-prem-installers/onprem/pre-orch-upgrade.sh | 1232 ++++++++++++ .../onprem/pre-upgrade-backup.sh | 628 ++++++ 4 files changed, 3536 insertions(+), 1 deletion(-) create mode 100755 on-prem-installers/onprem/post-orch-upgrade.sh create mode 100755 on-prem-installers/onprem/pre-orch-upgrade.sh create mode 100755 on-prem-installers/onprem/pre-upgrade-backup.sh diff --git a/on-prem-installers/onprem/post-orch-upgrade.sh b/on-prem-installers/onprem/post-orch-upgrade.sh new file mode 100755 index 0000000000..0b0a670882 --- /dev/null +++ b/on-prem-installers/onprem/post-orch-upgrade.sh @@ -0,0 +1,1675 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: 2026 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Script Name: post-orch-upgrade.sh +# Description: Deb-free post-upgrade script for Edge Orchestrator. +# Replaces the onprem-gitea-installer, onprem-argocd-installer, +# and onprem-orch-installer debian packages with pure bash. +# +# This script handles everything AFTER the Kubernetes cluster +# has been upgraded (by pre-orch-upgrade.sh): +# - Retrieving and updating cluster configuration +# - Upgrading Gitea (TLS certs, Helm chart, accounts) +# - Upgrading ArgoCD (proxy config, Helm chart) +# - Deploying the orchestrator (root-app via Helm) +# - PostgreSQL migration to CloudNativePG +# - Service recovery (MPS/RPS, Vault, restarts) +# - Cleanup (external-secrets CRDs, Kyverno, nginx) +# +# Prerequisites: +# - pre-orch-upgrade.sh has completed (K8s cluster upgraded, OS configured) +# - onprem.env is configured with correct values +# - kubectl, helm, yq, openssl are available +# - sudo access (for cert installation) +# - Repo tarball in repo_archives/ (or running from a git checkout) +# +# Usage: +# ./post-orch-upgrade.sh [options] +# +# Options: +# -l Use local packages (skip artifact download) +# -s Skip interactive prompts (non-interactive mode) +# -h Show help + +set -euo pipefail + +export PATH="/usr/local/bin:${PATH}" +export KUBECONFIG="${KUBECONFIG:-/home/$USER/.kube/config}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/onprem.env" + +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/upgrade_postgres.sh" + +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/vault_unseal.sh" + +################################ +# Logging +################################ + +LOG_FILE="post_orch_upgrade_$(date +'%Y%m%d_%H%M%S').log" +LOG_DIR="/var/log/orch-upgrade" + +sudo mkdir -p "$LOG_DIR" +sudo chown "$(whoami):$(whoami)" "$LOG_DIR" + +FULL_LOG_PATH="$LOG_DIR/$LOG_FILE" + +log_message() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "$FULL_LOG_PATH" +} + +log_info() { + log_message "INFO: $*" +} + +log_warn() { + log_message "WARN: $*" +} + +log_error() { + log_message "ERROR: $*" +} + +# Redirect all output to both console and log file +exec > >(tee -a "$FULL_LOG_PATH") +exec 2> >(tee -a "$FULL_LOG_PATH" >&2) + +log_info "Starting post-orch-upgrade script" +log_info "Log file: $FULL_LOG_PATH" + +################################ +# Defaults / Configuration +################################ + +apps_ns="${APPS_NS:-onprem}" +argo_cd_ns="${ARGO_CD_NS:-argocd}" +gitea_ns="${GITEA_NS:-gitea}" +si_config_repo="edge-manageability-framework" + +cwd="$(pwd)" +git_arch_name="repo_archives" + +GIT_REPOS="${GIT_REPOS:-$cwd/$git_arch_name}" +export GIT_REPOS + +ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}" +INSTALL_GITEA="${INSTALL_GITEA:-true}" +GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}" +USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}" +DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}" +SKIP_INTERACTIVE="${SKIP_INTERACTIVE:-false}" + +GITEA_CHART_VERSION="${GITEA_CHART_VERSION:-10.4.0}" +ARGOCD_CHART_VERSION="${ARGOCD_CHART_VERSION:-8.2.7}" + +# Paths set during setup_working_dir +WORK_DIR="" +REPO_DIR="" +ONPREM_INSTALLERS_DIR="" + +################################ +# Cleanup trap +################################ + +cleanup_work_dir() { + if [[ -n "${WORK_DIR:-}" && -d "${WORK_DIR:-}" ]]; then + log_info "Cleaning up working directory: $WORK_DIR" + rm -rf "$WORK_DIR" + fi +} + +trap cleanup_work_dir EXIT + +################################ +# Prerequisites +################################ + +check_prerequisites() { + log_info "Checking prerequisites..." + + local missing=() + for cmd in kubectl helm yq openssl; do + if ! command -v "$cmd" &>/dev/null; then + missing+=("$cmd") + fi + done + + if [[ ${#missing[@]} -gt 0 ]]; then + log_error "Missing required tools: ${missing[*]}" + log_error "Run pre-orch-upgrade.sh first (installs helm and yq), or install manually." + exit 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + log_error "Cannot reach Kubernetes cluster. Check KUBECONFIG." + exit 1 + fi + + log_info "All prerequisites met." +} + +################################ +# Helper Functions +################################ + +update_config_variable() { + local config_file="$1" var_name="$2" var_value="$3" + if [[ -n "${var_value:-}" ]]; then + if grep -q "^export ${var_name}=" "$config_file"; then + sed -i "s|^export ${var_name}=.*|export ${var_name}='${var_value}'|" "$config_file" + else + echo "export ${var_name}='${var_value}'" >> "$config_file" + fi + fi +} + +wait_for_pods_running() { + local ns="$1" + log_info "Waiting for all pods to be Ready in namespace $ns..." + kubectl wait pod --selector='!job-name' --all --for=condition=Ready \ + --namespace="$ns" --timeout=600s +} + +resync_all_apps() { + if [[ ! -f /tmp/argo-cd/sync-patch.yaml ]]; then + sudo mkdir -p /tmp/argo-cd + cat </dev/null +operation: + sync: + syncStrategy: + hook: {} + +SYNCEOF + fi + kubectl patch application root-app -n "$apps_ns" --type merge \ + -p '{"operation":null}' || true + kubectl patch application root-app -n "$apps_ns" --type json \ + -p '[{"op": "remove", "path": "/status/operationState"}]' || true + sleep 10 + kubectl patch application root-app -n "$apps_ns" \ + --patch-file /tmp/argo-cd/sync-patch.yaml --type merge +} + +terminate_existing_sync() { + local app_name="$1" namespace="$2" + local current_phase + current_phase=$(kubectl get application "$app_name" -n "$namespace" \ + -o jsonpath='{.status.operationState.phase}' 2>/dev/null || true) + + if [[ "$current_phase" == "Running" ]]; then + log_info "Terminating existing sync operation for $app_name..." + kubectl patch application "$app_name" -n "$namespace" \ + --type='merge' -p='{"operation": null}' + timeout 30 bash -c " + while [[ \"\$(kubectl get application '$app_name' -n '$namespace' \ + -o jsonpath='{.status.operationState.phase}' 2>/dev/null)\" == 'Running' ]]; do + sleep 2 + done + " || true + fi +} + +check_and_patch_sync_app() { + local app_name="$1" namespace="$2" + local max_retries=2 + + for ((i=1; i<=max_retries; i++)); do + local app_status + app_status=$(kubectl get application "$app_name" -n "$namespace" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' \ + 2>/dev/null || echo "NotFound NotFound") + + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "$app_name is Synced and Healthy" + return 0 + fi + + log_warn "$app_name not healthy (status: $app_status). Syncing (attempt $i/$max_retries)" + + set +e + terminate_existing_sync "$app_name" "$namespace" + kubectl patch -n "$namespace" application "$app_name" \ + --patch-file /tmp/argo-cd/sync-patch.yaml --type merge + set -e + + local check_timeout=90 check_interval=3 elapsed=0 + while (( elapsed < check_timeout )); do + app_status=$(kubectl get application "$app_name" -n "$namespace" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' \ + 2>/dev/null || echo "NotFound NotFound") + + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "$app_name became Synced and Healthy" + return 0 + fi + sleep "$check_interval" + elapsed=$((elapsed + check_interval)) + done + done + log_warn "$app_name may still require attention after $max_retries attempts" +} + +wait_for_app_synced_healthy() { + resync_all_apps + local app_name="$1" namespace="$2" timeout_s="${3:-120}" + local start_time + start_time=$(date +%s) + + set +e + while true; do + local app_status + app_status=$(kubectl get application "$app_name" -n "$namespace" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' \ + 2>/dev/null || echo "NotFound NotFound") + + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "$app_name is Synced and Healthy." + set -e + return 0 + fi + + local current_time elapsed + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if (( elapsed > timeout_s )); then + log_warn "Timeout waiting for $app_name after ${timeout_s}s (status: $app_status)" + set -e + return 0 + fi + + log_info "Waiting for $app_name (${elapsed}s/${timeout_s}s, status: $app_status)" + sleep 3 + done +} + +restart_statefulset() { + local name="$1" namespace="$2" + log_info "Restarting StatefulSet $name in $namespace..." + local replicas + replicas=$(kubectl get statefulset "$name" -n "$namespace" \ + -o jsonpath='{.spec.replicas}') + kubectl scale statefulset "$name" -n "$namespace" --replicas=0 + kubectl wait --for=delete pod -l "app=$name" -n "$namespace" \ + --timeout=300s || true + kubectl scale statefulset "$name" -n "$namespace" --replicas="$replicas" + log_info "$name restarted" +} + +cleanup_gitea_secrets() { + log_info "Cleaning up old Gitea secrets..." + local secrets=("gitea-apporch-token" "gitea-argocd-token" "gitea-clusterorch-token") + for secret in "${secrets[@]}"; do + if kubectl get secret "$secret" -n gitea >/dev/null 2>&1; then + kubectl delete secret "$secret" -n gitea + log_info "Deleted secret: $secret" + fi + done +} + +delete_nginx_if_any() { + log_info "Checking and deleting nginx ingress (if any)..." + kubectl delete application ingress-nginx -n "$apps_ns" \ + --ignore-not-found=true || true + kubectl delete application nginx-ingress-pxe-boots -n "$apps_ns" \ + --ignore-not-found=true || true + + local harbor_pods + harbor_pods=$(kubectl get pods -n orch-harbor --no-headers 2>/dev/null \ + | awk '/harbor-oci-nginx/ {print $1}' || true) + if [[ -n "${harbor_pods:-}" ]]; then + log_info "Deleting harbor nginx pods" + # shellcheck disable=SC2086 + kubectl delete pod -n orch-harbor $harbor_pods || true + fi + log_info "Nginx cleanup done" +} + +################################################################################ +# PHASE 1: CONFIGURATION +################################################################################ + +retrieve_and_update_config() { + log_info "=== Phase 1a: Retrieving cluster configuration ===" + local config_file="$cwd/onprem.env" + + # Get LoadBalancer IPs — fall back to existing onprem.env values when + # services are absent (e.g. freshly recreated KIND cluster). + local argo_ip traefik_ip haproxy_ip + argo_ip=$(kubectl get svc argocd-server -n argocd \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + traefik_ip=$(kubectl get svc traefik -n orch-gateway \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + + if kubectl get svc ingress-haproxy-kubernetes-ingress -n orch-boots >/dev/null 2>&1; then + haproxy_ip=$(kubectl get svc ingress-haproxy-kubernetes-ingress -n orch-boots \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + elif kubectl get svc ingress-nginx-controller -n orch-boots >/dev/null 2>&1; then + haproxy_ip=$(kubectl get svc ingress-nginx-controller -n orch-boots \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + else + log_warn "No ingress service found in orch-boots namespace — using existing onprem.env values" + haproxy_ip="${HAPROXY_IP:-}" + fi + + # Only update config when a non-empty value was retrieved; otherwise keep existing + [[ -n "$argo_ip" ]] && update_config_variable "$config_file" "ARGO_IP" "$argo_ip" + [[ -n "$traefik_ip" ]] && update_config_variable "$config_file" "TRAEFIK_IP" "$traefik_ip" + [[ -n "$haproxy_ip" ]] && update_config_variable "$config_file" "HAPROXY_IP" "$haproxy_ip" + + # SRE TLS Configuration + local sre_tls_enabled + sre_tls_enabled=$(kubectl get applications -n "$apps_ns" sre-exporter \ + -o jsonpath='{.spec.sources[*].helm.valuesObject.otelCollector.tls.enabled}' \ + 2>/dev/null || echo "false") + + if [[ "$sre_tls_enabled" == "true" ]]; then + update_config_variable "$config_file" "SRE_TLS_ENABLED" "true" + local sre_dest_ca_cert + sre_dest_ca_cert=$(kubectl get applications -n "$apps_ns" sre-exporter \ + -o jsonpath='{.spec.sources[*].helm.valuesObject.otelCollector.tls.caSecret.enabled}' \ + 2>/dev/null || echo "false") + [[ "$sre_dest_ca_cert" == "true" ]] && \ + update_config_variable "$config_file" "SRE_DEST_CA_CERT" "true" + else + update_config_variable "$config_file" "SRE_TLS_ENABLED" "false" + fi + + # Detect profiles from ArgoCD root-app + local value_files + value_files=$(kubectl get application root-app -n "$apps_ns" \ + -o jsonpath='{.spec.sources[0].helm.valueFiles[*]}' 2>/dev/null || true) + + if [[ -z "$value_files" ]]; then + log_warn "No value files found in root-app" + else + local disable_co="false" disable_ao="false" disable_o11y="false" single_tenancy="false" + echo "$value_files" | grep -q "enable-cluster-orch.yaml" || disable_co="true" + echo "$value_files" | grep -q "enable-app-orch.yaml" || disable_ao="true" + echo "$value_files" | grep -qE "(enable-o11y\.yaml|o11y-onprem-1k\.yaml)" || disable_o11y="true" + echo "$value_files" | grep -q "enable-singleTenancy.yaml" && single_tenancy="true" + + INSTALL_GITEA="true" + if [[ "$disable_co" == "true" || "$disable_ao" == "true" ]]; then + INSTALL_GITEA="false" + fi + + update_config_variable "$config_file" "DISABLE_CO_PROFILE" "$disable_co" + update_config_variable "$config_file" "DISABLE_AO_PROFILE" "$disable_ao" + update_config_variable "$config_file" "DISABLE_O11Y_PROFILE" "$disable_o11y" + update_config_variable "$config_file" "SINGLE_TENANCY_PROFILE" "$single_tenancy" + update_config_variable "$config_file" "INSTALL_GITEA" "$INSTALL_GITEA" + fi + + # SMTP configuration + local smtp_skip_verify + smtp_skip_verify=$(kubectl get application alerting-monitor -n "$apps_ns" \ + -o jsonpath='{.spec.sources[*].helm.valuesObject.alertingMonitor.smtp.insecureSkipVerify}' \ + 2>/dev/null || echo "false") + update_config_variable "$config_file" "SMTP_SKIP_VERIFY" "$smtp_skip_verify" + + log_info "Configuration retrieval completed." + + # Re-source the updated config + # shellcheck disable=SC1090 + source "$config_file" +} + +setup_working_dir() { + log_info "=== Phase 1b: Setting up working directory ===" + + # Try to discover repo root from script location (git checkout mode) + # Check common locations: two dirs up from script, HOME, or cwd + local candidates=( + "$(cd "$SCRIPT_DIR/../.." 2>/dev/null && pwd)" + "$HOME/edge-manageability-framework" + "$(cd "$cwd/../edge-manageability-framework" 2>/dev/null && pwd)" + ) + + for candidate in "${candidates[@]}"; do + if [[ -d "$candidate/orch-configs" && -d "$candidate/argocd" && -d "$candidate/on-prem-installers" ]]; then + log_info "Running from git checkout: $candidate" + REPO_DIR="$candidate" + ONPREM_INSTALLERS_DIR="$REPO_DIR/on-prem-installers" + return 0 + fi + done + + # Tarball mode: extract the repo tarball + if [[ ! -d "$GIT_REPOS" ]]; then + log_error "Repo archives directory not found: $GIT_REPOS" + exit 1 + fi + + local repo_file + repo_file=$(find "$GIT_REPOS" -name "*${si_config_repo}*.tgz" -type f | head -1) + if [[ -z "$repo_file" ]]; then + log_error "No $si_config_repo tarball found in $GIT_REPOS" + exit 1 + fi + + WORK_DIR="$(mktemp -d)" + log_info "Extracting repo tarball to $WORK_DIR" + tar -xf "$repo_file" -C "$WORK_DIR" + + REPO_DIR="$WORK_DIR/$si_config_repo" + ONPREM_INSTALLERS_DIR="$REPO_DIR/on-prem-installers" + + if [[ ! -d "$REPO_DIR/orch-configs" || ! -d "$REPO_DIR/argocd" ]]; then + log_error "Extracted tarball does not look like a valid $si_config_repo repo" + exit 1 + fi + + log_info "Repo extracted to: $REPO_DIR" +} + +apply_cluster_config() { + log_info "=== Phase 1c: Generating and applying cluster config ===" + + local gen_script="$REPO_DIR/installer/generate_cluster_yaml.sh" + if [[ ! -x "$gen_script" ]]; then + log_warn "generate_cluster_yaml.sh not found at $gen_script, trying current directory..." + gen_script="./generate_cluster_yaml.sh" + fi + + if [[ -x "$gen_script" ]]; then + rm -f "${ORCH_INSTALLER_PROFILE}.yaml" + + # generate_cluster_yaml.sh sources onprem.env from its own directory and + # reads cluster_onprem.tpl from $PWD. Symlink both into place. + local gen_dir + gen_dir="$(dirname "$gen_script")" + if [[ ! -f "$gen_dir/onprem.env" ]]; then + ln -sf "$cwd/onprem.env" "$gen_dir/onprem.env" + fi + local tpl_file="$cwd/cluster_onprem.tpl" + if [[ ! -f "$gen_dir/cluster_onprem.tpl" && -f "$tpl_file" ]]; then + ln -sf "$tpl_file" "$gen_dir/cluster_onprem.tpl" + fi + + (cd "$gen_dir" && bash "$gen_script" onprem) + + # Move generated output to cwd if it landed in gen_dir + if [[ -f "$gen_dir/${ORCH_INSTALLER_PROFILE}.yaml" && "$gen_dir" != "$cwd" ]]; then + mv "$gen_dir/${ORCH_INSTALLER_PROFILE}.yaml" "$cwd/" + fi + else + log_warn "generate_cluster_yaml.sh not found. Expecting ${ORCH_INSTALLER_PROFILE}.yaml to exist." + fi + + local cluster_yaml="$cwd/${ORCH_INSTALLER_PROFILE}.yaml" + if [[ ! -f "$cluster_yaml" ]]; then + log_error "Cluster config not found: $cluster_yaml" + exit 1 + fi + + # Copy cluster config into repo for root-app and Gitea push + local target_dir="$REPO_DIR/orch-configs/clusters" + if [[ -d "$target_dir" ]]; then + cp "$cluster_yaml" "$target_dir/${ORCH_INSTALLER_PROFILE}.yaml" + log_info "Cluster config copied to $target_dir/" + fi + + if [[ "$SKIP_INTERACTIVE" != "true" ]]; then + while true; do + read -rp "Edit values.yaml if required. Ready to proceed? (yes/no): " yn + case $yn in + [Yy]* ) break;; + [Nn]* ) exit 1;; + * ) echo "Please answer yes or no.";; + esac + done + fi + + log_info "Cluster config ready: $cluster_yaml" +} + +################################################################################ +# PHASE 2: GITEA UPGRADE +################################################################################ + +upgrade_gitea() { + if [[ "$INSTALL_GITEA" != "true" ]]; then + log_info "Skipping Gitea upgrade (INSTALL_GITEA=$INSTALL_GITEA)" + return 0 + fi + + log_info "=== Phase 2: Upgrading Gitea ===" + + local image_registry="${GITEA_IMAGE_REGISTRY:-docker.io}" + local values_file="$ONPREM_INSTALLERS_DIR/assets/gitea/values.yaml" + if [[ ! -r "$values_file" ]]; then + log_error "Gitea values file not found: $values_file" + exit 1 + fi + + # Fetch Gitea chart from helm repo + local chart_dir + chart_dir="$(mktemp -d)" + trap 'rm -rf "${chart_dir:-}"' RETURN + + log_info "Fetching Gitea chart v${GITEA_CHART_VERSION}..." + helm repo add gitea-charts https://dl.gitea.com/charts/ --force-update >/dev/null 2>&1 + helm fetch gitea-charts/gitea --version "$GITEA_CHART_VERSION" \ + --untar --untardir "$chart_dir" + + # Ensure namespaces exist + kubectl create ns gitea >/dev/null 2>&1 || true + kubectl create ns orch-platform >/dev/null 2>&1 || true + + # Generate TLS cert if not present + if ! kubectl -n gitea get secret gitea-tls-certs >/dev/null 2>&1; then + log_info "Generating self-signed TLS cert for Gitea..." + local tmp_cert + tmp_cert="$(mktemp -d)" + + openssl genrsa -out "$tmp_cert/infra-tls.key" 4096 2>/dev/null + openssl req -key "$tmp_cert/infra-tls.key" -new -x509 -days 365 \ + -out "$tmp_cert/infra-tls.crt" \ + -subj "/C=US/O=Orch Deploy/OU=Open Edge Platform" \ + -addext "subjectAltName=DNS:localhost,DNS:gitea-http.gitea.svc.cluster.local" \ + 2>/dev/null + + sudo install -D -m 0644 "$tmp_cert/infra-tls.crt" \ + /usr/local/share/ca-certificates/gitea_cert.crt + sudo update-ca-certificates -f + + kubectl create secret tls gitea-tls-certs -n gitea \ + --cert="$tmp_cert/infra-tls.crt" \ + --key="$tmp_cert/infra-tls.key" + + rm -rf "$tmp_cert" + fi + + # Generate random passwords (use openssl to avoid SIGPIPE under pipefail) + local admin_pw argocd_pw app_pw cluster_pw + admin_pw="$(openssl rand -base64 24 | tr -dc A-Za-z0-9 | cut -c1-16)" + argocd_pw="$(openssl rand -base64 24 | tr -dc A-Za-z0-9 | cut -c1-16)" + app_pw="$(openssl rand -base64 24 | tr -dc A-Za-z0-9 | cut -c1-16)" + cluster_pw="$(openssl rand -base64 24 | tr -dc A-Za-z0-9 | cut -c1-16)" + + # Create secrets + _create_gitea_secret "gitea-cred" "gitea_admin" "$admin_pw" "gitea" + _create_gitea_secret "argocd-gitea-credential" "argocd" "$argocd_pw" "gitea" + _create_gitea_secret "app-gitea-credential" "apporch" "$app_pw" "orch-platform" + _create_gitea_secret "cluster-gitea-credential" "clusterorch" "$cluster_pw" "orch-platform" + + # Scale down Gitea before upgrade + kubectl scale deployment gitea -n gitea --replicas=0 2>/dev/null || true + + # Helm upgrade Gitea + log_info "Running helm upgrade for Gitea..." + helm upgrade --install gitea "$chart_dir/gitea" \ + --values "$values_file" \ + --set gitea.admin.existingSecret=gitea-cred \ + --set "image.registry=${image_registry}" \ + -n gitea --timeout 15m0s --wait + + wait_for_pods_running "$gitea_ns" + + # Create/update Gitea accounts + _create_gitea_account "argocd-gitea-credential" "argocd" "$argocd_pw" \ + "argocd@orch-installer.com" + _create_gitea_account "app-gitea-credential" "apporch" "$app_pw" \ + "test@test.com" + _create_gitea_account "cluster-gitea-credential" "clusterorch" "$cluster_pw" \ + "test@test2.com" + + log_info "Gitea upgrade completed." +} + +_create_gitea_secret() { + local secret_name="$1" account_name="$2" password="$3" namespace="$4" + kubectl create secret generic "$secret_name" -n "$namespace" \ + --from-literal=username="$account_name" \ + --from-literal=password="$password" \ + --dry-run=client -o yaml | kubectl apply -f - +} + +_create_gitea_account() { + local secret_name="$1" account_name="$2" password="$3" email="$4" + + local gitea_pod + gitea_pod=$(kubectl get pods -n gitea -l app=gitea \ + -o jsonpath="{.items[0].metadata.name}" 2>/dev/null || true) + + if [[ -z "$gitea_pod" ]]; then + # Try newer label selector + gitea_pod=$(kubectl get pods -n gitea \ + -l 'app.kubernetes.io/instance=gitea,app.kubernetes.io/name=gitea' \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + fi + + if [[ -z "$gitea_pod" ]]; then + log_error "No Gitea pods found" + return 1 + fi + + if ! kubectl exec -n gitea "$gitea_pod" -c gitea -- \ + gitea admin user list 2>/dev/null | grep -q "$account_name"; then + log_info "Creating Gitea account: $account_name" + kubectl exec -n gitea "$gitea_pod" -c gitea -- \ + gitea admin user create --username "$account_name" --password "$password" \ + --email "$email" --must-change-password=false + else + log_info "Updating Gitea account password: $account_name" + kubectl exec -n gitea "$gitea_pod" -c gitea -- \ + gitea admin user change-password --username "$account_name" \ + --password "$password" --must-change-password=false + fi + + # Generate access token + local user_token token + user_token=$(kubectl exec -n gitea "$gitea_pod" -c gitea -- \ + gitea admin user generate-access-token \ + --scopes write:repository,write:user \ + --username "$account_name" \ + --token-name "${account_name}-$(date +%s)" 2>/dev/null || true) + token=$(echo "$user_token" | awk '{print $NF}') + + if [[ -n "$token" ]]; then + kubectl create secret generic "gitea-${account_name}-token" -n gitea \ + --from-literal=token="$token" \ + --dry-run=client -o yaml | kubectl apply -f - + fi +} + +################################################################################ +# PHASE 3: ARGOCD UPGRADE +################################################################################ + +upgrade_argocd() { + log_info "=== Phase 3: Upgrading ArgoCD ===" + + local values_tmpl="$ONPREM_INSTALLERS_DIR/assets/argo-cd/values.tmpl" + if [[ ! -r "$values_tmpl" ]]; then + log_error "ArgoCD values template not found: $values_tmpl" + exit 1 + fi + + # Fetch ArgoCD chart from helm repo + local chart_dir + chart_dir="$(mktemp -d)" + trap 'rm -rf "${chart_dir:-}"' RETURN + + log_info "Fetching ArgoCD chart v${ARGOCD_CHART_VERSION}..." + helm repo add argo-helm https://argoproj.github.io/argo-helm \ + --force-update >/dev/null 2>&1 + helm fetch argo-helm/argo-cd --version "$ARGOCD_CHART_VERSION" \ + --untar --untardir "$chart_dir" + + # Process proxy configuration via helm template + cp "$values_tmpl" "$chart_dir/argo-cd/templates/values.tmpl" + + cat <"$chart_dir/proxy-values.yaml" +http_proxy: ${http_proxy:-} +https_proxy: ${https_proxy:-} +no_proxy: ${no_proxy:-} +EOF + + helm template -s templates/values.tmpl "$chart_dir/argo-cd" \ + --values "$chart_dir/proxy-values.yaml" > "$chart_dir/values.yaml" + rm -f "$chart_dir/argo-cd/templates/values.tmpl" + + # Generate volume mounts for node CA bundle and Gitea TLS + cat <"$chart_dir/mounts.yaml" +notifications: + extraVolumeMounts: + - mountPath: /etc/ssl/certs/ca-certificates.crt + name: tls-from-node + - mountPath: /etc/ssl/certs/gitea_cert.crt + name: gitea-tls + extraVolumes: + - name: tls-from-node + hostPath: + path: /etc/ssl/certs/ca-certificates.crt + - name: gitea-tls + hostPath: + path: /usr/local/share/ca-certificates/gitea_cert.crt +server: + volumeMounts: + - mountPath: /etc/ssl/certs/ca-certificates.crt + name: tls-from-node + - mountPath: /etc/ssl/certs/gitea_cert.crt + name: gitea-tls + volumes: + - name: tls-from-node + hostPath: + path: /etc/ssl/certs/ca-certificates.crt + - name: gitea-tls + hostPath: + path: /usr/local/share/ca-certificates/gitea_cert.crt +repoServer: + volumeMounts: + - mountPath: /etc/ssl/certs/ca-certificates.crt + name: tls-from-node + - mountPath: /etc/ssl/certs/gitea_cert.crt + name: gitea-tls + volumes: + - name: tls-from-node + hostPath: + path: /etc/ssl/certs/ca-certificates.crt + - name: gitea-tls + hostPath: + path: /usr/local/share/ca-certificates/gitea_cert.crt +applicationSet: + extraVolumeMounts: + - mountPath: /etc/ssl/certs/ca-certificates.crt + name: tls-from-node + - mountPath: /etc/ssl/certs/gitea_cert.crt + name: gitea-tls + extraVolumes: + - name: tls-from-node + hostPath: + path: /etc/ssl/certs/ca-certificates.crt + - name: gitea-tls + hostPath: + path: /usr/local/share/ca-certificates/gitea_cert.crt +EOF + + log_info "Running helm upgrade for ArgoCD..." + kubectl create ns "$argo_cd_ns" >/dev/null 2>&1 || true + helm upgrade --install argocd "$chart_dir/argo-cd" \ + --values "$chart_dir/values.yaml" \ + -f "$chart_dir/mounts.yaml" \ + -n "$argo_cd_ns" --create-namespace --wait --timeout 15m0s + + wait_for_pods_running "$argo_cd_ns" + + log_info "ArgoCD upgrade completed." +} + +################################################################################ +# PHASE 4: ORCHESTRATOR DEPLOYMENT +################################################################################ + +get_gitea_service_url() { + if [[ "$INSTALL_GITEA" != "true" ]]; then + echo "" + return + fi + + local port + port=$(kubectl get svc gitea-http -n gitea \ + -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || true) + + if [[ "$port" == "443" ]]; then + echo "gitea-http.gitea.svc.cluster.local" + elif [[ -n "$port" ]]; then + echo "gitea-http.gitea.svc.cluster.local:${port}" + else + log_warn "Could not determine Gitea service URL" + echo "gitea-http.gitea.svc.cluster.local" + fi +} + +push_repo_to_gitea() { + local gitea_url="$1" + log_info "Pushing repository to Gitea..." + + # Clean up any previous push job + kubectl delete job gitea-init-${si_config_repo} -n gitea \ + --ignore-not-found=true 2>/dev/null || true + + # Create K8s Job to push repo content via git + kubectl apply -f - < /root/.git-credentials + cd /repo + git init + git remote add gitea "https://${gitea_url}/\$GITEA_USERNAME/${si_config_repo}.git" 2>/dev/null || \ + git remote set-url gitea "https://${gitea_url}/\$GITEA_USERNAME/${si_config_repo}.git" + git checkout -B main + git add . + git commit --allow-empty -m 'Recreate repo from artifact' + git push --force gitea main + volumeMounts: + - name: repo + mountPath: /repo + - name: tls + mountPath: /usr/local/share/ca-certificates/ + restartPolicy: Never + backoffLimit: 5 +JOBEOF + + log_info "Waiting for Gitea push job to complete..." + kubectl wait --for=condition=complete --timeout=300s \ + -n gitea "job/gitea-init-${si_config_repo}" + + log_info "Repo pushed to Gitea successfully." +} + +create_gitea_creds_secret() { + local gitea_url="$1" + log_info "Creating ArgoCD repository secret for Gitea..." + + # Fetch Gitea credentials + local username_b64 password_b64 username password + username_b64=$(kubectl get secret argocd-gitea-credential -n gitea \ + -o jsonpath='{.data.username}') + password_b64=$(kubectl get secret argocd-gitea-credential -n gitea \ + -o jsonpath='{.data.password}') + username=$(echo "$username_b64" | base64 -d) + password=$(echo "$password_b64" | base64 -d) + + kubectl delete secret "$si_config_repo" -n argocd --ignore-not-found + + kubectl apply -f - </dev/null || true + kubectl delete sts -n orch-database postgresql --ignore-not-found=true 2>/dev/null || true + kubectl delete job -n orch-infra credentials --ignore-not-found=true 2>/dev/null || true + kubectl delete job -n orch-infra loca-credentials --ignore-not-found=true 2>/dev/null || true + kubectl delete secret -l managed-by=edge-manageability-framework -A \ + --ignore-not-found=true 2>/dev/null || true + + # Set up repo credentials and push + local gitea_url + gitea_url=$(get_gitea_service_url) + + if [[ "$INSTALL_GITEA" == "true" ]]; then + push_repo_to_gitea "$gitea_url" + create_gitea_creds_secret "$gitea_url" + else + if [[ -z "${WORK_DIR:-}" ]]; then + # Running from git checkout — extract tarball for local root-app + log_info "GitHub mode: using local repo checkout for root-app" + fi + create_github_creds_secret + fi + + # Install root-app via Helm + log_info "Installing root-app Helm chart..." + helm upgrade --install root-app "$root_app_chart" \ + -f "$cluster_yaml" \ + -n "$apps_ns" --create-namespace + + log_info "Orchestrator deployment initiated." +} + +################################################################################ +# PHASE 5: POSTGRESQL MIGRATION +################################################################################ + +save_postgres_passwords() { + log_info "=== Phase 5a: Saving PostgreSQL passwords ===" + + if [[ -s postgres-secrets-password.txt ]]; then + log_info "postgres-secrets-password.txt already exists, skipping save." + return 0 + fi + + local alerting catalog inventory iam_tenancy platform_keycloak vault_pw postgresql mps rps + + alerting=$(kubectl get secret alerting-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + catalog=$(kubectl get secret app-orch-catalog-local-postgresql -n orch-app \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + inventory=$(kubectl get secret inventory-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + iam_tenancy=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + platform_keycloak=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + vault_pw=$(kubectl get secret vault-local-postgresql -n orch-platform \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + postgresql=$(kubectl get secret orch-database-postgresql -n orch-database \ + -o jsonpath='{.data.password}' 2>/dev/null || true) + mps=$(kubectl get secret mps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + rps=$(kubectl get secret rps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + + { + echo "Alerting: $alerting" + echo "CatalogService: $catalog" + echo "Inventory: $inventory" + echo "IAMTenancy: $iam_tenancy" + echo "PlatformKeycloak: $platform_keycloak" + echo "Vault: $vault_pw" + echo "PostgreSQL: $postgresql" + echo "Mps: $mps" + echo "Rps: $rps" + } > postgres-secrets-password.txt + + log_info "PostgreSQL passwords saved to postgres-secrets-password.txt" +} + +delete_mps_rps_secrets() { + log_info "=== Phase 5b: Deleting MPS/RPS secrets for recreation ===" + + if kubectl get secret mps -n orch-infra >/dev/null 2>&1; then + kubectl get secret mps -n orch-infra -o yaml > mps_secret.yaml + kubectl delete secret mps -n orch-infra + log_info "MPS secret backed up and deleted" + fi + + if kubectl get secret rps -n orch-infra >/dev/null 2>&1; then + kubectl get secret rps -n orch-infra -o yaml > rps_secret.yaml + kubectl delete secret rps -n orch-infra + log_info "RPS secret backed up and deleted" + fi +} + +patch_secrets() { + log_info "Patching secrets with saved passwords..." + + # Read passwords from file + local alerting="" catalog="" inventory="" iam_tenancy="" + local platform_keycloak="" vault_pw="" postgresql="" mps="" rps="" + + if [[ -s postgres-secrets-password.txt ]]; then + while IFS=': ' read -r key value; do + case "$key" in + Alerting) alerting="$value" ;; + CatalogService) catalog="$value" ;; + Inventory) inventory="$value" ;; + IAMTenancy) iam_tenancy="$value" ;; + PlatformKeycloak) platform_keycloak="$value" ;; + Vault) vault_pw="$value" ;; + PostgreSQL) postgresql="$value" ;; + Mps) mps="$value" ;; + Rps) rps="$value" ;; + esac + done < postgres-secrets-password.txt + fi + + wait_for_app_synced_healthy postgresql-secrets "$apps_ns" + check_and_patch_sync_app postgresql-secrets "$apps_ns" + wait_for_app_synced_healthy postgresql-secrets "$apps_ns" + + # If postgresql-secrets still not healthy, try root-app sync + local app_status + app_status=$(kubectl get application postgresql-secrets -n "$apps_ns" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' \ + 2>/dev/null || echo "NotFound NotFound") + if [[ "$app_status" != "Synced Healthy" ]]; then + check_and_patch_sync_app root-app "$apps_ns" + fi + + # Wait for secrets to appear + local secrets_to_check=( + "orch-app:app-orch-catalog-local-postgresql" + "orch-app:app-orch-catalog-reader-local-postgresql" + "orch-iam:iam-tenancy-local-postgresql" + "orch-iam:iam-tenancy-reader-local-postgresql" + "orch-infra:alerting-local-postgresql" + "orch-infra:alerting-reader-local-postgresql" + "orch-infra:inventory-local-postgresql" + "orch-infra:inventory-reader-local-postgresql" + "orch-platform:platform-keycloak-local-postgresql" + "orch-platform:platform-keycloak-reader-local-postgresql" + "orch-platform:vault-local-postgresql" + "orch-platform:vault-reader-local-postgresql" + "orch-infra:mps-local-postgresql" + "orch-infra:mps-reader-local-postgresql" + "orch-infra:rps-local-postgresql" + "orch-infra:rps-reader-local-postgresql" + ) + + local max_wait=600 check_interval=5 + + log_info "Waiting for all required secrets to exist..." + for entry in "${secrets_to_check[@]}"; do + local ns="${entry%%:*}" secret_name="${entry##*:}" + local elapsed=0 + while ! kubectl get secret "$secret_name" -n "$ns" >/dev/null 2>&1; do + if (( elapsed >= max_wait )); then + log_error "Timeout waiting for secret $secret_name in $ns" + exit 1 + fi + sleep "$check_interval" + elapsed=$((elapsed + check_interval)) + done + done + log_info "All required secrets exist." + + # Patch all database secrets + kubectl patch secret -n orch-app app-orch-catalog-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$catalog\"}}" --type=merge + kubectl patch secret -n orch-app app-orch-catalog-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$catalog\"}}" --type=merge + kubectl patch secret -n orch-iam iam-tenancy-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$iam_tenancy\"}}" --type=merge + kubectl patch secret -n orch-iam iam-tenancy-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$iam_tenancy\"}}" --type=merge + kubectl patch secret -n orch-infra alerting-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$alerting\"}}" --type=merge + kubectl patch secret -n orch-infra alerting-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$alerting\"}}" --type=merge + kubectl patch secret -n orch-infra inventory-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$inventory\"}}" --type=merge + kubectl patch secret -n orch-infra inventory-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$inventory\"}}" --type=merge + kubectl patch secret -n orch-platform platform-keycloak-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$platform_keycloak\"}}" --type=merge + kubectl patch secret -n orch-platform platform-keycloak-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$platform_keycloak\"}}" --type=merge + kubectl patch secret -n orch-platform vault-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$vault_pw\"}}" --type=merge + kubectl patch secret -n orch-platform vault-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$vault_pw\"}}" --type=merge + kubectl patch secret -n orch-infra mps-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$mps\"}}" --type=merge + kubectl patch secret -n orch-infra mps-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$mps\"}}" --type=merge + kubectl patch secret -n orch-infra rps-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$rps\"}}" --type=merge + kubectl patch secret -n orch-infra rps-reader-local-postgresql \ + -p "{\"data\": {\"PGPASSWORD\": \"$rps\"}}" --type=merge + + # CloudNativePG secrets (if applicable) + if kubectl get secret orch-app-app-orch-catalog -n orch-database >/dev/null 2>&1; then + kubectl patch secret -n orch-database orch-app-app-orch-catalog \ + -p "{\"data\": {\"password\": \"$catalog\"}}" --type=merge + kubectl patch secret -n orch-database orch-iam-iam-tenancy \ + -p "{\"data\": {\"password\": \"$iam_tenancy\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-alerting \ + -p "{\"data\": {\"password\": \"$alerting\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-inventory \ + -p "{\"data\": {\"password\": \"$inventory\"}}" --type=merge + kubectl patch secret -n orch-database orch-platform-platform-keycloak \ + -p "{\"data\": {\"password\": \"$platform_keycloak\"}}" --type=merge + kubectl patch secret -n orch-database orch-platform-vault \ + -p "{\"data\": {\"password\": \"$vault_pw\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-mps \ + -p "{\"data\": {\"password\": \"$mps\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-rps \ + -p "{\"data\": {\"password\": \"$rps\"}}" --type=merge + fi + + # Patch Keycloak secret with username & password fields + if kubectl get secret platform-keycloak -n orch-platform >/dev/null 2>&1; then + local admin_password + admin_password=$(kubectl get secret platform-keycloak -n orch-platform \ + -o jsonpath='{.data.admin-password}' 2>/dev/null | base64 -d 2>/dev/null || true) + if [[ -n "$admin_password" ]]; then + kubectl patch secret platform-keycloak -n orch-platform --type='merge' \ + -p "{\"stringData\": {\"username\": \"admin\", \"password\": \"$admin_password\"}}" || true + fi + fi + + # Patch PostgreSQL main secret + kubectl patch secret -n orch-database orch-database-postgresql \ + -p "{\"data\": {\"password\": \"$postgresql\"}}" --type=merge + + log_info "All secrets patched." +} + +migrate_postgres_to_cnpg() { + log_info "=== Phase 5: PostgreSQL migration to CloudNativePG ===" + + # Delete rke2-metrics-server if present + helm delete -n kube-system rke2-metrics-server 2>/dev/null || true + + resync_all_apps + + # Wait for postgresql-secrets to sync + log_info "Waiting for postgresql-secrets application..." + local start_time timeout_s=3600 + start_time=$(date +%s) + + set +e + while true; do + local app_status + app_status=$(kubectl get application postgresql-secrets -n "$apps_ns" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || true) + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "postgresql-secrets is Synced and Healthy." + break + fi + local current_time elapsed + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if (( elapsed > timeout_s )); then + log_error "Timeout waiting for postgresql-secrets (${timeout_s}s)" + exit 1 + fi + log_info "Waiting for postgresql-secrets (status: ${app_status:-pending}, ${elapsed}s)" + sleep 5 + done + set -e + + # Delete old PostgreSQL (from upgrade_postgres.sh) + delete_postgres + + # Stop root-app sync + kubectl patch application root-app -n "$apps_ns" --type merge \ + -p '{"operation":null}' || true + kubectl patch application root-app -n "$apps_ns" --type json \ + -p '[{"op": "remove", "path": "/status/operationState"}]' || true + + # Force postgresql sync with hook strategy + cat </dev/null +operation: + sync: + syncStrategy: + hook: {} +EOF + kubectl patch -n "$apps_ns" application root-app \ + --patch-file /tmp/sync-postgresql-patch.yaml --type merge + + # Wait for postgresql-secrets again after root-app sync + start_time=$(date +%s) + set +e + while true; do + local app_status + app_status=$(kubectl get application postgresql-secrets -n "$apps_ns" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || true) + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "postgresql-secrets is Synced and Healthy." + break + fi + local current_time elapsed + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if (( elapsed > timeout_s )); then + log_error "Timeout waiting for postgresql-secrets after resync (${timeout_s}s)" + exit 1 + fi + sleep 5 + done + set -e + + # Vault unseal + vault_unseal + + # Resync and patch secrets + resync_all_apps + sleep 120 + patch_secrets + sleep 10 + + # Apply saved PostgreSQL superuser secret (stripped of metadata) + if [[ -f postgres_secret.yaml ]]; then + yq e ' + del(.metadata.labels) | + del(.metadata.annotations) | + del(.metadata.ownerReferences) | + del(.metadata.finalizers) | + del(.metadata.managedFields) | + del(.metadata.resourceVersion) | + del(.metadata.uid) | + del(.metadata.creationTimestamp) + ' postgres_secret.yaml | kubectl apply -f - + fi + + sleep 30 + + # Wait for CloudNativePG primary pod + log_info "Waiting for CloudNativePG primary pod..." + start_time=$(date +%s) + local pg_timeout=300 + + set +e + while true; do + local pod_status + pod_status=$(kubectl get pods -n orch-database \ + -l cnpg.io/cluster=postgresql-cluster,cnpg.io/instanceRole=primary \ + -o jsonpath='{.items[0].status.phase}' 2>/dev/null || true) + if [[ "$pod_status" == "Running" ]]; then + log_info "PostgreSQL CNPG pod is Running." + sleep 30 + break + fi + local current_time elapsed + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + if (( elapsed > pg_timeout )); then + log_error "Timeout waiting for PostgreSQL CNPG pod (${pg_timeout}s)" + exit 1 + fi + log_info "Waiting for PostgreSQL (status: ${pod_status:-pending}, ${elapsed}s)" + sleep 5 + done + set -e + + # Restore PostgreSQL from backup (from upgrade_postgres.sh) + restore_postgres + + log_info "Database user passwords updated." + + # Unseal vault again + vault_unseal + + log_info "PostgreSQL migration completed." +} + +################################################################################ +# PHASE 6: SERVICE RECOVERY +################################################################################ + +restore_mps_rps_secrets() { + log_info "=== Phase 6a: Restoring MPS/RPS secrets ===" + + if [[ -s mps_secret.yaml ]]; then + kubectl apply -f mps_secret.yaml + log_info "MPS secret restored" + fi + + if [[ -s rps_secret.yaml ]]; then + kubectl apply -f rps_secret.yaml + log_info "RPS secret restored" + fi +} + +fix_mps_rps_connections() { + log_info "=== Phase 6b: Updating MPS/RPS connection strings for CloudNativePG ===" + + local mps_password rps_password + mps_password=$(kubectl get secret mps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' | base64 -d) + rps_password=$(kubectl get secret rps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' | base64 -d) + + # MPS connection string + local mps_conn + mps_conn="postgresql://orch-infra-mps_user:${mps_password}@postgresql-cluster-rw.orch-database/orch-infra-mps?search_path=public&sslmode=disable" + local mps_b64 + mps_b64=$(echo -n "$mps_conn" | base64 -w 0) + kubectl patch secret mps -n orch-infra \ + -p "{\"data\":{\"connectionString\":\"$mps_b64\"}}" --type=merge + + # RPS connection string + local rps_conn + rps_conn="postgresql://orch-infra-rps_user:${rps_password}@postgresql-cluster-rw.orch-database/orch-infra-rps?search_path=public&sslmode=disable" + local rps_b64 + rps_b64=$(echo -n "$rps_conn" | base64 -w 0) + kubectl patch secret rps -n orch-infra \ + -p "{\"data\":{\"connectionString\":\"$rps_b64\"}}" --type=merge + + log_info "MPS/RPS connection strings updated to use postgresql-cluster-rw.orch-database" +} + +restart_services() { + log_info "=== Phase 6c: Restarting services ===" + + kubectl rollout restart deployment rps -n orch-infra + kubectl rollout restart deployment mps -n orch-infra + log_info "MPS/RPS restarted" + + kubectl rollout restart deployment inventory -n orch-infra + log_info "inventory restarted" + + kubectl rollout restart deployment onboarding-manager -n orch-infra + log_info "onboarding-manager restarted" + + kubectl rollout restart deployment dkam -n orch-infra + log_info "dkam restarted" + + restart_statefulset keycloak-tenant-controller-set orch-platform + + resync_all_apps + sleep 10 + + # Harbor restarts (skip for onprem-vpro profile) + if [[ "${ORCH_INSTALLER_PROFILE:-}" != "onprem-vpro" ]]; then + restart_statefulset harbor-oci-database orch-harbor || true + kubectl rollout restart deployment harbor-oci-core -n orch-harbor || true + log_info "harbor restarted" + else + log_info "Skipping Harbor restarts for onprem-vpro profile" + fi +} + +restore_gitea_vault_creds() { + log_info "=== Phase 6d: Restoring Gitea credentials to Vault ===" + + # Sync root-app + if [[ -f /tmp/argo-cd/sync-patch.yaml ]]; then + kubectl patch application root-app -n "$apps_ns" \ + --patch-file /tmp/argo-cd/sync-patch.yaml --type merge || true + fi + + if [[ "$INSTALL_GITEA" == "true" ]]; then + local password username + password=$(kubectl get secret app-gitea-credential -n orch-platform \ + -o jsonpath="{.data.password}" 2>/dev/null | base64 -d || true) + username=$(kubectl get secret app-gitea-credential -n orch-platform \ + -o jsonpath="{.data.username}" 2>/dev/null | base64 -d || true) + + if [[ -n "$password" && -n "$username" ]]; then + kubectl exec -it vault-0 -n orch-platform -c vault -- \ + vault kv put secret/ma_git_service \ + username="$username" password="$password" 2>/dev/null || true + log_info "Gitea credentials stored in Vault" + fi + fi + + # Delete fleet-gitrepo-cred secrets + kubectl get secret --all-namespaces --no-headers 2>/dev/null \ + | awk '/fleet-gitrepo-cred/ {print $1, $2}' \ + | while IFS=' ' read -r ns secret; do + log_info "Deleting secret $secret in namespace $ns" + kubectl delete secret "$secret" -n "$ns" || true + done +} + +################################################################################ +# PHASE 7: CLEANUP +################################################################################ + +cleanup_external_secrets() { + log_info "=== Phase 7a: Cleaning up external-secrets ===" + + for crd in clustersecretstores.external-secrets.io \ + secretstores.external-secrets.io \ + externalsecrets.external-secrets.io; do + if kubectl get crd "$crd" >/dev/null 2>&1; then + kubectl delete crd "$crd" & + kubectl patch "crd/$crd" -p '{"metadata":{"finalizers":[]}}' --type=merge + fi + done + + # Apply External Secrets CRDs with server-side apply + log_info "Applying external-secrets CRDs v0.20.4..." + kubectl apply --server-side=true --force-conflicts \ + -f https://raw.githubusercontent.com/external-secrets/external-secrets/refs/tags/v0.20.4/deploy/crds/bundle.yaml || true + + # Final vault unseal + vault_unseal + log_info "Vault unsealed successfully." + + # Stop root-app sync + kubectl patch application root-app -n "$apps_ns" --type merge \ + -p '{"operation":null}' || true + kubectl patch application root-app -n "$apps_ns" --type json \ + -p '[{"op": "remove", "path": "/status/operationState"}]' || true + sleep 5 + + # Delete external-secrets application + kubectl patch application external-secrets -n "$apps_ns" \ + --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete application external-secrets -n "$apps_ns" \ + --force --grace-period=0 --ignore-not-found=true 2>/dev/null || true + sleep 5 + + log_info "external-secrets cleanup done." +} + +cleanup_kyverno() { + log_info "=== Phase 7b: Cleaning up Kyverno policies ===" + + for policy in restart-mps-deployment-on-secret-change \ + restart-rps-deployment-on-secret-change; do + if kubectl get clusterpolicy "$policy" -o name >/dev/null 2>&1; then + kubectl delete clusterpolicy "$policy" + log_info "Deleted ClusterPolicy: $policy" + fi + done +} + +final_prune_sync() { + log_info "=== Phase 7c: Final prune sync ===" + + kubectl patch -n "$apps_ns" application root-app --type merge --patch '{ + "operation": { + "initiatedBy": { "username": "admin" }, + "sync": { + "prune": true, + "syncStrategy": { "hook": {} } + } + } + }' + + sleep 30 + + delete_nginx_if_any +} + +remove_gitea_if_disabled() { + if [[ "${INSTALL_GITEA}" == "false" ]]; then + log_info "=== Phase 7d: Removing Gitea ===" + if helm list -n gitea | awk '{print $1}' | grep -q "^gitea$"; then + helm uninstall gitea -n gitea + log_info "Gitea uninstalled" + else + log_info "Gitea release not found, skipping" + fi + fi +} + +################################################################################ +# KYVERNO JOB CLEANUP +################################################################################ + +cleanup_kyverno_jobs() { + if kubectl get job kyverno-clean-reports -n kyverno >/dev/null 2>&1; then + log_info "Cleaning up kyverno-clean-reports job..." + kubectl delete job kyverno-clean-reports -n kyverno & + kubectl delete pods -l job-name="kyverno-clean-reports" -n kyverno & + kubectl patch job kyverno-clean-reports -n kyverno --type=merge \ + -p='{"metadata":{"finalizers":[]}}' || true + fi +} + +################################################################################ +# CLI PARSING +################################################################################ + +usage() { + cat >&2 < >(tee -a "$FULL_LOG_PATH") +exec 2> >(tee -a "$FULL_LOG_PATH" >&2) + +log_info "Starting pre-orch-upgrade script" +log_info "Log file: $FULL_LOG_PATH" + +################################ +# Defaults / Configuration +################################ + +WAIT_TIMEOUT_SECONDS="${WAIT_TIMEOUT_SECONDS:-600}" +WAIT_INTERVAL_SECONDS="${WAIT_INTERVAL_SECONDS:-5}" +LOCALPV_VERSION="${LOCALPV_VERSION:-4.3.0}" +SKIP_OS_CONFIG="${SKIP_OS_CONFIG:-false}" +HELM_VERSION="${HELM_VERSION:-}" + +# KIND +KIND_CLUSTER_NAME_DEFAULT="kind-cluster" +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:-$KIND_CLUSTER_NAME_DEFAULT}" +KIND_API_PORT="${KIND_API_PORT:-6443}" +KIND_VERSION="${KIND_VERSION:-}" +KIND_FORCE_RECREATE="${KIND_FORCE_RECREATE:-false}" + +# K3s +K3S_VERSION_DEFAULT="v1.34.5+k3s1" +K3S_VERSION="${K3S_VERSION:-$K3S_VERSION_DEFAULT}" + +# RKE2 +RKE2_TARGET_VERSION_DEFAULT="v1.34.5+rke2r1" +RKE2_TARGET_VERSION="${RKE2_TARGET_VERSION:-$RKE2_TARGET_VERSION_DEFAULT}" +DOCKER_USERNAME="${DOCKER_USERNAME:-}" +DOCKER_PASSWORD="${DOCKER_PASSWORD:-}" + +# System-upgrade-controller version for RKE2 upgrades +SYSTEM_UPGRADE_CONTROLLER_VERSION="v0.13.2" + +################################ +# Helpers +################################ + +require_cmd() { + local cmd="$1" + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "❌ Required command not found in PATH: $cmd" + exit 1 + fi +} + +cmd_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Auto-detect the installed Kubernetes provider by checking +# systemd services, binaries, and kubeconfig context. +detect_k8s_provider() { + # Check RKE2 first (most common for on-prem) + if systemctl list-unit-files rke2-server.service >/dev/null 2>&1 && \ + systemctl is-enabled rke2-server.service >/dev/null 2>&1; then + echo "rke2" + return 0 + fi + if [[ -d /etc/rancher/rke2 ]] || cmd_exists rke2; then + echo "rke2" + return 0 + fi + + # Check K3s + if systemctl list-unit-files k3s.service >/dev/null 2>&1 && \ + systemctl is-enabled k3s.service >/dev/null 2>&1; then + echo "k3s" + return 0 + fi + if [[ -d /etc/rancher/k3s ]] || cmd_exists k3s; then + echo "k3s" + return 0 + fi + + # Check KIND (look for kind binary + running cluster, or kubeconfig context) + if cmd_exists kind && kind get clusters 2>/dev/null | grep -q .; then + echo "kind" + return 0 + fi + if kubectl config current-context 2>/dev/null | grep -q "^kind-"; then + echo "kind" + return 0 + fi + + # Could not detect + return 1 +} + +install_helm() { + if cmd_exists helm; then + echo "✅ helm is already installed: $(helm version --short 2>/dev/null || echo 'unknown')" + return 0 + fi + + if ! cmd_exists curl && ! cmd_exists wget; then + echo "❌ helm is required but is not installed. Need curl or wget to install it automatically." + echo " Install curl (or wget) and retry, or install helm manually (https://helm.sh/docs/intro/install/)." + exit 1 + fi + + echo "👉 helm not found; installing helm v3..." + + local tmp + tmp="$(mktemp -d)" + trap 'rm -rf "${tmp:-}"' RETURN + + local installer_url="https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3" + local installer_path="$tmp/get-helm-3.sh" + + if cmd_exists curl; then + curl -fsSL "$installer_url" -o "$installer_path" + else + wget -qO "$installer_path" "$installer_url" + fi + chmod +x "$installer_path" + + if [[ -w /usr/local/bin ]]; then + if [[ -n "${HELM_VERSION}" ]]; then + HELM_INSTALL_DIR="/usr/local/bin" DESIRED_VERSION="${HELM_VERSION}" "$installer_path" + else + HELM_INSTALL_DIR="/usr/local/bin" "$installer_path" + fi + elif cmd_exists sudo; then + if [[ -n "${HELM_VERSION}" ]]; then + sudo -E env DESIRED_VERSION="${HELM_VERSION}" "$installer_path" + else + sudo -E "$installer_path" + fi + else + mkdir -p "${HOME}/.local/bin" + if [[ -n "${HELM_VERSION}" ]]; then + HELM_INSTALL_DIR="${HOME}/.local/bin" DESIRED_VERSION="${HELM_VERSION}" "$installer_path" + else + HELM_INSTALL_DIR="${HOME}/.local/bin" "$installer_path" + fi + fi + + if ! cmd_exists helm; then + echo "❌ helm installation did not succeed; please install helm manually and retry." + exit 1 + fi + echo "✅ helm installed: $(helm version --short 2>/dev/null || echo 'unknown')" +} + +install_yq() { + if cmd_exists yq; then + echo "✅ yq is already installed: $(yq --version 2>/dev/null || echo 'unknown')" + return 0 + fi + + require_cmd curl + + echo "👉 yq not found; installing yq..." + + local arch + arch="$(uname -m)" + local yq_binary="yq_linux_amd64" + case "${arch}" in + x86_64|amd64) yq_binary="yq_linux_amd64" ;; + aarch64|arm64) yq_binary="yq_linux_arm64" ;; + *) + echo "❌ Unsupported architecture for yq: ${arch}" + exit 1 + ;; + esac + + local tmp + tmp="$(mktemp -d)" + trap 'rm -rf "${tmp:-}"' RETURN + + # Get latest version + local version + version="$(curl -s https://api.github.com/repos/mikefarah/yq/releases/latest | grep '"tag_name"' | cut -d '"' -f 4)" + if [[ -z "${version}" ]]; then + version="v4.44.1" # fallback + fi + + local yq_url="https://github.com/mikefarah/yq/releases/download/${version}/${yq_binary}.tar.gz" + echo "👉 Downloading yq ${version} from ${yq_url}..." + curl -fsSL -o "$tmp/${yq_binary}.tar.gz" "$yq_url" + tar xf "$tmp/${yq_binary}.tar.gz" -C "$tmp" + + if [[ -f "$tmp/${yq_binary}" ]]; then + sudo mv "$tmp/${yq_binary}" /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + elif [[ -f "$tmp/yq" ]]; then + sudo mv "$tmp/yq" /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + else + echo "❌ yq binary not found after extraction" + exit 1 + fi + + if ! cmd_exists yq; then + echo "❌ yq installation did not succeed; please install yq manually and retry." + exit 1 + fi + echo "✅ yq installed: $(yq --version 2>/dev/null || echo 'unknown')" +} + +usage() { + cat >&2 < Default: ${WAIT_TIMEOUT_SECONDS} + --wait-interval Default: ${WAIT_INTERVAL_SECONDS} + --localpv-version Default: ${LOCALPV_VERSION} + --helm-version Default: latest + --skip-os-config Skip OS-level configuration step + +KIND options: + --cluster-name Default: ${KIND_CLUSTER_NAME_DEFAULT} + --api-port Default: ${KIND_API_PORT} + --kind-version Default: latest + --force-recreate Delete and recreate cluster even if version matches + +K3s options: + --k3s-version Default: ${K3S_VERSION_DEFAULT} + --docker-username Optional (for Docker Hub auth) + --docker-password Optional (for Docker Hub auth) + +RKE2 options: + --rke2-target-version Default: ${RKE2_TARGET_VERSION_DEFAULT} + --docker-username Optional (for Docker Hub auth) + --docker-password Optional (for Docker Hub auth) + +Examples: + $(basename "$0") # auto-detect provider + $(basename "$0") upgrade # auto-detect, explicit action + $(basename "$0") rke2 upgrade # explicit provider and action + $(basename "$0") rke2 upgrade --rke2-target-version v1.34.4+rke2r1 + $(basename "$0") k3s upgrade --k3s-version v1.34.3+k3s1 + $(basename "$0") kind upgrade + $(basename "$0") --skip-os-config # auto-detect, skip OS config +EOF +} + +################################ +# Wait / Readiness helpers +################################ + +wait_for_k8s_ready() { + local kube_context="${1:-}" + + local kubectl_ctx_args=() + if [[ -n "${kube_context}" ]]; then + kubectl_ctx_args+=(--context "${kube_context}") + fi + + echo "👉 Waiting for Kubernetes API to be reachable..." + local deadline=$((SECONDS + WAIT_TIMEOUT_SECONDS)) + until kubectl "${kubectl_ctx_args[@]}" get --raw='/readyz' >/dev/null 2>&1; do + if (( SECONDS >= deadline )); then + echo "❌ Timed out waiting for API server to be ready after ${WAIT_TIMEOUT_SECONDS}s" + kubectl "${kubectl_ctx_args[@]}" cluster-info || true + exit 1 + fi + sleep "${WAIT_INTERVAL_SECONDS}" + done + echo "✅ API server is ready" + + echo "👉 Waiting for all nodes to be Ready (timeout: ${WAIT_TIMEOUT_SECONDS}s)..." + if ! kubectl "${kubectl_ctx_args[@]}" wait --for=condition=Ready node --all --timeout="${WAIT_TIMEOUT_SECONDS}s"; then + echo "❌ Timed out waiting for nodes to become Ready" + kubectl "${kubectl_ctx_args[@]}" get nodes -o wide || true + kubectl "${kubectl_ctx_args[@]}" get pods -A || true + exit 1 + fi + echo "✅ All nodes are Ready" +} + +################################ +# OS Configuration +# (replaces onprem-config-installer deb) +################################ + +upgrade_os_config() { + log_info "=== OS Configuration Upgrade ===" + + # ------------------------------------------------------- + # 1. sysctl tuning: inotify limits + # ------------------------------------------------------- + echo "👉 Configuring sysctl inotify parameters..." + + local sysctl_file="/etc/sysctl.conf" + local sysctl_params=( + "fs.inotify.max_queued_events = 1048576" + "fs.inotify.max_user_instances = 1048576" + "fs.inotify.max_user_watches = 1048576" + ) + + for param in "${sysctl_params[@]}"; do + local key="${param%% =*}" + # Remove existing entries (commented or not) and append correct value + if ! grep -q "^${key}" "${sysctl_file}" 2>/dev/null; then + echo "${param}" | sudo tee -a "${sysctl_file}" >/dev/null + echo " Added: ${param}" + else + echo " Already set: ${key}" + fi + done + + sudo sysctl -p >/dev/null 2>&1 || true + echo "✅ sysctl parameters configured" + + # ------------------------------------------------------- + # 2. Install / update yq + # ------------------------------------------------------- + echo "👉 Ensuring yq is installed..." + install_yq + + # ------------------------------------------------------- + # 3. Install / update helm + # ------------------------------------------------------- + echo "👉 Ensuring helm is installed..." + install_helm + + # ------------------------------------------------------- + # 4. Ensure hostpath directories exist + # ------------------------------------------------------- + echo "👉 Ensuring hostpath directories exist..." + local hostpath_dirs=("/var/openebs/local") + for dir in "${hostpath_dirs[@]}"; do + if [[ ! -d "${dir}" ]]; then + sudo mkdir -p "${dir}" + echo " Created: ${dir}" + else + echo " Exists: ${dir}" + fi + done + echo "✅ Hostpath directories ready" + + # ------------------------------------------------------- + # 5. Kernel modules for LVM snapshots + # ------------------------------------------------------- + echo "👉 Configuring kernel modules for LVM snapshots..." + + local modules_file="/etc/modules-load.d/lv-snapshots.conf" + printf "dm-snapshot\ndm-mirror\n" | sudo tee "${modules_file}" >/dev/null + + sudo modprobe dm-snapshot 2>/dev/null || log_warn "modprobe dm-snapshot failed (non-fatal)" + sudo modprobe dm-mirror 2>/dev/null || log_warn "modprobe dm-mirror failed (non-fatal)" + echo "✅ Kernel modules configured" + + log_info "=== OS Configuration Upgrade Complete ===" +} + +################################ +# OpenEBS LocalPV +################################ + +upgrade_openebs_localpv() { + local kube_context="${1:-}" + + install_helm + require_cmd kubectl + + local helm_ctx_args=() + local kubectl_ctx_args=() + if [[ -n "${kube_context}" ]]; then + helm_ctx_args+=(--kube-context "${kube_context}") + kubectl_ctx_args+=(--context "${kube_context}") + fi + + echo "👉 Using OpenEBS LocalPV version: ${LOCALPV_VERSION}" + + echo "👉 Adding OpenEBS LocalPV Helm repo..." + helm repo add openebs-localpv https://openebs.github.io/dynamic-localpv-provisioner >/dev/null 2>&1 || true + + echo "🔄 Updating Helm repos..." + helm repo update >/dev/null + + echo "🚀 Upgrading OpenEBS LocalPV..." + helm upgrade --install openebs-localpv openebs-localpv/localpv-provisioner \ + "${helm_ctx_args[@]}" \ + --version "${LOCALPV_VERSION}" \ + --namespace openebs-system --create-namespace \ + --set hostpathClass.enabled=true \ + --set hostpathClass.name=openebs-hostpath \ + --set hostpathClass.isDefaultClass=true \ + --set deviceClass.enabled=false \ + --wait --timeout 10m0s + + echo "📦 OpenEBS Pods in openebs-system namespace:" + kubectl "${kubectl_ctx_args[@]}" get pods -n openebs-system + echo "✅ OpenEBS LocalPV upgrade complete" +} + +################################ +# RKE2 Upgrade +# (replaces onprem-ke-installer deb) +################################ + +# RKE2 version ladder — must be traversed one minor version at a time. +# Matches the Go implementation in mage/upgrade.go:determineUpgradePath() +RKE2_VERSION_LADDER=( + "v1.30.14+rke2r2" # Patch update within 1.30 + "v1.31.13+rke2r1" # Upgrade to 1.31 + "v1.32.9+rke2r1" # Upgrade to 1.32 + "v1.33.5+rke2r1" # Upgrade to 1.33 + "v1.34.1+rke2r1" # Upgrade to 1.34.1 + "v1.34.5+rke2r1" # Final target version +) + +# Extract minor version: "v1.30.14+rke2r2" -> "v1.30" +_rke2_minor() { + local ver="$1" + echo "${ver}" | cut -d. -f1,2 +} + +# Determine the upgrade path from current version to target version. +# Outputs space-separated list of versions to upgrade through. +determine_rke2_upgrade_path() { + local current="$1" + local target="$2" + + local current_minor target_minor + current_minor="$(_rke2_minor "${current}")" + target_minor="$(_rke2_minor "${target}")" + + local start_idx=-1 + local end_idx=-1 + local i + + # Find starting index (version after current) + for i in "${!RKE2_VERSION_LADDER[@]}"; do + local v="${RKE2_VERSION_LADDER[$i]}" + if [[ "${v}" == "${current}" ]]; then + start_idx=$i + break + fi + local v_minor + v_minor="$(_rke2_minor "${v}")" + if [[ "${v_minor}" == "${current_minor}" && ${start_idx} -eq -1 ]]; then + start_idx=$i + fi + done + + if [[ ${start_idx} -eq -1 ]]; then + start_idx=0 + else + start_idx=$((start_idx + 1)) + fi + + # Find ending index + for i in "${!RKE2_VERSION_LADDER[@]}"; do + local v="${RKE2_VERSION_LADDER[$i]}" + if [[ "${v}" == "${target}" ]]; then + end_idx=$i + break + fi + local v_minor + v_minor="$(_rke2_minor "${v}")" + if [[ "${v_minor}" == "${target_minor}" ]]; then + end_idx=$i + fi + done + + if [[ ${end_idx} -eq -1 ]]; then + end_idx=$(( ${#RKE2_VERSION_LADDER[@]} - 1 )) + fi + + # Build path + if (( start_idx <= end_idx )); then + for (( i=start_idx; i<=end_idx; i++ )); do + echo "${RKE2_VERSION_LADDER[$i]}" + done + fi +} + +# Wait for node to report a specific kubelet version +rke2_wait_for_version() { + local node_name="$1" + local expected_version="$2" + local deadline=$((SECONDS + WAIT_TIMEOUT_SECONDS)) + + echo "👉 Waiting for node to report kubelet version ${expected_version}..." + + while true; do + local found_version + found_version="$(kubectl get "${node_name}" -o jsonpath='{.status.nodeInfo.kubeletVersion}' 2>/dev/null || echo "")" + + if [[ "${found_version}" == "${expected_version}" ]]; then + echo "✅ Node reports version ${expected_version}" + return 0 + fi + + if (( SECONDS >= deadline )); then + echo "❌ Timed out waiting for node version ${expected_version} (current: ${found_version})" + exit 1 + fi + + echo " Current version: ${found_version}, waiting... ($(( deadline - SECONDS ))s remaining)" + sleep "${WAIT_INTERVAL_SECONDS}" + done +} + +# Wait for node to be Ready and schedulable +rke2_wait_for_node_ready() { + local node_name="$1" + local deadline=$((SECONDS + WAIT_TIMEOUT_SECONDS)) + + echo "👉 Waiting for node to be Ready and schedulable..." + + while true; do + local ready + ready="$(kubectl get "${node_name}" -o jsonpath='{range .status.conditions[?(@.type=="Ready")]}{.status}{end}' 2>/dev/null || echo "Unknown")" + + local schedulable="True" + local taints + taints="$(kubectl get "${node_name}" -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +taints = data.get('spec', {}).get('taints', []) +noschedule = [t for t in taints if t.get('effect') == 'NoSchedule'] +print('True' if len(noschedule) == 0 else 'False') +" 2>/dev/null || echo "True")" + schedulable="${taints}" + + if [[ "${ready}" == "True" && "${schedulable}" == "True" ]]; then + echo "✅ Node is Ready and schedulable" + return 0 + fi + + if (( SECONDS >= deadline )); then + echo "❌ Timed out waiting for node Ready (ready=${ready}, schedulable=${schedulable})" + exit 1 + fi + + echo " Node status: ready=${ready}, schedulable=${schedulable}, waiting... ($(( deadline - SECONDS ))s remaining)" + sleep "${WAIT_INTERVAL_SECONDS}" + done +} + +rke2_upgrade() { + require_cmd sudo + require_cmd kubectl + require_cmd curl + + if [[ "$(uname -s)" != "Linux" ]]; then + echo "❌ RKE2 upgrade currently supports Linux only" + exit 1 + fi + + log_info "=== RKE2 Cluster Upgrade ===" + + # Get the node name + local node_name + node_name="$(kubectl get nodes -o name | head -1)" + if [[ -z "${node_name}" ]]; then + echo "❌ No nodes found in the cluster" + exit 1 + fi + echo "👉 Orchestrator node: ${node_name}" + + # Get current version + local current_version + current_version="$(kubectl get "${node_name}" -o jsonpath='{.status.nodeInfo.kubeletVersion}')" + echo "👉 Current RKE2 version: ${current_version}" + echo "👉 Target RKE2 version: ${RKE2_TARGET_VERSION}" + + # Check if already at target + if [[ "${current_version}" == "${RKE2_TARGET_VERSION}" ]]; then + echo "✅ RKE2 is already at the target version ${RKE2_TARGET_VERSION}. No upgrade needed." + return 0 + fi + + # Determine upgrade path + local -a upgrade_path + mapfile -t upgrade_path < <(determine_rke2_upgrade_path "${current_version}" "${RKE2_TARGET_VERSION}") + + if [[ ${#upgrade_path[@]} -eq 0 ]]; then + echo "❌ Unable to determine upgrade path from ${current_version} to ${RKE2_TARGET_VERSION}" + exit 1 + fi + + echo "👉 Upgrade path: ${upgrade_path[*]}" + + # Install system-upgrade-controller + echo "👉 Installing system-upgrade-controller ${SYSTEM_UPGRADE_CONTROLLER_VERSION}..." + kubectl apply -f \ + "https://github.com/rancher/system-upgrade-controller/releases/download/${SYSTEM_UPGRADE_CONTROLLER_VERSION}/system-upgrade-controller.yaml" + + echo "👉 Waiting for system-upgrade-controller deployment to be ready..." + kubectl rollout status deployment/system-upgrade-controller \ + -n system-upgrade --timeout=10m + + # Wait for CRDs to propagate + sleep 15 + + # Delete any existing upgrade Plans + echo "👉 Cleaning up existing upgrade Plans..." + kubectl delete -n system-upgrade plans.upgrade.cattle.io --all 2>/dev/null || true + + # Label node for upgrade + echo "👉 Labeling node for upgrade..." + kubectl label "${node_name}" rke2-upgrade=true --overwrite + + # Perform sequential upgrades through the version ladder + local total=${#upgrade_path[@]} + local step=0 + for rke2_version in "${upgrade_path[@]}"; do + step=$((step + 1)) + echo "" + log_info "--- RKE2 Upgrade Step ${step}/${total}: upgrading to ${rke2_version} ---" + + # Generate and apply upgrade Plan + kubectl apply -f - </dev/null || true + + kubectl delete -f \ + "https://github.com/rancher/system-upgrade-controller/releases/download/${SYSTEM_UPGRADE_CONTROLLER_VERSION}/system-upgrade-controller.yaml" \ + 2>/dev/null || true + + # Refresh kubeconfig and kubectl binary after upgrade + echo "👉 Refreshing kubeconfig and tools..." + rke2_refresh_kubeconfig_and_tools + + log_info "=== RKE2 Cluster Upgrade Complete ===" +} + +rke2_refresh_kubeconfig_and_tools() { + mkdir -p "${HOME}/.kube" + sudo cp /etc/rancher/rke2/rke2.yaml "${HOME}/.kube/config" 2>/dev/null || true + sudo chown -R "${USER}:${USER}" "${HOME}/.kube" 2>/dev/null || true + chmod 600 "${HOME}/.kube/config" 2>/dev/null || true + export KUBECONFIG="${HOME}/.kube/config" + + # Copy updated binaries + if [[ -f /var/lib/rancher/rke2/bin/ctr ]]; then + sudo cp /var/lib/rancher/rke2/bin/ctr /usr/local/bin/ || true + fi + if [[ -f /var/lib/rancher/rke2/bin/kubectl ]]; then + sudo cp /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/ || true + fi +} + +################################ +# K3s Upgrade +################################ + +k3s_configure_registries() { + if [[ -z "${DOCKER_USERNAME}" || -z "${DOCKER_PASSWORD}" ]]; then + return 1 + fi + + sudo mkdir -p /etc/rancher/k3s + sudo tee /etc/rancher/k3s/registries.yaml >/dev/null </dev/null | grep -oP 'v[\d.]+' | head -1 || true) + fi + + if [[ -z "${KIND_VERSION}" ]]; then + version="$(get_latest_kind)" + else + version="${KIND_VERSION}" + fi + + # If we couldn't resolve target version (e.g. API rate limit) and kind + # is already installed, keep the existing binary. + if [[ -z "${version}" ]]; then + if [[ -n "${current_version}" ]]; then + log_info "Could not determine latest KIND version (GitHub API rate limit?). Keeping existing KIND ${current_version}." + echo "✅ KIND ${current_version} already installed — skipping download" + return 0 + else + log_error "Cannot determine KIND version to install and no existing binary found." + return 1 + fi + fi + + # Skip download if the requested version is already installed + if [[ "${version}" == "${current_version}" ]]; then + echo "✅ KIND ${version} already installed — skipping download" + return 0 + fi + + echo "👉 Installing KIND ${version}..." + curl -Lo kind "https://kind.sigs.k8s.io/dl/${version}/kind-${os}-${arch}" + chmod +x kind + sudo mv kind /usr/local/bin/kind + echo "✅ KIND ${version} installed" +} + +create_kind_config() { + local cfg_file="$1" + + cat < "${cfg_file}" +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +networking: + apiServerAddress: "127.0.0.1" + apiServerPort: ${KIND_API_PORT} +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: KubeletConfiguration + maxPods: 250 + serializeImagePulls: false +EOF +} + +get_kind_node_k8s_version() { + # Returns the K8s server version (e.g. v1.35.0) from the running cluster + kubectl version --short 2>/dev/null | awk '/Server Version:/ {print $3}' || true +} + +get_kind_target_k8s_version() { + # Ask the kind binary what K8s version it would create + # kind images list the default node image tag + local img + img=$(kind build node-image --help 2>&1 | grep -oP 'kindest/node:v[\d.]+' | head -1 || true) + if [[ -n "$img" ]]; then + echo "${img#kindest/node:}" + else + echo "" + fi +} + +kind_upgrade() { + require_cmd kubectl + + log_info "=== KIND Cluster Upgrade ===" + + local kind_config + kind_config="/tmp/kind-${KIND_CLUSTER_NAME}-${KIND_API_PORT}.yaml" + local context="kind-${KIND_CLUSTER_NAME}" + + # ── Step 1: Install/update kind binary ── + install_kind_bin + + # ── Step 2: Decide whether to recreate ── + local need_recreate="false" + local current_version="" cluster_exists="false" + + if cmd_exists kind && kind get clusters 2>/dev/null | grep -qx "${KIND_CLUSTER_NAME}"; then + cluster_exists="true" + # Export KIND kubeconfig so kubectl talks to the right cluster + kind export kubeconfig --name "${KIND_CLUSTER_NAME}" 2>/dev/null || true + current_version=$(kubectl --context "${context}" version -o json 2>/dev/null \ + | grep -oP '"gitVersion"\s*:\s*"\Kv[^"]+' | tail -1 || true) + log_info "Existing KIND cluster found: ${KIND_CLUSTER_NAME} (K8s ${current_version:-unknown})" + fi + + if [[ "${KIND_FORCE_RECREATE}" == "true" ]]; then + log_info "--force-recreate specified — cluster will be deleted and recreated." + need_recreate="true" + elif [[ "${cluster_exists}" == "false" ]]; then + log_info "No existing cluster found — will create a new one." + need_recreate="true" + else + # Compare current K8s version with the one KIND would create. + # KIND embeds a default K8s version per release. If user ran + # --kind-version to get a newer release, the default K8s version + # may differ. When we can't determine the target, fall back to + # keeping the cluster (safe path). + local target_version="" + target_version=$(kind version -q 2>/dev/null || true) + # kind doesn't expose the target K8s version directly; use the + # image tag from "kind create cluster --help" or default images. + log_info "KIND binary updated. Checking if cluster recreation is needed..." + + if [[ -n "${current_version}" ]]; then + echo " Current cluster K8s version: ${current_version}" + echo " KIND binary version: $(kind version 2>/dev/null || echo unknown)" + echo "" + echo "✅ Existing cluster is intact — skipping recreation." + echo " The cluster and all workloads are preserved." + echo " Use --force-recreate if you need a fresh cluster." + log_info "Cluster preserved (K8s ${current_version}). Skipping delete/create." + + # Ensure kubectl context is set correctly + kind export kubeconfig --name "${KIND_CLUSTER_NAME}" 2>/dev/null || true + kubectl config use-context "${context}" 2>/dev/null || true + wait_for_k8s_ready "${context}" + log_info "=== KIND Cluster Upgrade Complete (cluster preserved) ===" + return 0 + else + log_warn "Could not determine current K8s version — recreating cluster to be safe." + need_recreate="true" + fi + fi + + # ── Step 3: Recreate cluster (only if needed) ── + if [[ "${need_recreate}" == "true" ]]; then + echo "⚠️ KIND clusters do not support in-place K8s upgrades." + echo " The cluster will be deleted and recreated." + + # Kill any stale processes holding the API port + local stale_pid + stale_pid=$(sudo ss -tlnp "sport = :${KIND_API_PORT}" 2>/dev/null \ + | awk '/LISTEN/ {match($0, /pid=([0-9]+)/, m); print m[1]}' | head -1 || true) + if [[ -n "${stale_pid}" ]]; then + log_info "Killing stale process (PID ${stale_pid}) on port ${KIND_API_PORT}..." + sudo kill "${stale_pid}" 2>/dev/null || true + sleep 2 + fi + + # Delete existing cluster if present + if [[ "${cluster_exists}" == "true" ]]; then + echo "👉 Deleting existing KIND cluster: ${KIND_CLUSTER_NAME}..." + kind delete cluster --name "${KIND_CLUSTER_NAME}" + fi + + # Create fresh cluster + create_kind_config "${kind_config}" + + echo "👉 Creating KIND cluster: ${KIND_CLUSTER_NAME} (API @ 127.0.0.1:${KIND_API_PORT})" + kind create cluster --name "${KIND_CLUSTER_NAME}" --config "${kind_config}" + + rm -f "${kind_config}" + + echo "✅ KIND cluster recreated" + kubectl cluster-info --context "${context}" + + wait_for_k8s_ready "${context}" + fi + + log_info "=== KIND Cluster Upgrade Complete ===" +} + +################################ +# Main Dispatch +################################ + +# Flexible argument parsing: +# ./pre-orch-upgrade.sh -> auto-detect provider, action=upgrade +# ./pre-orch-upgrade.sh upgrade -> auto-detect provider, action=upgrade +# ./pre-orch-upgrade.sh rke2 -> provider=rke2, action=upgrade +# ./pre-orch-upgrade.sh rke2 upgrade -> provider=rke2, action=upgrade +# ./pre-orch-upgrade.sh --skip-os-config -> auto-detect, options start immediately + +PROVIDER="" +ACTION="upgrade" + +# Check if first arg is a known provider, 'upgrade', a flag, or missing +if [[ $# -ge 1 ]]; then + case "$1" in + kind|k3s|rke2) + PROVIDER="$1" + shift + # Check if next arg is 'upgrade' (optional, skip if flag or missing) + if [[ $# -ge 1 && "$1" == "upgrade" ]]; then + shift + fi + ;; + upgrade) + # No provider given, just the action + shift + ;; + -h|--help) + usage + exit 0 + ;; + -*) # Flags — no provider, no action word, go straight to option parsing + ;; + *) + echo "❌ Unknown argument: $1 (expected: kind, k3s, rke2, upgrade, or options)" + usage + exit 1 + ;; + esac +fi + +# Auto-detect provider if not explicitly specified +if [[ -z "${PROVIDER}" ]]; then + echo "👉 No Kubernetes provider specified, auto-detecting..." + if PROVIDER="$(detect_k8s_provider)"; then + echo "✅ Detected Kubernetes provider: ${PROVIDER}" + else + echo "❌ Could not auto-detect Kubernetes provider." + echo " Please specify the provider explicitly: $(basename "$0") upgrade" + exit 1 + fi +fi + +log_info "Provider: ${PROVIDER} | Action: ${ACTION}" + +# Parse long options +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + + # Global + --wait-timeout) + WAIT_TIMEOUT_SECONDS="$2" + shift 2 + ;; + --wait-interval) + WAIT_INTERVAL_SECONDS="$2" + shift 2 + ;; + --localpv-version) + LOCALPV_VERSION="$2" + shift 2 + ;; + --helm-version) + HELM_VERSION="$2" + shift 2 + ;; + --skip-os-config) + SKIP_OS_CONFIG="true" + shift + ;; + + # KIND + --cluster-name) + KIND_CLUSTER_NAME="$2" + shift 2 + ;; + --api-port) + KIND_API_PORT="$2" + shift 2 + ;; + --kind-version) + KIND_VERSION="$2" + shift 2 + ;; + --force-recreate) + KIND_FORCE_RECREATE="true" + shift + ;; + + # K3s + --k3s-version) + K3S_VERSION="$2" + shift 2 + ;; + + # RKE2 + --rke2-target-version) + RKE2_TARGET_VERSION="$2" + shift 2 + ;; + + # Shared + --docker-username) + DOCKER_USERNAME="$2" + shift 2 + ;; + --docker-password) + DOCKER_PASSWORD="$2" + shift 2 + ;; + + *) + echo "❌ Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Step 1: OS Configuration (common for all providers) +if [[ "${SKIP_OS_CONFIG}" != "true" ]]; then + upgrade_os_config +else + log_info "Skipping OS configuration (--skip-os-config)" +fi + +# Step 2: Provider-specific Kubernetes upgrade +case "${PROVIDER}" in + kind) + kind_upgrade + # Step 3: OpenEBS LocalPV + upgrade_openebs_localpv "kind-${KIND_CLUSTER_NAME}" + ;; + k3s) + k3s_upgrade + # Step 3: OpenEBS LocalPV + upgrade_openebs_localpv + ;; + rke2) + rke2_upgrade + # Step 3: OpenEBS LocalPV + upgrade_openebs_localpv + ;; + *) + echo "❌ Unknown provider: ${PROVIDER} (expected: kind, k3s, or rke2)" + usage + exit 1 + ;; +esac + +echo "" +log_info "=========================================" +log_info "Pre-upgrade complete for provider: ${PROVIDER}" +log_info "=========================================" diff --git a/on-prem-installers/onprem/pre-upgrade-backup.sh b/on-prem-installers/onprem/pre-upgrade-backup.sh new file mode 100755 index 0000000000..d62e929955 --- /dev/null +++ b/on-prem-installers/onprem/pre-upgrade-backup.sh @@ -0,0 +1,628 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: 2026 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Script Name: pre-upgrade-backup.sh +# Description: Standalone backup script for Edge Orchestrator pre-upgrade. +# Creates backups of all critical data before running the upgrade: +# - PostgreSQL database dump (pg_dumpall) +# - PostgreSQL superuser secret +# - PostgreSQL service passwords (9 services) +# - MPS/RPS connection secrets +# - Gitea secrets cleanup (pre-backup) +# - PersistentVolume LVM snapshots (optional, -b flag) +# - RKE2 etcd snapshot (optional, -b flag, RKE2 only) +# - K3s etcd snapshot (optional, -b flag, K3s only) +# +# This script should be run BEFORE pre-orch-upgrade.sh and post-orch-upgrade.sh. +# +# Usage: +# ./pre-upgrade-backup.sh [options] +# +# Options: +# -b Enable PV and etcd backup (requires LVM + sufficient VG space) +# -s Skip interactive prompts (non-interactive mode) +# -h Show help + +set -euo pipefail + +export PATH="/usr/local/bin:${PATH}" +export KUBECONFIG="${KUBECONFIG:-/home/$USER/.kube/config}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/onprem.env" + +# shellcheck disable=SC1091 +# Provides: check_postgres, backup_postgres, restore_postgres, local_backup_path, etc. +source "${SCRIPT_DIR}/upgrade_postgres.sh" + +################################ +# Logging +################################ + +LOG_FILE="pre_upgrade_backup_$(date +'%Y%m%d_%H%M%S').log" +LOG_DIR="/var/log/orch-upgrade" + +sudo mkdir -p "$LOG_DIR" +sudo chown "$(whoami):$(whoami)" "$LOG_DIR" + +FULL_LOG_PATH="$LOG_DIR/$LOG_FILE" + +log_message() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "$FULL_LOG_PATH" +} + +log_info() { + log_message "INFO: $*" +} + +log_warn() { + log_message "WARN: $*" +} + +log_error() { + log_message "ERROR: $*" +} + +# Redirect all output to both console and log file +exec > >(tee -a "$FULL_LOG_PATH") +exec 2> >(tee -a "$FULL_LOG_PATH" >&2) + +log_info "Starting pre-upgrade backup script" +log_info "Log file: $FULL_LOG_PATH" + +################################ +# Configuration +################################ + +BACKUP_PVS="${BACKUP_PVS:-false}" +SKIP_INTERACTIVE="${SKIP_INTERACTIVE:-false}" +BACKUP_DIR="$(pwd)" + +# From upgrade_postgres.sh (sourced above): +# postgres_namespace=orch-database +# local_backup_path=./orch-database_backup.sql +# podname=postgresql-cluster-1 +# POSTGRES_USERNAME=postgres +# application_namespace=onprem + +################################ +# Kubernetes Provider Detection +################################ + +detect_k8s_provider() { + if systemctl is-active --quiet rke2-server.service 2>/dev/null; then + echo "rke2" + elif systemctl is-active --quiet k3s.service 2>/dev/null; then + echo "k3s" + elif command -v kind &>/dev/null && kind get clusters 2>/dev/null | grep -q .; then + echo "kind" + elif [[ -d /etc/rancher/rke2 ]]; then + echo "rke2" + elif [[ -d /etc/rancher/k3s ]]; then + echo "k3s" + else + echo "unknown" + fi +} + +################################ +# Prerequisites +################################ + +check_prerequisites() { + log_info "Checking prerequisites..." + + if ! command -v kubectl &>/dev/null; then + log_error "kubectl not found" + exit 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + log_error "Cannot reach Kubernetes cluster. Check KUBECONFIG." + exit 1 + fi + + log_info "Prerequisites met." +} + +################################ +# Phase 1: PostgreSQL Health Check +################################ + +pre_backup_postgres_check() { + log_info "=== Phase 1: PostgreSQL health check ===" + + # Do NOT call check_postgres() from upgrade_postgres.sh here — it has an + # interactive read -rp prompt that blocks in non-interactive / piped runs. + # Instead, perform the health check inline. + + if [[ -f "$local_backup_path" ]]; then + log_warn "Existing backup file found: $local_backup_path" + if [[ "$SKIP_INTERACTIVE" == "true" ]]; then + log_info "Non-interactive mode (-s): continuing with existing backup file." + else + read -rp "A backup file already exists. Type 'Continue' to proceed or Ctrl-C to abort: " confirm + if [[ ! "$confirm" =~ ^[Cc][Oo][Nn][Tt][Ii][Nn][Uu][Ee]$ ]]; then + log_error "User aborted." + exit 1 + fi + fi + log_info "PostgreSQL health check skipped (recovery from previous run)." + return 0 + fi + + # shellcheck disable=SC2154 # podname, postgres_namespace from upgrade_postgres.sh + local pod_status + pod_status=$(kubectl get pods -n "$postgres_namespace" "$podname" \ + -o jsonpath='{.status.phase}' 2>/dev/null || true) + + if [[ "$pod_status" != "Running" ]]; then + log_error "PostgreSQL pod $podname is not running (status: ${pod_status:-not found})." + exit 1 + fi + + log_info "PostgreSQL pod $podname is healthy (Running)." +} + +################################ +# Phase 2: PostgreSQL Superuser Secret Backup +################################ + +backup_postgres_secret() { + log_info "=== Phase 2: Backing up PostgreSQL superuser secret ===" + + if [[ -f "${BACKUP_DIR}/postgres_secret.yaml" ]]; then + log_info "postgres_secret.yaml already exists, skipping." + return 0 + fi + + if kubectl get secret -n orch-database postgresql-cluster-superuser >/dev/null 2>&1; then + kubectl get secret -n orch-database postgresql-cluster-superuser \ + -o yaml > "${BACKUP_DIR}/postgres_secret.yaml" + log_info "PostgreSQL superuser secret saved to postgres_secret.yaml" + else + log_warn "postgresql-cluster-superuser secret not found, skipping." + fi +} + +################################ +# Phase 3: PostgreSQL Database Dump +################################ + +backup_postgres_database() { + log_info "=== Phase 3: Backing up PostgreSQL databases ===" + + # backup_postgres() from upgrade_postgres.sh handles idempotency + backup_postgres + + # shellcheck disable=SC2154 # local_backup_path defined in upgrade_postgres.sh + if [[ -f "$local_backup_path" ]]; then + log_info "PostgreSQL database backup saved to $local_backup_path" + else + log_error "PostgreSQL database backup failed!" + exit 1 + fi +} + +################################ +# Phase 4: Gitea Secrets Cleanup +################################ + +cleanup_gitea_secrets() { + log_info "=== Phase 4: Cleaning up Gitea secrets before backup ===" + + local install_gitea="${INSTALL_GITEA:-true}" + + if [[ "$install_gitea" != "true" ]]; then + log_info "Gitea not installed, skipping secrets cleanup." + return 0 + fi + + local secrets=("gitea-apporch-token" "gitea-argocd-token" "gitea-clusterorch-token") + for secret in "${secrets[@]}"; do + if kubectl get secret "$secret" -n gitea >/dev/null 2>&1; then + kubectl delete secret "$secret" -n gitea + log_info "Deleted Gitea secret: $secret" + fi + done + + log_info "Gitea secrets cleanup completed." +} + +################################ +# Phase 5: PostgreSQL Service Passwords +################################ + +backup_postgres_passwords() { + log_info "=== Phase 5: Backing up PostgreSQL service passwords ===" + + if [[ -s "${BACKUP_DIR}/postgres-secrets-password.txt" ]]; then + log_info "postgres-secrets-password.txt already exists, skipping." + return 0 + fi + + local alerting catalog inventory iam_tenancy platform_keycloak vault_pw postgresql mps rps + + alerting=$(kubectl get secret alerting-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + catalog=$(kubectl get secret app-orch-catalog-local-postgresql -n orch-app \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + inventory=$(kubectl get secret inventory-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + iam_tenancy=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + platform_keycloak=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + vault_pw=$(kubectl get secret vault-local-postgresql -n orch-platform \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + postgresql=$(kubectl get secret orch-database-postgresql -n orch-database \ + -o jsonpath='{.data.password}' 2>/dev/null || true) + mps=$(kubectl get secret mps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + rps=$(kubectl get secret rps-local-postgresql -n orch-infra \ + -o jsonpath='{.data.PGPASSWORD}' 2>/dev/null || true) + + { + echo "Alerting: $alerting" + echo "CatalogService: $catalog" + echo "Inventory: $inventory" + echo "IAMTenancy: $iam_tenancy" + echo "PlatformKeycloak: $platform_keycloak" + echo "Vault: $vault_pw" + echo "PostgreSQL: $postgresql" + echo "Mps: $mps" + echo "Rps: $rps" + } > "${BACKUP_DIR}/postgres-secrets-password.txt" + + log_info "PostgreSQL service passwords saved to postgres-secrets-password.txt" +} + +################################ +# Phase 6: MPS/RPS Secret Backup +################################ + +backup_mps_rps_secrets() { + log_info "=== Phase 6: Backing up MPS/RPS secrets ===" + + if kubectl get secret mps -n orch-infra >/dev/null 2>&1; then + kubectl get secret mps -n orch-infra -o yaml > "${BACKUP_DIR}/mps_secret.yaml" + log_info "MPS secret backed up to mps_secret.yaml" + else + log_info "MPS secret not found, skipping." + fi + + if kubectl get secret rps -n orch-infra >/dev/null 2>&1; then + kubectl get secret rps -n orch-infra -o yaml > "${BACKUP_DIR}/rps_secret.yaml" + log_info "RPS secret backed up to rps_secret.yaml" + else + log_info "RPS secret not found, skipping." + fi +} + +################################ +# Phase 7: PV LVM Backup (optional) +################################ + +# Get LV size and format it for lvcreate command +get_lv_size() { + local lv_path="$1" + + local size_output size unit formatted_size + size_output=$(sudo lvdisplay "$lv_path" | grep "LV Size" | awk '{print $3, $4}') + size=$(echo "$size_output" | awk '{print $1}') + unit=$(echo "$size_output" | awk '{print $2}') + + case $unit in + GiB) formatted_size="${size}G" ;; + MiB) formatted_size="${size}M" ;; + TiB) formatted_size="${size}T" ;; + *) + log_error "Unsupported LV size unit: $unit" + exit 1 + ;; + esac + + echo "$formatted_size" +} + +check_space_for_backup() { + local vg_info vsize vfree vused margin enough_space + + vg_info=$(sudo vgs --noheadings --units g --nosuffix -o vg_size,vg_free 2>/dev/null) + vsize=$(echo "$vg_info" | awk '{print $1}') + vfree=$(echo "$vg_info" | awk '{print $2}') + vused=$(echo "$vsize - $vfree" | bc) + + margin=$(echo "$vused * 0.05" | bc) + enough_space=$(echo "$vfree > ($vused + $margin)" | bc) + + echo "$enough_space" +} + +backup_pvs() { + log_info "=== Phase 7a: Backing up PersistentVolumes (LVM snapshots) ===" + + local space_check_result + space_check_result=$(check_space_for_backup) + if [[ "$space_check_result" -eq 0 ]]; then + log_error "Not enough space for PVs backup in VG" + return 1 + fi + + local vg_name="lvmvg" + local vol_snap_class_name="openebs-lvm-vsc" + local backup_date + backup_date=$(date +'%Y-%m-%d-%H_%M') + + local pvs_to_backup + pvs_to_backup=$(kubectl get pvc --all-namespaces \ + -o jsonpath='{range .items[?(@.status.phase=="Bound")]}{.metadata.name}{" "}{.metadata.namespace}{" "}{.spec.volumeName}{"\n"}{end}') + + echo "$pvs_to_backup" | while IFS= read -r line; do + local pvc_name pvc_namespace lv_name + read -r pvc_name pvc_namespace lv_name <<<"$line" + + # dkam-pvc data gets re-populated, no need to backup + if [[ "$pvc_name" == "dkam-pvc" ]]; then + continue + fi + + log_info "Backing up PVC: $pvc_name (namespace: $pvc_namespace)" + + # Create VolumeSnapshot + kubectl apply -f - <= max_attempts )); then + log_error "Timeout waiting for VolumeSnapshot ${pvc_name}-snap" + exit 1 + fi + done + + # Create backup LV on VG + local formatted_size bak_lv_name + formatted_size=$(get_lv_size "/dev/$vg_name/$lv_name") + bak_lv_name="${pvc_name}-${pvc_namespace}-bak-${backup_date}" + + sudo lvcreate -n "$bak_lv_name" -L "$formatted_size" "$vg_name" -y + sudo mkfs.ext4 "/dev/$vg_name/$bak_lv_name" + + # Copy data from snapshot to backup LV + sudo mkdir -p /mnt/original-lv /mnt/backup-lv + + local snap_name + snap_name=$(sudo lvs --options lv_name,origin --noheadings \ + | grep "$lv_name" \ + | awk -v lv_name="$lv_name" '$1 != lv_name {print $1}') + + sudo mount "/dev/$vg_name/$snap_name" /mnt/original-lv + sudo mount "/dev/$vg_name/$bak_lv_name" /mnt/backup-lv + sudo cp -a /mnt/original-lv/. /mnt/backup-lv/ + sudo umount "/dev/$vg_name/$snap_name" + sudo umount "/dev/$vg_name/$bak_lv_name" + + sudo rm -rf /mnt/original-lv /mnt/backup-lv + + # Cleanup snapshot + kubectl delete volumesnapshot -n "$pvc_namespace" "${pvc_name}-snap" + + log_info "PVC $pvc_name backed up to LV $bak_lv_name" + done + + log_info "PV backup completed." +} + +################################ +# Phase 8: etcd Snapshot (optional) +################################ + +backup_etcd() { + log_info "=== Phase 7b: Taking etcd snapshot ===" + + local provider + provider=$(detect_k8s_provider) + + case "$provider" in + rke2) + local snapshot_name + # Use DEPLOY_VERSION or current RKE2 version instead of dpkg-query + local rke2_version + rke2_version=$(rke2 --version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") + snapshot_name="pre-upgrade-snapshot-${rke2_version}-$(date +'%Y%m%d_%H%M%S')" + + log_info "Taking RKE2 etcd snapshot: $snapshot_name" + sudo rke2 etcd-snapshot save --name "$snapshot_name" + + sudo mkdir -p /var/orch-backups/ + sudo find /var/lib/rancher/rke2/server/db/snapshots/ \ + -name "pre-upgrade-snapshot-*" \ + -exec mv {} /var/orch-backups/ \; + + log_info "RKE2 etcd snapshot saved to /var/orch-backups/" + ;; + + k3s) + local snapshot_name + local k3s_version + k3s_version=$(k3s --version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") + snapshot_name="pre-upgrade-snapshot-${k3s_version}-$(date +'%Y%m%d_%H%M%S')" + + log_info "Taking K3s etcd snapshot: $snapshot_name" + sudo k3s etcd-snapshot save --name "$snapshot_name" + + sudo mkdir -p /var/orch-backups/ + sudo find /var/lib/rancher/k3s/server/db/snapshots/ \ + -name "pre-upgrade-snapshot-*" \ + -exec mv {} /var/orch-backups/ \; + + log_info "K3s etcd snapshot saved to /var/orch-backups/" + ;; + + kind) + log_info "KIND clusters are ephemeral — etcd snapshots not applicable." + ;; + + *) + log_warn "Unknown K8s provider '$provider' — skipping etcd snapshot." + ;; + esac +} + +################################ +# Summary +################################ + +print_summary() { + log_info "================================================" + log_info " Pre-Upgrade Backup Summary" + log_info "================================================" + + local files=() + [[ -f "${BACKUP_DIR}/postgres_secret.yaml" ]] && \ + files+=(" postgres_secret.yaml (PostgreSQL superuser secret)") + [[ -f "$local_backup_path" ]] && \ + files+=(" ${local_backup_path} (PostgreSQL database dump)") + [[ -f "${BACKUP_DIR}/postgres-secrets-password.txt" ]] && \ + files+=(" postgres-secrets-password.txt (9 service passwords)") + [[ -f "${BACKUP_DIR}/mps_secret.yaml" ]] && \ + files+=(" mps_secret.yaml (MPS connection secret)") + [[ -f "${BACKUP_DIR}/rps_secret.yaml" ]] && \ + files+=(" rps_secret.yaml (RPS connection secret)") + + if [[ ${#files[@]} -gt 0 ]]; then + log_info "Backup files created:" + for f in "${files[@]}"; do + log_info "$f" + done + fi + + if [[ "$BACKUP_PVS" == "true" ]]; then + log_info "PV LVM snapshots: created in VG lvmvg" + log_info "etcd snapshots: saved to /var/orch-backups/" + fi + + log_info "================================================" + log_info "Backups complete. You can now run:" + log_info " 1. ./pre-orch-upgrade.sh (K8s + OS upgrade)" + log_info " 2. ./post-orch-upgrade.sh (Gitea, ArgoCD, orchestrator)" + log_info "================================================" +} + +################################ +# CLI +################################ + +usage() { + cat >&2 < Date: Mon, 6 Apr 2026 05:25:23 +0000 Subject: [PATCH 2/3] updated the scripts with some minor updates --- .../onprem/post-orch-upgrade.sh | 135 ++++++--- .../onprem/pre-upgrade-backup.sh | 258 +----------------- 2 files changed, 99 insertions(+), 294 deletions(-) diff --git a/on-prem-installers/onprem/post-orch-upgrade.sh b/on-prem-installers/onprem/post-orch-upgrade.sh index 0b0a670882..d0d2272423 100755 --- a/on-prem-installers/onprem/post-orch-upgrade.sh +++ b/on-prem-installers/onprem/post-orch-upgrade.sh @@ -103,7 +103,6 @@ export GIT_REPOS ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}" INSTALL_GITEA="${INSTALL_GITEA:-true}" GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}" -USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}" DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}" SKIP_INTERACTIVE="${SKIP_INTERACTIVE:-false}" @@ -173,9 +172,19 @@ update_config_variable() { wait_for_pods_running() { local ns="$1" + local retries=3 log_info "Waiting for all pods to be Ready in namespace $ns..." - kubectl wait pod --selector='!job-name' --all --for=condition=Ready \ - --namespace="$ns" --timeout=600s + for ((i = 1; i <= retries; i++)); do + # Allow old pods to be cleaned up before listing + sleep 5 + if kubectl wait pod --selector='!job-name' --all --for=condition=Ready \ + --namespace="$ns" --timeout=600s 2>&1; then + return 0 + fi + log_warn "Attempt $i/$retries: some pods were not found (likely deleted during rollout), retrying..." + done + log_error "Pods in namespace $ns did not become Ready after $retries attempts" + return 1 } resync_all_apps() { @@ -517,17 +526,6 @@ apply_cluster_config() { log_info "Cluster config copied to $target_dir/" fi - if [[ "$SKIP_INTERACTIVE" != "true" ]]; then - while true; do - read -rp "Edit values.yaml if required. Ready to proceed? (yes/no): " yn - case $yn in - [Yy]* ) break;; - [Nn]* ) exit 1;; - * ) echo "Please answer yes or no.";; - esac - done - fi - log_info "Cluster config ready: $cluster_yaml" } @@ -1231,6 +1229,8 @@ migrate_postgres_to_cnpg() { log_info "Waiting for postgresql-secrets application..." local start_time timeout_s=3600 start_time=$(date +%s) + local resync_interval_1=120 + local last_resync_1=$start_time set +e while true; do @@ -1248,6 +1248,15 @@ migrate_postgres_to_cnpg() { log_error "Timeout waiting for postgresql-secrets (${timeout_s}s)" exit 1 fi + + # Re-trigger root-app sync periodically + local since_resync_1=$((current_time - last_resync_1)) + if (( since_resync_1 >= resync_interval_1 )); then + log_warn "postgresql-secrets still not Synced/Healthy after ${elapsed}s. Re-syncing root-app..." + resync_all_apps + last_resync_1=$current_time + fi + log_info "Waiting for postgresql-secrets (status: ${app_status:-pending}, ${elapsed}s)" sleep 5 done @@ -1274,23 +1283,67 @@ EOF # Wait for postgresql-secrets again after root-app sync start_time=$(date +%s) + local resync_interval=120 + local last_resync=$start_time set +e while true; do - local app_status - app_status=$(kubectl get application postgresql-secrets -n "$apps_ns" \ - -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || true) - if [[ "$app_status" == "Synced Healthy" ]]; then - log_info "postgresql-secrets is Synced and Healthy." - break + # Check if postgresql-secrets application exists at all + local app_exists + app_exists=$(kubectl get application postgresql-secrets -n "$apps_ns" \ + --no-headers 2>/dev/null | wc -l) + + local app_status="" + if (( app_exists > 0 )); then + app_status=$(kubectl get application postgresql-secrets -n "$apps_ns" \ + -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || true) + if [[ "$app_status" == "Synced Healthy" ]]; then + log_info "postgresql-secrets is Synced and Healthy." + break + fi fi + local current_time elapsed current_time=$(date +%s) elapsed=$((current_time - start_time)) if (( elapsed > timeout_s )); then log_error "Timeout waiting for postgresql-secrets after resync (${timeout_s}s)" + log_error "root-app status: $(kubectl get application root-app -n "$apps_ns" \ + -o jsonpath='{.status.sync.status} {.status.health.status} phase={.status.operationState.phase}' 2>/dev/null || true)" exit 1 fi - sleep 5 + + # Re-trigger root-app sync periodically if postgresql-secrets hasn't appeared + local since_resync=$((current_time - last_resync)) + if (( since_resync >= resync_interval )); then + if (( app_exists == 0 )); then + log_warn "postgresql-secrets app not found after ${elapsed}s. Re-triggering root-app sync..." + else + log_warn "postgresql-secrets status: ${app_status:-unknown} after ${elapsed}s. Re-triggering root-app sync..." + fi + local root_phase + root_phase=$(kubectl get application root-app -n "$apps_ns" \ + -o jsonpath='{.status.operationState.phase}' 2>/dev/null || true) + log_info "root-app operation phase: ${root_phase:-none}" + + if [[ "$root_phase" != "Running" ]]; then + kubectl patch application root-app -n "$apps_ns" --type merge \ + -p '{"operation":null}' 2>/dev/null || true + kubectl patch application root-app -n "$apps_ns" --type json \ + -p '[{"op": "remove", "path": "/status/operationState"}]' 2>/dev/null || true + sleep 5 + kubectl patch -n "$apps_ns" application root-app \ + --patch-file /tmp/sync-postgresql-patch.yaml --type merge + log_info "root-app sync re-triggered" + fi + last_resync=$current_time + fi + + if (( app_exists == 0 )); then + log_info "Waiting for postgresql-secrets app to be created by root-app... (${elapsed}s)" + else + log_info "Waiting for postgresql-secrets (status: ${app_status:-pending}, ${elapsed}s)" + fi + sleep 10 done set -e @@ -1365,15 +1418,21 @@ EOF restore_mps_rps_secrets() { log_info "=== Phase 6a: Restoring MPS/RPS secrets ===" - if [[ -s mps_secret.yaml ]]; then - kubectl apply -f mps_secret.yaml - log_info "MPS secret restored" - fi - - if [[ -s rps_secret.yaml ]]; then - kubectl apply -f rps_secret.yaml - log_info "RPS secret restored" - fi + # Strip cluster-specific metadata (resourceVersion, uid, etc.) to avoid + # "the object has been modified" conflicts when the secret was recreated + # by ArgoCD between backup and restore. + for secret_file in mps_secret.yaml rps_secret.yaml; do + if [[ -s "$secret_file" ]]; then + yq e ' + del(.metadata.resourceVersion) | + del(.metadata.uid) | + del(.metadata.creationTimestamp) | + del(.metadata.managedFields) | + del(.metadata.annotations["kubectl.kubernetes.io/last-applied-configuration"]) + ' "$secret_file" | kubectl apply -f - + log_info "${secret_file%.yaml} secret restored" + fi + done } fix_mps_rps_connections() { @@ -1587,8 +1646,6 @@ Usage: $(basename "$0") [options] Options: - -l Use local packages (skip artifact download) - -s Skip interactive prompts (non-interactive mode) -h Show this help message EOF } @@ -1598,13 +1655,11 @@ EOF ################################################################################ main() { - local help_flag="" local_flag="" skip_flag="" + local help_flag="" - while getopts 'hls' flag; do + while getopts 'h' flag; do case "${flag}" in h) help_flag="true" ;; - l) local_flag="true" ;; - s) skip_flag="true" ;; *) help_flag="true" ;; esac done @@ -1614,14 +1669,6 @@ main() { exit 0 fi - if [[ "${local_flag:-}" == "true" ]]; then - USE_LOCAL_PACKAGES="true" - fi - - if [[ "${skip_flag:-}" == "true" ]]; then - SKIP_INTERACTIVE="true" - fi - check_prerequisites # Phase 1: Configuration diff --git a/on-prem-installers/onprem/pre-upgrade-backup.sh b/on-prem-installers/onprem/pre-upgrade-backup.sh index d62e929955..73d594e7ed 100755 --- a/on-prem-installers/onprem/pre-upgrade-backup.sh +++ b/on-prem-installers/onprem/pre-upgrade-backup.sh @@ -12,9 +12,6 @@ # - PostgreSQL service passwords (9 services) # - MPS/RPS connection secrets # - Gitea secrets cleanup (pre-backup) -# - PersistentVolume LVM snapshots (optional, -b flag) -# - RKE2 etcd snapshot (optional, -b flag, RKE2 only) -# - K3s etcd snapshot (optional, -b flag, K3s only) # # This script should be run BEFORE pre-orch-upgrade.sh and post-orch-upgrade.sh. # @@ -22,8 +19,6 @@ # ./pre-upgrade-backup.sh [options] # # Options: -# -b Enable PV and etcd backup (requires LVM + sufficient VG space) -# -s Skip interactive prompts (non-interactive mode) # -h Show help set -euo pipefail @@ -79,8 +74,6 @@ log_info "Log file: $FULL_LOG_PATH" # Configuration ################################ -BACKUP_PVS="${BACKUP_PVS:-false}" -SKIP_INTERACTIVE="${SKIP_INTERACTIVE:-false}" BACKUP_DIR="$(pwd)" # From upgrade_postgres.sh (sourced above): @@ -90,26 +83,6 @@ BACKUP_DIR="$(pwd)" # POSTGRES_USERNAME=postgres # application_namespace=onprem -################################ -# Kubernetes Provider Detection -################################ - -detect_k8s_provider() { - if systemctl is-active --quiet rke2-server.service 2>/dev/null; then - echo "rke2" - elif systemctl is-active --quiet k3s.service 2>/dev/null; then - echo "k3s" - elif command -v kind &>/dev/null && kind get clusters 2>/dev/null | grep -q .; then - echo "kind" - elif [[ -d /etc/rancher/rke2 ]]; then - echo "rke2" - elif [[ -d /etc/rancher/k3s ]]; then - echo "k3s" - else - echo "unknown" - fi -} - ################################ # Prerequisites ################################ @@ -141,23 +114,20 @@ pre_backup_postgres_check() { # interactive read -rp prompt that blocks in non-interactive / piped runs. # Instead, perform the health check inline. + # shellcheck disable=SC2154 # local_backup_path defined in upgrade_postgres.sh if [[ -f "$local_backup_path" ]]; then log_warn "Existing backup file found: $local_backup_path" - if [[ "$SKIP_INTERACTIVE" == "true" ]]; then - log_info "Non-interactive mode (-s): continuing with existing backup file." - else - read -rp "A backup file already exists. Type 'Continue' to proceed or Ctrl-C to abort: " confirm - if [[ ! "$confirm" =~ ^[Cc][Oo][Nn][Tt][Ii][Nn][Uu][Ee]$ ]]; then - log_error "User aborted." - exit 1 - fi + read -rp "A backup file already exists. Type 'Continue' to proceed or Ctrl-C to abort: " confirm + if [[ ! "$confirm" =~ ^[Cc][Oo][Nn][Tt][Ii][Nn][Uu][Ee]$ ]]; then + log_error "User aborted." + exit 1 fi log_info "PostgreSQL health check skipped (recovery from previous run)." return 0 fi - # shellcheck disable=SC2154 # podname, postgres_namespace from upgrade_postgres.sh local pod_status + # shellcheck disable=SC2154 # podname, postgres_namespace from upgrade_postgres.sh pod_status=$(kubectl get pods -n "$postgres_namespace" "$podname" \ -o jsonpath='{.status.phase}' 2>/dev/null || true) @@ -304,191 +274,6 @@ backup_mps_rps_secrets() { fi } -################################ -# Phase 7: PV LVM Backup (optional) -################################ - -# Get LV size and format it for lvcreate command -get_lv_size() { - local lv_path="$1" - - local size_output size unit formatted_size - size_output=$(sudo lvdisplay "$lv_path" | grep "LV Size" | awk '{print $3, $4}') - size=$(echo "$size_output" | awk '{print $1}') - unit=$(echo "$size_output" | awk '{print $2}') - - case $unit in - GiB) formatted_size="${size}G" ;; - MiB) formatted_size="${size}M" ;; - TiB) formatted_size="${size}T" ;; - *) - log_error "Unsupported LV size unit: $unit" - exit 1 - ;; - esac - - echo "$formatted_size" -} - -check_space_for_backup() { - local vg_info vsize vfree vused margin enough_space - - vg_info=$(sudo vgs --noheadings --units g --nosuffix -o vg_size,vg_free 2>/dev/null) - vsize=$(echo "$vg_info" | awk '{print $1}') - vfree=$(echo "$vg_info" | awk '{print $2}') - vused=$(echo "$vsize - $vfree" | bc) - - margin=$(echo "$vused * 0.05" | bc) - enough_space=$(echo "$vfree > ($vused + $margin)" | bc) - - echo "$enough_space" -} - -backup_pvs() { - log_info "=== Phase 7a: Backing up PersistentVolumes (LVM snapshots) ===" - - local space_check_result - space_check_result=$(check_space_for_backup) - if [[ "$space_check_result" -eq 0 ]]; then - log_error "Not enough space for PVs backup in VG" - return 1 - fi - - local vg_name="lvmvg" - local vol_snap_class_name="openebs-lvm-vsc" - local backup_date - backup_date=$(date +'%Y-%m-%d-%H_%M') - - local pvs_to_backup - pvs_to_backup=$(kubectl get pvc --all-namespaces \ - -o jsonpath='{range .items[?(@.status.phase=="Bound")]}{.metadata.name}{" "}{.metadata.namespace}{" "}{.spec.volumeName}{"\n"}{end}') - - echo "$pvs_to_backup" | while IFS= read -r line; do - local pvc_name pvc_namespace lv_name - read -r pvc_name pvc_namespace lv_name <<<"$line" - - # dkam-pvc data gets re-populated, no need to backup - if [[ "$pvc_name" == "dkam-pvc" ]]; then - continue - fi - - log_info "Backing up PVC: $pvc_name (namespace: $pvc_namespace)" - - # Create VolumeSnapshot - kubectl apply -f - <= max_attempts )); then - log_error "Timeout waiting for VolumeSnapshot ${pvc_name}-snap" - exit 1 - fi - done - - # Create backup LV on VG - local formatted_size bak_lv_name - formatted_size=$(get_lv_size "/dev/$vg_name/$lv_name") - bak_lv_name="${pvc_name}-${pvc_namespace}-bak-${backup_date}" - - sudo lvcreate -n "$bak_lv_name" -L "$formatted_size" "$vg_name" -y - sudo mkfs.ext4 "/dev/$vg_name/$bak_lv_name" - - # Copy data from snapshot to backup LV - sudo mkdir -p /mnt/original-lv /mnt/backup-lv - - local snap_name - snap_name=$(sudo lvs --options lv_name,origin --noheadings \ - | grep "$lv_name" \ - | awk -v lv_name="$lv_name" '$1 != lv_name {print $1}') - - sudo mount "/dev/$vg_name/$snap_name" /mnt/original-lv - sudo mount "/dev/$vg_name/$bak_lv_name" /mnt/backup-lv - sudo cp -a /mnt/original-lv/. /mnt/backup-lv/ - sudo umount "/dev/$vg_name/$snap_name" - sudo umount "/dev/$vg_name/$bak_lv_name" - - sudo rm -rf /mnt/original-lv /mnt/backup-lv - - # Cleanup snapshot - kubectl delete volumesnapshot -n "$pvc_namespace" "${pvc_name}-snap" - - log_info "PVC $pvc_name backed up to LV $bak_lv_name" - done - - log_info "PV backup completed." -} - -################################ -# Phase 8: etcd Snapshot (optional) -################################ - -backup_etcd() { - log_info "=== Phase 7b: Taking etcd snapshot ===" - - local provider - provider=$(detect_k8s_provider) - - case "$provider" in - rke2) - local snapshot_name - # Use DEPLOY_VERSION or current RKE2 version instead of dpkg-query - local rke2_version - rke2_version=$(rke2 --version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") - snapshot_name="pre-upgrade-snapshot-${rke2_version}-$(date +'%Y%m%d_%H%M%S')" - - log_info "Taking RKE2 etcd snapshot: $snapshot_name" - sudo rke2 etcd-snapshot save --name "$snapshot_name" - - sudo mkdir -p /var/orch-backups/ - sudo find /var/lib/rancher/rke2/server/db/snapshots/ \ - -name "pre-upgrade-snapshot-*" \ - -exec mv {} /var/orch-backups/ \; - - log_info "RKE2 etcd snapshot saved to /var/orch-backups/" - ;; - - k3s) - local snapshot_name - local k3s_version - k3s_version=$(k3s --version 2>/dev/null | head -1 | awk '{print $3}' || echo "unknown") - snapshot_name="pre-upgrade-snapshot-${k3s_version}-$(date +'%Y%m%d_%H%M%S')" - - log_info "Taking K3s etcd snapshot: $snapshot_name" - sudo k3s etcd-snapshot save --name "$snapshot_name" - - sudo mkdir -p /var/orch-backups/ - sudo find /var/lib/rancher/k3s/server/db/snapshots/ \ - -name "pre-upgrade-snapshot-*" \ - -exec mv {} /var/orch-backups/ \; - - log_info "K3s etcd snapshot saved to /var/orch-backups/" - ;; - - kind) - log_info "KIND clusters are ephemeral — etcd snapshots not applicable." - ;; - - *) - log_warn "Unknown K8s provider '$provider' — skipping etcd snapshot." - ;; - esac -} - ################################ # Summary ################################ @@ -517,11 +302,6 @@ print_summary() { done fi - if [[ "$BACKUP_PVS" == "true" ]]; then - log_info "PV LVM snapshots: created in VG lvmvg" - log_info "etcd snapshots: saved to /var/orch-backups/" - fi - log_info "================================================" log_info "Backups complete. You can now run:" log_info " 1. ./pre-orch-upgrade.sh (K8s + OS upgrade)" @@ -543,8 +323,6 @@ Usage: $(basename "$0") [options] Options: - -b Enable PV (LVM) and etcd backup (requires LVM VG space) - -s Skip interactive prompts (non-interactive mode) -h Show this help message Backup artifacts created (in current directory): @@ -554,10 +332,6 @@ Backup artifacts created (in current directory): mps_secret.yaml MPS connection secret rps_secret.yaml RPS connection secret -With -b flag (optional): - LVM backup LVs One per bound PVC (in lvmvg) - /var/orch-backups/ etcd snapshots (RKE2/K3s) - Execution order: 1. ./pre-upgrade-backup.sh <-- this script 2. ./pre-orch-upgrade.sh K8s cluster + OS upgrade @@ -571,13 +345,11 @@ EOF ################################ main() { - local help_flag="" backup_flag="" skip_flag="" + local help_flag="" - while getopts 'hbs' flag; do + while getopts 'h' flag; do case "${flag}" in h) help_flag="true" ;; - b) backup_flag="true" ;; - s) skip_flag="true" ;; *) help_flag="true" ;; esac done @@ -587,14 +359,6 @@ main() { exit 0 fi - if [[ "${backup_flag:-}" == "true" ]]; then - BACKUP_PVS="true" - fi - - if [[ "${skip_flag:-}" == "true" ]]; then - SKIP_INTERACTIVE="true" - fi - check_prerequisites # Phase 1: PostgreSQL health check @@ -615,12 +379,6 @@ main() { # Phase 6: MPS/RPS secrets backup_mps_rps_secrets - # Phase 7-8: PV + etcd backups (optional) - if [[ "$BACKUP_PVS" == "true" ]]; then - backup_pvs - backup_etcd - fi - # Summary print_summary } From 7201364b2a826c7a73505a66bc7621b633026b16 Mon Sep 17 00:00:00 2001 From: Palash Goel Date: Tue, 7 Apr 2026 03:24:42 +0000 Subject: [PATCH 3/3] updated changes for lint issue --- on-prem-installers/onprem/pre-orch-upgrade.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/on-prem-installers/onprem/pre-orch-upgrade.sh b/on-prem-installers/onprem/pre-orch-upgrade.sh index fd22b02a4c..6d25a12402 100755 --- a/on-prem-installers/onprem/pre-orch-upgrade.sh +++ b/on-prem-installers/onprem/pre-orch-upgrade.sh @@ -995,8 +995,6 @@ kind_upgrade() { # --kind-version to get a newer release, the default K8s version # may differ. When we can't determine the target, fall back to # keeping the cluster (safe path). - local target_version="" - target_version=$(kind version -q 2>/dev/null || true) # kind doesn't expose the target K8s version directly; use the # image tag from "kind create cluster --help" or default images. log_info "KIND binary updated. Checking if cluster recreation is needed..."