From b7e53c2710ab31f9d43cba74be90f326ad13815f Mon Sep 17 00:00:00 2001 From: Andrei Palade Date: Wed, 3 Dec 2025 16:34:58 +0000 Subject: [PATCH 01/12] Update passwords patching --- on-prem-installers/onprem/onprem_upgrade.sh | 66 ++++----------------- 1 file changed, 11 insertions(+), 55 deletions(-) diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index 967c5eff5..d53fd2f42 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -872,7 +872,7 @@ while true; do done set -e -patch_secret() { +patch_secrets() { # Patch secrets with passwords from postgres-secrets-password.txt # If the file is not empty, read the passwords and patch the secrets accordingly @@ -980,40 +980,18 @@ patch_secret() { kubectl patch secret -n orch-infra mps-reader-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$MPS\"}}" --type=merge kubectl patch secret -n orch-infra rps-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$RPS\"}}" --type=merge kubectl patch secret -n orch-infra rps-reader-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$RPS\"}}" --type=merge - # Use a temporary file for the patch payload - patch_file=$(mktemp) - cat > "$patch_file" </dev/null 2>&1; then - kubectl patch secret -n orch-app orch-app-app-orch-catalog-local-postgresql -p "{\"data\": {\"password\": \"$CATALOG_SERVICE\"}}" --type=merge - kubectl patch secret -n orch-iam orch-iam-iam-tenancy -p "{\"data\": {\"password\": \"$IAM_TENANCY\"}}" --type=merge - kubectl patch secret -n orch-infra orch-infra-alerting -p "{\"data\": {\"password\": \"$ALERTING\"}}" --type=merge - kubectl patch secret -n orch-infra orch-infra-inventory -p "{\"data\": {\"password\": \"$INVENTORY\"}}" --type=merge - kubectl patch secret -n orch-platform orch-platform-platform-keycloak -p "{\"data\": {\"password\": \"$PLATFORM_KEYCLOAK\"}}" --type=merge - kubectl patch secret -n orch-platform orch-platform-vault -p "{\"data\": {\"password\": \"$VAULT\"}}" --type=merge - kubectl patch secret -n orch-infra orch-infra-mps -p "{\"data\": {\"password\": \"$MPS\"}}" --type=merge - kubectl patch secret -n orch-infra orch-infra-rps -p "{\"data\": {\"password\": \"$RPS\"}}" --type=merge + if kubectl get secret orch-app-app-orch-catalog -n orch-database >/dev/null 2>&1; then + kubectl patch secret -n orch-database orch-app-app-orch-catalog -p "{\"data\": {\"password\": \"$CATALOG_SERVICE\"}}" --type=merge + kubectl patch secret -n orch-database orch-iam-iam-tenancy -p "{\"data\": {\"password\": \"$IAM_TENANCY\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-alerting -p "{\"data\": {\"password\": \"$ALERTING\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-inventory -p "{\"data\": {\"password\": \"$INVENTORY\"}}" --type=merge + kubectl patch secret -n orch-database orch-platform-platform-keycloak -p "{\"data\": {\"password\": \"$PLATFORM_KEYCLOAK\"}}" --type=merge + kubectl patch secret -n orch-database orch-platform-vault -p "{\"data\": {\"password\": \"$VAULT\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-mps -p "{\"data\": {\"password\": \"$MPS\"}}" --type=merge + kubectl patch secret -n orch-database orch-infra-rps -p "{\"data\": {\"password\": \"$RPS\"}}" --type=merge fi - - kubectl patch secret -n orch-database passwords --type=merge --patch-file "$patch_file" - rm -f "$patch_file" - - # Patch postgresql secret - #kubectl patch secret -n orch-database postgresql -p "{\"data\": {\"postgres-password\": \"$POSTGRESQL\"}}" --type=merge } # Stop sync operation for root-app, so it won't be synced with the old version of the application. @@ -1063,7 +1041,7 @@ kubectl patch application root-app -n "$apps_ns" --type json -p '[{"op": "remove sleep 30 kubectl patch -n "$apps_ns" application root-app --patch-file /tmp/sync-postgresql-patch.yaml --type merge sleep 30 -patch_secret +patch_secrets sleep 10 # Restore secret after app delete but before postgress restored @@ -1100,28 +1078,6 @@ restore_postgres # Update ALL database user passwords in PostgreSQL after restore echo "Updating all database user passwords in PostgreSQL..." -# Get all passwords from postgres-secrets-password.txt file (they are base64 encoded) -ALERTING_PASSWORD=$(grep "^Alerting:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -CATALOG_PASSWORD=$(grep "^CatalogService:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -INVENTORY_PASSWORD=$(grep "^Inventory:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -IAM_TENANCY_PASSWORD=$(grep "^IAMTenancy:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -KEYCLOAK_PASSWORD=$(grep "^PlatformKeycloak:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -MPS_PASSWORD=$(grep "^Mps:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -RPS_PASSWORD=$(grep "^Rps:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -VAULT_PASSWORD=$(grep "^Vault:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) -POSTGRESQL_PASSWORD=$(grep "^PostgreSQL:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d) - -# Update passwords for all database users -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-platform-vault_user\" WITH PASSWORD '$VAULT_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-alerting_user\" WITH PASSWORD '$ALERTING_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-app-app-orch-catalog_user\" WITH PASSWORD '$CATALOG_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-inventory_user\" WITH PASSWORD '$INVENTORY_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-iam-iam-tenancy_user\" WITH PASSWORD '$IAM_TENANCY_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-platform-platform-keycloak_user\" WITH PASSWORD '$KEYCLOAK_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-mps_user\" WITH PASSWORD '$MPS_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-rps_user\" WITH PASSWORD '$RPS_PASSWORD';" -kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-database-postgresql_user\" WITH PASSWORD '$POSTGRESQL_PASSWORD';" - echo "✅ All database user passwords updated successfully" vault_unseal From b11668f2f39ad008af406f424c6b7e8ef4abac20 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 02:31:28 +0530 Subject: [PATCH 02/12] onprem upgrade rc1 to rc2 fix (#1205) --- on-prem-installers/onprem/onprem_upgrade.sh | 69 ++++++++++++------- on-prem-installers/onprem/upgrade_postgres.sh | 23 +++++-- 2 files changed, 61 insertions(+), 31 deletions(-) diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index d53fd2f42..61d60a56a 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -306,7 +306,7 @@ check_and_force_sync_app() { for ((i=1; i<=max_retries; i++)); do app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound") - + if [[ "$app_status" == "Synced Healthy" ]]; then echo "✅ $app_name is Synced and Healthy" return 0 @@ -315,26 +315,26 @@ check_and_force_sync_app() { echo "⚠️ $app_name is not Synced and Healthy (status: $app_status). Force-syncing... (attempt $i/$max_retries)" force_sync_outofsync_app "$app_name" "$namespace" "$server_side_apply" echo "✅ $app_name sync triggered" - + # Check status every 5s for 90s local check_timeout=90 local check_interval=3 local elapsed=0 - + while (( elapsed < check_timeout )); do app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound") - + if [[ "$app_status" == "Synced Healthy" ]]; then echo "✅ $app_name became Synced and Healthy" return 0 else echo "Current status: $app_status (elapsed: ${elapsed}s)" fi - + sleep $check_interval elapsed=$((elapsed + check_interval)) done - + echo "⏳ $app_name did not become healthy within ${check_timeout}s" done @@ -348,7 +348,7 @@ check_and_patch_sync_app() { for ((i=1; i<=max_retries; i++)); do app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound") - + if [[ "$app_status" == "Synced Healthy" ]]; then echo "✅ $app_name is Synced and Healthy" return 0 @@ -365,21 +365,21 @@ check_and_patch_sync_app() { local check_timeout=90 local check_interval=3 local elapsed=0 - + while (( elapsed < check_timeout )); do app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound") - + if [[ "$app_status" == "Synced Healthy" ]]; then echo "✅ $app_name became Synced and Healthy" return 0 else echo "Current status: $app_status (elapsed: ${elapsed}s)" fi - + sleep $check_interval elapsed=$((elapsed + check_interval)) done - + echo "⏳ $app_name did not become healthy within ${check_timeout}s" done @@ -391,7 +391,7 @@ wait_for_app_synced_healthy() { local app_name=$1 local namespace=$2 local timeout=${3:-120} # Default 120 seconds if not specified - + local start_time start_time=$(date +%s) set +e @@ -446,7 +446,7 @@ check_and_cleanup_job() { local app_name=$1 local namespace=$2 local job_label=${3:-job-name} - + app_status=$(kubectl get application "$app_name" -n "$apps_ns" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound") if [[ "$app_status" != "Synced Healthy" ]]; then if kubectl get job -n "$namespace" -l "$job_label" 2>/dev/null | grep "$app_name"; then @@ -668,11 +668,16 @@ if ! check_postgres; then exit 1 fi -# Perform postgreSQL secret backup if not done already +# Perform PostgreSQL secret backup if not done already if [[ ! -f postgres_secret.yaml ]]; then - kubectl get secret -n orch-database postgresql -o yaml > postgres_secret.yaml + if [[ "$UPGRADE_3_1_X" == "true" ]]; then + kubectl get secret -n orch-database postgresql -o yaml > postgres_secret.yaml + else + kubectl get secret -n orch-database passwords -o yaml > postgres_secret.yaml + fi fi + # Delete gitea secrets before backup cleanup_gitea_secrets @@ -793,7 +798,11 @@ if [[ ! -s postgres-secrets-password.txt ]]; then IAM_TENANCY=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam -o jsonpath='{.data.PGPASSWORD}') PLATFORM_KEYCLOAK=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}') VAULT=$(kubectl get secret vault-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}') - POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}') + if [[ "$UPGRADE_3_1_X" == "true" ]]; then + POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}') + else + POSTGRESQL=$(kubectl get secret orch-database-postgresql -n orch-database -o jsonpath='{.data.password}') + fi MPS=$(kubectl get secret mps-local-postgresql -n orch-infra -o jsonpath='{.data.PGPASSWORD}') RPS=$(kubectl get secret rps-local-postgresql -n orch-infra -o jsonpath='{.data.PGPASSWORD}') { @@ -1045,8 +1054,20 @@ patch_secrets sleep 10 # Restore secret after app delete but before postgress restored -yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f - - +if [[ "$UPGRADE_3_1_X" == "true" ]]; then + yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f - +else + yq e ' + del(.metadata.labels) | + del(.metadata.annotations) | + del(.metadata.ownerReferences) | + del(.metadata.finalizers) | + del(.metadata.managedFields) | + del(.metadata.resourceVersion) | + del(.metadata.uid) | + del(.metadata.creationTimestamp) + ' postgres_secret.yaml | kubectl apply -f - +fi sleep 30 # Wait until PostgreSQL pod is running (Re-sync) start_time=$(date +%s) @@ -1256,7 +1277,7 @@ check_and_cleanup_job "wait-istio-job" "ns-label" # Collect and display syncwave information for OutOfSync applications echo "OutOfSync applications by syncwave:" outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | + jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ sort -n) @@ -1276,7 +1297,7 @@ done # Collect and display syncwave information for OutOfSync applications echo "OutOfSync applications by syncwave:" outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | + jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ sort -n) @@ -1295,7 +1316,7 @@ done # Collect and display syncwave information for OutOfSync applications echo "OutOfSync applications by syncwave:" outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | + jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ sort -n) @@ -1316,10 +1337,10 @@ kubectl patch application root-app -n "$apps_ns" --type json -p '[{"op": "remo # OS profiles Fix kubectl patch application tenancy-api-mapping -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge -kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge +kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge kubectl delete application tenancy-api-mapping -n onprem kubectl delete application tenancy-datamodel -n onprem -kubectl delete deployment -n orch-infra os-resource-manager +kubectl delete deployment -n orch-infra os-resource-manager # Apply root-app Patch kubectl patch application root-app -n "$apps_ns" --patch-file /tmp/argo-cd/sync-patch.yaml --type merge @@ -1330,4 +1351,4 @@ kubectl delete secret boots-ca-cert -n orch-gateway kubectl delete secret boots-ca-cert -n orch-infra kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null -echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state" \ No newline at end of file +echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state" diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh index 9150a73a3..d138619ac 100755 --- a/on-prem-installers/onprem/upgrade_postgres.sh +++ b/on-prem-installers/onprem/upgrade_postgres.sh @@ -4,17 +4,22 @@ # # SPDX-License-Identifier: Apache-2.0 -podname="postgresql-0" postgres_namespace=orch-database -POSTGRES_LOCAL_BACKUP_PATH="./" +POSTGRES_LOCAL_BACKUP_PATH="./" local_backup_file="${postgres_namespace}_backup.sql" local_backup_path="${POSTGRES_LOCAL_BACKUP_PATH}${local_backup_file}" -POSTGRES_USERNAME="postgres" +POSTGRES_USERNAME="postgres" application_namespace=onprem +if [[ "$UPGRADE_3_1_X" == "true" ]]; then + podname="postgresql-0" +else + podname="postgresql-cluster-1" +fi + check_postgres() { if [[ -f "$local_backup_path" ]]; then - read -rp "A backfile file already exists. + read -rp "A backfile file already exists. If you would like to continue using this backup file type Continue : " confirm && [[ $confirm == [cC][oO][nN][tT][iI][nN][uU][eE] ]] || exit 1 # avoid the rest of the check function as this could be a recovery from a failed update @@ -49,7 +54,7 @@ backup_postgres() { fi echo "Backing up databases from pod $podname in namespace $postgres_namespace..." - remote_backup_path="/tmp/${postgres_namespace}_backup.sql" + remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql" kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "$(typeset -f disable_security); disable_security" if kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "pg_dumpall -U $POSTGRES_USERNAME -f '$remote_backup_path'"; then @@ -99,11 +104,15 @@ restore_postgres() { echo "Restoring backup databases from pod $podname in namespace $postgres_namespace..." # Get postgres password from secret - PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d) + if [[ "$UPGRADE_3_1_X" == "true" ]]; then + PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d) +else + PGPASSWORD=$(kubectl get secret -n $postgres_namespace orch-database-postgresql -o jsonpath='{.data.password}' | base64 -d) +fi # CloudNativePG doesn't need security disable/enable, just use credentials # Use the remote backup file that was copied to the pod kubectl exec -n $postgres_namespace "$podname" -c postgres -- env PGPASSWORD="$PGPASSWORD" psql -U $POSTGRES_USERNAME -f "$remote_backup_path" echo "Restore completed successfully." -} \ No newline at end of file +} From 45d88faa9b5d76fb05728e2b51c916d74c0c4bf8 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 09:18:58 +0530 Subject: [PATCH 03/12] upgrade check added (#1206) --- on-prem-installers/onprem/upgrade_postgres.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh index d138619ac..45cd7abe8 100755 --- a/on-prem-installers/onprem/upgrade_postgres.sh +++ b/on-prem-installers/onprem/upgrade_postgres.sh @@ -54,7 +54,12 @@ backup_postgres() { fi echo "Backing up databases from pod $podname in namespace $postgres_namespace..." - remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql" + if [[ "$UPGRADE_3_1_X" == "true" ]]; then + remote_backup_path="/tmp/${postgres_namespace}_backup.sql" + else + remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql" + fi + kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "$(typeset -f disable_security); disable_security" if kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "pg_dumpall -U $POSTGRES_USERNAME -f '$remote_backup_path'"; then From a928d34611cf542f5a1b184415a8c8bf28069ea8 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 10:00:23 +0530 Subject: [PATCH 04/12] postgresql pod check for upgrade flow (#1207) --- on-prem-installers/onprem/onprem_upgrade.sh | 24 +++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index 61d60a56a..f8a79266c 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -92,6 +92,30 @@ gitea_ns=gitea # shellcheck disable=SC2034 root_app=root-app +postgres_namespace=orch-database +echo "Checking PostgreSQL pod in namespace: $postgres_namespace" +# Get all pods once (optimized) +pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null) +# Check for new version pod +echo "Checking for: postgresql-cluster-1" +if echo "$pods" | grep -q "^postgresql-cluster-1"; then + export UPGRADE_3_1_X=false + podname=postgresql-cluster-1 + echo "Onprem Upgrade from Rel3.1.x" +elif echo "$pods" | grep -q "^postgresql-0"; then + export UPGRADE_3_1_X=true + echo "Onprem Upgrade from latest release where postgresql-cluster-1" + podname=postgresql-0 +# No valid pod found +else + echo "❌ ERROR: No valid PostgreSQL pod found!" + echo "Expected:" + echo " - postgresql-cluster-1 (new version, 2025.02+)" + echo " - postgresql-0 (old version, 3.1.3 and below)" + exit 1 +fi +echo "Selected PostgreSQL pod → $podname" + # Variables that depend on the above and might require updating later, are placed in here set_artifacts_version() { installer_list=( From 88e48e842b521507ce6256587cf1e63c52a9c480 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 10:11:24 +0530 Subject: [PATCH 05/12] fix check --- on-prem-installers/onprem/onprem_upgrade.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index f8a79266c..b68320e03 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -99,12 +99,12 @@ pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null) # Check for new version pod echo "Checking for: postgresql-cluster-1" if echo "$pods" | grep -q "^postgresql-cluster-1"; then - export UPGRADE_3_1_X=false + export UPGRADE_3_1_X=false + cho "Onprem Upgrade from latest release where postgresql-cluster-1" podname=postgresql-cluster-1 - echo "Onprem Upgrade from Rel3.1.x" elif echo "$pods" | grep -q "^postgresql-0"; then export UPGRADE_3_1_X=true - echo "Onprem Upgrade from latest release where postgresql-cluster-1" + echo "Onprem Upgrade from Rel3.1.x" podname=postgresql-0 # No valid pod found else From 0b2ac9af0e2e96a439ac3ab4b61ab21511678f84 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 16:03:01 +0530 Subject: [PATCH 06/12] Update onprem_upgrade.sh --- on-prem-installers/onprem/onprem_upgrade.sh | 25 +-------------------- 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index b68320e03..ff9b7c0d8 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -92,30 +92,7 @@ gitea_ns=gitea # shellcheck disable=SC2034 root_app=root-app -postgres_namespace=orch-database -echo "Checking PostgreSQL pod in namespace: $postgres_namespace" -# Get all pods once (optimized) -pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null) -# Check for new version pod -echo "Checking for: postgresql-cluster-1" -if echo "$pods" | grep -q "^postgresql-cluster-1"; then - export UPGRADE_3_1_X=false - cho "Onprem Upgrade from latest release where postgresql-cluster-1" - podname=postgresql-cluster-1 -elif echo "$pods" | grep -q "^postgresql-0"; then - export UPGRADE_3_1_X=true - echo "Onprem Upgrade from Rel3.1.x" - podname=postgresql-0 -# No valid pod found -else - echo "❌ ERROR: No valid PostgreSQL pod found!" - echo "Expected:" - echo " - postgresql-cluster-1 (new version, 2025.02+)" - echo " - postgresql-0 (old version, 3.1.3 and below)" - exit 1 -fi -echo "Selected PostgreSQL pod → $podname" - +export UPGRADE_3_1_X=true # Variables that depend on the above and might require updating later, are placed in here set_artifacts_version() { installer_list=( From 0c51d145e467a31c09ac3155e5c9d9a80d818630 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 17:43:52 +0530 Subject: [PATCH 07/12] Onprem stability issue (#1208) Co-authored-by: Andrei Palade --- .../onprem/after_upgrade_restart.sh | 1319 ++++++++++++++++- on-prem-installers/onprem/onprem_upgrade.sh | 174 +-- 2 files changed, 1286 insertions(+), 207 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index a6dbac55f..5551ffc6d 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -5,69 +5,1270 @@ # SPDX-License-Identifier: Apache-2.0 # Description: -# This script is used after an upgrade to perform the following tasks: -# - Restart the following key components: -# • nexus-api-gw -# • cluster-manager-template-controller -# • app-orch-tenant-controller -# - Delete old ClusterTemplates that do NOT contain "k3s" in their name +# ArgoCD Application Sync Script with Advanced Retry and Recovery Logic +# +# This script manages the synchronization of ArgoCD applications in wave order, +# with comprehensive error handling, failed sync detection, and automatic recovery. +# It handles stuck jobs, degraded applications, and failed CRDs, ensuring all +# applications reach a Healthy+Synced state. +# +# Features: +# - Wave-ordered application synchronization +# - Automatic detection and cleanup of failed syncs +# - Real-time job/CRD failure detection during sync +# - Automatic restart of failed applications +# - Global retry mechanism (4 attempts) +# - Per-application retry logic (3 attempts) +# - Timestamp tracking for all operations +# - Unhealthy job and CRD cleanup +# - OutOfSync application handling +# - Root-app special handling +# - Post-upgrade cleanup: Removes obsolete applications (tenancy-api-mapping, +# tenancy-datamodel), legacy deployments (os-resource-manager), and stale +# secrets (tls-boots, boots-ca-cert) to ensure clean upgrade state # # Usage: -# ./after_upgrade_restart.sh +# ./after_upgrade_restart.sh [NAMESPACE] +# +# Arguments: +# NAMESPACE - Target namespace for applications (optional, default: onprem) +# +# The script will: +# 1. Install ArgoCD CLI if not present +# 2. Login to ArgoCD server +# 3. Sync all applications excluding root-app +# 4. Perform post-upgrade cleanup +# 5. Re-sync all applications +# 6. Validate final state +# + +# Examples: +# ./after_upgrade_restart.sh # Uses default namespace 'onprem' +# +# Environment Variables: +# ARGO_NS - ArgoCD namespace (default: argocd) +# +# Exit Codes: +# 0 - All applications synced successfully +# 1 - Sync failed after all retries + +set -o pipefail + +# ============================================================ +# ============= GLOBAL CONFIGURATION VARIABLES =============== +# ============================================================ + +# Parse command-line arguments +NS="${1:-onprem}" # Use first argument or default to "onprem" +ARGO_NS="argocd" + +echo "[INFO] Using namespace: $NS" +echo "[INFO] Using ArgoCD namespace: $ARGO_NS" + +# Sync behaviour +GLOBAL_POLL_INTERVAL=10 # seconds +APP_MAX_WAIT=60 # 5 minutes to wait for any app (Healthy+Synced) +APP_MAX_RETRIES=3 # retry X times for each app +GLOBAL_SYNC_RETRIES=2 # Global retry for entire sync process + +# Apps requiring server-side apply (space-separated list) +SERVER_SIDE_APPS="external-secrets copy-app-gitea-cred-to-fleet copy-ca-cert-boots-to-gateway copy-ca-cert-boots-to-infra copy-ca-cert-gateway-to-cattle copy-ca-cert-gateway-to-infra copy-ca-cert-gitea-to-app copy-ca-cert-gitea-to-cluster copy-cluster-gitea-cred-to-fleet copy-keycloak-admin-to-infra infra-external platform-keycloak namespace-label wait-istio-job" + +# shellcheck disable=SC1091 +# ============================================================ +# REQUIRE COMMANDS +# ============================================================ +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "[ERROR] Required command '$1' not found. Install it and retry." + exit 1 + fi +} +require_cmd kubectl +require_cmd jq + +# ============================================================ +# ArgoCD CLI Install +# ============================================================ +install_argocd_cli() { + if ! command -v argocd >/dev/null 2>&1; then + echo "[INFO] argocd CLI not found. Installing..." + VERSION=$(curl -L -s https://raw.githubusercontent.com/argoproj/argo-cd/stable/VERSION) + echo "[INFO] Latest version: $VERSION" + curl -sSL -o argocd-linux-amd64 \ + https://github.com/argoproj/argo-cd/releases/download/v${VERSION}/argocd-linux-amd64 + sudo install -m 555 argocd-linux-amd64 /usr/local/bin/argocd + rm -f argocd-linux-amd64 + echo "[INFO] argocd CLI installed successfully." +else + echo "[INFO] argocd CLI already installed: $(argocd version --client | head -1)" +fi +} +install_argocd_cli + +# ============================================================ +# Fetch admin password +# ============================================================ +echo "[INFO] Fetching ArgoCD admin password..." +if command -v yq >/dev/null 2>&1; then + ADMIN_PASSWD=$(kubectl get secret -n "$ARGO_NS" argocd-initial-admin-secret -o yaml \ + | yq -r '.data.password' | base64 -d) +else + ADMIN_PASSWD=$(kubectl get secret -n "$ARGO_NS" argocd-initial-admin-secret \ + -o jsonpath='{.data.password}' | base64 -d) +fi + +# ============================================================ +# Discover Argo endpoint +# ============================================================ +echo "[INFO] Detecting ArgoCD Server endpoint..." +LB_IP=$(kubectl get svc argocd-server -n "$ARGO_NS" \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + +if [[ -n "$LB_IP" ]]; then + ARGO_ENDPOINT="$LB_IP" + echo "[INFO] Using LoadBalancer IP: $ARGO_ENDPOINT" +else + NODEPORT=$(kubectl get svc argocd-server -n "$ARGO_NS" -o jsonpath='{.spec.ports[0].nodePort}') + NODEIP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | awk '{print $1}') + ARGO_ENDPOINT="${NODEIP}:${NODEPORT}" + echo "[INFO] Using NodePort: $ARGO_ENDPOINT" +fi + +# ============================================================ +# Argo Login +# ============================================================ +echo "[INFO] Logging into ArgoCD..." +argocd login "$ARGO_ENDPOINT" --username admin --password "$ADMIN_PASSWD" --insecure --grpc-web +echo "[INFO] Login OK." + +# ============================================================ +# Fetch all apps by wave +# ============================================================ +get_all_apps_by_wave() { + kubectl get applications.argoproj.io -n "$NS" -o json \ + | jq -r '.items[] | + { + name: .metadata.name, + wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"), + health: .status.health.status, + sync: .status.sync.status + } + | "\(.wave) \(.name) \(.health) \(.sync)" + ' | sort -n -k1 +} + +# ============================================================ +# Fetch NOT-GREEN apps by wave +# ============================================================ +get_not_green_apps() { + kubectl get applications.argoproj.io -n "$NS" -o json \ + | jq -r '.items[] | + { + name: .metadata.name, + wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"), + health: .status.health.status, + sync: .status.sync.status + } + | select(.health != "Healthy" or .sync != "Synced") + | "\(.wave) \(.name) \(.health) \(.sync)" + ' | sort -n -k1 +} + +# Optional color helpers +bold() { tput bold 2>/dev/null; } +normal() { tput sgr0 2>/dev/null; } +green() { tput setaf 2>/dev/null 2 && tput setaf 2; } +red() { tput setaf 1 2>/dev/null; } +yellow() { tput setaf 3 2>/dev/null; } +blue() { tput setaf 4 2>/dev/null; } +reset() { tput sgr0 2>/dev/null; } + +# Get timestamp +get_timestamp() { + date '+%Y-%m-%d %H:%M:%S' +} + +# ============================================================ +# Check and fix CRD version mismatches +# ============================================================ +check_and_fix_crd_version_mismatch() { + local app_name="$1" + + # Get application status + local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + return 1 + fi + + # Check for CRD version mismatch errors in sync messages + local version_mismatch=$(echo "$status" | jq -r ' + .status.conditions[]? | + select(.type == "ComparisonError" or .type == "SyncError") | + select(.message | contains("could not find version") or contains("Version") and contains("is installed")) | + .message + ' 2>/dev/null) + + if [[ -n "$version_mismatch" ]]; then + echo "$(red)[CRD-VERSION-MISMATCH] Detected CRD version mismatch in $app_name:$(reset)" + echo "$version_mismatch" + + # Extract CRD details from error message + local crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1) + local crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1) + + if [[ -n "$crd_group" && -n "$crd_kind" ]]; then + # Try to find and list the CRD + local crd_name="${crd_kind,,}s.${crd_group}" + echo "$(yellow)[INFO] Looking for CRD: $crd_name$(reset)" + + # Check if CRD exists + if kubectl get crd "$crd_name" &>/dev/null; then + echo "$(yellow)[INFO] CRD $crd_name exists, checking versions...$(reset)" + kubectl get crd "$crd_name" -o jsonpath='{.spec.versions[*].name}' 2>/dev/null + echo + + # For external-secrets.io, we need to update to v1beta1 + if [[ "$crd_group" == "external-secrets.io" ]]; then + echo "$(yellow)[FIX] Attempting to refresh application to use correct CRD version...$(reset)" + argocd app get "${NS}/${app_name}" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 3 + return 0 + fi + else + echo "$(red)[ERROR] CRD $crd_name not found on cluster$(reset)" + fi + fi + + return 0 + fi + + return 1 +} + +# ============================================================ +# Check if application has failed sync and needs cleanup +# ============================================================ +check_and_handle_failed_sync() { + local app_name="$1" + local full_app="${NS}/${app_name}" + + # Get application status + local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + return 1 + fi + + local sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + local sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"') + + # Check if sync failed + if [[ "$sync_phase" == "Failed" || "$sync_phase" == "Error" ]]; then + echo "$(red)[FAILED-SYNC] Application $app_name has failed sync (phase=$sync_phase)$(reset)" + + # Check for failed jobs/CRDs + local failed_resources=$(echo "$status" | jq -r ' + .status.resources[]? | + select(.kind == "Job" or .kind == "CustomResourceDefinition") | + select(.health.status == "Degraded" or .health.status == "Missing" or .health.status == null) | + "\(.kind) \(.namespace) \(.name)" + ') + + if [[ -n "$failed_resources" ]]; then + echo "$(red)[CLEANUP] Found failed jobs/CRDs in $app_name:$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + echo "$(red) - Deleting $kind $res_name in $res_ns$(reset)" + + if [[ "$kind" == "Job" ]]; then + kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null & + kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null & + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true + fi + done <<< "$failed_resources" + fi -# Function: delete pod and wait until it's Running and Ready -restart_and_wait_pod() { - local namespace="$1" - local pattern="$2" + # Terminate stuck operations and refresh + echo "$(yellow)[RESTART] Restarting sync for $app_name...$(reset)" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + sleep 2 + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 - echo "🔍 Looking for pod matching '$pattern' in namespace '$namespace'..." + # Trigger a new sync + echo "$(yellow)[RESYNC] Triggering fresh sync for $app_name...$(reset)" + argocd app sync "$full_app" --grpc-web 2>&1 || true + sleep 5 - # Find the pod name - local pod_name - pod_name=$(kubectl get pods -n "$namespace" | grep "$pattern" | awk '{print $1}') + return 0 + fi - if [ -z "$pod_name" ]; then - echo "❌ No pod found matching pattern '$pattern' in namespace '$namespace'" return 1 - fi - - echo "📌 Found pod: $pod_name. Deleting..." - kubectl delete pod "$pod_name" -n "$namespace" - kubectl wait deployment/"$pattern" -n "$namespace" --for=condition=Available --timeout=120s - -} - -# Function: Dlete Old Cluster Templates that do NOT contain 'k3s' -delete_old_template() { -echo "🔍 Fetching all ClusterTemplates..." -all_templates=$(kubectl get clustertemplate -A --no-headers) - -echo "🚨 Deleting ClusterTemplates that do NOT contain 'k3s' in their name..." - -# Loop through each line of the result -while IFS= read -r line; do - namespace=$(echo "$line" | awk '{print $1}') - template_name=$(echo "$line" | awk '{print $2}') - - # Check if the template name contains "k3s" - if [[ "$template_name" != *k3s* ]]; then - echo "❌ Deleting template '$template_name' in namespace '$namespace'" - kubectl delete clustertemplate "$template_name" -n "$namespace" - else - echo "✅ Keeping template '$template_name' in namespace '$namespace' (contains 'k3s')" - fi -done <<< "$all_templates" - -echo "✅ Cleanup complete." -kubectl get clustertemplate -A | grep k3s -} -#restart pod after upgrade call: -restart_and_wait_pod "orch-iam" "nexus-api-gw" -restart_and_wait_pod "orch-cluster" "cluster-manager" -restart_and_wait_pod "orch-cluster" "cluster-manager-template-controller" -restart_and_wait_pod "orch-app" "app-orch-tenant-controller" -#delete old cluster template -delete_old_template -sleep 30s -#delete old secrets -kubectl delete secret tls-boots -n orch-boots +} + +# ============================================================ +# Clean unhealthy jobs for a specific application +# ============================================================ +clean_unhealthy_jobs_for_app() { + local app_name="$1" + + # Check for unhealthy jobs in this app and clean them up + app_resources=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null | jq -r ' + .status.resources[]? | + select(.kind == "Job" and (.health.status != "Healthy" or .health.status == null)) | + "\(.namespace) \(.name)" + ') + + if [[ -n "$app_resources" ]]; then + echo "$(yellow)[CLEANUP] Found unhealthy/failed jobs in $app_name:$(reset)" + while IFS= read -r job_line; do + [[ -z "$job_line" ]] && continue + read -r job_ns job_name <<< "$job_line" + echo "$(yellow) - Deleting job $job_name in $job_ns (background)$(reset)" + kubectl delete pods -n "$job_ns" -l job-name="$job_name" --ignore-not-found=true 2>/dev/null & + kubectl delete job "$job_name" -n "$job_ns" --ignore-not-found=true 2>/dev/null & + done <<< "$app_resources" + echo "[INFO] Job cleanup initiated in background, proceeding..." + return 0 + fi + return 1 +} + +print_header() { + echo + echo "$(bold)$(blue)============================================================$(reset)" + echo "$(bold)$(blue)== $1$(reset)" + echo "$(bold)$(blue)============================================================$(reset)" +} + +print_table_header() { + printf "%-18s %-25s %-10s %-10s\n" "Wave" "App Name" "Health" "Sync" + echo "------------------------------------------------------------" +} + +print_table_row() { + local wave="$1" name="$2" health="$3" sync="$4" + local color="" + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + color=$(green) + elif [[ "$health" == "Healthy" || "$sync" == "Synced" ]]; then + color=$(yellow) + else + color=$(red) + fi + printf "%s%-18s %-25s %-10s %-10s%s\n" "$color" "$wave" "$name" "$health" "$sync" "$(reset)" +} + +# ============================================================ +# Sync apps one-by-one in wave order (with nice reporting) +# ============================================================ +sync_not_green_apps_once() { + mapfile -t all_apps < <(get_all_apps_by_wave) + [[ ${#all_apps[@]} -eq 0 ]] && { echo "[WARN] No applications found in namespace '$NS'."; return 0; } + + print_header "Applications (Wave-Ordered Status)" + print_table_header + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + print_table_row "$wave" "$name" "$health" "$sync" + done + echo + + # Print summary of NOT-GREEN apps before syncing + echo "$(bold)[INFO] Apps NOT Healthy or NOT Synced:$(reset)" + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + if [[ "$health" != "Healthy" || "$sync" != "Synced" ]]; then + echo "$(red) - $name (wave=$wave) Health=$health Sync=$sync$(reset)" + fi + done + echo + + # Sync NOT-GREEN apps in wave order, skipping root-app until last + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + full_app="${NS}/${name}" + + # Skip root-app for now, handle it after all other apps + if [[ "$name" == "root-app" ]]; then + continue + fi + + # First check and handle any failed syncs + echo "[$(get_timestamp)] Checking for failed syncs in $name..." + check_and_handle_failed_sync "$name" + + # Special pre-sync handling for nginx-ingress-pxe-boots + if [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Pre-sync: nginx-ingress-pxe-boots detected - deleting tls-boots secret first...$(reset)" + kubectl delete secret tls-boots -n orch-boots 2>/dev/null || true + sleep 3 + fi + + attempt=1 + synced=false + while (( attempt <= APP_MAX_RETRIES )); do + status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + echo "$(red)[FAIL] $full_app not found$(reset)" + break + fi + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"') + + echo "[$(get_timestamp)] $full_app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time" + + if (( attempt == 1 )); then + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)" + synced=true + break + fi + + # Check if last sync failed and clean up + if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then + echo "$(red)[CLEANUP] Last sync failed for $full_app, cleaning up stuck resources...$(reset)" + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + fi + + # Refresh app if it's degraded or not healthy + if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then + echo "$(yellow)[REFRESH] App is $health, checking for unhealthy jobs...$(reset)" + + # Clean up any unhealthy jobs first + clean_unhealthy_jobs_for_app "$name" + + if (( attempt > 1 )); then + # Hard refresh on retry attempts + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + else + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + fi + sleep 5 + fi + fi + + echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" + echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" + + # Check if app requires server-side apply + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + sleep 2 + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) + rc=$? + # Special handling for nginx-ingress-pxe-boots + elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) + rc=$? + else + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) + rc=$? + fi + + if [[ $rc -ne 0 ]]; then + if [[ "$LOG" =~ "deleting" ]]; then + echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)" + break + fi + echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)" + ((attempt++)) + continue + fi + + timed_out=false + while true; do + now_ts=$(date +%s) + elapsed=$(( now_ts - start_ts )) + if (( elapsed >= APP_MAX_WAIT )); then + echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" + timed_out=true + break + fi + status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) + [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; } + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + + # Check for failed jobs/CRDs during sync + failed_jobs=$(echo "$status" | jq -r ' + .status.resources[]? | + select(.kind == "Job" and .health.status == "Degraded") | + .name + ' | wc -l) + + if [[ $failed_jobs -gt 0 ]]; then + echo "$(red)[ERROR] $full_app has $failed_jobs failed job(s), triggering cleanup and restart...$(reset)" + # Clean up failed jobs and restart sync + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 3 + argocd app sync "$full_app" --grpc-web 2>&1 || true + start_ts=$(date +%s) # Reset timer + sleep "$GLOBAL_POLL_INTERVAL" + continue + fi + + # Check if sync operation failed + if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then + echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase at [$(get_timestamp)]$(reset)" + timed_out=true + break + fi + + print_table_row "$wave" "$name" "$health" "$sync" + echo " [$(get_timestamp)] Elapsed: ${elapsed}s" + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s at [$(get_timestamp)] (attempt ${attempt})$(reset)" + synced=true + break + fi + sleep "$GLOBAL_POLL_INTERVAL" + done + if [[ "$synced" == "true" ]]; then + break + fi + ((attempt++)) + if (( attempt <= APP_MAX_RETRIES )); then + echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)" + # On retry, clean up unhealthy jobs and clear stuck operations + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + else + echo "$(red)[FAIL] Max retries reached for $full_app. Proceeding to next app.$(reset)" + fi + done + echo "$(blue)[INFO] Proceeding to next app...$(reset)" + done + + # Now handle root-app sync after all other apps + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)" + return 1 + fi + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + wave=$(echo "$status" | jq -r '.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"') + full_app="${NS}/root-app" + + attempt=1 + synced=false + while (( attempt <= APP_MAX_RETRIES )); do + last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"') + + echo "[$(get_timestamp)] root-app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time" + + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)" + synced=true + break + fi + + # Check if last sync failed and clean up + if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then + echo "$(red)[CLEANUP] Last sync failed for root-app, cleaning up stuck resources...$(reset)" + clean_unhealthy_jobs_for_app "root-app" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + fi + + # Refresh root-app if it's degraded or not healthy + if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then + echo "$(yellow)[REFRESH] root-app is $health, refreshing before sync...$(reset)" + if (( attempt > 1 )); then + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + else + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + fi + sleep 5 + fi + + echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" + echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" + + # Stop any ongoing operations and refresh before sync + echo "[INFO] Stopping ongoing operations and refreshing before sync..." + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + sleep 2 + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + sleep 3 + + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) + rc=$? + + if [[ $rc -ne 0 ]]; then + if [[ "$LOG" =~ "deleting" ]]; then + echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)" + break + fi + echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)" + ((attempt++)) + continue + fi + + timed_out=false + while true; do + now_ts=$(date +%s) + elapsed=$(( now_ts - start_ts )) + if (( elapsed >= APP_MAX_WAIT )); then + echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" + timed_out=true + break + fi + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; } + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + print_table_row "$wave" "root-app" "$health" "$sync" + echo " Elapsed: ${elapsed}s" + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)" + synced=true + break + fi + sleep "$GLOBAL_POLL_INTERVAL" + done + if [[ "$synced" == "true" ]]; then + break + fi + ((attempt++)) + if (( attempt <= APP_MAX_RETRIES )); then + echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)" + else + echo "$(red)[FAIL] Max retries reached for $full_app.$(reset)" + fi + done + echo "$(blue)[INFO] Finished root-app sync attempt(s).$(reset)" +} + +# ============================================================ +# Sync all apps except root-app (wave order, nice reporting) +# ============================================================ +sync_all_apps_exclude_root() { + mapfile -t all_apps < <(get_all_apps_by_wave) + [[ ${#all_apps[@]} -eq 0 ]] && { echo "[WARN] No applications found in namespace '$NS'."; return 0; } + + print_header "Applications (Wave-Ordered Status, excluding root-app)" + print_table_header + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + if [[ "$name" != "root-app" ]]; then + print_table_row "$wave" "$name" "$health" "$sync" + fi + done + echo + + # Print summary of NOT-GREEN apps before syncing + echo "$(bold)[INFO] Apps NOT Healthy or NOT Synced (excluding root-app):$(reset)" + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + if [[ "$name" != "root-app" && ( "$health" != "Healthy" || "$sync" != "Synced" ) ]]; then + echo "$(red) - $name (wave=$wave) Health=$health Sync=$sync$(reset)" + fi + done + echo + + # Sync NOT-GREEN apps in wave order, skipping root-app + for line in "${all_apps[@]}"; do + read -r wave name health sync <<< "$line" + full_app="${NS}/${name}" + + if [[ "$name" == "root-app" ]]; then + continue + fi + + # First check and handle any failed syncs + echo "[$(get_timestamp)] Checking for failed syncs in $name..." + check_and_handle_failed_sync "$name" + + # Check for CRD version mismatches + echo "[$(get_timestamp)] Checking for CRD version mismatches in $name..." + check_and_fix_crd_version_mismatch "$name" + + # Special pre-sync handling for nginx-ingress-pxe-boots + if [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Pre-sync: nginx-ingress-pxe-boots detected - deleting tls-boots secret first...$(reset)" + kubectl delete secret tls-boots -n orch-boots 2>/dev/null || true + sleep 3 + fi + + attempt=1 + synced=false + while (( attempt <= APP_MAX_RETRIES )); do + status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) + if [[ -n "$status" ]]; then + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"') + + echo "[$(get_timestamp)] $full_app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time" + + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)" + synced=true + break + fi + + # Check if last sync failed and clean up + if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then + echo "$(red)[CLEANUP] Last sync failed for $full_app, cleaning up stuck resources...$(reset)" + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + fi + + # Refresh app if it's degraded or not healthy + if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then + echo "$(yellow)[REFRESH] App is $health, checking for unhealthy jobs...$(reset)" + + # Clean up any unhealthy jobs first + clean_unhealthy_jobs_for_app "$name" + + if (( attempt > 1 )); then + # Hard refresh on retry attempts + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + else + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + fi + sleep 5 + fi + fi + + echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" + echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" + + # Check if app requires server-side apply + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + sleep 2 + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) + rc=$? + # Special handling for nginx-ingress-pxe-boots + elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) + rc=$? + else + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) + rc=$? + fi + + if [[ $rc -ne 0 ]]; then + if [[ "$LOG" =~ "deleting" ]]; then + echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)" + break + fi + echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)" + ((attempt++)) + continue + fi + + timed_out=false + while true; do + now_ts=$(date +%s) + elapsed=$(( now_ts - start_ts )) + if (( elapsed >= APP_MAX_WAIT )); then + echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" + timed_out=true + break + fi + status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) + [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; } + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + + # Check for failed jobs/CRDs during sync + failed_jobs=$(echo "$status" | jq -r ' + .status.resources[]? | + select(.kind == "Job" and .health.status == "Degraded") | + .name + ' | wc -l) + + if [[ $failed_jobs -gt 0 ]]; then + echo "$(red)[ERROR] $full_app has $failed_jobs failed job(s), triggering cleanup and restart...$(reset)" + # Clean up failed jobs and restart sync + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 3 + argocd app sync "$full_app" --grpc-web 2>&1 || true + start_ts=$(date +%s) # Reset timer + sleep "$GLOBAL_POLL_INTERVAL" + continue + fi + + # Check if sync operation failed + if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then + echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)" + timed_out=true + break + fi + + print_table_row "$wave" "$name" "$health" "$sync" + echo " Elapsed: ${elapsed}s" + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)" + synced=true + break + fi + sleep "$GLOBAL_POLL_INTERVAL" + done + if [[ "$synced" == "true" ]]; then + break + fi + ((attempt++)) + if (( attempt <= APP_MAX_RETRIES )); then + echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)" + # On retry, clean up unhealthy jobs and clear stuck operations + clean_unhealthy_jobs_for_app "$name" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + else + echo "$(red)[FAIL] Max retries reached for $full_app. Proceeding to next app.$(reset)" + fi + done + echo "$(blue)[INFO] Proceeding to next app...$(reset)" + done +} + +# ============================================================ +# Sync root-app only (with nice reporting) +# ============================================================ +sync_root_app_only() { + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)" + return 1 + fi + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + wave=$(echo "$status" | jq -r '.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"') + full_app="${NS}/root-app" + + print_header "root-app Status" + print_table_header + print_table_row "$wave" "root-app" "$health" "$sync" + echo + + # First check and handle any failed syncs + echo "[$(get_timestamp)] Checking for failed syncs in root-app..." + check_and_handle_failed_sync "root-app" + + # Check for CRD version mismatches + echo "[$(get_timestamp)] Checking for CRD version mismatches in root-app..." + check_and_fix_crd_version_mismatch "root-app" + + last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"') + + echo "[$(get_timestamp)] root-app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time" + + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)" + return 0 + fi + + # Check if last sync failed and clean up + if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then + echo "$(red)[CLEANUP] Last sync failed for root-app, cleaning up stuck resources...$(reset)" + clean_unhealthy_jobs_for_app "root-app" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + fi + + echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" + attempt=1 + synced=false + while (( attempt <= APP_MAX_RETRIES )); do + # Refresh root-app if it's degraded or not healthy + if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then + echo "$(yellow)[REFRESH] root-app is $health, refreshing before sync...$(reset)" + if (( attempt > 1 )); then + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + else + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + fi + sleep 5 + fi + + echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" + + # Stop any ongoing operations and refresh before sync + echo "[INFO] Stopping ongoing operations and refreshing before sync..." + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + sleep 2 + argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true + sleep 3 + + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) + rc=$? + + if [[ $rc -ne 0 ]]; then + if [[ "$LOG" =~ "deleting" ]]; then + echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)" + break + fi + echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)" + ((attempt++)) + continue + fi + + timed_out=false + while true; do + now_ts=$(date +%s) + elapsed=$(( now_ts - start_ts )) + if (( elapsed >= APP_MAX_WAIT )); then + echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" + timed_out=true + break + fi + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; } + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + + # Check if sync operation failed + if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then + echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)" + timed_out=true + break + fi + + print_table_row "$wave" "root-app" "$health" "$sync" + echo " Elapsed: ${elapsed}s" + if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then + echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)" + synced=true + break + fi + sleep "$GLOBAL_POLL_INTERVAL" + done + if [[ "$synced" == "true" ]]; then + break + fi + ((attempt++)) + if (( attempt <= APP_MAX_RETRIES )); then + echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)" + # On retry, clean up unhealthy jobs and clear stuck operations + clean_unhealthy_jobs_for_app "root-app" + argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true + argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true + sleep 5 + else + echo "$(red)[FAIL] Max retries reached for $full_app.$(reset)" + fi + + # Re-fetch status for next iteration + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + if [[ -n "$status" ]]; then + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + fi + done + echo "$(blue)[INFO] Finished root-app sync attempt(s).$(reset)" +} + +# ============================================================ +# Wait until NS is all green (excluding root-app) +# ============================================================ +namespace_all_green_exclude_root() { + kubectl get applications.argoproj.io -n "$NS" -o json \ + | jq -r ' + .items[] | + select(.metadata.name != "root-app") | + { + health: .status.health.status, + sync: .status.sync.status + } + | select(.health != "Healthy" or .sync != "Synced") + ' | grep -q . + return $? +} + +sync_until_green_ns_exclude_root() { + while true; do + if ! namespace_all_green_exclude_root; then + print_header "All non-root-app applications are Healthy+Synced in namespace '$NS'." + break + fi + + print_header "NOT-GREEN apps (Wave-Ordered, excluding root-app)" + print_table_header + mapfile -t not_green < <(kubectl get applications.argoproj.io -n "$NS" -o json \ + | jq -r '.items[] | select(.metadata.name != "root-app") | { + name: .metadata.name, + wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"), + health: .status.health.status, + sync: .status.sync.status + } | "\(.wave) \(.name) \(.health) \(.sync)"' | sort -n -k1) + for line in "${not_green[@]}"; do + read -r wave name health sync <<< "$line" + print_table_row "$wave" "$name" "$health" "$sync" + done + echo + + sync_all_apps_exclude_root + + sleep "10" + done +} + + +# ============================================================ +# Check and delete stuck/out-of-sync dependent CRD jobs +# ============================================================ +check_and_delete_stuck_crd_jobs() { + print_header "Checking for stuck/out-of-sync dependent CRD jobs" + + # Check for stuck jobs in all namespaces + echo "[INFO] Looking for stuck or failed jobs..." + + # Get jobs that are not completed or have failed + stuck_jobs=$(kubectl get jobs --all-namespaces -o json | jq -r ' + .items[] | + select(.status.succeeded != 1 and (.status.failed > 0 or .status.active > 0)) | + "\(.metadata.namespace) \(.metadata.name)" + ') + + if [[ -n "$stuck_jobs" ]]; then + echo "$(yellow)[WARN] Found stuck/failed jobs:$(reset)" + echo "$stuck_jobs" + + # Delete stuck jobs and their pods + while IFS= read -r line; do + [[ -z "$line" ]] && continue + read -r job_ns job_name <<< "$line" + echo "$(yellow)[CLEANUP] Deleting stuck job $job_name in namespace $job_ns (background)$(reset)" + + # Delete associated pods first + kubectl delete pods -n "$job_ns" -l job-name="$job_name" --ignore-not-found=true 2>/dev/null & + + # Delete the job + kubectl delete job "$job_name" -n "$job_ns" --ignore-not-found=true & + done <<< "$stuck_jobs" + + echo "[INFO] Job cleanup initiated in background, proceeding..." + else + echo "$(green)[OK] No stuck jobs found$(reset)" + fi + + # Check for applications that are OutOfSync + echo "[INFO] Looking for OutOfSync applications..." + out_of_sync_apps=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r ' + .items[] | + select(.status.sync.status == "OutOfSync") | + .metadata.name + ') + + if [[ -n "$out_of_sync_apps" ]]; then + echo "$(yellow)[WARN] Found OutOfSync applications:$(reset)" + echo "$out_of_sync_apps" + + # Stop and restart sync for OutOfSync apps + while IFS= read -r app_name; do + [[ -z "$app_name" ]] && continue + echo "$(yellow)[CLEANUP] Stopping sync for $app_name$(reset)" + argocd app terminate-op "${NS}/${app_name}" --grpc-web 2>/dev/null || true + sleep 2 + done <<< "$out_of_sync_apps" + else + echo "$(green)[OK] No OutOfSync applications found$(reset)" + fi + + # Check for applications with sync failures + echo "[INFO] Looking for applications with sync failures..." + sync_failed_apps=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r ' + .items[] | + select(.status.operationState.phase == "Failed" or .status.operationState.phase == "Error") | + "\(.metadata.name) \(.status.operationState.phase)" + ') + + if [[ -n "$sync_failed_apps" ]]; then + echo "$(red)[WARN] Found applications with sync failures:$(reset)" + echo "$sync_failed_apps" + + # Clean up failed apps + while IFS= read -r line; do + [[ -z "$line" ]] && continue + read -r app_name phase <<< "$line" + echo "$(red)[CLEANUP] App $app_name has phase=$phase, cleaning up...$(reset)" + + # Clean up unhealthy jobs for this app + clean_unhealthy_jobs_for_app "$app_name" + + # Terminate any stuck operations + argocd app terminate-op "${NS}/${app_name}" --grpc-web 2>/dev/null || true + + # Hard refresh to clear the error state + argocd app get "${NS}/${app_name}" --hard-refresh --grpc-web >/dev/null 2>&1 || true + + sleep 2 + done <<< "$sync_failed_apps" + else + echo "$(green)[OK] No sync failed applications found$(reset)" + fi + + echo "[INFO] Stuck CRD jobs check and cleanup completed." +} + +# ============================================================ +# Post-upgrade cleanup function +# ============================================================ +post_upgrade_cleanup() { + print_header "Post-upgrade Cleanup (Manual Fixes)" + + echo "[INFO] Deleting applications tenancy-api-mapping and tenancy-datamodel in namespace onprem..." + kubectl delete application tenancy-api-mapping -n onprem || true + kubectl delete application tenancy-datamodel -n onprem || true + + echo "[INFO] Deleting deployment os-resource-manager in namespace orch-infra..." + kubectl delete deployment -n orch-infra os-resource-manager || true + + echo "[INFO] Deleting onboarding secrets..." + kubectl delete secret tls-boots -n orch-boots || true + kubectl delete secret boots-ca-cert -n orch-gateway || true + kubectl delete secret boots-ca-cert -n orch-infra || true + + echo "[INFO] Deleting dkam pods in namespace orch-infra..." + kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null || true + + echo "[INFO] Post-upgrade cleanup completed." +} + +# ============================================================ +# Main sync function with retry logic +# ============================================================ +execute_full_sync() { + sync_until_green_ns_exclude_root + print_header "Syncing root-app after all other apps are green" + sync_root_app_only + + post_upgrade_cleanup + + sleep 60 + print_header "Post-upgrade: Syncing all apps (excluding root-app) again" + sync_all_apps_exclude_root + print_header "Post-upgrade: Syncing root-app again" + sync_root_app_only +} + +# ============================================================ +# Check if sync was successful +# ============================================================ +check_sync_success() { + # Check root-app status + status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) + if [[ -z "$status" ]]; then + echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)" + return 1 + fi + health=$(echo "$status" | jq -r '.status.health.status') + sync=$(echo "$status" | jq -r '.status.sync.status') + + if [[ "$health" != "Healthy" || "$sync" != "Synced" ]]; then + echo "$(red)[FAIL] root-app is NOT Healthy+Synced (Health: $health, Sync: $sync)$(reset)" + return 1 + fi + + # Check for any non-healthy apps + not_healthy=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r ' + .items[] | + select(.status.health.status != "Healthy" or .status.sync.status != "Synced") | + .metadata.name + ' | wc -l) + + if [[ $not_healthy -gt 0 ]]; then + echo "$(red)[FAIL] $not_healthy applications are not Healthy+Synced$(reset)" + return 1 + fi + kubectl get applications -A + echo "$(green)[OK] All applications are Healthy+Synced$(reset)" + + # Display all applications status + echo + echo "$(bold)$(green)Final Application Status:$(reset)" + + + return 0 +} + +# ============================================================ +# GLOBAL TIMEOUT WATCHDOG +# ============================================================ +SCRIPT_START_TS=$(date +%s) + +# Global retry loop +global_retry=1 +sync_success=false + +while (( global_retry <= GLOBAL_SYNC_RETRIES )); do + print_header "GLOBAL SYNC ATTEMPT ${global_retry}/${GLOBAL_SYNC_RETRIES}" + + execute_full_sync + + if check_sync_success; then + sync_success=true + print_header "Sync Script Completed Successfully" + exit 0 + fi + + if (( global_retry < GLOBAL_SYNC_RETRIES )); then + echo "$(yellow)[RETRY] Sync attempt ${global_retry} failed. Checking for stuck resources...$(reset)" + + # Check and cleanup stuck resources before next retry + check_and_delete_stuck_crd_jobs + + # Stop all ongoing sync operations + echo "[INFO] Stopping all ongoing sync operations..." + mapfile -t all_apps < <(kubectl get applications.argoproj.io -n "$NS" -o jsonpath='{.items[*].metadata.name}') + for app in "${all_apps[@]}"; do + [[ -z "$app" ]] && continue + argocd app terminate-op "${NS}/${app}" --grpc-web 2>/dev/null || true + done + + echo "$(yellow)[INFO] Waiting 30 seconds before retry ${global_retry}...$(reset)" + sleep 30 + + ((global_retry++)) + else + echo "$(red)[FAIL] Maximum global retries (${GLOBAL_SYNC_RETRIES}) reached. Sync failed.$(reset)" + exit 1 + fi +done + +# This should not be reached, but just in case +echo "$(red)[FAIL] Sync did not complete successfully after ${GLOBAL_SYNC_RETRIES} attempts.$(reset)" +exit 1 diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index ff9b7c0d8..9f2bd55a7 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -77,6 +77,7 @@ ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}" DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}" # Updated to v3.1.0 GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}" USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}" # New flag for local packages +UPGRADE_3_1_X="${UPGRADE_3_1_X:-true}" ### Variables cwd=$(pwd) @@ -92,7 +93,7 @@ gitea_ns=gitea # shellcheck disable=SC2034 root_app=root-app -export UPGRADE_3_1_X=true + # Variables that depend on the above and might require updating later, are placed in here set_artifacts_version() { installer_list=( @@ -764,6 +765,16 @@ fi # Modify orch-configs settings for upgrade procedure retrieve_and_apply_config +# Check if kyverno-clean-reports job exists before attempting cleanup +if kubectl get job kyverno-clean-reports -n kyverno >/dev/null 2>&1; then + echo "Cleaning up kyverno-clean-reports job..." + kubectl delete job kyverno-clean-reports -n kyverno & + kubectl delete pods -l job-name="kyverno-clean-reports" -n kyverno & + kubectl patch job kyverno-clean-reports -n kyverno --type=merge -p='{"metadata":{"finalizers":[]}}' +else + echo "kyverno-clean-reports job not found in kyverno namespace, skipping cleanup" +fi + ### Upgrade # Run OS Configuration upgrade @@ -1056,18 +1067,18 @@ sleep 10 # Restore secret after app delete but before postgress restored if [[ "$UPGRADE_3_1_X" == "true" ]]; then - yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f - + yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f - else - yq e ' - del(.metadata.labels) | - del(.metadata.annotations) | - del(.metadata.ownerReferences) | - del(.metadata.finalizers) | - del(.metadata.managedFields) | - del(.metadata.resourceVersion) | - del(.metadata.uid) | - del(.metadata.creationTimestamp) - ' postgres_secret.yaml | kubectl apply -f - + yq e ' + del(.metadata.labels) | + del(.metadata.annotations) | + del(.metadata.ownerReferences) | + del(.metadata.finalizers) | + del(.metadata.managedFields) | + del(.metadata.resourceVersion) | + del(.metadata.uid) | + del(.metadata.creationTimestamp) + ' postgres_secret.yaml | kubectl apply -f - fi sleep 30 # Wait until PostgreSQL pod is running (Re-sync) @@ -1205,151 +1216,18 @@ fi echo "Applying external-secrets CRDs with server-side apply..." kubectl apply --server-side=true --force-conflicts -f https://raw.githubusercontent.com/external-secrets/external-secrets/refs/tags/v0.20.4/deploy/crds/bundle.yaml || true -check_and_force_sync_app external-secrets "$apps_ns" "true" -wait_for_app_synced_healthy external-secrets "$apps_ns" - -# Force sync apps that copy secrets to their destinations -check_and_force_sync_app copy-app-gitea-cred-to-fleet "$apps_ns" -check_and_force_sync_app copy-ca-cert-boots-to-gateway "$apps_ns" -check_and_force_sync_app copy-ca-cert-boots-to-infra "$apps_ns" -check_and_force_sync_app copy-ca-cert-gateway-to-cattle "$apps_ns" -check_and_force_sync_app copy-ca-cert-gateway-to-infra "$apps_ns" -check_and_force_sync_app copy-ca-cert-gitea-to-app "$apps_ns" -check_and_force_sync_app copy-ca-cert-gitea-to-cluster "$apps_ns" -check_and_force_sync_app copy-cluster-gitea-cred-to-fleet "$apps_ns" -check_and_force_sync_app copy-keycloak-admin-to-infra "$apps_ns" - # Unseal vault after external-secrets is ready echo "Unsealing vault..." vault_unseal echo "✅ Vault unsealed successfully" - -kubectl patch -n "$apps_ns" application platform-keycloak --patch-file /tmp/argo-cd/sync-patch.yaml --type merge - -wait_for_app_synced_healthy platform-keycloak "$apps_ns" - -kubectl patch -n "$apps_ns" application cluster-manager --patch-file /tmp/argo-cd/sync-patch.yaml --type merge - - -kubectl delete secret tls-boots -n orch-boots - -# Observability Minio PVC ignoreDifferences patching and job cleanup -kubectl patch job orchestrator-observability-mimir-make-minio-buckets-5.4.0 -n orch-platform --type=merge -p='{"metadata":{"finalizers":[]}}' -kubectl delete job orchestrator-observability-mimir-make-minio-buckets-5.4.0 -n orch-platform --force --grace-period=0 2>/dev/null || true -kubectl delete pods -l job-name="orchestrator-observability-mimir-make-minio-buckets-5.4.0" -n orch-platform --force --grace-period=0 2>/dev/null || true - -kubectl patch application orchestrator-observability -n "$apps_ns" --type='json' -p='[{ - "op": "add", - "path": "/spec/ignoreDifferences", - "value": [{ - "group": "", - "kind": "PersistentVolumeClaim", - "name": "orchestrator-observability-minio", - "jsonPointers": ["/spec/storageClassName", "/spec/volumeName"] - }] -}]' - -kubectl patch job edgenode-observability-mimir-make-minio-buckets-5.4.0 -n orch-infra --type=merge -p='{"metadata":{"finalizers":[]}}' -kubectl delete job edgenode-observability-mimir-make-minio-buckets-5.4.0 -n orch-infra --force --grace-period=0 2>/dev/null || true -kubectl delete pods -l job-name="edgenode-observability-mimir-make-minio-buckets-5.4.0" -n orch-infra --force --grace-period=0 2>/dev/null || true - -kubectl patch application edgenode-observability -n "$apps_ns" --type='json' -p='[{ - "op": "add", - "path": "/spec/ignoreDifferences", - "value": [{ - "group": "", - "kind": "PersistentVolumeClaim", - "name": "edgenode-observability-minio", - "jsonPointers": ["/spec/storageClassName", "/spec/volumeName"] - }] -}]' - -check_and_patch_sync_app edgenode-observability "$apps_ns" -check_and_patch_sync_app orchestrator-observability "$apps_ns" - -# Cleanup infra-external jobs -kubectl delete jobs setup-databases-mps setup-databases-rps amt-dbpassword-secret-job init-amt-vault-job -n orch-infra --force --grace-period=0 --ignore-not-found - - -check_and_cleanup_job "namespace-label" "ns-label" -check_and_cleanup_job "wait-istio-job" "ns-label" - -# Unsynced leftovers using patch sync -# Collect and display syncwave information for OutOfSync applications -echo "OutOfSync applications by syncwave:" -outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | - "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ - sort -n) - -echo "$outofsync_apps" | awk '{print " Wave " $1 ": " $2}' - -# Sync applications in wave order -echo "Syncing OutOfSync applications in wave order..." -echo "$outofsync_apps" | while read -r wave app_name; do - if [[ -n "$app_name" ]]; then - echo "Processing wave $wave: $app_name" - check_and_patch_sync_app "$app_name" "$apps_ns" - fi -done - - -# Unsynced leftovers using force sync -# Collect and display syncwave information for OutOfSync applications -echo "OutOfSync applications by syncwave:" -outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | - "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ - sort -n) - -echo "$outofsync_apps" | awk '{print " Wave " $1 ": " $2}' - -# Sync applications in wave order -echo "Syncing OutOfSync applications in wave order..." -echo "$outofsync_apps" | while read -r wave app_name; do - if [[ -n "$app_name" ]]; then - echo "Processing wave $wave: $app_name" - check_and_force_sync_app "$app_name" "$apps_ns" "true" - fi -done - -# Unsynced leftovers using force sync -# Collect and display syncwave information for OutOfSync applications -echo "OutOfSync applications by syncwave:" -outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \ - jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | - "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \ - sort -n) - -echo "$outofsync_apps" | awk '{print " Wave " $1 ": " $2}' - -# Sync applications in wave order -echo "Syncing OutOfSync applications in wave order..." -echo "$outofsync_apps" | while read -r wave app_name; do - if [[ -n "$app_name" ]]; then - echo "Processing wave $wave: $app_name" - check_and_force_sync_app "$app_name" "$apps_ns" - fi -done - # Stop root-app old sync as it will be stuck. kubectl patch application root-app -n "$apps_ns" --type merge -p '{"operation":null}' kubectl patch application root-app -n "$apps_ns" --type json -p '[{"op": "remove", "path": "/status/operationState"}]' - -# OS profiles Fix -kubectl patch application tenancy-api-mapping -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge -kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge -kubectl delete application tenancy-api-mapping -n onprem -kubectl delete application tenancy-datamodel -n onprem -kubectl delete deployment -n orch-infra os-resource-manager - # Apply root-app Patch kubectl patch application root-app -n "$apps_ns" --patch-file /tmp/argo-cd/sync-patch.yaml --type merge - -# Onboarding Fix +sleep 10 +#restart tls-boot secrets kubectl delete secret tls-boots -n orch-boots -kubectl delete secret boots-ca-cert -n orch-gateway -kubectl delete secret boots-ca-cert -n orch-infra -kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null +./after_upgrade_restart.sh echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state" From 53fc24a3c1a7fa034750412daa01790e1911e8a2 Mon Sep 17 00:00:00 2001 From: Andrei Palade Date: Thu, 4 Dec 2025 12:51:07 +0000 Subject: [PATCH 08/12] Fix linter issues in the after_upgrade_restart.sh script (#1212) --- .../onprem/after_upgrade_restart.sh | 58 +++++++++++-------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index 5551ffc6d..c8875efeb 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -96,7 +96,7 @@ install_argocd_cli() { VERSION=$(curl -L -s https://raw.githubusercontent.com/argoproj/argo-cd/stable/VERSION) echo "[INFO] Latest version: $VERSION" curl -sSL -o argocd-linux-amd64 \ - https://github.com/argoproj/argo-cd/releases/download/v${VERSION}/argocd-linux-amd64 + https://github.com/argoproj/argo-cd/releases/download/v"${VERSION}"/argocd-linux-amd64 sudo install -m 555 argocd-linux-amd64 /usr/local/bin/argocd rm -f argocd-linux-amd64 echo "[INFO] argocd CLI installed successfully." @@ -196,13 +196,15 @@ check_and_fix_crd_version_mismatch() { local app_name="$1" # Get application status - local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) + local status + status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) if [[ -z "$status" ]]; then return 1 fi # Check for CRD version mismatch errors in sync messages - local version_mismatch=$(echo "$status" | jq -r ' + local version_mismatch + version_mismatch=$(echo "$status" | jq -r ' .status.conditions[]? | select(.type == "ComparisonError" or .type == "SyncError") | select(.message | contains("could not find version") or contains("Version") and contains("is installed")) | @@ -214,8 +216,10 @@ check_and_fix_crd_version_mismatch() { echo "$version_mismatch" # Extract CRD details from error message - local crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1) - local crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1) + local crd_group + crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1) + local crd_kind + crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1) if [[ -n "$crd_group" && -n "$crd_kind" ]]; then # Try to find and list the CRD @@ -254,20 +258,24 @@ check_and_handle_failed_sync() { local full_app="${NS}/${app_name}" # Get application status - local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) + local status + status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null) if [[ -z "$status" ]]; then return 1 fi - local sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') - local sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"') + local sync_phase + sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"') + #local sync_status + #sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"') # Check if sync failed if [[ "$sync_phase" == "Failed" || "$sync_phase" == "Error" ]]; then echo "$(red)[FAILED-SYNC] Application $app_name has failed sync (phase=$sync_phase)$(reset)" # Check for failed jobs/CRDs - local failed_resources=$(echo "$status" | jq -r ' + local failed_resources + failed_resources=$(echo "$status" | jq -r ' .status.resources[]? | select(.kind == "Job" or .kind == "CustomResourceDefinition") | select(.health.status == "Degraded" or .health.status == "Missing" or .health.status == null) | @@ -459,7 +467,7 @@ sync_not_green_apps_once() { echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 @@ -489,13 +497,13 @@ sync_not_green_apps_once() { continue fi - timed_out=false + #timed_out=false while true; do now_ts=$(date +%s) elapsed=$(( now_ts - start_ts )) if (( elapsed >= APP_MAX_WAIT )); then echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" - timed_out=true + #timed_out=true break fi status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) @@ -527,7 +535,7 @@ sync_not_green_apps_once() { # Check if sync operation failed if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase at [$(get_timestamp)]$(reset)" - timed_out=true + #timed_out=true break fi @@ -627,13 +635,13 @@ sync_not_green_apps_once() { continue fi - timed_out=false + #timed_out=false while true; do now_ts=$(date +%s) elapsed=$(( now_ts - start_ts )) if (( elapsed >= APP_MAX_WAIT )); then echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" - timed_out=true + #timed_out=true break fi status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) @@ -761,7 +769,7 @@ sync_all_apps_exclude_root() { echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 @@ -791,13 +799,13 @@ sync_all_apps_exclude_root() { continue fi - timed_out=false + #timed_out=false while true; do now_ts=$(date +%s) elapsed=$(( now_ts - start_ts )) if (( elapsed >= APP_MAX_WAIT )); then echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" - timed_out=true + #timed_out=true break fi status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null) @@ -829,7 +837,7 @@ sync_all_apps_exclude_root() { # Check if sync operation failed if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)" - timed_out=true + #timed_out=true break fi @@ -945,13 +953,13 @@ sync_root_app_only() { continue fi - timed_out=false + #timed_out=false while true; do now_ts=$(date +%s) elapsed=$(( now_ts - start_ts )) if (( elapsed >= APP_MAX_WAIT )); then echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)" - timed_out=true + #timed_out=true break fi status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null) @@ -963,7 +971,7 @@ sync_root_app_only() { # Check if sync operation failed if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)" - timed_out=true + #timed_out=true break fi @@ -1228,11 +1236,11 @@ check_sync_success() { # ============================================================ # GLOBAL TIMEOUT WATCHDOG # ============================================================ -SCRIPT_START_TS=$(date +%s) +#SCRIPT_START_TS=$(date +%s) # Global retry loop global_retry=1 -sync_success=false +#sync_success=false while (( global_retry <= GLOBAL_SYNC_RETRIES )); do print_header "GLOBAL SYNC ATTEMPT ${global_retry}/${GLOBAL_SYNC_RETRIES}" @@ -1240,7 +1248,7 @@ while (( global_retry <= GLOBAL_SYNC_RETRIES )); do execute_full_sync if check_sync_success; then - sync_success=true + #sync_success=true print_header "Sync Script Completed Successfully" exit 0 fi From 9c463c6700e073623599f00bd2a50fba47eaad34 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 21:42:47 +0530 Subject: [PATCH 09/12] add finalizer check (#1220) --- .../onprem/after_upgrade_restart.sh | 146 +++++++++++++++--- 1 file changed, 128 insertions(+), 18 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index c8875efeb..83ee967aa 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -287,13 +287,13 @@ check_and_handle_failed_sync() { while IFS= read -r res_line; do [[ -z "$res_line" ]] && continue read -r kind res_ns res_name <<< "$res_line" - echo "$(red) - Deleting $kind $res_name in $res_ns$(reset)" + echo "$(red) - Deleting $kind $res_name in $res_ns (background)$(reset)" if [[ "$kind" == "Job" ]]; then kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null & kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null & elif [[ "$kind" == "CustomResourceDefinition" ]]; then - kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null & fi done <<< "$failed_resources" fi @@ -466,22 +466,49 @@ sync_not_green_apps_once() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then + # Check if app requires server-side apply and special cleanup + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 + + # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) + echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" + problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' + .status.resources[]? | + select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | + select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | + "\(.kind) \(.namespace) \(.name)" + ') + + if [[ -n "$problem_resources" ]]; then + echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + echo "$(yellow) - Deleting $kind $res_name in $res_ns$(reset)" + + if [[ "$kind" == "Job" ]]; then + kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true + kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true + else + kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true + fi + done <<< "$problem_resources" + echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" + sleep 3 + fi + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? - # Special handling for nginx-ingress-pxe-boots - elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then - echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" - start_ts=$(date +%s) - LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) - rc=$? else + # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? @@ -768,22 +795,105 @@ sync_all_apps_exclude_root() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then + # Check if app requires server-side apply and special cleanup + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 + + # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) + echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" + problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' + .status.resources[]? | + select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | + select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | + "\(.kind) \(.namespace) \(.name)" + ') + + if [[ -n "$problem_resources" ]]; then + echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + echo "$(yellow) - Deleting $kind $res_name in $res_ns (background)$(reset)" + + if [[ "$kind" == "Job" ]]; then + kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + else + kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + fi + done <<< "$problem_resources" + echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" + sleep 3 + + # Verify resources are deleted, if still present, force finalizer removal + echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + + if [[ "$kind" == "Job" ]]; then + if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + if kubectl get crd "$res_name" &>/dev/null; then + echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then + if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + fi + done <<< "$problem_resources" + sleep 2 + fi + # Verify resources are deleted, if still present, force finalizer removal + echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + + if [[ "$kind" == "Job" ]]; then + if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + if kubectl get crd "$res_name" &>/dev/null; then + echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then + if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + fi + done <<< "$problem_resources" + sleep 2 + fi + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? - # Special handling for nginx-ingress-pxe-boots - elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then - echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" - start_ts=$(date +%s) - LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) - rc=$? else + # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? From 77511f49f62c9850bca71abc9021f1c2bb250159 Mon Sep 17 00:00:00 2001 From: Andrei Palade Date: Thu, 4 Dec 2025 16:21:01 +0000 Subject: [PATCH 10/12] Revert "add finalizer check (#1220)" (#1223) --- .../onprem/after_upgrade_restart.sh | 146 +++--------------- 1 file changed, 18 insertions(+), 128 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index 83ee967aa..c8875efeb 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -287,13 +287,13 @@ check_and_handle_failed_sync() { while IFS= read -r res_line; do [[ -z "$res_line" ]] && continue read -r kind res_ns res_name <<< "$res_line" - echo "$(red) - Deleting $kind $res_name in $res_ns (background)$(reset)" + echo "$(red) - Deleting $kind $res_name in $res_ns$(reset)" if [[ "$kind" == "Job" ]]; then kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null & kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null & elif [[ "$kind" == "CustomResourceDefinition" ]]; then - kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null & + kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true fi done <<< "$failed_resources" fi @@ -466,49 +466,22 @@ sync_not_green_apps_once() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply and special cleanup - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + # Check if app requires server-side apply + if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 - - # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) - echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" - problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' - .status.resources[]? | - select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | - select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | - "\(.kind) \(.namespace) \(.name)" - ') - - if [[ -n "$problem_resources" ]]; then - echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" - while IFS= read -r res_line; do - [[ -z "$res_line" ]] && continue - read -r kind res_ns res_name <<< "$res_line" - echo "$(yellow) - Deleting $kind $res_name in $res_ns$(reset)" - - if [[ "$kind" == "Job" ]]; then - kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true - kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true - kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true - elif [[ "$kind" == "CustomResourceDefinition" ]]; then - kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true - kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true - else - kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true - fi - done <<< "$problem_resources" - echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" - sleep 3 - fi - echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? + # Special handling for nginx-ingress-pxe-boots + elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) + rc=$? else - # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? @@ -795,105 +768,22 @@ sync_all_apps_exclude_root() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply and special cleanup - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + # Check if app requires server-side apply + if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 - - # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) - echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" - problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' - .status.resources[]? | - select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | - select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | - "\(.kind) \(.namespace) \(.name)" - ') - - if [[ -n "$problem_resources" ]]; then - echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" - while IFS= read -r res_line; do - [[ -z "$res_line" ]] && continue - read -r kind res_ns res_name <<< "$res_line" - echo "$(yellow) - Deleting $kind $res_name in $res_ns (background)$(reset)" - - if [[ "$kind" == "Job" ]]; then - kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true - kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & - kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & - elif [[ "$kind" == "CustomResourceDefinition" ]]; then - kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true - kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & - else - kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & - fi - done <<< "$problem_resources" - echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" - sleep 3 - - # Verify resources are deleted, if still present, force finalizer removal - echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)" - while IFS= read -r res_line; do - [[ -z "$res_line" ]] && continue - read -r kind res_ns res_name <<< "$res_line" - - if [[ "$kind" == "Job" ]]; then - if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then - echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & - fi - elif [[ "$kind" == "CustomResourceDefinition" ]]; then - if kubectl get crd "$res_name" &>/dev/null; then - echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null & - fi - elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then - if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then - echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & - fi - fi - done <<< "$problem_resources" - sleep 2 - fi - # Verify resources are deleted, if still present, force finalizer removal - echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)" - while IFS= read -r res_line; do - [[ -z "$res_line" ]] && continue - read -r kind res_ns res_name <<< "$res_line" - - if [[ "$kind" == "Job" ]]; then - if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then - echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & - fi - elif [[ "$kind" == "CustomResourceDefinition" ]]; then - if kubectl get crd "$res_name" &>/dev/null; then - echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null & - fi - elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then - if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then - echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)" - kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true - kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & - fi - fi - done <<< "$problem_resources" - sleep 2 - fi - echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? + # Special handling for nginx-ingress-pxe-boots + elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then + echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" + start_ts=$(date +%s) + LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) + rc=$? else - # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? From 42aca9670a68bcad39cd79de9c6f58631042e73e Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Thu, 4 Dec 2025 21:58:13 +0530 Subject: [PATCH 11/12] added patch for finalizer (#1224) --- .../onprem/after_upgrade_restart.sh | 118 +++++++++++++++--- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index c8875efeb..7d4cc206e 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -287,13 +287,13 @@ check_and_handle_failed_sync() { while IFS= read -r res_line; do [[ -z "$res_line" ]] && continue read -r kind res_ns res_name <<< "$res_line" - echo "$(red) - Deleting $kind $res_name in $res_ns$(reset)" + echo "$(red) - Deleting $kind $res_name in $res_ns (background)$(reset)" if [[ "$kind" == "Job" ]]; then kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null & kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null & elif [[ "$kind" == "CustomResourceDefinition" ]]; then - kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null & fi done <<< "$failed_resources" fi @@ -466,22 +466,49 @@ sync_not_green_apps_once() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then + # Check if app requires server-side apply and special cleanup + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 + + # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) + echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" + problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' + .status.resources[]? | + select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | + select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | + "\(.kind) \(.namespace) \(.name)" + ') + + if [[ -n "$problem_resources" ]]; then + echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + echo "$(yellow) - Deleting $kind $res_name in $res_ns (background)$(reset)" + + if [[ "$kind" == "Job" ]]; then + kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + else + kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + fi + done <<< "$problem_resources" + echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" + sleep 3 + fi + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? - # Special handling for nginx-ingress-pxe-boots - elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then - echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" - start_ts=$(date +%s) - LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) - rc=$? else + # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? @@ -768,22 +795,77 @@ sync_all_apps_exclude_root() { echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)" echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" - # Check if app requires server-side apply - if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then + # Check if app requires server-side apply and special cleanup + if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 + + # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.) + echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)" + problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r ' + .status.resources[]? | + select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") | + select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") | + "\(.kind) \(.namespace) \(.name)" + ') + + if [[ -n "$problem_resources" ]]; then + echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + echo "$(yellow) - Deleting $kind $res_name in $res_ns (background)$(reset)" + + if [[ "$kind" == "Job" ]]; then + kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true + kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null & + else + kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null & + fi + done <<< "$problem_resources" + echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)" + sleep 3 + + # Verify resources are deleted, if still present, force finalizer removal + echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)" + while IFS= read -r res_line; do + [[ -z "$res_line" ]] && continue + read -r kind res_ns res_name <<< "$res_line" + + if [[ "$kind" == "Job" ]]; then + if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "CustomResourceDefinition" ]]; then + if kubectl get crd "$res_name" &>/dev/null; then + echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null & + fi + elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then + if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then + echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)" + kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true + kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null & + fi + fi + done <<< "$problem_resources" + sleep 2 + fi + echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)" start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1) rc=$? - # Special handling for nginx-ingress-pxe-boots - elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then - echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)" - start_ts=$(date +%s) - LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1) - rc=$? else + # Standard sync for apps not in SERVER_SIDE_APPS start_ts=$(date +%s) LOG=$(argocd app sync "$full_app" --grpc-web 2>&1) rc=$? From fb9ce53c4707b95d26dbb355e6fc8d5ae1001613 Mon Sep 17 00:00:00 2001 From: Sunil Parida Date: Fri, 5 Dec 2025 06:57:43 +0530 Subject: [PATCH 12/12] updated flag check UPGRADE_FROM_3_1 and lint error fix (#1225) --- .../onprem/after_upgrade_restart.sh | 13 ++--- on-prem-installers/onprem/onprem_upgrade.sh | 53 +++++++++++++++---- on-prem-installers/onprem/upgrade_postgres.sh | 10 ++-- 3 files changed, 54 insertions(+), 22 deletions(-) diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh index 7d4cc206e..cc4d63e2e 100755 --- a/on-prem-installers/onprem/after_upgrade_restart.sh +++ b/on-prem-installers/onprem/after_upgrade_restart.sh @@ -467,7 +467,7 @@ sync_not_green_apps_once() { echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" # Check if app requires server-side apply and special cleanup - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + if [[ " $SERVER_SIDE_APPS " =~ \ $name\ ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 @@ -796,7 +796,7 @@ sync_all_apps_exclude_root() { echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)" # Check if app requires server-side apply and special cleanup - if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then + if [[ " $SERVER_SIDE_APPS " =~ \ $name\ ]]; then echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)" argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true sleep 2 @@ -1251,7 +1251,8 @@ post_upgrade_cleanup() { kubectl delete secret tls-boots -n orch-boots || true kubectl delete secret boots-ca-cert -n orch-gateway || true kubectl delete secret boots-ca-cert -n orch-infra || true - + echo "[INFO] Waiting 30 seconds for secrets cleanup to complete before deleting dkam pods..." + sleep 30 echo "[INFO] Deleting dkam pods in namespace orch-infra..." kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null || true @@ -1265,13 +1266,7 @@ execute_full_sync() { sync_until_green_ns_exclude_root print_header "Syncing root-app after all other apps are green" sync_root_app_only - post_upgrade_cleanup - - sleep 60 - print_header "Post-upgrade: Syncing all apps (excluding root-app) again" - sync_all_apps_exclude_root - print_header "Post-upgrade: Syncing root-app again" sync_root_app_only } diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh index 9f2bd55a7..8a68de454 100755 --- a/on-prem-installers/onprem/onprem_upgrade.sh +++ b/on-prem-installers/onprem/onprem_upgrade.sh @@ -8,7 +8,10 @@ # Description: This script: # If requested - does a backup of PVs and cluster's ETCD # Downloads debian packages and repo artifacts, -# Upgrades packages to v3.1.0: +# Upgrades packages from either: +# - v3.1.3 to latest (set -u true or omit, default) +# - v2025.02 to latest (set -u false) +# Upgrades: # - OS config, # - RKE2 and basic cluster components, # - ArgoCD, @@ -18,6 +21,7 @@ # Usage: ./onprem_upgrade # -o: Override production values with dev values # -b: enable backup of Orchestrator PVs before upgrade (optional) +# -u [true|false]: specify source version: true=from 3.1.3 (default), false=from 2025.02 # -h: help (optional) set -e @@ -77,7 +81,8 @@ ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}" DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}" # Updated to v3.1.0 GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}" USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}" # New flag for local packages -UPGRADE_3_1_X="${UPGRADE_3_1_X:-true}" +# UPGRADE_FROM_3_1_X indicates SOURCE version: true=upgrading FROM 3.1.3, false=upgrading FROM 2025.02 +UPGRADE_FROM_3_1_X="${UPGRADE_FROM_3_1_X:-true}" # Default: upgrading from 3.1.3 ### Variables cwd=$(pwd) @@ -624,21 +629,32 @@ cleanup_gitea_secrets() { usage() { cat >&2 < postgres_secret.yaml else kubectl get secret -n orch-database passwords -o yaml > postgres_secret.yaml @@ -810,7 +843,7 @@ if [[ ! -s postgres-secrets-password.txt ]]; then IAM_TENANCY=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam -o jsonpath='{.data.PGPASSWORD}') PLATFORM_KEYCLOAK=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}') VAULT=$(kubectl get secret vault-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}') - if [[ "$UPGRADE_3_1_X" == "true" ]]; then + if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}') else POSTGRESQL=$(kubectl get secret orch-database-postgresql -n orch-database -o jsonpath='{.data.password}') @@ -1066,7 +1099,7 @@ patch_secrets sleep 10 # Restore secret after app delete but before postgress restored -if [[ "$UPGRADE_3_1_X" == "true" ]]; then +if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f - else yq e ' diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh index 45cd7abe8..e24f49e1e 100755 --- a/on-prem-installers/onprem/upgrade_postgres.sh +++ b/on-prem-installers/onprem/upgrade_postgres.sh @@ -11,7 +11,11 @@ local_backup_path="${POSTGRES_LOCAL_BACKUP_PATH}${local_backup_file}" POSTGRES_USERNAME="postgres" application_namespace=onprem -if [[ "$UPGRADE_3_1_X" == "true" ]]; then +# UPGRADE_FROM_3_1_X is set and exported by onprem_upgrade.sh +# Default to true if not set (upgrading FROM 3.1.3) +UPGRADE_FROM_3_1_X="${UPGRADE_FROM_3_1_X:-true}" + +if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then podname="postgresql-0" else podname="postgresql-cluster-1" @@ -54,7 +58,7 @@ backup_postgres() { fi echo "Backing up databases from pod $podname in namespace $postgres_namespace..." - if [[ "$UPGRADE_3_1_X" == "true" ]]; then + if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then remote_backup_path="/tmp/${postgres_namespace}_backup.sql" else remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql" @@ -109,7 +113,7 @@ restore_postgres() { echo "Restoring backup databases from pod $podname in namespace $postgres_namespace..." # Get postgres password from secret - if [[ "$UPGRADE_3_1_X" == "true" ]]; then + if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d) else PGPASSWORD=$(kubectl get secret -n $postgres_namespace orch-database-postgresql -o jsonpath='{.data.password}' | base64 -d)