From b7e53c2710ab31f9d43cba74be90f326ad13815f Mon Sep 17 00:00:00 2001
From: Andrei Palade <andrei.palade@intel.com>
Date: Wed, 3 Dec 2025 16:34:58 +0000
Subject: [PATCH 01/12] Update passwords patching

---
 on-prem-installers/onprem/onprem_upgrade.sh | 66 ++++-----------------
 1 file changed, 11 insertions(+), 55 deletions(-)

diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index 967c5eff5..d53fd2f42 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -872,7 +872,7 @@ while true; do
 done
 set -e
 
-patch_secret() {
+patch_secrets() {
 
     # Patch secrets with passwords from postgres-secrets-password.txt
     # If the file is not empty, read the passwords and patch the secrets accordingly
@@ -980,40 +980,18 @@ patch_secret() {
     kubectl patch secret -n orch-infra mps-reader-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$MPS\"}}" --type=merge
     kubectl patch secret -n orch-infra rps-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$RPS\"}}" --type=merge
     kubectl patch secret -n orch-infra rps-reader-local-postgresql -p "{\"data\": {\"PGPASSWORD\": \"$RPS\"}}" --type=merge
-    # Use a temporary file for the patch payload
-    patch_file=$(mktemp)
-    cat > "$patch_file" <<EOF
-{
-  "data": {
-    "alerting": "$ALERTING",
-    "app-orch-catalog": "$CATALOG_SERVICE",
-    "iam-tenancy": "$IAM_TENANCY",
-    "inventory": "$INVENTORY",
-    "platform-keycloak": "$PLATFORM_KEYCLOAK",
-    "vault": "$VAULT",
-    "mps": "$MPS",
-    "rps": "$RPS"
-  }
-}
-EOF
 
     # New secrets needed for postgresql chart migration to cloudnative-pg
-    if kubectl get secret orch-app-app-orch-catalog -n orch-app >/dev/null 2>&1; then
-      kubectl patch secret -n orch-app orch-app-app-orch-catalog-local-postgresql -p "{\"data\": {\"password\": \"$CATALOG_SERVICE\"}}" --type=merge
-      kubectl patch secret -n orch-iam orch-iam-iam-tenancy -p "{\"data\": {\"password\": \"$IAM_TENANCY\"}}" --type=merge
-      kubectl patch secret -n orch-infra orch-infra-alerting -p "{\"data\": {\"password\": \"$ALERTING\"}}" --type=merge
-      kubectl patch secret -n orch-infra orch-infra-inventory -p "{\"data\": {\"password\": \"$INVENTORY\"}}" --type=merge
-      kubectl patch secret -n orch-platform orch-platform-platform-keycloak -p "{\"data\": {\"password\": \"$PLATFORM_KEYCLOAK\"}}" --type=merge
-      kubectl patch secret -n orch-platform orch-platform-vault -p "{\"data\": {\"password\": \"$VAULT\"}}" --type=merge
-      kubectl patch secret -n orch-infra orch-infra-mps -p "{\"data\": {\"password\": \"$MPS\"}}" --type=merge
-      kubectl patch secret -n orch-infra orch-infra-rps -p "{\"data\": {\"password\": \"$RPS\"}}" --type=merge
+    if kubectl get secret orch-app-app-orch-catalog -n orch-database >/dev/null 2>&1; then
+      kubectl patch secret -n orch-database orch-app-app-orch-catalog -p "{\"data\": {\"password\": \"$CATALOG_SERVICE\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-iam-iam-tenancy -p "{\"data\": {\"password\": \"$IAM_TENANCY\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-infra-alerting -p "{\"data\": {\"password\": \"$ALERTING\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-infra-inventory -p "{\"data\": {\"password\": \"$INVENTORY\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-platform-platform-keycloak -p "{\"data\": {\"password\": \"$PLATFORM_KEYCLOAK\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-platform-vault -p "{\"data\": {\"password\": \"$VAULT\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-infra-mps -p "{\"data\": {\"password\": \"$MPS\"}}" --type=merge
+      kubectl patch secret -n orch-database orch-infra-rps -p "{\"data\": {\"password\": \"$RPS\"}}" --type=merge
     fi
-
-    kubectl patch secret -n orch-database passwords --type=merge --patch-file "$patch_file"
-    rm -f "$patch_file"
-
-    # Patch postgresql secret
-    #kubectl patch secret -n orch-database postgresql -p "{\"data\": {\"postgres-password\": \"$POSTGRESQL\"}}" --type=merge
 }
 
 # Stop sync operation for root-app, so it won't be synced with the old version of the application.
@@ -1063,7 +1041,7 @@ kubectl patch application root-app -n "$apps_ns" --type json -p '[{"op": "remove
 sleep 30
 kubectl patch -n "$apps_ns" application root-app --patch-file /tmp/sync-postgresql-patch.yaml --type merge
 sleep 30
-patch_secret
+patch_secrets
 sleep 10
 
 # Restore secret after app delete but before postgress restored
@@ -1100,28 +1078,6 @@ restore_postgres
 # Update ALL database user passwords in PostgreSQL after restore
 echo "Updating all database user passwords in PostgreSQL..."
 
-# Get all passwords from postgres-secrets-password.txt file (they are base64 encoded)
-ALERTING_PASSWORD=$(grep "^Alerting:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-CATALOG_PASSWORD=$(grep "^CatalogService:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-INVENTORY_PASSWORD=$(grep "^Inventory:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-IAM_TENANCY_PASSWORD=$(grep "^IAMTenancy:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-KEYCLOAK_PASSWORD=$(grep "^PlatformKeycloak:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-MPS_PASSWORD=$(grep "^Mps:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-RPS_PASSWORD=$(grep "^Rps:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-VAULT_PASSWORD=$(grep "^Vault:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-POSTGRESQL_PASSWORD=$(grep "^PostgreSQL:" postgres-secrets-password.txt | cut -d' ' -f2 | base64 -d)
-
-# Update passwords for all database users
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-platform-vault_user\" WITH PASSWORD '$VAULT_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-alerting_user\" WITH PASSWORD '$ALERTING_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-app-app-orch-catalog_user\" WITH PASSWORD '$CATALOG_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-inventory_user\" WITH PASSWORD '$INVENTORY_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-iam-iam-tenancy_user\" WITH PASSWORD '$IAM_TENANCY_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-platform-platform-keycloak_user\" WITH PASSWORD '$KEYCLOAK_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-mps_user\" WITH PASSWORD '$MPS_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-infra-rps_user\" WITH PASSWORD '$RPS_PASSWORD';"
-kubectl exec postgresql-cluster-1 -n orch-database -c postgres -- psql -U postgres -c "ALTER USER \"orch-database-postgresql_user\" WITH PASSWORD '$POSTGRESQL_PASSWORD';"
-
 echo "✅ All database user passwords updated successfully"
 
 vault_unseal

From b11668f2f39ad008af406f424c6b7e8ef4abac20 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 02:31:28 +0530
Subject: [PATCH 02/12] onprem upgrade rc1 to rc2 fix (#1205)

---
 on-prem-installers/onprem/onprem_upgrade.sh   | 69 ++++++++++++-------
 on-prem-installers/onprem/upgrade_postgres.sh | 23 +++++--
 2 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index d53fd2f42..61d60a56a 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -306,7 +306,7 @@ check_and_force_sync_app() {
 
     for ((i=1; i<=max_retries; i++)); do
         app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound")
-        
+
         if [[ "$app_status" == "Synced Healthy" ]]; then
             echo "✅ $app_name is Synced and Healthy"
             return 0
@@ -315,26 +315,26 @@ check_and_force_sync_app() {
         echo "⚠️  $app_name is not Synced and Healthy (status: $app_status). Force-syncing... (attempt $i/$max_retries)"
         force_sync_outofsync_app "$app_name" "$namespace" "$server_side_apply"
         echo "✅ $app_name sync triggered"
-        
+
         # Check status every 5s for 90s
         local check_timeout=90
         local check_interval=3
         local elapsed=0
-        
+
         while (( elapsed < check_timeout )); do
             app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound")
-            
+
             if [[ "$app_status" == "Synced Healthy" ]]; then
                 echo "✅ $app_name became Synced and Healthy"
                 return 0
             else
                 echo "Current status: $app_status (elapsed: ${elapsed}s)"
             fi
-            
+
             sleep $check_interval
             elapsed=$((elapsed + check_interval))
         done
-        
+
         echo "⏳ $app_name did not become healthy within ${check_timeout}s"
     done
 
@@ -348,7 +348,7 @@ check_and_patch_sync_app() {
 
     for ((i=1; i<=max_retries; i++)); do
         app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound")
-        
+
         if [[ "$app_status" == "Synced Healthy" ]]; then
             echo "✅ $app_name is Synced and Healthy"
             return 0
@@ -365,21 +365,21 @@ check_and_patch_sync_app() {
         local check_timeout=90
         local check_interval=3
         local elapsed=0
-        
+
         while (( elapsed < check_timeout )); do
             app_status=$(kubectl get application "$app_name" -n "$namespace" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound")
-            
+
             if [[ "$app_status" == "Synced Healthy" ]]; then
                 echo "✅ $app_name became Synced and Healthy"
                 return 0
             else
                 echo "Current status: $app_status (elapsed: ${elapsed}s)"
             fi
-            
+
             sleep $check_interval
             elapsed=$((elapsed + check_interval))
         done
-        
+
         echo "⏳ $app_name did not become healthy within ${check_timeout}s"
     done
 
@@ -391,7 +391,7 @@ wait_for_app_synced_healthy() {
     local app_name=$1
     local namespace=$2
     local timeout=${3:-120}  # Default 120 seconds if not specified
-    
+
     local start_time
     start_time=$(date +%s)
     set +e
@@ -446,7 +446,7 @@ check_and_cleanup_job() {
     local app_name=$1
     local namespace=$2
     local job_label=${3:-job-name}
-    
+
     app_status=$(kubectl get application "$app_name" -n "$apps_ns" -o jsonpath='{.status.sync.status} {.status.health.status}' 2>/dev/null || echo "NotFound NotFound")
     if [[ "$app_status" != "Synced Healthy" ]]; then
         if kubectl get job -n "$namespace" -l "$job_label" 2>/dev/null | grep "$app_name"; then
@@ -668,11 +668,16 @@ if ! check_postgres; then
     exit 1
 fi
 
-# Perform postgreSQL secret backup if not done already
+# Perform PostgreSQL secret backup if not done already
 if [[ ! -f postgres_secret.yaml ]]; then
-    kubectl get secret -n orch-database postgresql -o yaml > postgres_secret.yaml
+    if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+        kubectl get secret -n orch-database postgresql -o yaml > postgres_secret.yaml
+    else
+        kubectl get secret -n orch-database passwords -o yaml > postgres_secret.yaml
+    fi
 fi
 
+
 # Delete gitea secrets before backup
 cleanup_gitea_secrets
 
@@ -793,7 +798,11 @@ if [[ ! -s postgres-secrets-password.txt ]]; then
     IAM_TENANCY=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam -o jsonpath='{.data.PGPASSWORD}')
     PLATFORM_KEYCLOAK=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}')
     VAULT=$(kubectl get secret vault-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}')
-    POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}')
+    if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+        POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}')
+    else
+        POSTGRESQL=$(kubectl get secret orch-database-postgresql -n orch-database -o jsonpath='{.data.password}')
+    fi
     MPS=$(kubectl get secret mps-local-postgresql -n orch-infra -o jsonpath='{.data.PGPASSWORD}')
     RPS=$(kubectl get secret rps-local-postgresql -n orch-infra -o jsonpath='{.data.PGPASSWORD}')
     {
@@ -1045,8 +1054,20 @@ patch_secrets
 sleep 10
 
 # Restore secret after app delete but before postgress restored
-yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f -
-
+if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+	yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f -
+else
+	yq e '
+	  del(.metadata.labels) |
+	  del(.metadata.annotations) |
+	  del(.metadata.ownerReferences) |
+	  del(.metadata.finalizers) |
+	  del(.metadata.managedFields) |
+	  del(.metadata.resourceVersion) |
+	  del(.metadata.uid) |
+	  del(.metadata.creationTimestamp)
+	' postgres_secret.yaml | kubectl apply -f -
+fi
 sleep 30
 # Wait until PostgreSQL pod is running (Re-sync)
 start_time=$(date +%s)
@@ -1256,7 +1277,7 @@ check_and_cleanup_job "wait-istio-job" "ns-label"
 # Collect and display syncwave information for OutOfSync applications
 echo "OutOfSync applications by syncwave:"
 outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | 
+    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
     "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
     sort -n)
 
@@ -1276,7 +1297,7 @@ done
 # Collect and display syncwave information for OutOfSync applications
 echo "OutOfSync applications by syncwave:"
 outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | 
+    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
     "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
     sort -n)
 
@@ -1295,7 +1316,7 @@ done
 # Collect and display syncwave information for OutOfSync applications
 echo "OutOfSync applications by syncwave:"
 outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") | 
+    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
     "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
     sort -n)
 
@@ -1316,10 +1337,10 @@ kubectl patch application root-app -n  "$apps_ns"  --type json -p '[{"op": "remo
 
 # OS profiles Fix
 kubectl patch application tenancy-api-mapping -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge 
+kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
 kubectl delete application tenancy-api-mapping -n onprem
 kubectl delete application tenancy-datamodel -n onprem
-kubectl delete deployment -n orch-infra os-resource-manager 
+kubectl delete deployment -n orch-infra os-resource-manager
 
 # Apply root-app Patch
 kubectl patch application root-app -n  "$apps_ns"  --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
@@ -1330,4 +1351,4 @@ kubectl delete secret boots-ca-cert -n orch-gateway
 kubectl delete secret boots-ca-cert -n orch-infra
 kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null
 
-echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state"
\ No newline at end of file
+echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state"
diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh
index 9150a73a3..d138619ac 100755
--- a/on-prem-installers/onprem/upgrade_postgres.sh
+++ b/on-prem-installers/onprem/upgrade_postgres.sh
@@ -4,17 +4,22 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-podname="postgresql-0"
 postgres_namespace=orch-database
-POSTGRES_LOCAL_BACKUP_PATH="./" 
+POSTGRES_LOCAL_BACKUP_PATH="./"
 local_backup_file="${postgres_namespace}_backup.sql"
 local_backup_path="${POSTGRES_LOCAL_BACKUP_PATH}${local_backup_file}"
-POSTGRES_USERNAME="postgres"  
+POSTGRES_USERNAME="postgres"
 application_namespace=onprem
 
+if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+    podname="postgresql-0"
+else
+    podname="postgresql-cluster-1"
+fi
+
 check_postgres() {
   if [[ -f "$local_backup_path" ]]; then
-    read -rp "A backfile file already exists. 
+    read -rp "A backfile file already exists.
     If you would like to continue using this backup file type Continue :
     " confirm && [[ $confirm == [cC][oO][nN][tT][iI][nN][uU][eE] ]] || exit 1
     # avoid the rest of the check function as this could be a recovery from a failed update
@@ -49,7 +54,7 @@ backup_postgres() {
   fi
   echo "Backing up databases from pod $podname in namespace $postgres_namespace..."
 
-  remote_backup_path="/tmp/${postgres_namespace}_backup.sql"
+  remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql"
   kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "$(typeset -f disable_security); disable_security"
 
   if kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "pg_dumpall -U $POSTGRES_USERNAME -f '$remote_backup_path'"; then
@@ -99,11 +104,15 @@ restore_postgres() {
   echo "Restoring backup databases from pod $podname in namespace $postgres_namespace..."
 
   # Get postgres password from secret
-  PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d)
+  if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+        PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d)
+else
+        PGPASSWORD=$(kubectl get secret -n $postgres_namespace orch-database-postgresql -o jsonpath='{.data.password}' | base64 -d)
+fi
 
   # CloudNativePG doesn't need security disable/enable, just use credentials
   # Use the remote backup file that was copied to the pod
   kubectl exec -n $postgres_namespace "$podname" -c postgres -- env PGPASSWORD="$PGPASSWORD" psql -U $POSTGRES_USERNAME -f "$remote_backup_path"
 
   echo "Restore completed successfully."
-}
\ No newline at end of file
+}

From 45d88faa9b5d76fb05728e2b51c916d74c0c4bf8 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 09:18:58 +0530
Subject: [PATCH 03/12] upgrade check added (#1206)

---
 on-prem-installers/onprem/upgrade_postgres.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh
index d138619ac..45cd7abe8 100755
--- a/on-prem-installers/onprem/upgrade_postgres.sh
+++ b/on-prem-installers/onprem/upgrade_postgres.sh
@@ -54,7 +54,12 @@ backup_postgres() {
   fi
   echo "Backing up databases from pod $podname in namespace $postgres_namespace..."
 
-  remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql"
+  if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+        remote_backup_path="/tmp/${postgres_namespace}_backup.sql"
+  else
+        remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql"
+  fi
+
   kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "$(typeset -f disable_security); disable_security"
 
   if kubectl exec -n $postgres_namespace $podname -- /bin/bash -c "pg_dumpall -U $POSTGRES_USERNAME -f '$remote_backup_path'"; then

From a928d34611cf542f5a1b184415a8c8bf28069ea8 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 10:00:23 +0530
Subject: [PATCH 04/12] postgresql pod check for upgrade flow (#1207)

---
 on-prem-installers/onprem/onprem_upgrade.sh | 24 +++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index 61d60a56a..f8a79266c 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -92,6 +92,30 @@ gitea_ns=gitea
 # shellcheck disable=SC2034
 root_app=root-app
 
+postgres_namespace=orch-database
+echo "Checking PostgreSQL pod in namespace: $postgres_namespace"
+# Get all pods once (optimized)
+pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null)
+# Check for new version pod
+echo "Checking for: postgresql-cluster-1"
+if echo "$pods" | grep -q "^postgresql-cluster-1"; then
+    export UPGRADE_3_1_X=false
+	podname=postgresql-cluster-1
+    echo "Onprem Upgrade from Rel3.1.x"
+elif echo "$pods" | grep -q "^postgresql-0"; then
+	export UPGRADE_3_1_X=true
+    echo "Onprem Upgrade from latest release where postgresql-cluster-1"
+	podname=postgresql-0
+# No valid pod found
+else
+    echo "❌ ERROR: No valid PostgreSQL pod found!"
+    echo "Expected:"
+    echo "  - postgresql-cluster-1 (new version, 2025.02+)"
+    echo "  - postgresql-0 (old version, 3.1.3 and below)"
+    exit 1
+fi
+echo "Selected PostgreSQL pod → $podname"
+
 # Variables that depend on the above and might require updating later, are placed in here
 set_artifacts_version() {
   installer_list=(

From 88e48e842b521507ce6256587cf1e63c52a9c480 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 10:11:24 +0530
Subject: [PATCH 05/12] fix check

---
 on-prem-installers/onprem/onprem_upgrade.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index f8a79266c..b68320e03 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -99,12 +99,12 @@ pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null)
 # Check for new version pod
 echo "Checking for: postgresql-cluster-1"
 if echo "$pods" | grep -q "^postgresql-cluster-1"; then
-    export UPGRADE_3_1_X=false
+    export UPGRADE_3_1_X=false	
+	cho "Onprem Upgrade from latest release where postgresql-cluster-1" 
 	podname=postgresql-cluster-1
-    echo "Onprem Upgrade from Rel3.1.x"
 elif echo "$pods" | grep -q "^postgresql-0"; then
 	export UPGRADE_3_1_X=true
-    echo "Onprem Upgrade from latest release where postgresql-cluster-1"
+    echo "Onprem Upgrade from Rel3.1.x"
 	podname=postgresql-0
 # No valid pod found
 else

From 0b2ac9af0e2e96a439ac3ab4b61ab21511678f84 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 16:03:01 +0530
Subject: [PATCH 06/12] Update onprem_upgrade.sh

---
 on-prem-installers/onprem/onprem_upgrade.sh | 25 +--------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index b68320e03..ff9b7c0d8 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -92,30 +92,7 @@ gitea_ns=gitea
 # shellcheck disable=SC2034
 root_app=root-app
 
-postgres_namespace=orch-database
-echo "Checking PostgreSQL pod in namespace: $postgres_namespace"
-# Get all pods once (optimized)
-pods=$(kubectl get pod -n "$postgres_namespace" --no-headers 2>/dev/null)
-# Check for new version pod
-echo "Checking for: postgresql-cluster-1"
-if echo "$pods" | grep -q "^postgresql-cluster-1"; then
-    export UPGRADE_3_1_X=false	
-	cho "Onprem Upgrade from latest release where postgresql-cluster-1" 
-	podname=postgresql-cluster-1
-elif echo "$pods" | grep -q "^postgresql-0"; then
-	export UPGRADE_3_1_X=true
-    echo "Onprem Upgrade from Rel3.1.x"
-	podname=postgresql-0
-# No valid pod found
-else
-    echo "❌ ERROR: No valid PostgreSQL pod found!"
-    echo "Expected:"
-    echo "  - postgresql-cluster-1 (new version, 2025.02+)"
-    echo "  - postgresql-0 (old version, 3.1.3 and below)"
-    exit 1
-fi
-echo "Selected PostgreSQL pod → $podname"
-
+export UPGRADE_3_1_X=true
 # Variables that depend on the above and might require updating later, are placed in here
 set_artifacts_version() {
   installer_list=(

From 0c51d145e467a31c09ac3155e5c9d9a80d818630 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 17:43:52 +0530
Subject: [PATCH 07/12] Onprem stability issue (#1208)

Co-authored-by: Andrei Palade <andrei.palade@intel.com>
---
 .../onprem/after_upgrade_restart.sh           | 1319 ++++++++++++++++-
 on-prem-installers/onprem/onprem_upgrade.sh   |  174 +--
 2 files changed, 1286 insertions(+), 207 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index a6dbac55f..5551ffc6d 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -5,69 +5,1270 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Description:
-#   This script is used after an upgrade to perform the following tasks:
-#     - Restart the following key components:
-#         • nexus-api-gw
-#         • cluster-manager-template-controller
-#         • app-orch-tenant-controller
-#     - Delete old ClusterTemplates that do NOT contain "k3s" in their name
+#   ArgoCD Application Sync Script with Advanced Retry and Recovery Logic
+#
+#   This script manages the synchronization of ArgoCD applications in wave order,
+#   with comprehensive error handling, failed sync detection, and automatic recovery.
+#   It handles stuck jobs, degraded applications, and failed CRDs, ensuring all
+#   applications reach a Healthy+Synced state.
+#
+# Features:
+#   - Wave-ordered application synchronization
+#   - Automatic detection and cleanup of failed syncs
+#   - Real-time job/CRD failure detection during sync
+#   - Automatic restart of failed applications
+#   - Global retry mechanism (4 attempts)
+#   - Per-application retry logic (3 attempts)
+#   - Timestamp tracking for all operations
+#   - Unhealthy job and CRD cleanup
+#   - OutOfSync application handling
+#   - Root-app special handling
+#   - Post-upgrade cleanup: Removes obsolete applications (tenancy-api-mapping,
+#     tenancy-datamodel), legacy deployments (os-resource-manager), and stale
+#     secrets (tls-boots, boots-ca-cert) to ensure clean upgrade state
 #
 # Usage:
-#   ./after_upgrade_restart.sh
+#   ./after_upgrade_restart.sh [NAMESPACE]
+#
+#   Arguments:
+#     NAMESPACE    - Target namespace for applications (optional, default: onprem)
+#
+#   The script will:
+#   1. Install ArgoCD CLI if not present
+#   2. Login to ArgoCD server
+#   3. Sync all applications excluding root-app
+#   4. Perform post-upgrade cleanup
+#   5. Re-sync all applications
+#   6. Validate final state
+#
+
+# Examples:
+#   ./after_upgrade_restart.sh              # Uses default namespace 'onprem'
+#
+# Environment Variables:
+#   ARGO_NS         - ArgoCD namespace (default: argocd)
+#
+# Exit Codes:
+#   0 - All applications synced successfully
+#   1 - Sync failed after all retries
+
+set -o pipefail
+
+# ============================================================
+# ============= GLOBAL CONFIGURATION VARIABLES ===============
+# ============================================================
+
+# Parse command-line arguments
+NS="${1:-onprem}"  # Use first argument or default to "onprem"
+ARGO_NS="argocd"
+
+echo "[INFO] Using namespace: $NS"
+echo "[INFO] Using ArgoCD namespace: $ARGO_NS"
+
+# Sync behaviour
+GLOBAL_POLL_INTERVAL=10           # seconds
+APP_MAX_WAIT=60               # 5 minutes to wait for any app (Healthy+Synced)
+APP_MAX_RETRIES=3                 # retry X times for each app
+GLOBAL_SYNC_RETRIES=2            # Global retry for entire sync process
+
+# Apps requiring server-side apply (space-separated list)
+SERVER_SIDE_APPS="external-secrets copy-app-gitea-cred-to-fleet copy-ca-cert-boots-to-gateway copy-ca-cert-boots-to-infra copy-ca-cert-gateway-to-cattle copy-ca-cert-gateway-to-infra copy-ca-cert-gitea-to-app copy-ca-cert-gitea-to-cluster copy-cluster-gitea-cred-to-fleet copy-keycloak-admin-to-infra infra-external platform-keycloak namespace-label wait-istio-job"
+
+# shellcheck disable=SC1091
+# ============================================================
+# REQUIRE COMMANDS
+# ============================================================
+require_cmd() {
+    if ! command -v "$1" >/dev/null 2>&1; then
+        echo "[ERROR] Required command '$1' not found. Install it and retry."
+        exit 1
+    fi
+}
+require_cmd kubectl
+require_cmd jq
+
+# ============================================================
+# ArgoCD CLI Install
+# ============================================================
+install_argocd_cli() {
+        if ! command -v argocd >/dev/null 2>&1; then
+    echo "[INFO] argocd CLI not found. Installing..."
+    VERSION=$(curl -L -s https://raw.githubusercontent.com/argoproj/argo-cd/stable/VERSION)
+    echo "[INFO] Latest version: $VERSION"
+    curl -sSL -o argocd-linux-amd64 \
+        https://github.com/argoproj/argo-cd/releases/download/v${VERSION}/argocd-linux-amd64
+    sudo install -m 555 argocd-linux-amd64 /usr/local/bin/argocd
+    rm -f argocd-linux-amd64
+    echo "[INFO] argocd CLI installed successfully."
+else
+    echo "[INFO] argocd CLI already installed: $(argocd version --client | head -1)"
+fi
+}
+install_argocd_cli
+
+# ============================================================
+# Fetch admin password
+# ============================================================
+echo "[INFO] Fetching ArgoCD admin password..."
+if command -v yq >/dev/null 2>&1; then
+    ADMIN_PASSWD=$(kubectl get secret -n "$ARGO_NS" argocd-initial-admin-secret -o yaml \
+        | yq -r '.data.password' | base64 -d)
+else
+    ADMIN_PASSWD=$(kubectl get secret -n "$ARGO_NS" argocd-initial-admin-secret \
+        -o jsonpath='{.data.password}' | base64 -d)
+fi
+
+# ============================================================
+# Discover Argo endpoint
+# ============================================================
+echo "[INFO] Detecting ArgoCD Server endpoint..."
+LB_IP=$(kubectl get svc argocd-server -n "$ARGO_NS" \
+    -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+
+if [[ -n "$LB_IP" ]]; then
+    ARGO_ENDPOINT="$LB_IP"
+    echo "[INFO] Using LoadBalancer IP: $ARGO_ENDPOINT"
+else
+    NODEPORT=$(kubectl get svc argocd-server -n "$ARGO_NS" -o jsonpath='{.spec.ports[0].nodePort}')
+    NODEIP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | awk '{print $1}')
+    ARGO_ENDPOINT="${NODEIP}:${NODEPORT}"
+    echo "[INFO] Using NodePort: $ARGO_ENDPOINT"
+fi
+
+# ============================================================
+# Argo Login
+# ============================================================
+echo "[INFO] Logging into ArgoCD..."
+argocd login "$ARGO_ENDPOINT" --username admin --password "$ADMIN_PASSWD" --insecure --grpc-web
+echo "[INFO] Login OK."
+
+# ============================================================
+# Fetch all apps by wave
+# ============================================================
+get_all_apps_by_wave() {
+    kubectl get applications.argoproj.io -n "$NS" -o json \
+    | jq -r '.items[] |
+        {
+            name: .metadata.name,
+            wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"),
+            health: .status.health.status,
+            sync: .status.sync.status
+        }
+        | "\(.wave) \(.name) \(.health) \(.sync)"
+    ' | sort -n -k1
+}
+
+# ============================================================
+# Fetch NOT-GREEN apps by wave
+# ============================================================
+get_not_green_apps() {
+    kubectl get applications.argoproj.io -n "$NS" -o json \
+    | jq -r '.items[] |
+        {
+            name: .metadata.name,
+            wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"),
+            health: .status.health.status,
+            sync: .status.sync.status
+        }
+        | select(.health != "Healthy" or .sync != "Synced")
+        | "\(.wave) \(.name) \(.health) \(.sync)"
+    ' | sort -n -k1
+}
+
+# Optional color helpers
+bold() { tput bold 2>/dev/null; }
+normal() { tput sgr0 2>/dev/null; }
+green() { tput setaf 2>/dev/null 2 && tput setaf 2; }
+red() { tput setaf 1 2>/dev/null; }
+yellow() { tput setaf 3 2>/dev/null; }
+blue() { tput setaf 4 2>/dev/null; }
+reset() { tput sgr0 2>/dev/null; }
+
+# Get timestamp
+get_timestamp() {
+    date '+%Y-%m-%d %H:%M:%S'
+}
+
+# ============================================================
+# Check and fix CRD version mismatches
+# ============================================================
+check_and_fix_crd_version_mismatch() {
+    local app_name="$1"
+
+    # Get application status
+    local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
+    if [[ -z "$status" ]]; then
+        return 1
+    fi
+
+    # Check for CRD version mismatch errors in sync messages
+    local version_mismatch=$(echo "$status" | jq -r '
+        .status.conditions[]? |
+        select(.type == "ComparisonError" or .type == "SyncError") |
+        select(.message | contains("could not find version") or contains("Version") and contains("is installed")) |
+        .message
+    ' 2>/dev/null)
+
+    if [[ -n "$version_mismatch" ]]; then
+        echo "$(red)[CRD-VERSION-MISMATCH] Detected CRD version mismatch in $app_name:$(reset)"
+        echo "$version_mismatch"
+
+        # Extract CRD details from error message
+        local crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1)
+        local crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1)
+
+        if [[ -n "$crd_group" && -n "$crd_kind" ]]; then
+            # Try to find and list the CRD
+            local crd_name="${crd_kind,,}s.${crd_group}"
+            echo "$(yellow)[INFO] Looking for CRD: $crd_name$(reset)"
+
+            # Check if CRD exists
+            if kubectl get crd "$crd_name" &>/dev/null; then
+                echo "$(yellow)[INFO] CRD $crd_name exists, checking versions...$(reset)"
+                kubectl get crd "$crd_name" -o jsonpath='{.spec.versions[*].name}' 2>/dev/null
+                echo
+
+                # For external-secrets.io, we need to update to v1beta1
+                if [[ "$crd_group" == "external-secrets.io" ]]; then
+                    echo "$(yellow)[FIX] Attempting to refresh application to use correct CRD version...$(reset)"
+                    argocd app get "${NS}/${app_name}" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    sleep 3
+                    return 0
+                fi
+            else
+                echo "$(red)[ERROR] CRD $crd_name not found on cluster$(reset)"
+            fi
+        fi
+
+        return 0
+    fi
+
+    return 1
+}
+
+# ============================================================
+# Check if application has failed sync and needs cleanup
+# ============================================================
+check_and_handle_failed_sync() {
+    local app_name="$1"
+    local full_app="${NS}/${app_name}"
+
+    # Get application status
+    local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
+    if [[ -z "$status" ]]; then
+        return 1
+    fi
+
+    local sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+    local sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"')
+
+    # Check if sync failed
+    if [[ "$sync_phase" == "Failed" || "$sync_phase" == "Error" ]]; then
+        echo "$(red)[FAILED-SYNC] Application $app_name has failed sync (phase=$sync_phase)$(reset)"
+
+        # Check for failed jobs/CRDs
+        local failed_resources=$(echo "$status" | jq -r '
+            .status.resources[]? |
+            select(.kind == "Job" or .kind == "CustomResourceDefinition") |
+            select(.health.status == "Degraded" or .health.status == "Missing" or .health.status == null) |
+            "\(.kind) \(.namespace) \(.name)"
+        ')
+
+        if [[ -n "$failed_resources" ]]; then
+            echo "$(red)[CLEANUP] Found failed jobs/CRDs in $app_name:$(reset)"
+            while IFS= read -r res_line; do
+                [[ -z "$res_line" ]] && continue
+                read -r kind res_ns res_name <<< "$res_line"
+                echo "$(red)  - Deleting $kind $res_name in $res_ns$(reset)"
+
+                if [[ "$kind" == "Job" ]]; then
+                    kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null &
+                    kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null &
+                elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true
+                fi
+            done <<< "$failed_resources"
+        fi
 
-# Function: delete pod and wait until it's Running and Ready
-restart_and_wait_pod() {
-  local namespace="$1"
-  local pattern="$2"
+        # Terminate stuck operations and refresh
+        echo "$(yellow)[RESTART] Restarting sync for $app_name...$(reset)"
+        argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+        sleep 2
+        argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+        sleep 5
 
-  echo "🔍 Looking for pod matching '$pattern' in namespace '$namespace'..."
+        # Trigger a new sync
+        echo "$(yellow)[RESYNC] Triggering fresh sync for $app_name...$(reset)"
+        argocd app sync "$full_app" --grpc-web 2>&1 || true
+        sleep 5
 
-  # Find the pod name
-  local pod_name
-  pod_name=$(kubectl get pods -n "$namespace" | grep "$pattern" | awk '{print $1}')
+        return 0
+    fi
 
-  if [ -z "$pod_name" ]; then
-    echo "❌ No pod found matching pattern '$pattern' in namespace '$namespace'"
     return 1
-  fi
-
-  echo "📌 Found pod: $pod_name. Deleting..."
-  kubectl delete pod "$pod_name" -n "$namespace"
-  kubectl wait deployment/"$pattern" -n "$namespace" --for=condition=Available --timeout=120s
-
-}
-
-# Function: Dlete Old Cluster Templates that do NOT contain 'k3s'
-delete_old_template() {
-echo "🔍 Fetching all ClusterTemplates..."
-all_templates=$(kubectl get clustertemplate -A --no-headers)
-
-echo "🚨 Deleting ClusterTemplates that do NOT contain 'k3s' in their name..."
-
-# Loop through each line of the result
-while IFS= read -r line; do
-  namespace=$(echo "$line" | awk '{print $1}')
-  template_name=$(echo "$line" | awk '{print $2}')
-
-  # Check if the template name contains "k3s"
-  if [[ "$template_name" != *k3s* ]]; then
-    echo "❌ Deleting template '$template_name' in namespace '$namespace'"
-    kubectl delete clustertemplate "$template_name" -n "$namespace"
-  else
-    echo "✅ Keeping template '$template_name' in namespace '$namespace' (contains 'k3s')"
-  fi
-done <<< "$all_templates"
-
-echo "✅ Cleanup complete."
-kubectl get clustertemplate -A | grep k3s
-}
-#restart pod after upgrade call:
-restart_and_wait_pod "orch-iam" "nexus-api-gw"
-restart_and_wait_pod "orch-cluster" "cluster-manager"
-restart_and_wait_pod "orch-cluster" "cluster-manager-template-controller"
-restart_and_wait_pod "orch-app" "app-orch-tenant-controller"
-#delete old cluster template
-delete_old_template
-sleep 30s
-#delete old secrets
-kubectl delete secret tls-boots -n orch-boots
+}
+
+# ============================================================
+# Clean unhealthy jobs for a specific application
+# ============================================================
+clean_unhealthy_jobs_for_app() {
+    local app_name="$1"
+
+    # Check for unhealthy jobs in this app and clean them up
+    app_resources=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null | jq -r '
+        .status.resources[]? |
+        select(.kind == "Job" and (.health.status != "Healthy" or .health.status == null)) |
+        "\(.namespace) \(.name)"
+    ')
+
+    if [[ -n "$app_resources" ]]; then
+        echo "$(yellow)[CLEANUP] Found unhealthy/failed jobs in $app_name:$(reset)"
+        while IFS= read -r job_line; do
+            [[ -z "$job_line" ]] && continue
+            read -r job_ns job_name <<< "$job_line"
+            echo "$(yellow)  - Deleting job $job_name in $job_ns (background)$(reset)"
+            kubectl delete pods -n "$job_ns" -l job-name="$job_name" --ignore-not-found=true 2>/dev/null &
+            kubectl delete job "$job_name" -n "$job_ns" --ignore-not-found=true 2>/dev/null &
+        done <<< "$app_resources"
+        echo "[INFO] Job cleanup initiated in background, proceeding..."
+        return 0
+    fi
+    return 1
+}
+
+print_header() {
+    echo
+    echo "$(bold)$(blue)============================================================$(reset)"
+    echo "$(bold)$(blue)== $1$(reset)"
+    echo "$(bold)$(blue)============================================================$(reset)"
+}
+
+print_table_header() {
+    printf "%-18s %-25s %-10s %-10s\n" "Wave" "App Name" "Health" "Sync"
+    echo "------------------------------------------------------------"
+}
+
+print_table_row() {
+    local wave="$1" name="$2" health="$3" sync="$4"
+    local color=""
+    if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+        color=$(green)
+    elif [[ "$health" == "Healthy" || "$sync" == "Synced" ]]; then
+        color=$(yellow)
+    else
+        color=$(red)
+    fi
+    printf "%s%-18s %-25s %-10s %-10s%s\n" "$color" "$wave" "$name" "$health" "$sync" "$(reset)"
+}
+
+# ============================================================
+# Sync apps one-by-one in wave order (with nice reporting)
+# ============================================================
+sync_not_green_apps_once() {
+    mapfile -t all_apps < <(get_all_apps_by_wave)
+    [[ ${#all_apps[@]} -eq 0 ]] && { echo "[WARN] No applications found in namespace '$NS'."; return 0; }
+
+    print_header "Applications (Wave-Ordered Status)"
+    print_table_header
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        print_table_row "$wave" "$name" "$health" "$sync"
+    done
+    echo
+
+    # Print summary of NOT-GREEN apps before syncing
+    echo "$(bold)[INFO] Apps NOT Healthy or NOT Synced:$(reset)"
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        if [[ "$health" != "Healthy" || "$sync" != "Synced" ]]; then
+            echo "$(red)  - $name (wave=$wave) Health=$health Sync=$sync$(reset)"
+        fi
+    done
+    echo
+
+    # Sync NOT-GREEN apps in wave order, skipping root-app until last
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        full_app="${NS}/${name}"
+
+        # Skip root-app for now, handle it after all other apps
+        if [[ "$name" == "root-app" ]]; then
+            continue
+        fi
+
+        # First check and handle any failed syncs
+        echo "[$(get_timestamp)] Checking for failed syncs in $name..."
+        check_and_handle_failed_sync "$name"
+
+        # Special pre-sync handling for nginx-ingress-pxe-boots
+        if [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+            echo "$(yellow)[INFO] Pre-sync: nginx-ingress-pxe-boots detected - deleting tls-boots secret first...$(reset)"
+            kubectl delete secret tls-boots -n orch-boots 2>/dev/null || true
+            sleep 3
+        fi
+
+        attempt=1
+        synced=false
+        while (( attempt <= APP_MAX_RETRIES )); do
+            status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
+            if [[ -z "$status" ]]; then
+                echo "$(red)[FAIL] $full_app not found$(reset)"
+                break
+            fi
+            health=$(echo "$status" | jq -r '.status.health.status')
+            sync=$(echo "$status" | jq -r '.status.sync.status')
+            last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+            last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"')
+
+            echo "[$(get_timestamp)] $full_app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time"
+
+            if (( attempt == 1 )); then
+                if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                    echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)"
+                    synced=true
+                    break
+                fi
+
+                # Check if last sync failed and clean up
+                if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then
+                    echo "$(red)[CLEANUP] Last sync failed for $full_app, cleaning up stuck resources...$(reset)"
+                    clean_unhealthy_jobs_for_app "$name"
+                    argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                    argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    sleep 5
+                fi
+
+                # Refresh app if it's degraded or not healthy
+                if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then
+                    echo "$(yellow)[REFRESH] App is $health, checking for unhealthy jobs...$(reset)"
+
+                    # Clean up any unhealthy jobs first
+                    clean_unhealthy_jobs_for_app "$name"
+
+                    if (( attempt > 1 )); then
+                        # Hard refresh on retry attempts
+                        argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    else
+                        argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+                    fi
+                    sleep 5
+                fi
+            fi
+
+            echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
+            echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
+
+            # Check if app requires server-side apply
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+                echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
+                argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                sleep 2
+                echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
+                rc=$?
+            # Special handling for nginx-ingress-pxe-boots
+            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
+                rc=$?
+            else
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
+                rc=$?
+            fi
+
+            if [[ $rc -ne 0 ]]; then
+                if [[ "$LOG" =~ "deleting" ]]; then
+                    echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)"
+                    break
+                fi
+                echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)"
+                ((attempt++))
+                continue
+            fi
+
+            timed_out=false
+            while true; do
+                now_ts=$(date +%s)
+                elapsed=$(( now_ts - start_ts ))
+                if (( elapsed >= APP_MAX_WAIT )); then
+                    echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
+                    timed_out=true
+                    break
+                fi
+                status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
+                [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; }
+                health=$(echo "$status" | jq -r '.status.health.status')
+                sync=$(echo "$status" | jq -r '.status.sync.status')
+                operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+
+                # Check for failed jobs/CRDs during sync
+                failed_jobs=$(echo "$status" | jq -r '
+                    .status.resources[]? |
+                    select(.kind == "Job" and .health.status == "Degraded") |
+                    .name
+                ' | wc -l)
+
+                if [[ $failed_jobs -gt 0 ]]; then
+                    echo "$(red)[ERROR] $full_app has $failed_jobs failed job(s), triggering cleanup and restart...$(reset)"
+                    # Clean up failed jobs and restart sync
+                    clean_unhealthy_jobs_for_app "$name"
+                    argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                    argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    sleep 3
+                    argocd app sync "$full_app" --grpc-web 2>&1 || true
+                    start_ts=$(date +%s)  # Reset timer
+                    sleep "$GLOBAL_POLL_INTERVAL"
+                    continue
+                fi
+
+                # Check if sync operation failed
+                if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
+                    echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase at [$(get_timestamp)]$(reset)"
+                    timed_out=true
+                    break
+                fi
+
+                print_table_row "$wave" "$name" "$health" "$sync"
+                echo "    [$(get_timestamp)] Elapsed: ${elapsed}s"
+                if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                    echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s at [$(get_timestamp)] (attempt ${attempt})$(reset)"
+                    synced=true
+                    break
+                fi
+                sleep "$GLOBAL_POLL_INTERVAL"
+            done
+            if [[ "$synced" == "true" ]]; then
+                break
+            fi
+            ((attempt++))
+            if (( attempt <= APP_MAX_RETRIES )); then
+                echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)"
+                # On retry, clean up unhealthy jobs and clear stuck operations
+                clean_unhealthy_jobs_for_app "$name"
+                argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                sleep 5
+            else
+                echo "$(red)[FAIL] Max retries reached for $full_app. Proceeding to next app.$(reset)"
+            fi
+        done
+        echo "$(blue)[INFO] Proceeding to next app...$(reset)"
+    done
+
+    # Now handle root-app sync after all other apps
+    status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+    if [[ -z "$status" ]]; then
+        echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)"
+        return 1
+    fi
+    health=$(echo "$status" | jq -r '.status.health.status')
+    sync=$(echo "$status" | jq -r '.status.sync.status')
+    wave=$(echo "$status" | jq -r '.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"')
+    full_app="${NS}/root-app"
+
+    attempt=1
+    synced=false
+    while (( attempt <= APP_MAX_RETRIES )); do
+        last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+        last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"')
+
+        echo "[$(get_timestamp)] root-app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time"
+
+        if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+            echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)"
+            synced=true
+            break
+        fi
+
+        # Check if last sync failed and clean up
+        if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then
+            echo "$(red)[CLEANUP] Last sync failed for root-app, cleaning up stuck resources...$(reset)"
+            clean_unhealthy_jobs_for_app "root-app"
+            argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+            argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+            sleep 5
+        fi
+
+        # Refresh root-app if it's degraded or not healthy
+        if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then
+            echo "$(yellow)[REFRESH] root-app is $health, refreshing before sync...$(reset)"
+            if (( attempt > 1 )); then
+                argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+            else
+                argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+            fi
+            sleep 5
+        fi
+
+        echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
+        echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
+
+        # Stop any ongoing operations and refresh before sync
+        echo "[INFO] Stopping ongoing operations and refreshing before sync..."
+        argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+        sleep 2
+        argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+        sleep 3
+
+        start_ts=$(date +%s)
+        LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
+        rc=$?
+
+        if [[ $rc -ne 0 ]]; then
+            if [[ "$LOG" =~ "deleting" ]]; then
+                echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)"
+                break
+            fi
+            echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)"
+            ((attempt++))
+            continue
+        fi
+
+        timed_out=false
+        while true; do
+            now_ts=$(date +%s)
+            elapsed=$(( now_ts - start_ts ))
+            if (( elapsed >= APP_MAX_WAIT )); then
+                echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
+                timed_out=true
+                break
+            fi
+            status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+            [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; }
+            health=$(echo "$status" | jq -r '.status.health.status')
+            sync=$(echo "$status" | jq -r '.status.sync.status')
+            print_table_row "$wave" "root-app" "$health" "$sync"
+            echo "    Elapsed: ${elapsed}s"
+            if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)"
+                synced=true
+                break
+            fi
+            sleep "$GLOBAL_POLL_INTERVAL"
+        done
+        if [[ "$synced" == "true" ]]; then
+            break
+        fi
+        ((attempt++))
+        if (( attempt <= APP_MAX_RETRIES )); then
+            echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)"
+        else
+            echo "$(red)[FAIL] Max retries reached for $full_app.$(reset)"
+        fi
+    done
+    echo "$(blue)[INFO] Finished root-app sync attempt(s).$(reset)"
+}
+
+# ============================================================
+# Sync all apps except root-app (wave order, nice reporting)
+# ============================================================
+sync_all_apps_exclude_root() {
+    mapfile -t all_apps < <(get_all_apps_by_wave)
+    [[ ${#all_apps[@]} -eq 0 ]] && { echo "[WARN] No applications found in namespace '$NS'."; return 0; }
+
+    print_header "Applications (Wave-Ordered Status, excluding root-app)"
+    print_table_header
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        if [[ "$name" != "root-app" ]]; then
+            print_table_row "$wave" "$name" "$health" "$sync"
+        fi
+    done
+    echo
+
+    # Print summary of NOT-GREEN apps before syncing
+    echo "$(bold)[INFO] Apps NOT Healthy or NOT Synced (excluding root-app):$(reset)"
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        if [[ "$name" != "root-app" && ( "$health" != "Healthy" || "$sync" != "Synced" ) ]]; then
+            echo "$(red)  - $name (wave=$wave) Health=$health Sync=$sync$(reset)"
+        fi
+    done
+    echo
+
+    # Sync NOT-GREEN apps in wave order, skipping root-app
+    for line in "${all_apps[@]}"; do
+        read -r wave name health sync <<< "$line"
+        full_app="${NS}/${name}"
+
+        if [[ "$name" == "root-app" ]]; then
+            continue
+        fi
+
+        # First check and handle any failed syncs
+        echo "[$(get_timestamp)] Checking for failed syncs in $name..."
+        check_and_handle_failed_sync "$name"
+
+        # Check for CRD version mismatches
+        echo "[$(get_timestamp)] Checking for CRD version mismatches in $name..."
+        check_and_fix_crd_version_mismatch "$name"
+
+        # Special pre-sync handling for nginx-ingress-pxe-boots
+        if [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+            echo "$(yellow)[INFO] Pre-sync: nginx-ingress-pxe-boots detected - deleting tls-boots secret first...$(reset)"
+            kubectl delete secret tls-boots -n orch-boots 2>/dev/null || true
+            sleep 3
+        fi
+
+        attempt=1
+        synced=false
+        while (( attempt <= APP_MAX_RETRIES )); do
+            status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
+            if [[ -n "$status" ]]; then
+                health=$(echo "$status" | jq -r '.status.health.status')
+                sync=$(echo "$status" | jq -r '.status.sync.status')
+                last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+                last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"')
+
+                echo "[$(get_timestamp)] $full_app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time"
+
+                if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                    echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)"
+                    synced=true
+                    break
+                fi
+
+                # Check if last sync failed and clean up
+                if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then
+                    echo "$(red)[CLEANUP] Last sync failed for $full_app, cleaning up stuck resources...$(reset)"
+                    clean_unhealthy_jobs_for_app "$name"
+                    argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                    argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    sleep 5
+                fi
+
+                # Refresh app if it's degraded or not healthy
+                if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then
+                    echo "$(yellow)[REFRESH] App is $health, checking for unhealthy jobs...$(reset)"
+
+                    # Clean up any unhealthy jobs first
+                    clean_unhealthy_jobs_for_app "$name"
+
+                    if (( attempt > 1 )); then
+                        # Hard refresh on retry attempts
+                        argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    else
+                        argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+                    fi
+                    sleep 5
+                fi
+            fi
+
+            echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
+            echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
+
+            # Check if app requires server-side apply
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+                echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
+                argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                sleep 2
+                echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
+                rc=$?
+            # Special handling for nginx-ingress-pxe-boots
+            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
+                rc=$?
+            else
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
+                rc=$?
+            fi
+
+            if [[ $rc -ne 0 ]]; then
+                if [[ "$LOG" =~ "deleting" ]]; then
+                    echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)"
+                    break
+                fi
+                echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)"
+                ((attempt++))
+                continue
+            fi
+
+            timed_out=false
+            while true; do
+                now_ts=$(date +%s)
+                elapsed=$(( now_ts - start_ts ))
+                if (( elapsed >= APP_MAX_WAIT )); then
+                    echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
+                    timed_out=true
+                    break
+                fi
+                status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
+                [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; }
+                health=$(echo "$status" | jq -r '.status.health.status')
+                sync=$(echo "$status" | jq -r '.status.sync.status')
+                operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+
+                # Check for failed jobs/CRDs during sync
+                failed_jobs=$(echo "$status" | jq -r '
+                    .status.resources[]? |
+                    select(.kind == "Job" and .health.status == "Degraded") |
+                    .name
+                ' | wc -l)
+
+                if [[ $failed_jobs -gt 0 ]]; then
+                    echo "$(red)[ERROR] $full_app has $failed_jobs failed job(s), triggering cleanup and restart...$(reset)"
+                    # Clean up failed jobs and restart sync
+                    clean_unhealthy_jobs_for_app "$name"
+                    argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                    argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                    sleep 3
+                    argocd app sync "$full_app" --grpc-web 2>&1 || true
+                    start_ts=$(date +%s)  # Reset timer
+                    sleep "$GLOBAL_POLL_INTERVAL"
+                    continue
+                fi
+
+                # Check if sync operation failed
+                if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
+                    echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)"
+                    timed_out=true
+                    break
+                fi
+
+                print_table_row "$wave" "$name" "$health" "$sync"
+                echo "    Elapsed: ${elapsed}s"
+                if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                    echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)"
+                    synced=true
+                    break
+                fi
+                sleep "$GLOBAL_POLL_INTERVAL"
+            done
+            if [[ "$synced" == "true" ]]; then
+                break
+            fi
+            ((attempt++))
+            if (( attempt <= APP_MAX_RETRIES )); then
+                echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)"
+                # On retry, clean up unhealthy jobs and clear stuck operations
+                clean_unhealthy_jobs_for_app "$name"
+                argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+                argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+                sleep 5
+            else
+                echo "$(red)[FAIL] Max retries reached for $full_app. Proceeding to next app.$(reset)"
+            fi
+        done
+        echo "$(blue)[INFO] Proceeding to next app...$(reset)"
+    done
+}
+
+# ============================================================
+# Sync root-app only (with nice reporting)
+# ============================================================
+sync_root_app_only() {
+    status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+    if [[ -z "$status" ]]; then
+        echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)"
+        return 1
+    fi
+    health=$(echo "$status" | jq -r '.status.health.status')
+    sync=$(echo "$status" | jq -r '.status.sync.status')
+    wave=$(echo "$status" | jq -r '.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"')
+    full_app="${NS}/root-app"
+
+    print_header "root-app Status"
+    print_table_header
+    print_table_row "$wave" "root-app" "$health" "$sync"
+    echo
+
+    # First check and handle any failed syncs
+    echo "[$(get_timestamp)] Checking for failed syncs in root-app..."
+    check_and_handle_failed_sync "root-app"
+
+    # Check for CRD version mismatches
+    echo "[$(get_timestamp)] Checking for CRD version mismatches in root-app..."
+    check_and_fix_crd_version_mismatch "root-app"
+
+    last_sync_status=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+    last_sync_time=$(echo "$status" | jq -r '.status.operationState.finishedAt // "N/A"')
+
+    echo "[$(get_timestamp)] root-app Status: Health=$health Sync=$sync LastSync=$last_sync_status Time=$last_sync_time"
+
+    if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+        echo "$(green)[OK] $full_app (wave=$wave) already Healthy+Synced$(reset)"
+        return 0
+    fi
+
+    # Check if last sync failed and clean up
+    if [[ "$last_sync_status" == "Failed" || "$last_sync_status" == "Error" ]]; then
+        echo "$(red)[CLEANUP] Last sync failed for root-app, cleaning up stuck resources...$(reset)"
+        clean_unhealthy_jobs_for_app "root-app"
+        argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+        argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+        sleep 5
+    fi
+
+    echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
+    attempt=1
+    synced=false
+    while (( attempt <= APP_MAX_RETRIES )); do
+        # Refresh root-app if it's degraded or not healthy
+        if [[ "$health" == "Degraded" || "$health" == "Progressing" || "$health" != "Healthy" ]]; then
+            echo "$(yellow)[REFRESH] root-app is $health, refreshing before sync...$(reset)"
+            if (( attempt > 1 )); then
+                argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+            else
+                argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+            fi
+            sleep 5
+        fi
+
+        echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
+
+        # Stop any ongoing operations and refresh before sync
+        echo "[INFO] Stopping ongoing operations and refreshing before sync..."
+        argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+        sleep 2
+        argocd app get "$full_app" --refresh --grpc-web >/dev/null 2>&1 || true
+        sleep 3
+
+        start_ts=$(date +%s)
+        LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
+        rc=$?
+
+        if [[ $rc -ne 0 ]]; then
+            if [[ "$LOG" =~ "deleting" ]]; then
+                echo "$(red)[SKIP] $full_app is deleting. Skipping further attempts.$(reset)"
+                break
+            fi
+            echo "$(red)[ERROR] Sync command failed, will retry if attempts remain.$(reset)"
+            ((attempt++))
+            continue
+        fi
+
+        timed_out=false
+        while true; do
+            now_ts=$(date +%s)
+            elapsed=$(( now_ts - start_ts ))
+            if (( elapsed >= APP_MAX_WAIT )); then
+                echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
+                timed_out=true
+                break
+            fi
+            status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+            [[ -z "$status" ]] && { sleep "$GLOBAL_POLL_INTERVAL"; continue; }
+            health=$(echo "$status" | jq -r '.status.health.status')
+            sync=$(echo "$status" | jq -r '.status.sync.status')
+            operation_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+
+            # Check if sync operation failed
+            if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
+                echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)"
+                timed_out=true
+                break
+            fi
+
+            print_table_row "$wave" "root-app" "$health" "$sync"
+            echo "    Elapsed: ${elapsed}s"
+            if [[ "$health" == "Healthy" && "$sync" == "Synced" ]]; then
+                echo "$(green)[DONE] $full_app Healthy+Synced in ${elapsed}s (attempt ${attempt})$(reset)"
+                synced=true
+                break
+            fi
+            sleep "$GLOBAL_POLL_INTERVAL"
+        done
+        if [[ "$synced" == "true" ]]; then
+            break
+        fi
+        ((attempt++))
+        if (( attempt <= APP_MAX_RETRIES )); then
+            echo "$(yellow)[RETRY] Retrying $full_app (${attempt}/${APP_MAX_RETRIES})...$(reset)"
+            # On retry, clean up unhealthy jobs and clear stuck operations
+            clean_unhealthy_jobs_for_app "root-app"
+            argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
+            argocd app get "$full_app" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+            sleep 5
+        else
+            echo "$(red)[FAIL] Max retries reached for $full_app.$(reset)"
+        fi
+
+        # Re-fetch status for next iteration
+        status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+        if [[ -n "$status" ]]; then
+            health=$(echo "$status" | jq -r '.status.health.status')
+            sync=$(echo "$status" | jq -r '.status.sync.status')
+        fi
+    done
+    echo "$(blue)[INFO] Finished root-app sync attempt(s).$(reset)"
+}
+
+# ============================================================
+# Wait until NS is all green (excluding root-app)
+# ============================================================
+namespace_all_green_exclude_root() {
+    kubectl get applications.argoproj.io -n "$NS" -o json \
+    | jq -r '
+        .items[] |
+        select(.metadata.name != "root-app") |
+        {
+            health: .status.health.status,
+            sync: .status.sync.status
+        }
+        | select(.health != "Healthy" or .sync != "Synced")
+    ' | grep -q .
+    return $?
+}
+
+sync_until_green_ns_exclude_root() {
+    while true; do
+        if ! namespace_all_green_exclude_root; then
+            print_header "All non-root-app applications are Healthy+Synced in namespace '$NS'."
+            break
+        fi
+
+        print_header "NOT-GREEN apps (Wave-Ordered, excluding root-app)"
+        print_table_header
+        mapfile -t not_green < <(kubectl get applications.argoproj.io -n "$NS" -o json \
+            | jq -r '.items[] | select(.metadata.name != "root-app") | {
+                name: .metadata.name,
+                wave: (.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0"),
+                health: .status.health.status,
+                sync: .status.sync.status
+            } | "\(.wave) \(.name) \(.health) \(.sync)"' | sort -n -k1)
+        for line in "${not_green[@]}"; do
+            read -r wave name health sync <<< "$line"
+            print_table_row "$wave" "$name" "$health" "$sync"
+        done
+        echo
+
+        sync_all_apps_exclude_root
+
+        sleep "10"
+    done
+}
+
+
+# ============================================================
+# Check and delete stuck/out-of-sync dependent CRD jobs
+# ============================================================
+check_and_delete_stuck_crd_jobs() {
+    print_header "Checking for stuck/out-of-sync dependent CRD jobs"
+
+    # Check for stuck jobs in all namespaces
+    echo "[INFO] Looking for stuck or failed jobs..."
+
+    # Get jobs that are not completed or have failed
+    stuck_jobs=$(kubectl get jobs --all-namespaces -o json | jq -r '
+        .items[] |
+        select(.status.succeeded != 1 and (.status.failed > 0 or .status.active > 0)) |
+        "\(.metadata.namespace) \(.metadata.name)"
+    ')
+
+    if [[ -n "$stuck_jobs" ]]; then
+        echo "$(yellow)[WARN] Found stuck/failed jobs:$(reset)"
+        echo "$stuck_jobs"
+
+        # Delete stuck jobs and their pods
+        while IFS= read -r line; do
+            [[ -z "$line" ]] && continue
+            read -r job_ns job_name <<< "$line"
+            echo "$(yellow)[CLEANUP] Deleting stuck job $job_name in namespace $job_ns (background)$(reset)"
+
+            # Delete associated pods first
+            kubectl delete pods -n "$job_ns" -l job-name="$job_name" --ignore-not-found=true 2>/dev/null &
+
+            # Delete the job
+            kubectl delete job "$job_name" -n "$job_ns" --ignore-not-found=true &
+        done <<< "$stuck_jobs"
+
+        echo "[INFO] Job cleanup initiated in background, proceeding..."
+    else
+        echo "$(green)[OK] No stuck jobs found$(reset)"
+    fi
+
+    # Check for applications that are OutOfSync
+    echo "[INFO] Looking for OutOfSync applications..."
+    out_of_sync_apps=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r '
+        .items[] |
+        select(.status.sync.status == "OutOfSync") |
+        .metadata.name
+    ')
+
+    if [[ -n "$out_of_sync_apps" ]]; then
+        echo "$(yellow)[WARN] Found OutOfSync applications:$(reset)"
+        echo "$out_of_sync_apps"
+
+        # Stop and restart sync for OutOfSync apps
+        while IFS= read -r app_name; do
+            [[ -z "$app_name" ]] && continue
+            echo "$(yellow)[CLEANUP] Stopping sync for $app_name$(reset)"
+            argocd app terminate-op "${NS}/${app_name}" --grpc-web 2>/dev/null || true
+            sleep 2
+        done <<< "$out_of_sync_apps"
+    else
+        echo "$(green)[OK] No OutOfSync applications found$(reset)"
+    fi
+
+    # Check for applications with sync failures
+    echo "[INFO] Looking for applications with sync failures..."
+    sync_failed_apps=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r '
+        .items[] |
+        select(.status.operationState.phase == "Failed" or .status.operationState.phase == "Error") |
+        "\(.metadata.name) \(.status.operationState.phase)"
+    ')
+
+    if [[ -n "$sync_failed_apps" ]]; then
+        echo "$(red)[WARN] Found applications with sync failures:$(reset)"
+        echo "$sync_failed_apps"
+
+        # Clean up failed apps
+        while IFS= read -r line; do
+            [[ -z "$line" ]] && continue
+            read -r app_name phase <<< "$line"
+            echo "$(red)[CLEANUP] App $app_name has phase=$phase, cleaning up...$(reset)"
+
+            # Clean up unhealthy jobs for this app
+            clean_unhealthy_jobs_for_app "$app_name"
+
+            # Terminate any stuck operations
+            argocd app terminate-op "${NS}/${app_name}" --grpc-web 2>/dev/null || true
+
+            # Hard refresh to clear the error state
+            argocd app get "${NS}/${app_name}" --hard-refresh --grpc-web >/dev/null 2>&1 || true
+
+            sleep 2
+        done <<< "$sync_failed_apps"
+    else
+        echo "$(green)[OK] No sync failed applications found$(reset)"
+    fi
+
+    echo "[INFO] Stuck CRD jobs check and cleanup completed."
+}
+
+# ============================================================
+# Post-upgrade cleanup function
+# ============================================================
+post_upgrade_cleanup() {
+    print_header "Post-upgrade Cleanup (Manual Fixes)"
+
+    echo "[INFO] Deleting applications tenancy-api-mapping and tenancy-datamodel in namespace onprem..."
+    kubectl delete application tenancy-api-mapping -n onprem || true
+    kubectl delete application tenancy-datamodel -n onprem || true
+
+    echo "[INFO] Deleting deployment os-resource-manager in namespace orch-infra..."
+    kubectl delete deployment -n orch-infra os-resource-manager || true
+
+    echo "[INFO] Deleting onboarding secrets..."
+    kubectl delete secret tls-boots -n orch-boots || true
+    kubectl delete secret boots-ca-cert -n orch-gateway || true
+    kubectl delete secret boots-ca-cert -n orch-infra || true
+
+    echo "[INFO] Deleting dkam pods in namespace orch-infra..."
+    kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null || true
+
+    echo "[INFO] Post-upgrade cleanup completed."
+}
+
+# ============================================================
+# Main sync function with retry logic
+# ============================================================
+execute_full_sync() {
+    sync_until_green_ns_exclude_root
+    print_header "Syncing root-app after all other apps are green"
+    sync_root_app_only
+
+    post_upgrade_cleanup
+
+    sleep 60
+    print_header "Post-upgrade: Syncing all apps (excluding root-app) again"
+    sync_all_apps_exclude_root
+    print_header "Post-upgrade: Syncing root-app again"
+    sync_root_app_only
+}
+
+# ============================================================
+# Check if sync was successful
+# ============================================================
+check_sync_success() {
+    # Check root-app status
+    status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
+    if [[ -z "$status" ]]; then
+        echo "$(red)[FAIL] root-app not found in namespace '$NS'.$(reset)"
+        return 1
+    fi
+    health=$(echo "$status" | jq -r '.status.health.status')
+    sync=$(echo "$status" | jq -r '.status.sync.status')
+
+    if [[ "$health" != "Healthy" || "$sync" != "Synced" ]]; then
+        echo "$(red)[FAIL] root-app is NOT Healthy+Synced (Health: $health, Sync: $sync)$(reset)"
+        return 1
+    fi
+
+    # Check for any non-healthy apps
+    not_healthy=$(kubectl get applications.argoproj.io -n "$NS" -o json | jq -r '
+        .items[] |
+        select(.status.health.status != "Healthy" or .status.sync.status != "Synced") |
+        .metadata.name
+    ' | wc -l)
+
+    if [[ $not_healthy -gt 0 ]]; then
+        echo "$(red)[FAIL] $not_healthy applications are not Healthy+Synced$(reset)"
+        return 1
+    fi
+    kubectl get applications -A
+    echo "$(green)[OK] All applications are Healthy+Synced$(reset)"
+    
+    # Display all applications status
+    echo
+    echo "$(bold)$(green)Final Application Status:$(reset)"
+    
+    
+    return 0
+}
+
+# ============================================================
+# GLOBAL TIMEOUT WATCHDOG
+# ============================================================
+SCRIPT_START_TS=$(date +%s)
+
+# Global retry loop
+global_retry=1
+sync_success=false
+
+while (( global_retry <= GLOBAL_SYNC_RETRIES )); do
+    print_header "GLOBAL SYNC ATTEMPT ${global_retry}/${GLOBAL_SYNC_RETRIES}"
+
+    execute_full_sync
+
+    if check_sync_success; then
+        sync_success=true
+        print_header "Sync Script Completed Successfully"
+        exit 0
+    fi
+
+    if (( global_retry < GLOBAL_SYNC_RETRIES )); then
+        echo "$(yellow)[RETRY] Sync attempt ${global_retry} failed. Checking for stuck resources...$(reset)"
+
+        # Check and cleanup stuck resources before next retry
+        check_and_delete_stuck_crd_jobs
+
+        # Stop all ongoing sync operations
+        echo "[INFO] Stopping all ongoing sync operations..."
+        mapfile -t all_apps < <(kubectl get applications.argoproj.io -n "$NS" -o jsonpath='{.items[*].metadata.name}')
+        for app in "${all_apps[@]}"; do
+            [[ -z "$app" ]] && continue
+            argocd app terminate-op "${NS}/${app}" --grpc-web 2>/dev/null || true
+        done
+
+        echo "$(yellow)[INFO] Waiting 30 seconds before retry ${global_retry}...$(reset)"
+        sleep 30
+
+        ((global_retry++))
+    else
+        echo "$(red)[FAIL] Maximum global retries (${GLOBAL_SYNC_RETRIES}) reached. Sync failed.$(reset)"
+        exit 1
+    fi
+done
+
+# This should not be reached, but just in case
+echo "$(red)[FAIL] Sync did not complete successfully after ${GLOBAL_SYNC_RETRIES} attempts.$(reset)"
+exit 1
diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index ff9b7c0d8..9f2bd55a7 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -77,6 +77,7 @@ ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}"
 DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}"  # Updated to v3.1.0
 GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}"
 USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}"  # New flag for local packages
+UPGRADE_3_1_X="${UPGRADE_3_1_X:-true}"
 
 ### Variables
 cwd=$(pwd)
@@ -92,7 +93,7 @@ gitea_ns=gitea
 # shellcheck disable=SC2034
 root_app=root-app
 
-export UPGRADE_3_1_X=true
+
 # Variables that depend on the above and might require updating later, are placed in here
 set_artifacts_version() {
   installer_list=(
@@ -764,6 +765,16 @@ fi
 # Modify orch-configs settings for upgrade procedure
 retrieve_and_apply_config
 
+# Check if kyverno-clean-reports job exists before attempting cleanup
+if kubectl get job kyverno-clean-reports -n kyverno >/dev/null 2>&1; then
+    echo "Cleaning up kyverno-clean-reports job..."
+    kubectl delete job kyverno-clean-reports -n kyverno &
+    kubectl delete pods -l job-name="kyverno-clean-reports" -n kyverno &
+    kubectl patch job kyverno-clean-reports -n kyverno --type=merge -p='{"metadata":{"finalizers":[]}}'
+else
+    echo "kyverno-clean-reports job not found in kyverno namespace, skipping cleanup"
+fi
+
 ### Upgrade
 
 # Run OS Configuration upgrade
@@ -1056,18 +1067,18 @@ sleep 10
 
 # Restore secret after app delete but before postgress restored
 if [[ "$UPGRADE_3_1_X" == "true" ]]; then
-	yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f -
+        yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f -
 else
-	yq e '
-	  del(.metadata.labels) |
-	  del(.metadata.annotations) |
-	  del(.metadata.ownerReferences) |
-	  del(.metadata.finalizers) |
-	  del(.metadata.managedFields) |
-	  del(.metadata.resourceVersion) |
-	  del(.metadata.uid) |
-	  del(.metadata.creationTimestamp)
-	' postgres_secret.yaml | kubectl apply -f -
+        yq e '
+          del(.metadata.labels) |
+          del(.metadata.annotations) |
+          del(.metadata.ownerReferences) |
+          del(.metadata.finalizers) |
+          del(.metadata.managedFields) |
+          del(.metadata.resourceVersion) |
+          del(.metadata.uid) |
+          del(.metadata.creationTimestamp)
+        ' postgres_secret.yaml | kubectl apply -f -
 fi
 sleep 30
 # Wait until PostgreSQL pod is running (Re-sync)
@@ -1205,151 +1216,18 @@ fi
 echo "Applying external-secrets CRDs with server-side apply..."
 kubectl apply --server-side=true --force-conflicts -f https://raw.githubusercontent.com/external-secrets/external-secrets/refs/tags/v0.20.4/deploy/crds/bundle.yaml || true
 
-check_and_force_sync_app external-secrets "$apps_ns" "true"
-wait_for_app_synced_healthy external-secrets "$apps_ns"
-
-# Force sync apps that copy secrets to their destinations
-check_and_force_sync_app copy-app-gitea-cred-to-fleet "$apps_ns"
-check_and_force_sync_app copy-ca-cert-boots-to-gateway "$apps_ns"
-check_and_force_sync_app copy-ca-cert-boots-to-infra "$apps_ns"
-check_and_force_sync_app copy-ca-cert-gateway-to-cattle "$apps_ns"
-check_and_force_sync_app copy-ca-cert-gateway-to-infra "$apps_ns"
-check_and_force_sync_app copy-ca-cert-gitea-to-app "$apps_ns"
-check_and_force_sync_app copy-ca-cert-gitea-to-cluster "$apps_ns"
-check_and_force_sync_app copy-cluster-gitea-cred-to-fleet "$apps_ns"
-check_and_force_sync_app copy-keycloak-admin-to-infra "$apps_ns"
-
 # Unseal vault after external-secrets is ready
 echo "Unsealing vault..."
 vault_unseal
 echo "✅ Vault unsealed successfully"
-
-kubectl patch -n "$apps_ns" application platform-keycloak --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-
-wait_for_app_synced_healthy platform-keycloak "$apps_ns"
-
-kubectl patch -n "$apps_ns" application cluster-manager --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-
-
-kubectl delete secret tls-boots -n orch-boots
-
-# Observability Minio PVC ignoreDifferences patching and job cleanup
-kubectl patch job orchestrator-observability-mimir-make-minio-buckets-5.4.0 -n orch-platform --type=merge -p='{"metadata":{"finalizers":[]}}'
-kubectl delete job orchestrator-observability-mimir-make-minio-buckets-5.4.0 -n orch-platform --force --grace-period=0 2>/dev/null || true
-kubectl delete pods -l job-name="orchestrator-observability-mimir-make-minio-buckets-5.4.0" -n orch-platform --force --grace-period=0 2>/dev/null || true
-
-kubectl patch application orchestrator-observability -n "$apps_ns" --type='json' -p='[{
-    "op": "add",
-    "path": "/spec/ignoreDifferences",
-    "value": [{
-        "group": "",
-        "kind": "PersistentVolumeClaim",
-        "name": "orchestrator-observability-minio",
-        "jsonPointers": ["/spec/storageClassName", "/spec/volumeName"]
-    }]
-}]'
-
-kubectl patch job edgenode-observability-mimir-make-minio-buckets-5.4.0  -n orch-infra --type=merge -p='{"metadata":{"finalizers":[]}}'
-kubectl delete job edgenode-observability-mimir-make-minio-buckets-5.4.0 -n orch-infra --force --grace-period=0 2>/dev/null || true
-kubectl delete pods -l job-name="edgenode-observability-mimir-make-minio-buckets-5.4.0" -n orch-infra --force --grace-period=0 2>/dev/null || true
-
-kubectl patch application edgenode-observability -n "$apps_ns" --type='json' -p='[{
-    "op": "add",
-    "path": "/spec/ignoreDifferences",
-    "value": [{
-        "group": "",
-        "kind": "PersistentVolumeClaim",
-        "name": "edgenode-observability-minio",
-        "jsonPointers": ["/spec/storageClassName", "/spec/volumeName"]
-    }]
-}]'
-
-check_and_patch_sync_app edgenode-observability "$apps_ns"
-check_and_patch_sync_app orchestrator-observability "$apps_ns"
-
-# Cleanup infra-external jobs
-kubectl delete jobs setup-databases-mps setup-databases-rps amt-dbpassword-secret-job init-amt-vault-job -n orch-infra --force --grace-period=0 --ignore-not-found
-
-
-check_and_cleanup_job "namespace-label" "ns-label"
-check_and_cleanup_job "wait-istio-job" "ns-label"
-
-# Unsynced leftovers using patch sync
-# Collect and display syncwave information for OutOfSync applications
-echo "OutOfSync applications by syncwave:"
-outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
-    "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
-    sort -n)
-
-echo "$outofsync_apps" | awk '{print "  Wave " $1 ": " $2}'
-
-# Sync applications in wave order
-echo "Syncing OutOfSync applications in wave order..."
-echo "$outofsync_apps" | while read -r wave app_name; do
-    if [[ -n "$app_name" ]]; then
-        echo "Processing wave $wave: $app_name"
-        check_and_patch_sync_app "$app_name" "$apps_ns"
-    fi
-done
-
-
-# Unsynced leftovers using force sync
-# Collect and display syncwave information for OutOfSync applications
-echo "OutOfSync applications by syncwave:"
-outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
-    "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
-    sort -n)
-
-echo "$outofsync_apps" | awk '{print "  Wave " $1 ": " $2}'
-
-# Sync applications in wave order
-echo "Syncing OutOfSync applications in wave order..."
-echo "$outofsync_apps" | while read -r wave app_name; do
-    if [[ -n "$app_name" ]]; then
-        echo "Processing wave $wave: $app_name"
-        check_and_force_sync_app "$app_name" "$apps_ns" "true"
-    fi
-done
-
-# Unsynced leftovers using force sync
-# Collect and display syncwave information for OutOfSync applications
-echo "OutOfSync applications by syncwave:"
-outofsync_apps=$(kubectl get applications -n "$apps_ns" -o json | \
-    jq -r '.items[] | select((.status.sync.status!="Synced" or .status.health.status!="Healthy") and .metadata.name!="root-app") |
-    "\(.metadata.annotations["argocd.argoproj.io/sync-wave"] // "0") \(.metadata.name)"' | \
-    sort -n)
-
-echo "$outofsync_apps" | awk '{print "  Wave " $1 ": " $2}'
-
-# Sync applications in wave order
-echo "Syncing OutOfSync applications in wave order..."
-echo "$outofsync_apps" | while read -r wave app_name; do
-    if [[ -n "$app_name" ]]; then
-        echo "Processing wave $wave: $app_name"
-        check_and_force_sync_app "$app_name" "$apps_ns"
-    fi
-done
-
 # Stop root-app old sync as it will be stuck.
 kubectl patch application root-app -n  "$apps_ns"  --type merge -p '{"operation":null}'
 kubectl patch application root-app -n  "$apps_ns"  --type json -p '[{"op": "remove", "path": "/status/operationState"}]'
-
-# OS profiles Fix
-kubectl patch application tenancy-api-mapping -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-kubectl patch application tenancy-datamodel -n onprem --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-kubectl delete application tenancy-api-mapping -n onprem
-kubectl delete application tenancy-datamodel -n onprem
-kubectl delete deployment -n orch-infra os-resource-manager
-
 # Apply root-app Patch
 kubectl patch application root-app -n  "$apps_ns"  --patch-file /tmp/argo-cd/sync-patch.yaml --type merge
-
-# Onboarding Fix
+sleep 10
+#restart tls-boot secrets
 kubectl delete secret tls-boots -n orch-boots
-kubectl delete secret boots-ca-cert -n orch-gateway
-kubectl delete secret boots-ca-cert -n orch-infra
-kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null
 
+./after_upgrade_restart.sh
 echo "Upgrade completed! Wait for ArgoCD applications to be in 'Synced' and 'Healthy' state"

From 53fc24a3c1a7fa034750412daa01790e1911e8a2 Mon Sep 17 00:00:00 2001
From: Andrei Palade <andrei.palade@intel.com>
Date: Thu, 4 Dec 2025 12:51:07 +0000
Subject: [PATCH 08/12] Fix linter issues in the after_upgrade_restart.sh
 script (#1212)

---
 .../onprem/after_upgrade_restart.sh           | 58 +++++++++++--------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index 5551ffc6d..c8875efeb 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -96,7 +96,7 @@ install_argocd_cli() {
     VERSION=$(curl -L -s https://raw.githubusercontent.com/argoproj/argo-cd/stable/VERSION)
     echo "[INFO] Latest version: $VERSION"
     curl -sSL -o argocd-linux-amd64 \
-        https://github.com/argoproj/argo-cd/releases/download/v${VERSION}/argocd-linux-amd64
+        https://github.com/argoproj/argo-cd/releases/download/v"${VERSION}"/argocd-linux-amd64
     sudo install -m 555 argocd-linux-amd64 /usr/local/bin/argocd
     rm -f argocd-linux-amd64
     echo "[INFO] argocd CLI installed successfully."
@@ -196,13 +196,15 @@ check_and_fix_crd_version_mismatch() {
     local app_name="$1"
 
     # Get application status
-    local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
+    local status
+    status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
     if [[ -z "$status" ]]; then
         return 1
     fi
 
     # Check for CRD version mismatch errors in sync messages
-    local version_mismatch=$(echo "$status" | jq -r '
+    local version_mismatch
+    version_mismatch=$(echo "$status" | jq -r '
         .status.conditions[]? |
         select(.type == "ComparisonError" or .type == "SyncError") |
         select(.message | contains("could not find version") or contains("Version") and contains("is installed")) |
@@ -214,8 +216,10 @@ check_and_fix_crd_version_mismatch() {
         echo "$version_mismatch"
 
         # Extract CRD details from error message
-        local crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1)
-        local crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1)
+        local crd_group
+        crd_group=$(echo "$version_mismatch" | grep -oP '[a-z0-9.-]+\.[a-z]+(?=/[A-Z])' | head -1)
+        local crd_kind
+        crd_kind=$(echo "$version_mismatch" | grep -oP '/[A-Z][a-zA-Z]+' | sed 's|/||' | head -1)
 
         if [[ -n "$crd_group" && -n "$crd_kind" ]]; then
             # Try to find and list the CRD
@@ -254,20 +258,24 @@ check_and_handle_failed_sync() {
     local full_app="${NS}/${app_name}"
 
     # Get application status
-    local status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
+    local status
+    status=$(kubectl get applications.argoproj.io "$app_name" -n "$NS" -o json 2>/dev/null)
     if [[ -z "$status" ]]; then
         return 1
     fi
 
-    local sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
-    local sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"')
+    local sync_phase
+    sync_phase=$(echo "$status" | jq -r '.status.operationState.phase // "Unknown"')
+    #local sync_status
+    #sync_status=$(echo "$status" | jq -r '.status.sync.status // "Unknown"')
 
     # Check if sync failed
     if [[ "$sync_phase" == "Failed" || "$sync_phase" == "Error" ]]; then
         echo "$(red)[FAILED-SYNC] Application $app_name has failed sync (phase=$sync_phase)$(reset)"
 
         # Check for failed jobs/CRDs
-        local failed_resources=$(echo "$status" | jq -r '
+        local failed_resources
+        failed_resources=$(echo "$status" | jq -r '
             .status.resources[]? |
             select(.kind == "Job" or .kind == "CustomResourceDefinition") |
             select(.health.status == "Degraded" or .health.status == "Missing" or .health.status == null) |
@@ -459,7 +467,7 @@ sync_not_green_apps_once() {
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
             # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
@@ -489,13 +497,13 @@ sync_not_green_apps_once() {
                 continue
             fi
 
-            timed_out=false
+            #timed_out=false
             while true; do
                 now_ts=$(date +%s)
                 elapsed=$(( now_ts - start_ts ))
                 if (( elapsed >= APP_MAX_WAIT )); then
                     echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
-                    timed_out=true
+                    #timed_out=true
                     break
                 fi
                 status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
@@ -527,7 +535,7 @@ sync_not_green_apps_once() {
                 # Check if sync operation failed
                 if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
                     echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase at [$(get_timestamp)]$(reset)"
-                    timed_out=true
+                    #timed_out=true
                     break
                 fi
 
@@ -627,13 +635,13 @@ sync_not_green_apps_once() {
             continue
         fi
 
-        timed_out=false
+        #timed_out=false
         while true; do
             now_ts=$(date +%s)
             elapsed=$(( now_ts - start_ts ))
             if (( elapsed >= APP_MAX_WAIT )); then
                 echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
-                timed_out=true
+                #timed_out=true
                 break
             fi
             status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
@@ -761,7 +769,7 @@ sync_all_apps_exclude_root() {
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
             # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
@@ -791,13 +799,13 @@ sync_all_apps_exclude_root() {
                 continue
             fi
 
-            timed_out=false
+            #timed_out=false
             while true; do
                 now_ts=$(date +%s)
                 elapsed=$(( now_ts - start_ts ))
                 if (( elapsed >= APP_MAX_WAIT )); then
                     echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
-                    timed_out=true
+                    #timed_out=true
                     break
                 fi
                 status=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null)
@@ -829,7 +837,7 @@ sync_all_apps_exclude_root() {
                 # Check if sync operation failed
                 if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
                     echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)"
-                    timed_out=true
+                    #timed_out=true
                     break
                 fi
 
@@ -945,13 +953,13 @@ sync_root_app_only() {
             continue
         fi
 
-        timed_out=false
+        #timed_out=false
         while true; do
             now_ts=$(date +%s)
             elapsed=$(( now_ts - start_ts ))
             if (( elapsed >= APP_MAX_WAIT )); then
                 echo "$(red)[TIMEOUT] $full_app did not become Healthy+Synced within ${APP_MAX_WAIT}s.$(reset)"
-                timed_out=true
+                #timed_out=true
                 break
             fi
             status=$(kubectl get applications.argoproj.io "root-app" -n "$NS" -o json 2>/dev/null)
@@ -963,7 +971,7 @@ sync_root_app_only() {
             # Check if sync operation failed
             if [[ "$operation_phase" == "Failed" || "$operation_phase" == "Error" ]]; then
                 echo "$(red)[ERROR] $full_app sync operation failed with phase=$operation_phase$(reset)"
-                timed_out=true
+                #timed_out=true
                 break
             fi
 
@@ -1228,11 +1236,11 @@ check_sync_success() {
 # ============================================================
 # GLOBAL TIMEOUT WATCHDOG
 # ============================================================
-SCRIPT_START_TS=$(date +%s)
+#SCRIPT_START_TS=$(date +%s)
 
 # Global retry loop
 global_retry=1
-sync_success=false
+#sync_success=false
 
 while (( global_retry <= GLOBAL_SYNC_RETRIES )); do
     print_header "GLOBAL SYNC ATTEMPT ${global_retry}/${GLOBAL_SYNC_RETRIES}"
@@ -1240,7 +1248,7 @@ while (( global_retry <= GLOBAL_SYNC_RETRIES )); do
     execute_full_sync
 
     if check_sync_success; then
-        sync_success=true
+        #sync_success=true
         print_header "Sync Script Completed Successfully"
         exit 0
     fi

From 9c463c6700e073623599f00bd2a50fba47eaad34 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 21:42:47 +0530
Subject: [PATCH 09/12] add finalizer check (#1220)

---
 .../onprem/after_upgrade_restart.sh           | 146 +++++++++++++++---
 1 file changed, 128 insertions(+), 18 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index c8875efeb..83ee967aa 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -287,13 +287,13 @@ check_and_handle_failed_sync() {
             while IFS= read -r res_line; do
                 [[ -z "$res_line" ]] && continue
                 read -r kind res_ns res_name <<< "$res_line"
-                echo "$(red)  - Deleting $kind $res_name in $res_ns$(reset)"
+                echo "$(red)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
 
                 if [[ "$kind" == "Job" ]]; then
                     kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null &
                     kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null &
                 elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true
+                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null &
                 fi
             done <<< "$failed_resources"
         fi
@@ -466,22 +466,49 @@ sync_not_green_apps_once() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
+            # Check if app requires server-side apply and special cleanup
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
+                
+                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
+                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
+                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
+                    .status.resources[]? |
+                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
+                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
+                    "\(.kind) \(.namespace) \(.name)"
+                ')
+                
+                if [[ -n "$problem_resources" ]]; then
+                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns$(reset)"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true
+                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true
+                        else
+                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true
+                        fi
+                    done <<< "$problem_resources"
+                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
+                    sleep 3
+                fi
+                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
-            # Special handling for nginx-ingress-pxe-boots
-            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
-                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
-                start_ts=$(date +%s)
-                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
-                rc=$?
             else
+                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?
@@ -768,22 +795,105 @@ sync_all_apps_exclude_root() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
+            # Check if app requires server-side apply and special cleanup
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
+                
+                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
+                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
+                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
+                    .status.resources[]? |
+                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
+                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
+                    "\(.kind) \(.namespace) \(.name)"
+                ')
+                
+                if [[ -n "$problem_resources" ]]; then
+                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        else
+                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        fi
+                    done <<< "$problem_resources"
+                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
+                    sleep 3
+                    
+                    # Verify resources are deleted, if still present, force finalizer removal
+                    echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            if kubectl get crd "$res_name" &>/dev/null; then
+                                echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then
+                            if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        fi
+                    done <<< "$problem_resources"
+                    sleep 2
+                fi  
+                    # Verify resources are deleted, if still present, force finalizer removal
+                    echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            if kubectl get crd "$res_name" &>/dev/null; then
+                                echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then
+                            if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        fi
+                    done <<< "$problem_resources"
+                    sleep 2
+                fi
+                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
-            # Special handling for nginx-ingress-pxe-boots
-            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
-                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
-                start_ts=$(date +%s)
-                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
-                rc=$?
             else
+                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?

From 77511f49f62c9850bca71abc9021f1c2bb250159 Mon Sep 17 00:00:00 2001
From: Andrei Palade <andrei.palade@intel.com>
Date: Thu, 4 Dec 2025 16:21:01 +0000
Subject: [PATCH 10/12] Revert "add finalizer check (#1220)" (#1223)

---
 .../onprem/after_upgrade_restart.sh           | 146 +++---------------
 1 file changed, 18 insertions(+), 128 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index 83ee967aa..c8875efeb 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -287,13 +287,13 @@ check_and_handle_failed_sync() {
             while IFS= read -r res_line; do
                 [[ -z "$res_line" ]] && continue
                 read -r kind res_ns res_name <<< "$res_line"
-                echo "$(red)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
+                echo "$(red)  - Deleting $kind $res_name in $res_ns$(reset)"
 
                 if [[ "$kind" == "Job" ]]; then
                     kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null &
                     kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null &
                 elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null &
+                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true
                 fi
             done <<< "$failed_resources"
         fi
@@ -466,49 +466,22 @@ sync_not_green_apps_once() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply and special cleanup
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            # Check if app requires server-side apply
+            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
-                
-                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
-                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
-                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
-                    .status.resources[]? |
-                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
-                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
-                    "\(.kind) \(.namespace) \(.name)"
-                ')
-                
-                if [[ -n "$problem_resources" ]]; then
-                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
-                    while IFS= read -r res_line; do
-                        [[ -z "$res_line" ]] && continue
-                        read -r kind res_ns res_name <<< "$res_line"
-                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns$(reset)"
-                        
-                        if [[ "$kind" == "Job" ]]; then
-                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
-                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true
-                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true
-                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
-                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null || true
-                        else
-                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null || true
-                        fi
-                    done <<< "$problem_resources"
-                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
-                    sleep 3
-                fi
-                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
+            # Special handling for nginx-ingress-pxe-boots
+            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
+                rc=$?
             else
-                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?
@@ -795,105 +768,22 @@ sync_all_apps_exclude_root() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply and special cleanup
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            # Check if app requires server-side apply
+            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
-                
-                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
-                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
-                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
-                    .status.resources[]? |
-                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
-                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
-                    "\(.kind) \(.namespace) \(.name)"
-                ')
-                
-                if [[ -n "$problem_resources" ]]; then
-                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
-                    while IFS= read -r res_line; do
-                        [[ -z "$res_line" ]] && continue
-                        read -r kind res_ns res_name <<< "$res_line"
-                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
-                        
-                        if [[ "$kind" == "Job" ]]; then
-                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
-                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
-                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
-                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
-                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
-                        else
-                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
-                        fi
-                    done <<< "$problem_resources"
-                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
-                    sleep 3
-                    
-                    # Verify resources are deleted, if still present, force finalizer removal
-                    echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)"
-                    while IFS= read -r res_line; do
-                        [[ -z "$res_line" ]] && continue
-                        read -r kind res_ns res_name <<< "$res_line"
-                        
-                        if [[ "$kind" == "Job" ]]; then
-                            if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then
-                                echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
-                            fi
-                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                            if kubectl get crd "$res_name" &>/dev/null; then
-                                echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null &
-                            fi
-                        elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then
-                            if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then
-                                echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
-                            fi
-                        fi
-                    done <<< "$problem_resources"
-                    sleep 2
-                fi  
-                    # Verify resources are deleted, if still present, force finalizer removal
-                    echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)"
-                    while IFS= read -r res_line; do
-                        [[ -z "$res_line" ]] && continue
-                        read -r kind res_ns res_name <<< "$res_line"
-                        
-                        if [[ "$kind" == "Job" ]]; then
-                            if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then
-                                echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
-                            fi
-                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                            if kubectl get crd "$res_name" &>/dev/null; then
-                                echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null &
-                            fi
-                        elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then
-                            if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then
-                                echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)"
-                                kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
-                                kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
-                            fi
-                        fi
-                    done <<< "$problem_resources"
-                    sleep 2
-                fi
-                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
+            # Special handling for nginx-ingress-pxe-boots
+            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
+                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
+                start_ts=$(date +%s)
+                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
+                rc=$?
             else
-                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?

From 42aca9670a68bcad39cd79de9c6f58631042e73e Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Thu, 4 Dec 2025 21:58:13 +0530
Subject: [PATCH 11/12] added patch for finalizer  (#1224)

---
 .../onprem/after_upgrade_restart.sh           | 118 +++++++++++++++---
 1 file changed, 100 insertions(+), 18 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index c8875efeb..7d4cc206e 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -287,13 +287,13 @@ check_and_handle_failed_sync() {
             while IFS= read -r res_line; do
                 [[ -z "$res_line" ]] && continue
                 read -r kind res_ns res_name <<< "$res_line"
-                echo "$(red)  - Deleting $kind $res_name in $res_ns$(reset)"
+                echo "$(red)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
 
                 if [[ "$kind" == "Job" ]]; then
                     kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true 2>/dev/null &
                     kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true 2>/dev/null &
                 elif [[ "$kind" == "CustomResourceDefinition" ]]; then
-                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null || true
+                    kubectl delete crd "$res_name" --ignore-not-found=true 2>/dev/null &
                 fi
             done <<< "$failed_resources"
         fi
@@ -466,22 +466,49 @@ sync_not_green_apps_once() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
+            # Check if app requires server-side apply and special cleanup
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
+                
+                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
+                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
+                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
+                    .status.resources[]? |
+                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
+                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
+                    "\(.kind) \(.namespace) \(.name)"
+                ')
+                
+                if [[ -n "$problem_resources" ]]; then
+                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        else
+                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        fi
+                    done <<< "$problem_resources"
+                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
+                    sleep 3
+                fi
+                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
-            # Special handling for nginx-ingress-pxe-boots
-            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
-                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
-                start_ts=$(date +%s)
-                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
-                rc=$?
             else
+                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?
@@ -768,22 +795,77 @@ sync_all_apps_exclude_root() {
             echo "$(bold)[SYNC] $full_app (wave=$wave) at [$(get_timestamp)]$(reset)"
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
-            # Check if app requires server-side apply
-            if [[ " $SERVER_SIDE_APPS " =~ $name ]]; then
+            # Check if app requires server-side apply and special cleanup
+            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
+                
+                # Check for OutOfSync or error state resources (Jobs, CRDs, ExternalSecrets, etc.)
+                echo "$(yellow)[CLEANUP] Checking for OutOfSync/error resources in $name...$(reset)"
+                problem_resources=$(kubectl get applications.argoproj.io "$name" -n "$NS" -o json 2>/dev/null | jq -r '
+                    .status.resources[]? |
+                    select(.status == "OutOfSync" or .health.status == "Degraded" or .health.status == "Missing") |
+                    select(.kind == "Job" or .kind == "CustomResourceDefinition" or .kind == "ExternalSecret" or .kind == "SecretStore" or .kind == "ClusterSecretStore") |
+                    "\(.kind) \(.namespace) \(.name)"
+                ')
+                
+                if [[ -n "$problem_resources" ]]; then
+                    echo "$(yellow)[DELETE] Removing problem resources before sync...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        echo "$(yellow)  - Deleting $kind $res_name in $res_ns (background)$(reset)"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            kubectl patch job "$res_name" -n "$res_ns" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete pods -n "$res_ns" -l job-name="$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                            kubectl delete job "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            kubectl patch crd "$res_name" --type=merge -p='{"metadata":{"finalizers":[]}}' 2>/dev/null || true
+                            kubectl delete crd "$res_name" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        else
+                            kubectl delete "$kind" "$res_name" -n "$res_ns" --ignore-not-found=true --timeout=10s 2>/dev/null &
+                        fi
+                    done <<< "$problem_resources"
+                    echo "$(yellow)[INFO] Waiting for cleanup to complete...$(reset)"
+                    sleep 3
+                    
+                    # Verify resources are deleted, if still present, force finalizer removal
+                    echo "$(yellow)[VERIFY] Checking if resources were successfully deleted...$(reset)"
+                    while IFS= read -r res_line; do
+                        [[ -z "$res_line" ]] && continue
+                        read -r kind res_ns res_name <<< "$res_line"
+                        
+                        if [[ "$kind" == "Job" ]]; then
+                            if kubectl get job "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] Job $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch job "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete job "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "CustomResourceDefinition" ]]; then
+                            if kubectl get crd "$res_name" &>/dev/null; then
+                                echo "$(red)[STUCK] CRD $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch crd "$res_name" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete crd "$res_name" --force --grace-period=0 2>/dev/null &
+                            fi
+                        elif [[ "$kind" == "ExternalSecret" || "$kind" == "SecretStore" || "$kind" == "ClusterSecretStore" ]]; then
+                            if kubectl get "$kind" "$res_name" -n "$res_ns" &>/dev/null; then
+                                echo "$(red)[STUCK] $kind $res_name still exists, forcing finalizer removal...$(reset)"
+                                kubectl patch "$kind" "$res_name" -n "$res_ns" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' 2>/dev/null || true
+                                kubectl delete "$kind" "$res_name" -n "$res_ns" --force --grace-period=0 2>/dev/null &
+                            fi
+                        fi
+                    done <<< "$problem_resources"
+                    sleep 2
+                fi
+                
                 echo "$(yellow)[INFO] Syncing $name with --force --replace --server-side (safer for CRD upgrades)...$(reset)"
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --force --replace --server-side --grpc-web 2>&1)
                 rc=$?
-            # Special handling for nginx-ingress-pxe-boots
-            elif [[ "$name" == "nginx-ingress-pxe-boots" ]]; then
-                echo "$(yellow)[INFO] Syncing nginx-ingress-pxe-boots with --force (safer for upgrades)...$(reset)"
-                start_ts=$(date +%s)
-                LOG=$(argocd app sync "$full_app" --force --grpc-web 2>&1)
-                rc=$?
             else
+                # Standard sync for apps not in SERVER_SIDE_APPS
                 start_ts=$(date +%s)
                 LOG=$(argocd app sync "$full_app" --grpc-web 2>&1)
                 rc=$?

From fb9ce53c4707b95d26dbb355e6fc8d5ae1001613 Mon Sep 17 00:00:00 2001
From: Sunil Parida <sunil.kumar.parida@intel.com>
Date: Fri, 5 Dec 2025 06:57:43 +0530
Subject: [PATCH 12/12] updated flag check UPGRADE_FROM_3_1 and lint error fix
 (#1225)

---
 .../onprem/after_upgrade_restart.sh           | 13 ++---
 on-prem-installers/onprem/onprem_upgrade.sh   | 53 +++++++++++++++----
 on-prem-installers/onprem/upgrade_postgres.sh | 10 ++--
 3 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/on-prem-installers/onprem/after_upgrade_restart.sh b/on-prem-installers/onprem/after_upgrade_restart.sh
index 7d4cc206e..cc4d63e2e 100755
--- a/on-prem-installers/onprem/after_upgrade_restart.sh
+++ b/on-prem-installers/onprem/after_upgrade_restart.sh
@@ -467,7 +467,7 @@ sync_not_green_apps_once() {
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
             # Check if app requires server-side apply and special cleanup
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            if [[ " $SERVER_SIDE_APPS " =~ \ $name\  ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
@@ -796,7 +796,7 @@ sync_all_apps_exclude_root() {
             echo "$(yellow)[INFO] Attempt ${attempt}/${APP_MAX_RETRIES}, elapsed: 0s$(reset)"
 
             # Check if app requires server-side apply and special cleanup
-            if [[ " $SERVER_SIDE_APPS " =~ " $name " ]]; then
+            if [[ " $SERVER_SIDE_APPS " =~ \ $name\  ]]; then
                 echo "$(yellow)[INFO] Stopping any ongoing operations for $name before force sync...$(reset)"
                 argocd app terminate-op "$full_app" --grpc-web 2>/dev/null || true
                 sleep 2
@@ -1251,7 +1251,8 @@ post_upgrade_cleanup() {
     kubectl delete secret tls-boots -n orch-boots || true
     kubectl delete secret boots-ca-cert -n orch-gateway || true
     kubectl delete secret boots-ca-cert -n orch-infra || true
-
+    echo "[INFO] Waiting 30 seconds for secrets cleanup to complete before deleting dkam pods..."
+    sleep 30
     echo "[INFO] Deleting dkam pods in namespace orch-infra..."
     kubectl delete pod -n orch-infra -l app.kubernetes.io/name=dkam 2>/dev/null || true
 
@@ -1265,13 +1266,7 @@ execute_full_sync() {
     sync_until_green_ns_exclude_root
     print_header "Syncing root-app after all other apps are green"
     sync_root_app_only
-
     post_upgrade_cleanup
-
-    sleep 60
-    print_header "Post-upgrade: Syncing all apps (excluding root-app) again"
-    sync_all_apps_exclude_root
-    print_header "Post-upgrade: Syncing root-app again"
     sync_root_app_only
 }
 
diff --git a/on-prem-installers/onprem/onprem_upgrade.sh b/on-prem-installers/onprem/onprem_upgrade.sh
index 9f2bd55a7..8a68de454 100755
--- a/on-prem-installers/onprem/onprem_upgrade.sh
+++ b/on-prem-installers/onprem/onprem_upgrade.sh
@@ -8,7 +8,10 @@
 # Description: This script:
 #               If requested - does a backup of PVs and cluster's ETCD
 #               Downloads debian packages and repo artifacts,
-#               Upgrades packages to v3.1.0:
+#               Upgrades packages from either:
+#                 - v3.1.3 to latest (set -u true or omit, default)
+#                 - v2025.02 to latest (set -u false)
+#               Upgrades:
 #                 - OS config,
 #                 - RKE2 and basic cluster components,
 #                 - ArgoCD,
@@ -18,6 +21,7 @@
 # Usage: ./onprem_upgrade
 #    -o:             Override production values with dev values
 #    -b:             enable backup of Orchestrator PVs before upgrade (optional)
+#    -u [true|false]: specify source version: true=from 3.1.3 (default), false=from 2025.02
 #    -h:             help (optional)
 
 set -e
@@ -77,7 +81,8 @@ ORCH_INSTALLER_PROFILE="${ORCH_INSTALLER_PROFILE:-onprem}"
 DEPLOY_VERSION="${DEPLOY_VERSION:-v3.1.0}"  # Updated to v3.1.0
 GITEA_IMAGE_REGISTRY="${GITEA_IMAGE_REGISTRY:-docker.io}"
 USE_LOCAL_PACKAGES="${USE_LOCAL_PACKAGES:-false}"  # New flag for local packages
-UPGRADE_3_1_X="${UPGRADE_3_1_X:-true}"
+# UPGRADE_FROM_3_1_X indicates SOURCE version: true=upgrading FROM 3.1.3, false=upgrading FROM 2025.02
+UPGRADE_FROM_3_1_X="${UPGRADE_FROM_3_1_X:-true}"  # Default: upgrading from 3.1.3
 
 ### Variables
 cwd=$(pwd)
@@ -624,21 +629,32 @@ cleanup_gitea_secrets() {
 usage() {
     cat >&2 <<EOF
 Purpose:
-Upgrade OnPrem Edge Orchestrator to v3.1.0.
+Upgrade OnPrem Edge Orchestrator to latest version.
+
+Supports two upgrade paths:
+  1. From v3.1.3 to latest (use -u true or omit, this is default)
+  2. From v2025.02 to latest (use -u false)
 
 Usage:
 $(basename "$0") [option...] [argument]
 
-ex:
-./onprem_upgrade.sh -b
-./onprem_upgrade.sh -bl  # Use local packages with backup
+Examples:
+./onprem_upgrade.sh -b              # Upgrade from 3.1.3 with backup (default)
+./onprem_upgrade.sh -bl             # Upgrade from 3.1.3 with local packages and backup
+./onprem_upgrade.sh -u false     # Upgrade from 2025.02 with backup
+./onprem_upgrade.sh -u true         # Explicitly upgrade from 3.1.3
 
 Options:
     -b:             enable backup of Orchestrator PVs before upgrade (optional)
     -l:             use local packages instead of downloading (optional)
     -o:             override production values with dev values (optional)
+    -u [true|false]: specify SOURCE version to upgrade FROM:
+                       true  = upgrading FROM v3.1.3 (default)
+                       false = upgrading FROM v2025.02
     -h:             help (optional)
 
+Note: The -u flag sets UPGRADE_FROM_3_1_X variable.
+
 EOF
 }
 
@@ -647,21 +663,38 @@ EOF
 ################################
 
 # shellcheck disable=SC2034
-while getopts 'v:hbol' flag; do
+while getopts 'v:hbolu:' flag; do
     case "${flag}" in
     h) HELP='true' ;;
     b) BACKUP='true' ;;
     o) OVERRIDE='true' ;;
     l) USE_LOCAL_PACKAGES='true' ;;  # New local packages flag
+    u) UPGRADE_FROM_3_1_X="${OPTARG}" ;;  # Set UPGRADE_FROM_3_1_X from command-line
     *) HELP='true' ;;
     esac
 done
 
+# Validate UPGRADE_FROM_3_1_X value
+if [[ "$UPGRADE_FROM_3_1_X" != "true" && "$UPGRADE_FROM_3_1_X" != "false" ]]; then
+    echo "Error: UPGRADE_FROM_3_1_X must be 'true' or 'false'. Got: $UPGRADE_FROM_3_1_X"
+    usage
+    exit 1
+fi
+
+# Export for use in upgrade_postgres.sh
+export UPGRADE_FROM_3_1_X
+
 if [[ $HELP ]]; then
     usage
     exit 1
 fi
 
+# Log which upgrade path is being used
+if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
+    log_info "Upgrade path: FROM v3.1.3 TO latest"
+else
+    log_info "Upgrade path: FROM v2025.02 TO latest"
+fi
 
 # Check if postgres is running and if it is safe to backup
 check_postgres
@@ -672,7 +705,7 @@ fi
 
 # Perform PostgreSQL secret backup if not done already
 if [[ ! -f postgres_secret.yaml ]]; then
-    if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+    if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
         kubectl get secret -n orch-database postgresql -o yaml > postgres_secret.yaml
     else
         kubectl get secret -n orch-database passwords -o yaml > postgres_secret.yaml
@@ -810,7 +843,7 @@ if [[ ! -s postgres-secrets-password.txt ]]; then
     IAM_TENANCY=$(kubectl get secret iam-tenancy-local-postgresql -n orch-iam -o jsonpath='{.data.PGPASSWORD}')
     PLATFORM_KEYCLOAK=$(kubectl get secret platform-keycloak-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}')
     VAULT=$(kubectl get secret vault-local-postgresql -n orch-platform -o jsonpath='{.data.PGPASSWORD}')
-    if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+    if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
         POSTGRESQL=$(kubectl get secret postgresql -n orch-database -o jsonpath='{.data.postgres-password}')
     else
         POSTGRESQL=$(kubectl get secret orch-database-postgresql -n orch-database -o jsonpath='{.data.password}')
@@ -1066,7 +1099,7 @@ patch_secrets
 sleep 10
 
 # Restore secret after app delete but before postgress restored
-if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
         yq e 'del(.metadata.labels, .metadata.annotations, .metadata.uid, .metadata.creationTimestamp)' postgres_secret.yaml | kubectl apply -f -
 else
         yq e '
diff --git a/on-prem-installers/onprem/upgrade_postgres.sh b/on-prem-installers/onprem/upgrade_postgres.sh
index 45cd7abe8..e24f49e1e 100755
--- a/on-prem-installers/onprem/upgrade_postgres.sh
+++ b/on-prem-installers/onprem/upgrade_postgres.sh
@@ -11,7 +11,11 @@ local_backup_path="${POSTGRES_LOCAL_BACKUP_PATH}${local_backup_file}"
 POSTGRES_USERNAME="postgres"
 application_namespace=onprem
 
-if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+# UPGRADE_FROM_3_1_X is set and exported by onprem_upgrade.sh
+# Default to true if not set (upgrading FROM 3.1.3)
+UPGRADE_FROM_3_1_X="${UPGRADE_FROM_3_1_X:-true}"
+
+if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
     podname="postgresql-0"
 else
     podname="postgresql-cluster-1"
@@ -54,7 +58,7 @@ backup_postgres() {
   fi
   echo "Backing up databases from pod $podname in namespace $postgres_namespace..."
 
-  if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+  if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
         remote_backup_path="/tmp/${postgres_namespace}_backup.sql"
   else
         remote_backup_path="/var/lib/postgresql/data/${postgres_namespace}_backup.sql"
@@ -109,7 +113,7 @@ restore_postgres() {
   echo "Restoring backup databases from pod $podname in namespace $postgres_namespace..."
 
   # Get postgres password from secret
-  if [[ "$UPGRADE_3_1_X" == "true" ]]; then
+  if [[ "$UPGRADE_FROM_3_1_X" == "true" ]]; then
         PGPASSWORD=$(kubectl get secret -n $postgres_namespace postgresql -o jsonpath='{.data.postgres-password}' | base64 -d)
 else
         PGPASSWORD=$(kubectl get secret -n $postgres_namespace orch-database-postgresql -o jsonpath='{.data.password}' | base64 -d)