Optimize E2E with pod status check (#847)

Rei1010 · archlitchi · commit 72fa8cb81fec · 2025-05-06T14:05:54.000+08:00
Signed-off-by: wen.rui &lt;wen.rui@daocloud.io&gt;
diff --git a/.github/workflows/call-e2e.yaml b/.github/workflows/call-e2e.yaml
@@ -28,6 +28,7 @@ jobs:
     environment: ${{ matrix.device }}
     env:
       E2E_TYPE: ${{ inputs.type }}
+      HAMI_VERSION: ${{ inputs.ref }}
     steps:
       - name: checkout code
         uses: actions/checkout@v4
@@ -68,8 +69,6 @@ jobs:
           ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami"
 
       - name: deploy hami helm
-        env:
-          HAMI_VERSION: ${{ inputs.ref }}
         run: |
           make helm-deploy
 
diff --git a/hack/deploy-helm.sh b/hack/deploy-helm.sh
@@ -42,8 +42,9 @@ else
 fi
 
 # Set Helm Chart source based on E2E_TYPE.
+echo "E2E Type is: ${E2E_TYPE}"
+
 if [ "${E2E_TYPE}" == "pullrequest" ]; then
-  echo "E2E Type is: ${E2E_TYPE}"
   # Ensure the charts directory exists and contains a .tgz file
   if [ -d "charts" ] && [ -n "$(ls charts/*.tgz 2>/dev/null)" ]; then
     HELM_SOURCE=$(ls charts/*.tgz | head -n 1)  # Use the first .tgz file found
@@ -96,7 +97,7 @@ fi
 echo "Checking Pod status..."
 kubectl --kubeconfig "${KUBE_CONF}" get po -n "${TARGET_NS}"
 
-if ! util::check_pods_status "${KUBE_CONF}" "${TARGET_NS}"; then
+if ! util::check_pods_status "${KUBE_CONF}" ; then
   echo "Error: Pods are not running correctly."
   exit 1
 fi
diff --git a/hack/util.sh b/hack/util.sh
@@ -116,9 +116,38 @@ function util::wait_ip_reachable {
 # Check Pod status in a namespace.
 function util::check_pods_status {
   local kubeconfig=${1:-""}
-  local namespace=${2:-"hami-system"}
+  local namespace=${2:-""}
+  local retries=${3:-10}
+  local interval=${4:-30}
+
+  local attempt=0
   local unhealthy_pods
-  unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers | awk '!/Running|Succeeded/ {print $1}')
+
+  while (( attempt < retries )); do
+    echo "Checking Pod status (Attempt $(( attempt + 1 ))/$retries)..."
+
+    # Checking unhealthy pods in namespaces，ignore the  Running & Succeeded status
+    if [[ -z "$namespace" ]]; then
+      unhealthy_pods=$(kubectl get po -A --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $2}')
+    else
+      unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $1}')
+    fi
+
+    if [[ -z "$unhealthy_pods" ]]; then
+      echo "PASS: All Pods are in Running or Succeeded state."
+      return 0
+    fi
+
+    echo "Found unhealthy pods:"
+    echo "$unhealthy_pods"
+
+    if (( attempt < retries - 1 )); then
+      echo "Retrying pod check in ${interval}s..."
+      sleep "$interval"
+    fi
+
+    (( attempt++ ))
+  done
 
   if [[ -n "$unhealthy_pods" ]]; then
     echo "Found unhealthy pods in namespace $namespace:"
@@ -134,8 +163,5 @@ function util::check_pods_status {
     done
 
     return 1
-  else
-    echo "PASS: All Pods are in Running state."
-    return 0
   fi
-}
+}