Skip to content

Commit 72fa8cb

Browse files
Rei1010archlitchi
authored andcommitted
Optimize E2E with pod status check (#847)
Signed-off-by: wen.rui <wen.rui@daocloud.io>
1 parent 5409435 commit 72fa8cb

File tree

3 files changed

+36
-10
lines changed

3 files changed

+36
-10
lines changed

.github/workflows/call-e2e.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ jobs:
2828
environment: ${{ matrix.device }}
2929
env:
3030
E2E_TYPE: ${{ inputs.type }}
31+
HAMI_VERSION: ${{ inputs.ref }}
3132
steps:
3233
- name: checkout code
3334
uses: actions/checkout@v4
@@ -68,8 +69,6 @@ jobs:
6869
ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami"
6970
7071
- name: deploy hami helm
71-
env:
72-
HAMI_VERSION: ${{ inputs.ref }}
7372
run: |
7473
make helm-deploy
7574

hack/deploy-helm.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@ else
4242
fi
4343

4444
# Set Helm Chart source based on E2E_TYPE.
45+
echo "E2E Type is: ${E2E_TYPE}"
46+
4547
if [ "${E2E_TYPE}" == "pullrequest" ]; then
46-
echo "E2E Type is: ${E2E_TYPE}"
4748
# Ensure the charts directory exists and contains a .tgz file
4849
if [ -d "charts" ] && [ -n "$(ls charts/*.tgz 2>/dev/null)" ]; then
4950
HELM_SOURCE=$(ls charts/*.tgz | head -n 1) # Use the first .tgz file found
@@ -96,7 +97,7 @@ fi
9697
echo "Checking Pod status..."
9798
kubectl --kubeconfig "${KUBE_CONF}" get po -n "${TARGET_NS}"
9899

99-
if ! util::check_pods_status "${KUBE_CONF}" "${TARGET_NS}"; then
100+
if ! util::check_pods_status "${KUBE_CONF}" ; then
100101
echo "Error: Pods are not running correctly."
101102
exit 1
102103
fi

hack/util.sh

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,38 @@ function util::wait_ip_reachable {
116116
# Check Pod status in a namespace.
117117
function util::check_pods_status {
118118
local kubeconfig=${1:-""}
119-
local namespace=${2:-"hami-system"}
119+
local namespace=${2:-""}
120+
local retries=${3:-10}
121+
local interval=${4:-30}
122+
123+
local attempt=0
120124
local unhealthy_pods
121-
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers | awk '!/Running|Succeeded/ {print $1}')
125+
126+
while (( attempt < retries )); do
127+
echo "Checking Pod status (Attempt $(( attempt + 1 ))/$retries)..."
128+
129+
# Checking unhealthy pods in namespaces,ignore the Running & Succeeded status
130+
if [[ -z "$namespace" ]]; then
131+
unhealthy_pods=$(kubectl get po -A --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $2}')
132+
else
133+
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $1}')
134+
fi
135+
136+
if [[ -z "$unhealthy_pods" ]]; then
137+
echo "PASS: All Pods are in Running or Succeeded state."
138+
return 0
139+
fi
140+
141+
echo "Found unhealthy pods:"
142+
echo "$unhealthy_pods"
143+
144+
if (( attempt < retries - 1 )); then
145+
echo "Retrying pod check in ${interval}s..."
146+
sleep "$interval"
147+
fi
148+
149+
(( attempt++ ))
150+
done
122151

123152
if [[ -n "$unhealthy_pods" ]]; then
124153
echo "Found unhealthy pods in namespace $namespace:"
@@ -134,8 +163,5 @@ function util::check_pods_status {
134163
done
135164

136165
return 1
137-
else
138-
echo "PASS: All Pods are in Running state."
139-
return 0
140166
fi
141-
}
167+
}

0 commit comments

Comments
 (0)