Skip to content

Commit c106e4e

Browse files
authored
fix: replace wait with a test for Rancher (#89)
Signed-off-by: matttrach <matt.trachier@suse.com>
1 parent d25e002 commit c106e4e

10 files changed

Lines changed: 232 additions & 34 deletions

File tree

.github/workflows/release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ jobs:
231231
- test_TestProdBasic
232232
- test_TestDownstreamBasic
233233
- test_TestDownstreamProd
234-
if: needs.release.outputs.release_pr
234+
if: always() && needs.release.outputs.release_pr
235235
runs-on: ubuntu-latest
236236
steps:
237237
- uses: actions/checkout@v4

modules/rancher_bootstrap/main.tf

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,30 +86,46 @@ resource "terraform_data" "create" {
8686
MAX=2
8787
EXITCODE=1
8888
ATTEMPTS=0
89+
E=1
90+
E1=0
8991
while [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; do
90-
E=$EXITCODE
9192
A=0
9293
while [ $E -gt 0 ] && [ $A -lt $MAX ]; do
93-
timeout 3600 terraform apply -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate"
94+
timeout 1h terraform apply -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate"
9495
E=$?
96+
if [ $E -eq 124 ]; then echo "Apply timed out after 1 hour"; fi
9597
A=$((A+1))
9698
done
9799
# don't destroy if the last attempt fails
98100
if [ $E -gt 0 ] && [ $ATTEMPTS != $((MAX-1)) ]; then
99101
A1=0
100-
E1=$EXITCODE
101102
while [ $E1 -gt 0 ] && [ $A1 -lt $MAX ]; do
102-
timeout 3600 terraform destroy -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate"
103+
timeout 1h terraform destroy -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate"
103104
E1=$?
105+
if [ $E1 -eq 124 ]; then echo "Apply timed out after 1 hour"; fi
104106
A1=$((A1+1))
105107
done
106108
fi
107-
EXITCODE=$((E+E1))
109+
if [ $E -gt 0 ]; then
110+
echo "apply failed..."
111+
fi
112+
if [ $E1 -gt 0 ]; then
113+
echo "destroy failed..."
114+
fi
115+
if [ $E -gt 0 ] || [ $E1 -gt 0 ]; then
116+
EXITCODE=1
117+
else
118+
EXITCODE=0
119+
fi
108120
ATTEMPTS=$((ATTEMPTS+1))
109-
echo "wait 30 seconds between attempts..."
110-
sleep 30
121+
if [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; then
122+
echo "wait 30 seconds between attempts..."
123+
sleep 30
124+
fi
111125
done
112-
126+
if [ $ATTEMPTS -eq $MAX ]; then echo "max attempts reached..."; fi
127+
if [ $EXITCODE -ne 0 ]; then echo "failure, exit code $EXITCODE..."; fi
128+
if [ $EXITCODE -eq 0 ]; then echo "success..."; fi
113129
exit $EXITCODE
114130
EOT
115131
}

modules/rancher_bootstrap/rancher/main.tf

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,10 @@ resource "helm_release" "rancher" {
150150
chart = "${path.root}/rancher-${local.rancher_version}.tgz" # "${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz"
151151
namespace = "cattle-system"
152152
create_namespace = false
153-
wait = true
154-
wait_for_jobs = true
153+
wait = false
154+
wait_for_jobs = false
155155
force_update = true
156-
timeout = 3600 # 60m
156+
timeout = 1800 # 30m
157157

158158
set {
159159
name = "hostname"
@@ -205,22 +205,32 @@ resource "helm_release" "rancher" {
205205
}
206206
}
207207

208-
resource "time_sleep" "settle_after_rancher" {
208+
resource "terraform_data" "wait_for_rancher" {
209209
depends_on = [
210210
time_sleep.settle_before_rancher,
211211
kubernetes_manifest.issuer,
212+
terraform_data.wait_for_nginx,
213+
terraform_data.build_chart,
212214
helm_release.rancher,
213215
]
214-
create_duration = "120s"
216+
provisioner "local-exec" {
217+
command = <<-EOT
218+
cd ${abspath(path.root)} || true
219+
chmod +x ${abspath(path.module)}/runningPods.sh
220+
echo "using kubeconfig located at $KUBECONFIG"
221+
${abspath(path.module)}/runningPods.sh
222+
EOT
223+
}
215224
}
216225

217-
218226
resource "terraform_data" "get_public_cert_info" {
219227
depends_on = [
220228
time_sleep.settle_before_rancher,
221229
kubernetes_manifest.issuer,
230+
terraform_data.wait_for_nginx,
231+
terraform_data.build_chart,
222232
helm_release.rancher,
223-
time_sleep.settle_after_rancher,
233+
terraform_data.wait_for_rancher,
224234
]
225235
provisioner "local-exec" {
226236
command = <<-EOT
@@ -268,8 +278,10 @@ resource "rancher2_bootstrap" "admin" {
268278
depends_on = [
269279
time_sleep.settle_before_rancher,
270280
kubernetes_manifest.issuer,
281+
terraform_data.wait_for_nginx,
282+
terraform_data.build_chart,
271283
helm_release.rancher,
272-
time_sleep.settle_after_rancher,
284+
terraform_data.wait_for_rancher,
273285
terraform_data.get_public_cert_info,
274286
]
275287
password = random_password.password.result
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
set -x
3+
4+
JSONPATH="'{range .items[*]}
5+
{.metadata.name}{\"\\t\"} \
6+
{.metadata.namespace}{\"\\t\"} \
7+
{.status.phase}{\"\\n\"} \
8+
{end}'"
9+
10+
notReady() {
11+
PODS=$(kubectl get pods -A -o jsonpath="$JSONPATH")
12+
# shellcheck disable=SC2060,SC2140
13+
NOT_READY=$(echo "$PODS" | grep -v "Running" | grep -v "Succeeded" | tr -d ["\t","\n"," ","'"] || true)
14+
if [ -n "$NOT_READY" ]; then
15+
# Some pods aren't running
16+
return 0
17+
else
18+
# All pods are running
19+
return 1
20+
fi
21+
}
22+
23+
readyWait() {
24+
TIMEOUT=10 # 10 minutes
25+
TIMEOUT_MINUTES=$((TIMEOUT * 60))
26+
INTERVAL=30 # 30 seconds
27+
MAX=$((TIMEOUT_MINUTES / INTERVAL))
28+
ATTEMPTS=0
29+
30+
while notReady; do
31+
if [ "$ATTEMPTS" -lt "$MAX" ]; then
32+
ATTEMPTS=$((ATTEMPTS + 1))
33+
sleep "$INTERVAL";
34+
else
35+
return 1
36+
fi
37+
done
38+
return 0
39+
}
40+
41+
SUCCESSES=0
42+
SUCCESSES_NEEDED=3 # require three successes to make sure everything is settled
43+
44+
while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do
45+
SUCCESSES=$((SUCCESSES + 1))
46+
echo "succeeeded $SUCCESSES times..."
47+
sleep 30
48+
done
49+
50+
if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then
51+
echo "$SUCCESSES_NEEDED successes reached, passed..."
52+
EXITCODE=0
53+
else
54+
echo "$SUCCESSES_NEEDED successes not reached, failed..."
55+
EXITCODE=1
56+
fi
57+
58+
echo "nodes..."
59+
kubectl get nodes || true
60+
61+
echo "all..."
62+
kubectl get all -A || true
63+
64+
echo "pods..."
65+
kubectl get pods -A || true
66+
67+
exit $EXITCODE

modules/rancher_bootstrap/rancher_externalTLS/main.tf

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ resource "helm_release" "rancher" {
6767
chart = "${path.root}/rancher-${local.rancher_version}.tgz" #"${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz"
6868
namespace = "cattle-system"
6969
create_namespace = false
70-
wait = true
71-
wait_for_jobs = true
70+
wait = false
71+
wait_for_jobs = false
7272
force_update = true
7373
timeout = 1800 # 30m
7474

@@ -106,12 +106,19 @@ resource "helm_release" "rancher" {
106106
}
107107
}
108108

109-
resource "time_sleep" "settle_after_rancher" {
109+
resource "terraform_data" "wait_for_rancher" {
110110
depends_on = [
111111
time_sleep.settle_before_rancher,
112112
helm_release.rancher,
113113
]
114-
create_duration = "120s"
114+
provisioner "local-exec" {
115+
command = <<-EOT
116+
cd ${abspath(path.root)} || true
117+
chmod +x ${abspath(path.module)}/runningPods.sh
118+
echo "using kubeconfig located at $KUBECONFIG"
119+
${abspath(path.module)}/runningPods.sh
120+
EOT
121+
}
115122
}
116123

117124
resource "random_password" "password" {
@@ -125,7 +132,7 @@ resource "terraform_data" "get_public_cert_info" {
125132
random_password.password,
126133
time_sleep.settle_before_rancher,
127134
helm_release.rancher,
128-
time_sleep.settle_after_rancher,
135+
terraform_data.wait_for_rancher,
129136
]
130137
provisioner "local-exec" {
131138
command = <<-EOT
@@ -148,7 +155,7 @@ resource "terraform_data" "get_ping" {
148155
random_password.password,
149156
time_sleep.settle_before_rancher,
150157
helm_release.rancher,
151-
time_sleep.settle_after_rancher,
158+
terraform_data.wait_for_rancher,
152159
terraform_data.get_public_cert_info,
153160
]
154161
provisioner "local-exec" {
@@ -181,7 +188,7 @@ resource "rancher2_bootstrap" "admin" {
181188
random_password.password,
182189
time_sleep.settle_before_rancher,
183190
helm_release.rancher,
184-
time_sleep.settle_after_rancher,
191+
terraform_data.wait_for_rancher,
185192
terraform_data.get_public_cert_info,
186193
terraform_data.get_ping,
187194
]
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
set -x
3+
4+
JSONPATH="'{range .items[*]}
5+
{.metadata.name}{\"\\t\"} \
6+
{.metadata.namespace}{\"\\t\"} \
7+
{.status.phase}{\"\\n\"} \
8+
{end}'"
9+
10+
notReady() {
11+
PODS=$(kubectl get pods -A -o jsonpath="$JSONPATH")
12+
# shellcheck disable=SC2060,SC2140
13+
NOT_READY=$(echo "$PODS" | grep -v "Running" | grep -v "Succeeded" | tr -d ["\t","\n"," ","'"] || true)
14+
if [ -n "$NOT_READY" ]; then
15+
# Some pods aren't running
16+
return 0
17+
else
18+
# All pods are running
19+
return 1
20+
fi
21+
}
22+
23+
readyWait() {
24+
TIMEOUT=10 # 10 minutes
25+
TIMEOUT_MINUTES=$((TIMEOUT * 60))
26+
INTERVAL=30 # 30 seconds
27+
MAX=$((TIMEOUT_MINUTES / INTERVAL))
28+
ATTEMPTS=0
29+
30+
while notReady; do
31+
if [ "$ATTEMPTS" -lt "$MAX" ]; then
32+
ATTEMPTS=$((ATTEMPTS + 1))
33+
sleep "$INTERVAL";
34+
else
35+
return 1
36+
fi
37+
done
38+
return 0
39+
}
40+
41+
SUCCESSES=0
42+
SUCCESSES_NEEDED=3 # require three successes to make sure everything is settled
43+
44+
while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do
45+
SUCCESSES=$((SUCCESSES + 1))
46+
echo "succeeeded $SUCCESSES times..."
47+
sleep 30
48+
done
49+
50+
if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then
51+
echo "$SUCCESSES_NEEDED successes reached, passed..."
52+
EXITCODE=0
53+
else
54+
echo "$SUCCESSES_NEEDED successes not reached, failed..."
55+
EXITCODE=1
56+
fi
57+
58+
echo "nodes..."
59+
kubectl get nodes || true
60+
61+
echo "all..."
62+
kubectl get all -A || true
63+
64+
echo "pods..."
65+
kubectl get pods -A || true
66+
67+
exit $EXITCODE

test/scripts/runningPods.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,13 @@ while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do
4747
sleep 30
4848
done
4949

50-
echo "Pods are ready..."
50+
if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then
51+
echo "$SUCCESSES_NEEDED reached, passed.."
52+
EXITCODE=0
53+
else
54+
echo "$SUCCESSES_NEEDED not reached, failed.."
55+
EXITCODE=1
56+
fi
5157

5258
echo "nodes..."
5359
kubectl get nodes || true
@@ -58,4 +64,4 @@ kubectl get all -A || true
5864
echo "pods..."
5965
kubectl get pods -A || true
6066

61-
exit 0
67+
exit $EXITCODE

test/tests/downstream/downstream_test.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,20 +108,27 @@ func TestDownstreamBasic(t *testing.T) {
108108

109109
_, err = terraform.InitAndApplyE(t, terraformOptions)
110110
if err != nil {
111+
t.Log("Test failed, tearing down...")
112+
util.GetErrorLogs(t, testDir + "/kubeconfig")
111113
util.Teardown(t, testDir, terraformOptions, keyPair)
112114
os.Remove(exampleDir + ".terraform.lock.hcl")
113115
sshAgent.Stop()
114116
t.Fatalf("Error creating cluster: %s", err)
115117
}
116-
t.Log("Test passed, tearing down...")
118+
util.CheckReady(t, testDir + "/kubeconfig")
119+
util.CheckRunning(t, testDir + "/kubeconfig")
120+
if t.Failed() {
121+
t.Log("Test failed...")
122+
} else {
123+
t.Log("Test passed...")
124+
}
117125
util.Teardown(t, testDir, terraformOptions, keyPair)
118-
os.Remove(exampleDir + ".terraform.lock.hcl")
126+
os.Remove(exampleDir + "/.terraform.lock.hcl")
119127
sshAgent.Stop()
120128
}
121129

122130

123131

124-
125132
func TestDownstreamProd(t *testing.T) {
126133
t.Parallel()
127134
id := util.GetId()
@@ -214,13 +221,21 @@ func TestDownstreamProd(t *testing.T) {
214221

215222
_, err = terraform.InitAndApplyE(t, terraformOptions)
216223
if err != nil {
224+
t.Log("Test failed, tearing down...")
225+
util.GetErrorLogs(t, testDir + "/kubeconfig")
217226
util.Teardown(t, testDir, terraformOptions, keyPair)
218227
os.Remove(exampleDir + ".terraform.lock.hcl")
219228
sshAgent.Stop()
220229
t.Fatalf("Error creating cluster: %s", err)
221230
}
222-
t.Log("Test passed, tearing down...")
231+
util.CheckReady(t, testDir + "/kubeconfig")
232+
util.CheckRunning(t, testDir + "/kubeconfig")
233+
if t.Failed() {
234+
t.Log("Test failed...")
235+
} else {
236+
t.Log("Test passed...")
237+
}
223238
util.Teardown(t, testDir, terraformOptions, keyPair)
224-
os.Remove(exampleDir + ".terraform.lock.hcl")
239+
os.Remove(exampleDir + "/.terraform.lock.hcl")
225240
sshAgent.Stop()
226241
}

0 commit comments

Comments
 (0)