diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 2ff8db2..e828d07 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -231,7 +231,7 @@ jobs: - test_TestProdBasic - test_TestDownstreamBasic - test_TestDownstreamProd - if: needs.release.outputs.release_pr + if: always() && needs.release.outputs.release_pr runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/modules/rancher_bootstrap/main.tf b/modules/rancher_bootstrap/main.tf index 275eedf..7033474 100644 --- a/modules/rancher_bootstrap/main.tf +++ b/modules/rancher_bootstrap/main.tf @@ -86,30 +86,46 @@ resource "terraform_data" "create" { MAX=2 EXITCODE=1 ATTEMPTS=0 + E=1 + E1=0 while [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; do - E=$EXITCODE A=0 while [ $E -gt 0 ] && [ $A -lt $MAX ]; do - timeout 3600 terraform apply -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate" + timeout 1h terraform apply -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate" E=$? + if [ $E -eq 124 ]; then echo "Apply timed out after 1 hour"; fi A=$((A+1)) done # don't destroy if the last attempt fails if [ $E -gt 0 ] && [ $ATTEMPTS != $((MAX-1)) ]; then A1=0 - E1=$EXITCODE while [ $E1 -gt 0 ] && [ $A1 -lt $MAX ]; do - timeout 3600 terraform destroy -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate" + timeout 1h terraform destroy -var-file="inputs.tfvars" -auto-approve -state="${local.deploy_path}/tfstate" E1=$? + if [ $E1 -eq 124 ]; then echo "Apply timed out after 1 hour"; fi A1=$((A1+1)) done fi - EXITCODE=$((E+E1)) + if [ $E -gt 0 ]; then + echo "apply failed..." + fi + if [ $E1 -gt 0 ]; then + echo "destroy failed..." + fi + if [ $E -gt 0 ] || [ $E1 -gt 0 ]; then + EXITCODE=1 + else + EXITCODE=0 + fi ATTEMPTS=$((ATTEMPTS+1)) - echo "wait 30 seconds between attempts..." - sleep 30 + if [ $EXITCODE -gt 0 ] && [ $ATTEMPTS -lt $MAX ]; then + echo "wait 30 seconds between attempts..." + sleep 30 + fi done - + if [ $ATTEMPTS -eq $MAX ]; then echo "max attempts reached..."; fi + if [ $EXITCODE -ne 0 ]; then echo "failure, exit code $EXITCODE..."; fi + if [ $EXITCODE -eq 0 ]; then echo "success..."; fi exit $EXITCODE EOT } diff --git a/modules/rancher_bootstrap/rancher/main.tf b/modules/rancher_bootstrap/rancher/main.tf index 60d25b0..c9a3196 100644 --- a/modules/rancher_bootstrap/rancher/main.tf +++ b/modules/rancher_bootstrap/rancher/main.tf @@ -150,10 +150,10 @@ resource "helm_release" "rancher" { chart = "${path.root}/rancher-${local.rancher_version}.tgz" # "${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz" namespace = "cattle-system" create_namespace = false - wait = true - wait_for_jobs = true + wait = false + wait_for_jobs = false force_update = true - timeout = 3600 # 60m + timeout = 1800 # 30m set { name = "hostname" @@ -205,22 +205,32 @@ resource "helm_release" "rancher" { } } -resource "time_sleep" "settle_after_rancher" { +resource "terraform_data" "wait_for_rancher" { depends_on = [ time_sleep.settle_before_rancher, kubernetes_manifest.issuer, + terraform_data.wait_for_nginx, + terraform_data.build_chart, helm_release.rancher, ] - create_duration = "120s" + provisioner "local-exec" { + command = <<-EOT + cd ${abspath(path.root)} || true + chmod +x ${abspath(path.module)}/runningPods.sh + echo "using kubeconfig located at $KUBECONFIG" + ${abspath(path.module)}/runningPods.sh + EOT + } } - resource "terraform_data" "get_public_cert_info" { depends_on = [ time_sleep.settle_before_rancher, kubernetes_manifest.issuer, + terraform_data.wait_for_nginx, + terraform_data.build_chart, helm_release.rancher, - time_sleep.settle_after_rancher, + terraform_data.wait_for_rancher, ] provisioner "local-exec" { command = <<-EOT @@ -268,8 +278,10 @@ resource "rancher2_bootstrap" "admin" { depends_on = [ time_sleep.settle_before_rancher, kubernetes_manifest.issuer, + terraform_data.wait_for_nginx, + terraform_data.build_chart, helm_release.rancher, - time_sleep.settle_after_rancher, + terraform_data.wait_for_rancher, terraform_data.get_public_cert_info, ] password = random_password.password.result diff --git a/modules/rancher_bootstrap/rancher/runningPods.sh b/modules/rancher_bootstrap/rancher/runningPods.sh new file mode 100755 index 0000000..5d0d515 --- /dev/null +++ b/modules/rancher_bootstrap/rancher/runningPods.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +JSONPATH="'{range .items[*]} + {.metadata.name}{\"\\t\"} \ + {.metadata.namespace}{\"\\t\"} \ + {.status.phase}{\"\\n\"} \ +{end}'" + +notReady() { + PODS=$(kubectl get pods -A -o jsonpath="$JSONPATH") + # shellcheck disable=SC2060,SC2140 + NOT_READY=$(echo "$PODS" | grep -v "Running" | grep -v "Succeeded" | tr -d ["\t","\n"," ","'"] || true) + if [ -n "$NOT_READY" ]; then + # Some pods aren't running + return 0 + else + # All pods are running + return 1 + fi +} + +readyWait() { + TIMEOUT=10 # 10 minutes + TIMEOUT_MINUTES=$((TIMEOUT * 60)) + INTERVAL=30 # 30 seconds + MAX=$((TIMEOUT_MINUTES / INTERVAL)) + ATTEMPTS=0 + + while notReady; do + if [ "$ATTEMPTS" -lt "$MAX" ]; then + ATTEMPTS=$((ATTEMPTS + 1)) + sleep "$INTERVAL"; + else + return 1 + fi + done + return 0 +} + +SUCCESSES=0 +SUCCESSES_NEEDED=3 # require three successes to make sure everything is settled + +while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do + SUCCESSES=$((SUCCESSES + 1)) + echo "succeeeded $SUCCESSES times..." + sleep 30 +done + +if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then + echo "$SUCCESSES_NEEDED successes reached, passed..." + EXITCODE=0 +else + echo "$SUCCESSES_NEEDED successes not reached, failed..." + EXITCODE=1 +fi + +echo "nodes..." +kubectl get nodes || true + +echo "all..." +kubectl get all -A || true + +echo "pods..." +kubectl get pods -A || true + +exit $EXITCODE diff --git a/modules/rancher_bootstrap/rancher_externalTLS/main.tf b/modules/rancher_bootstrap/rancher_externalTLS/main.tf index 6154a1f..9b1b7bb 100644 --- a/modules/rancher_bootstrap/rancher_externalTLS/main.tf +++ b/modules/rancher_bootstrap/rancher_externalTLS/main.tf @@ -67,8 +67,8 @@ resource "helm_release" "rancher" { chart = "${path.root}/rancher-${local.rancher_version}.tgz" #"${local.rancher_helm_repository}/${local.rancher_channel}/rancher-${local.rancher_version}.tgz" namespace = "cattle-system" create_namespace = false - wait = true - wait_for_jobs = true + wait = false + wait_for_jobs = false force_update = true timeout = 1800 # 30m @@ -106,12 +106,19 @@ resource "helm_release" "rancher" { } } -resource "time_sleep" "settle_after_rancher" { +resource "terraform_data" "wait_for_rancher" { depends_on = [ time_sleep.settle_before_rancher, helm_release.rancher, ] - create_duration = "120s" + provisioner "local-exec" { + command = <<-EOT + cd ${abspath(path.root)} || true + chmod +x ${abspath(path.module)}/runningPods.sh + echo "using kubeconfig located at $KUBECONFIG" + ${abspath(path.module)}/runningPods.sh + EOT + } } resource "random_password" "password" { @@ -125,7 +132,7 @@ resource "terraform_data" "get_public_cert_info" { random_password.password, time_sleep.settle_before_rancher, helm_release.rancher, - time_sleep.settle_after_rancher, + terraform_data.wait_for_rancher, ] provisioner "local-exec" { command = <<-EOT @@ -148,7 +155,7 @@ resource "terraform_data" "get_ping" { random_password.password, time_sleep.settle_before_rancher, helm_release.rancher, - time_sleep.settle_after_rancher, + terraform_data.wait_for_rancher, terraform_data.get_public_cert_info, ] provisioner "local-exec" { @@ -181,7 +188,7 @@ resource "rancher2_bootstrap" "admin" { random_password.password, time_sleep.settle_before_rancher, helm_release.rancher, - time_sleep.settle_after_rancher, + terraform_data.wait_for_rancher, terraform_data.get_public_cert_info, terraform_data.get_ping, ] diff --git a/modules/rancher_bootstrap/rancher_externalTLS/runningPods.sh b/modules/rancher_bootstrap/rancher_externalTLS/runningPods.sh new file mode 100755 index 0000000..5d0d515 --- /dev/null +++ b/modules/rancher_bootstrap/rancher_externalTLS/runningPods.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +JSONPATH="'{range .items[*]} + {.metadata.name}{\"\\t\"} \ + {.metadata.namespace}{\"\\t\"} \ + {.status.phase}{\"\\n\"} \ +{end}'" + +notReady() { + PODS=$(kubectl get pods -A -o jsonpath="$JSONPATH") + # shellcheck disable=SC2060,SC2140 + NOT_READY=$(echo "$PODS" | grep -v "Running" | grep -v "Succeeded" | tr -d ["\t","\n"," ","'"] || true) + if [ -n "$NOT_READY" ]; then + # Some pods aren't running + return 0 + else + # All pods are running + return 1 + fi +} + +readyWait() { + TIMEOUT=10 # 10 minutes + TIMEOUT_MINUTES=$((TIMEOUT * 60)) + INTERVAL=30 # 30 seconds + MAX=$((TIMEOUT_MINUTES / INTERVAL)) + ATTEMPTS=0 + + while notReady; do + if [ "$ATTEMPTS" -lt "$MAX" ]; then + ATTEMPTS=$((ATTEMPTS + 1)) + sleep "$INTERVAL"; + else + return 1 + fi + done + return 0 +} + +SUCCESSES=0 +SUCCESSES_NEEDED=3 # require three successes to make sure everything is settled + +while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do + SUCCESSES=$((SUCCESSES + 1)) + echo "succeeeded $SUCCESSES times..." + sleep 30 +done + +if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then + echo "$SUCCESSES_NEEDED successes reached, passed..." + EXITCODE=0 +else + echo "$SUCCESSES_NEEDED successes not reached, failed..." + EXITCODE=1 +fi + +echo "nodes..." +kubectl get nodes || true + +echo "all..." +kubectl get all -A || true + +echo "pods..." +kubectl get pods -A || true + +exit $EXITCODE diff --git a/test/scripts/runningPods.sh b/test/scripts/runningPods.sh index 3d745c5..ba6a262 100755 --- a/test/scripts/runningPods.sh +++ b/test/scripts/runningPods.sh @@ -47,7 +47,13 @@ while readyWait && [ "$SUCCESSES" -lt "$SUCCESSES_NEEDED" ]; do sleep 30 done -echo "Pods are ready..." +if [ "$SUCCESSES" -eq "$SUCCESSES_NEEDED" ]; then + echo "$SUCCESSES_NEEDED reached, passed.." + EXITCODE=0 +else + echo "$SUCCESSES_NEEDED not reached, failed.." + EXITCODE=1 +fi echo "nodes..." kubectl get nodes || true @@ -58,4 +64,4 @@ kubectl get all -A || true echo "pods..." kubectl get pods -A || true -exit 0 +exit $EXITCODE diff --git a/test/tests/downstream/downstream_test.go b/test/tests/downstream/downstream_test.go index 80c4e0b..6da769c 100644 --- a/test/tests/downstream/downstream_test.go +++ b/test/tests/downstream/downstream_test.go @@ -108,20 +108,27 @@ func TestDownstreamBasic(t *testing.T) { _, err = terraform.InitAndApplyE(t, terraformOptions) if err != nil { + t.Log("Test failed, tearing down...") + util.GetErrorLogs(t, testDir + "/kubeconfig") util.Teardown(t, testDir, terraformOptions, keyPair) os.Remove(exampleDir + ".terraform.lock.hcl") sshAgent.Stop() t.Fatalf("Error creating cluster: %s", err) } - t.Log("Test passed, tearing down...") + util.CheckReady(t, testDir + "/kubeconfig") + util.CheckRunning(t, testDir + "/kubeconfig") + if t.Failed() { + t.Log("Test failed...") + } else { + t.Log("Test passed...") + } util.Teardown(t, testDir, terraformOptions, keyPair) - os.Remove(exampleDir + ".terraform.lock.hcl") + os.Remove(exampleDir + "/.terraform.lock.hcl") sshAgent.Stop() } - func TestDownstreamProd(t *testing.T) { t.Parallel() id := util.GetId() @@ -214,13 +221,21 @@ func TestDownstreamProd(t *testing.T) { _, err = terraform.InitAndApplyE(t, terraformOptions) if err != nil { + t.Log("Test failed, tearing down...") + util.GetErrorLogs(t, testDir + "/kubeconfig") util.Teardown(t, testDir, terraformOptions, keyPair) os.Remove(exampleDir + ".terraform.lock.hcl") sshAgent.Stop() t.Fatalf("Error creating cluster: %s", err) } - t.Log("Test passed, tearing down...") + util.CheckReady(t, testDir + "/kubeconfig") + util.CheckRunning(t, testDir + "/kubeconfig") + if t.Failed() { + t.Log("Test failed...") + } else { + t.Log("Test passed...") + } util.Teardown(t, testDir, terraformOptions, keyPair) - os.Remove(exampleDir + ".terraform.lock.hcl") + os.Remove(exampleDir + "/.terraform.lock.hcl") sshAgent.Stop() } diff --git a/test/tests/one/one_test.go b/test/tests/one/one_test.go index c97b78f..87df9f2 100644 --- a/test/tests/one/one_test.go +++ b/test/tests/one/one_test.go @@ -110,6 +110,6 @@ func TestOneBasic(t *testing.T) { t.Log("Test passed...") } util.Teardown(t, testDir, terraformOptions, keyPair) - os.Remove(exampleDir + ".terraform.lock.hcl") + os.Remove(exampleDir + "/.terraform.lock.hcl") sshAgent.Stop() } diff --git a/test/tests/prod/prod_test.go b/test/tests/prod/prod_test.go index 4079284..b319e17 100644 --- a/test/tests/prod/prod_test.go +++ b/test/tests/prod/prod_test.go @@ -94,13 +94,21 @@ func TestProdBasic(t *testing.T) { }) _, err = terraform.InitAndApplyE(t, terraformOptions) if err != nil { + t.Log("Test failed, tearing down...") + util.GetErrorLogs(t, testDir + "/kubeconfig") util.Teardown(t, testDir, terraformOptions, keyPair) os.Remove(exampleDir + ".terraform.lock.hcl") sshAgent.Stop() t.Fatalf("Error creating cluster: %s", err) } - t.Log("Test passed, tearing down...") + util.CheckReady(t, testDir + "/kubeconfig") + util.CheckRunning(t, testDir + "/kubeconfig") + if t.Failed() { + t.Log("Test failed...") + } else { + t.Log("Test passed...") + } util.Teardown(t, testDir, terraformOptions, keyPair) - os.Remove(exampleDir + ".terraform.lock.hcl") + os.Remove(exampleDir + "/.terraform.lock.hcl") sshAgent.Stop() }