Skip to content

Cloud Monitor

Cloud Monitor #32122

Workflow file for this run

name: Cloud Monitor
on:
workflow_dispatch:
branches:
- main
schedule:
# Run every 30 minutes
- cron: '*/30 * * * *'
jobs:
# AWS Cloud Monitor Job
aws-monitor:
name: AWS Cloud Monitor
runs-on: ubuntu-24.04
env:
DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
aws --version
kubectl version --client=true
- name: Check EKS cluster nodegroups
if: always()
run: |
nodeGroupNames=$(aws eks list-nodegroups --cluster-name zephyr-alpha | jq -r '.nodegroups[]')
isFailed="no"
for nodeGroupName in $nodeGroupNames; do
nodeGroup=$(aws eks describe-nodegroup --cluster-name zephyr-alpha --nodegroup-name $nodeGroupName)
nodeGroupStatus=$(echo "$nodeGroup" | jq -r '.nodegroup.status')
if [ "$nodeGroupStatus" != 'ACTIVE' ]; then
.github/log.sh ERROR "aws: zephyr-alpha: Found nodegroup '${nodeGroupName}' with inactive status."
isFailed="yes"
fi
done
if [ "$isFailed" = "yes" ]; then
exit 911
fi
- name: Get EKS cluster configuration
if: always()
run: |
aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes nodes
if: always()
run: |
kubectl get nodes
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "aws: zephyr-alpha: Found ${notReadyCount} node(s) with not-ready status."
exit 911
fi
- name: Report failure
if: failure()
run: |
jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
.github/log.sh ERROR "aws: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"
# Centrinix Cloud Monitor Job
cnx-monitor:
name: Centrinix Cloud Monitor
runs-on: ubuntu-24.04
env:
DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install WireGuard client
run: |
sudo apt install wireguard
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Centrinix VPN
run: |
sudo sh -c 'echo "${{ secrets.CNX_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.CNX_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes nodes
if: always()
run: |
kubectl get nodes
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "cnx: kube1: Found ${notReadyCount} node(s) with not-ready status."
exit 911
fi
- name: Check KeyDB cache pods
if: always()
run: |
kubectl -n keydb-cache get pods
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "3" ]; then
.github/log.sh ERROR "cnx: kube1: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
exit 911
fi
- name: Check Actions Runner Controller pods
if: always()
run: |
kubectl -n arc-systems get pods
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "cnx: kube1: Found ${notReadyCount} ARC pods with not-ready status."
exit 911
fi
- name: Clean up failed ephemeral runners
if: always()
run: |
kubectl -n arc-runners get ephemeralrunners
runnerList=$(kubectl -n arc-runners get ephemeralrunners -o json)
failedCount=$(echo "$runnerList" | jq -r '
[
.items[].status |
select(.phase == "Failed")
] | length')
if [ "$failedCount" -gt "0" ]; then
.github/log.sh WARN "cnx: kube1: Found ${failedCount} ephemeral runners with failed status."
echo "$runnerList" |
jq -r '.items[].status | select(.phase == "Failed").runnerName' |
xargs kubectl -n arc-runners delete ephemeralrunner
.github/log.sh INFO "cnx: kube1: Cleaned up ${failedCount} ephemeral runners with failed status."
fi
- name: Report failure
if: failure()
run: |
jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
.github/log.sh ERROR "cnx: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"
# Hetzner Cloud Monitor Job
hzr-monitor:
name: Hetzner Cloud Monitor
runs-on: ubuntu-24.04
env:
DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install WireGuard client
run: |
sudo apt install wireguard
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Hetzner VPN
run: |
sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.HZR_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes nodes
if: always()
run: |
kubectl get nodes
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} node(s) with not-ready status."
exit 911
fi
- name: Check KeyDB cache pods
if: always()
run: |
kubectl -n keydb-cache get pods
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "3" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
exit 911
fi
- name: Check Actions Runner Controller pods
if: always()
run: |
kubectl -n arc-systems get pods
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} ARC pods with not-ready status."
exit 911
fi
- name: Clean up failed ephemeral runners
if: always()
run: |
kubectl -n arc-runners get ephemeralrunners
runnerList=$(kubectl -n arc-runners get ephemeralrunners -o json)
failedCount=$(echo "$runnerList" | jq -r '
[
.items[].status |
select(.phase == "Failed")
] | length')
if [ "$failedCount" -gt "0" ]; then
.github/log.sh WARN "hzr: ci-main: Found ${failedCount} ephemeral runners with failed status."
echo "$runnerList" |
jq -r '.items[].status | select(.phase == "Failed").runnerName' |
xargs kubectl -n arc-runners delete ephemeralrunner
.github/log.sh INFO "hzr: ci-main: Cleaned up ${failedCount} ephemeral runners with failed status."
fi
- name: Report failure
if: failure()
run: |
jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
.github/log.sh ERROR "hzr: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"