Skip to content

Status Publisher #90073

Status Publisher

Status Publisher #90073

name: Status Publisher
on:
workflow_dispatch:
branches:
- main
schedule:
# Run every 5 minutes
- cron: '*/5 * * * *'
jobs:
# AWS Cloud Status Publisher Job
aws-status-publisher:
name: AWS Cloud Status Publisher
runs-on: ubuntu-24.04
env:
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
aws --version
kubectl version --client=true
- name: Get EKS cluster configuration
run: |
aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes Cluster status
if: always()
run: |
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "3" ]; then
status="majorOutage"
elif [ "$notReadyCount" -gt "2" ]; then
status="partialOutage"
elif [ "$notReadyCount" -gt "1" ]; then
status="degradedPerformance"
else
status="operational"
fi
cat <<EOF > status.aws_kubernetes_cluster.json
{
"status": "${status}",
"rawData": $(kubectl get nodes | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Elasticsearch status
if: always()
run: |
esData=$(kubectl -n elastic-stack get elasticsearches main -o json)
esHealth=$(echo "$esData" | jq -r '.status.health')
if [ "$esHealth" == "red" ]; then
status="majorOutage"
# NOTE: We are intentionally running Elasticsearch with reduced
# replicas (i.e. yellow) for cost saving reasons.
# elif [ "$esHealth" == "yellow" ]; then
# status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.aws_elasticsearch.json
{
"status": "${status}",
"rawData": $(kubectl -n elastic-stack get elasticsearches main | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Kibana status
if: always()
run: |
kbData=$(kubectl -n elastic-stack get kibanas main -o json)
kbHealth=$(echo "$kbData" | jq -r '.status.health')
if [ "$kbHealth" == "red" ]; then
status="majorOutage"
elif [ "$kbHealth" == "yellow" ]; then
status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.aws_kibana.json
{
"status": "${status}",
"rawData": $(kubectl -n elastic-stack get kibanas main | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Upload status data to S3 bucket
if: always()
run: |
aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'
# Centrinix Cloud Status Publisher Job
cnx-status-publisher:
name: Centrinix Cloud Status Publisher
runs-on: ubuntu-24.04
env:
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install WireGuard client
run: |
sudo apt install wireguard
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Centrinix VPN
run: |
sudo sh -c 'echo "${{ secrets.CNX_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.CNX_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes Cluster status
if: always()
run: |
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "10" ]; then
status="majorOutage"
elif [ "$notReadyCount" -gt "5" ]; then
status="partialOutage"
elif [ "$notReadyCount" -gt "2" ]; then
status="degradedPerformance"
else
status="operational"
fi
cat <<EOF > status.cnx_kubernetes_cluster.json
{
"status": "${status}",
"rawData": $(kubectl get nodes | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check KeyDB Cache status
if: always()
run: |
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "1" ]; then
status="inactive"
elif [ "$readyCount" -lt "2" ]; then
status="majorOutage"
elif [ "$readyCount" -lt "3" ]; then
status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.cnx_keydb_cache.json
{
"status": "${status}",
"rawData": $(kubectl -n keydb-cache get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Actions Runner Controller status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
status="majorOutage"
else
status="operational"
fi
cat <<EOF > status.cnx_actions_runner_controller.json
{
"status": "${status}",
"rawData": $(kubectl -n arc-systems get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-arm64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-cnx")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.cnx_runner_scale_set-linux_arm64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-x64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-cnx")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.cnx_runner_scale_set-linux_x64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Upload status data to S3 bucket
if: always()
run: |
aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'
# Hetzner Cloud Status Publisher Job
hzr-status-publisher:
name: Hetzner Cloud Status Publisher
runs-on: ubuntu-24.04
env:
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Install WireGuard client
run: |
sudo apt install wireguard
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Hetzner VPN
run: |
sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.HZR_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes Cluster status
if: always()
run: |
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "10" ]; then
status="majorOutage"
elif [ "$notReadyCount" -gt "5" ]; then
status="partialOutage"
elif [ "$notReadyCount" -gt "2" ]; then
status="degradedPerformance"
else
status="operational"
fi
cat <<EOF > status.hzr_kubernetes_cluster.json
{
"status": "${status}",
"rawData": $(kubectl get nodes | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check KeyDB Cache status
if: always()
run: |
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "1" ]; then
status="inactive"
elif [ "$readyCount" -lt "2" ]; then
status="majorOutage"
elif [ "$readyCount" -lt "3" ]; then
status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.hzr_keydb_cache.json
{
"status": "${status}",
"rawData": $(kubectl -n keydb-cache get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Actions Runner Controller status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
status="majorOutage"
else
status="operational"
fi
cat <<EOF > status.hzr_actions_runner_controller.json
{
"status": "${status}",
"rawData": $(kubectl -n arc-systems get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-arm64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-hzr")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.hzr_runner_scale_set-linux_arm64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-x64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-hzr")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.hzr_runner_scale_set-linux_x64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Upload status data to S3 bucket
if: always()
run: |
aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'