Skip to content

access: widen rule column and rebalance weights #116

access: widen rule column and rebalance weights

access: widen rule column and rebalance weights #116

Workflow file for this run

name: CI
# Runs unit tests on all platforms and integration tests against a real EOS
# cluster spun up via the official EOS Helm chart on a local kind
# (Kubernetes-in-Docker) cluster.
#
# Triggers:
# - Every push to main and every pull request.
# - Manual dispatch (workflow_dispatch), with an optional image-tag override.
#
# Prerequisites (no secrets required for the default image):
# The EOS image is pulled from the public CERN registry:
# gitlab-registry.cern.ch/dss/eos/eos-all
# The EOS Helm chart is pulled from:
# oci://registry.cern.ch/eos/charts/server
on:
push:
branches:
- main
pull_request:
workflow_dispatch:
inputs:
eos_image_tag:
description: 'EOS image tag including OS suffix (e.g. 5.4.1.el9)'
required: false
default: '5.4.1.el9'
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
# ---------------------------------------------------------------------------
# Unit tests + build (cross-platform)
# ---------------------------------------------------------------------------
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- name: Check out repository
uses: actions/checkout@v6
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version-file: go.mod
cache: true
- name: Verify formatting
if: matrix.os == 'ubuntu-latest'
run: |
unformatted="$(gofmt -l .)"
if [ -n "$unformatted" ]; then
echo "These files need gofmt:"
echo "$unformatted"
exit 1
fi
- name: Run tests
env:
EOS_TEST_SKIP: '1'
run: go test ./...
- name: Build binary
run: go build -v -o bin/ ./...
# ---------------------------------------------------------------------------
# Integration tests against a real EOS cluster
# ---------------------------------------------------------------------------
integration:
name: EOS Integration Tests
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Check out repository
uses: actions/checkout@v6
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version-file: go.mod
cache: true
# -----------------------------------------------------------------------
# Disk space – the EOS image is ~4 GB and kind copies it into the
# node's containerd store, so we need headroom on the runner.
# -----------------------------------------------------------------------
- name: Free disk space
run: |
echo "=== Disk usage before cleanup ==="
df -h /
# Remove large pre-installed toolchains that we don't need.
sudo rm -rf /usr/local/lib/android /usr/share/dotnet \
/opt/ghc /usr/local/share/boost /usr/share/swift \
/opt/hostedtoolcache/CodeQL || true
# Prune Docker build cache & dangling images.
docker system prune -af --volumes || true
echo "=== Disk usage after cleanup ==="
df -h /
# -----------------------------------------------------------------------
# Docker image cache
#
# The EOS "eos-ci" image is large. We save it as a tar after the first
# pull and restore it on subsequent runs to avoid hitting the CERN
# registry every time.
#
# The public image is gitlab-registry.cern.ch/dss/eos/eos-ci.
# Tags follow the CERN convention: <version>.el9 (e.g. 5.4.1.el9).
# The "eos-all" variant is hosted on a private CERN registry and is not
# accessible from GitHub Actions runners.
# -----------------------------------------------------------------------
- name: Cache EOS Docker image
id: cache-eos-image
uses: actions/cache@v5
with:
path: /tmp/eos-image.tar
key: eos-docker-image-${{ inputs.eos_image_tag || '5.4.1.el9' }}
- name: Pull and save EOS Docker image
if: steps.cache-eos-image.outputs.cache-hit != 'true'
env:
EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
run: |
docker pull ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG}
docker save ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG} \
-o /tmp/eos-image.tar
- name: Load EOS Docker image from cache
if: steps.cache-eos-image.outputs.cache-hit == 'true'
run: |
docker load -i /tmp/eos-image.tar
# Remove the tar to free space for kind.
rm -f /tmp/eos-image.tar
# -----------------------------------------------------------------------
# Kubernetes cluster
# -----------------------------------------------------------------------
- name: Create kind cluster
uses: helm/kind-action@v1
- name: Load EOS image into kind
env:
EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
run: |
# helm/kind-action creates a cluster named "chart-testing" by default,
# so we must pass --name to avoid the "no nodes found for cluster 'kind'" error.
kind load docker-image \
--name chart-testing \
${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG}
# Remove the image from Docker now that it's loaded into
# kind's containerd; avoids storing two copies.
docker rmi ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG} || true
# -----------------------------------------------------------------------
# Helm chart cache
# -----------------------------------------------------------------------
- name: Cache Helm chart
uses: actions/cache@v5
with:
path: ~/.cache/helm
key: helm-eos-chart-${{ inputs.eos_image_tag || '5.4.1.el9' }}
# -----------------------------------------------------------------------
# EOS cluster
# -----------------------------------------------------------------------
- name: Deploy EOS via Helm chart
env:
EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
run: |
# Deploy a minimal EOS cluster: 1 QDB (single-node raft), 1 MGM (+MQ sidecar), 2 FSTs.
# Uses the publicly accessible eos-ci image pre-loaded into the kind cluster.
# global.pullPolicy=IfNotPresent ensures k8s uses the locally pre-loaded image
# instead of attempting to re-pull from the CERN registry.
helm install eos oci://registry.cern.ch/eos/charts/server \
--set "global.repository=${EOS_IMAGE_REPO}" \
--set "global.tag=${EOS_IMAGE_TAG}" \
--set "global.pullPolicy=IfNotPresent" \
--set "fst.replicaCount=2" \
--set "qdb.replicaCount=1" \
--set "global.securityContext.privileged=true" \
--set "global.securityContext.allowPrivilegeEscalation=true" \
--timeout 20m \
--wait
- name: Wait for EOS namespace to boot
run: |
echo "Waiting for EOS MGM namespace to reach 'booted' state..."
MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
-o jsonpath='{.spec.containers[0].name}')
echo "Using MGM container: ${MGM_CONTAINER}"
for i in $(seq 1 60); do
if kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- \
eos ns stat 2>/dev/null | grep -q "booted"; then
echo "EOS MGM booted after $((i * 10))s"
break
fi
if [ "$i" -eq 60 ]; then
echo "ERROR: EOS MGM did not boot within 600s"
kubectl logs eos-mgm-0 --all-containers=true --tail=50
exit 1
fi
echo " attempt $i/60 — not yet booted, retrying in 10s..."
sleep 10
done
echo "=== eos fs ls ==="
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos fs ls
# -----------------------------------------------------------------------
# SSH access into the MGM pod
#
# The eos-tui client issues every EOS command via SSH (runCommand uses
# `ssh -o BatchMode=yes <target> <cmd>`). We:
# 1. Generate an ephemeral ed25519 key pair on the runner.
# 2. Install openssh-server and socat inside the MGM container.
# 3. Drop the public key into root's authorized_keys and start sshd
# on port 2222 (avoids any conflict with the EOS MGM service).
# 4. Start a socat tunnel inside the container:
# localhost:7777 → eos-qdb-0.eos-qdb.default.svc.cluster.local:7777
# This makes `redis-cli -p 7777 raft-info` work from the MGM pod,
# which is needed by TestIntegrationMGMs and
# TestIntegrationDiscoverMGMMaster.
# 5. kubectl port-forward the container's port 2222 to the runner.
# 6. Write an SSH client config block mapping the alias "eos-mgm" to
# localhost:2222 with the ephemeral key — no code changes needed.
# -----------------------------------------------------------------------
- name: Set up SSH access to EOS MGM
timeout-minutes: 5
run: |
# Generate an ephemeral SSH key pair (no passphrase).
mkdir -p ~/.ssh
chmod 700 ~/.ssh
ssh-keygen -t ed25519 -N '' -f ~/.ssh/eos_test_key -C eos-tui-ci
PUBLIC_KEY=$(cat ~/.ssh/eos_test_key.pub)
MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
-o jsonpath='{.spec.containers[0].name}')
echo "Using MGM container: ${MGM_CONTAINER}"
# Install openssh-server + socat, then configure sshd and the QDB
# proxy tunnel inside the MGM container.
#
# dnf can hang if the container's repos (e.g. CERN mirrors) are
# unreachable from inside the kind cluster, so we:
# - wrap with `timeout 120` to cap the total time,
# - set per-connection timeout to 30s via --setopt,
# - disable weak-dep pulls and GPG checks for speed,
# - disable any non-base repos that might be unreachable.
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- bash -c "
set -eo pipefail
echo '--- Installing openssh-server and socat ---'
# Disable any non-standard repos that may be unreachable from kind.
for f in /etc/yum.repos.d/*.repo; do
case \"\$(basename \"\$f\")\" in
alma*|rocky*|centos*|baseos*|appstream*|extras*|crb*) ;;
*) echo \" disabling repo file: \$f\"
sed -i 's/^enabled=1/enabled=0/' \"\$f\" 2>/dev/null || true ;;
esac
done
# Retry dnf up to 3 times — mirrors inside kind can be flaky.
for attempt in 1 2 3; do
echo \" dnf install attempt \$attempt/3\"
if timeout 120 dnf install -y \
--nogpgcheck \
--setopt=timeout=30 \
--setopt=install_weak_deps=False \
openssh-server socat; then
break
fi
if [ \"\$attempt\" -eq 3 ]; then
echo 'ERROR: dnf install failed after 3 attempts'
exit 1
fi
echo ' retrying in 5s...'
sleep 5
done
# Sanity-check that sshd was actually installed.
if [ ! -x /usr/sbin/sshd ]; then
echo 'ERROR: /usr/sbin/sshd not found after dnf install'
exit 1
fi
echo '--- Configuring authorized_keys ---'
mkdir -p /root/.ssh
chmod 700 /root/.ssh
echo '${PUBLIC_KEY}' > /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys
echo '--- Generating SSH host keys ---'
ssh-keygen -A
echo '--- Removing nologin gate (pam_nologin) ---'
rm -f /run/nologin /etc/nologin /var/run/nologin
echo '--- Starting sshd on port 2222 ---'
/usr/sbin/sshd -p 2222 \
-o UsePAM=no \
-o PermitRootLogin=yes \
-o PubkeyAuthentication=yes \
-o PasswordAuthentication=no \
-o AuthorizedKeysFile=/root/.ssh/authorized_keys \
-o PrintLastLog=no \
-o PrintMotd=no
echo 'sshd running'
echo '--- Starting socat QDB tunnel (localhost:7777 -> eos-qdb) ---'
nohup socat TCP-LISTEN:7777,fork,reuseaddr \
TCP:eos-qdb-0.eos-qdb.default.svc.cluster.local:7777 \
</dev/null >/dev/null 2>&1 &
echo 'socat tunnel started'
"
# Port-forward the MGM's sshd to the runner.
kubectl port-forward pod/eos-mgm-0 2222:2222 &
# Give port-forward time to establish.
sleep 5
# Write an SSH client config block so that `ssh eos-mgm <cmd>` works.
cat >> ~/.ssh/config << 'EOF'
Host eos-mgm
HostName 127.0.0.1
Port 2222
User root
IdentityFile ~/.ssh/eos_test_key
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
ServerAliveInterval 30
ConnectTimeout 10
EOF
echo "--- Verifying SSH connectivity ---"
ssh eos-mgm 'eos version'
echo "--- Verifying EOS node list reachable via SSH ---"
ssh eos-mgm 'eos -j node ls' | head -5
echo "--- Verifying redis-cli via socat tunnel ---"
ssh eos-mgm 'redis-cli -p 7777 raft-info' | head -5
# -----------------------------------------------------------------------
# Integration tests
# -----------------------------------------------------------------------
- name: Run integration tests
env:
EOS_TEST_INTEGRATION: '1'
EOS_TEST_SSH_TARGET: eos-mgm
run: |
go test -v -timeout 10m -run TestIntegration ./eos/...
# -----------------------------------------------------------------------
# Debug dump on failure
# -----------------------------------------------------------------------
- name: Dump EOS state on failure
if: failure()
run: |
echo "=== kubectl get pods ==="
kubectl get pods || true
MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
-o jsonpath='{.spec.containers[0].name}' 2>/dev/null || echo "eos-mgm")
echo "=== EOS version ==="
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos version || true
echo "=== EOS ns stat ==="
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos ns stat || true
echo "=== EOS fs ls ==="
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos fs ls || true
echo "=== EOS node ls ==="
kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos -j node ls 2>/dev/null \
| head -50 || true
echo "=== eos-mgm-0 pod logs (last 100 lines) ==="
kubectl logs eos-mgm-0 --all-containers=true --tail=100 || true
echo "=== eos-qdb-0 pod logs (last 50 lines) ==="
kubectl logs eos-qdb-0 --tail=50 || true