eos-tui/.github/workflows/ci.yml at 3851e38a6ff690f3bf5ae12a4680695874cd7bea · cern-eos/eos-tui · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
name: CI

# Runs unit tests on all platforms and integration tests against a real EOS
# cluster spun up via the official EOS Helm chart on a local kind
# (Kubernetes-in-Docker) cluster.
#
# Triggers:
#   - Every push to main and every pull request.
#   - Manual dispatch (workflow_dispatch), with an optional image-tag override.
#
# Prerequisites (no secrets required for the default image):
#   The EOS image is pulled from the public CERN registry:
#     gitlab-registry.cern.ch/dss/eos/eos-all
#   The EOS Helm chart is pulled from:
#     oci://registry.cern.ch/eos/charts/server

on:
  push:
    branches:
      - main
  pull_request:
  workflow_dispatch:
    inputs:
      eos_image_tag:
        description: 'EOS image tag including OS suffix (e.g. 5.4.1.el9)'
        required: false
        default: '5.4.1.el9'

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  # ---------------------------------------------------------------------------
  # Unit tests + build (cross-platform)
  # ---------------------------------------------------------------------------
  test:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]

    steps:
      - name: Check out repository
        uses: actions/checkout@v6

      - name: Set up Go
        uses: actions/setup-go@v6
        with:
          go-version-file: go.mod
          cache: true

      - name: Verify formatting
        if: matrix.os == 'ubuntu-latest'
        run: |
          unformatted="$(gofmt -l .)"
          if [ -n "$unformatted" ]; then
            echo "These files need gofmt:"
            echo "$unformatted"
            exit 1
          fi

      - name: Run tests
        env:
          EOS_TEST_SKIP: '1'
        run: go test ./...

      - name: Build binary
        run: go build -v -o bin/ ./...

  # ---------------------------------------------------------------------------
  # Integration tests against a real EOS cluster
  # ---------------------------------------------------------------------------
  integration:
    name: EOS Integration Tests
    runs-on: ubuntu-latest
    timeout-minutes: 60

    steps:
      - name: Check out repository
        uses: actions/checkout@v6

      - name: Set up Go
        uses: actions/setup-go@v6
        with:
          go-version-file: go.mod
          cache: true

      # -----------------------------------------------------------------------
      # Disk space – the EOS image is ~4 GB and kind copies it into the
      # node's containerd store, so we need headroom on the runner.
      # -----------------------------------------------------------------------

      - name: Free disk space
        run: |
          echo "=== Disk usage before cleanup ==="
          df -h /
          # Remove large pre-installed toolchains that we don't need.
          sudo rm -rf /usr/local/lib/android /usr/share/dotnet \
            /opt/ghc /usr/local/share/boost /usr/share/swift \
            /opt/hostedtoolcache/CodeQL || true
          # Prune Docker build cache & dangling images.
          docker system prune -af --volumes || true
          echo "=== Disk usage after cleanup ==="
          df -h /

      # -----------------------------------------------------------------------
      # Docker image cache
      #
      # The EOS "eos-ci" image is large. We save it as a tar after the first
      # pull and restore it on subsequent runs to avoid hitting the CERN
      # registry every time.
      #
      # The public image is gitlab-registry.cern.ch/dss/eos/eos-ci.
      # Tags follow the CERN convention: <version>.el9 (e.g. 5.4.1.el9).
      # The "eos-all" variant is hosted on a private CERN registry and is not
      # accessible from GitHub Actions runners.
      # -----------------------------------------------------------------------

      - name: Cache EOS Docker image
        id: cache-eos-image
        uses: actions/cache@v5
        with:
          path: /tmp/eos-image.tar
          key: eos-docker-image-${{ inputs.eos_image_tag || '5.4.1.el9' }}

      - name: Pull and save EOS Docker image
        if: steps.cache-eos-image.outputs.cache-hit != 'true'
        env:
          EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
          EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
        run: |
          docker pull ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG}
          docker save ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG} \
            -o /tmp/eos-image.tar

      - name: Load EOS Docker image from cache
        if: steps.cache-eos-image.outputs.cache-hit == 'true'
        run: |
          docker load -i /tmp/eos-image.tar
          # Remove the tar to free space for kind.
          rm -f /tmp/eos-image.tar

      # -----------------------------------------------------------------------
      # Kubernetes cluster
      # -----------------------------------------------------------------------

      - name: Create kind cluster
        uses: helm/kind-action@v1

      - name: Load EOS image into kind
        env:
          EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
          EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
        run: |
          # helm/kind-action creates a cluster named "chart-testing" by default,
          # so we must pass --name to avoid the "no nodes found for cluster 'kind'" error.
          kind load docker-image \
            --name chart-testing \
            ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG}
          # Remove the image from Docker now that it's loaded into
          # kind's containerd; avoids storing two copies.
          docker rmi ${EOS_IMAGE_REPO}:${EOS_IMAGE_TAG} || true

      # -----------------------------------------------------------------------
      # Helm chart cache
      # -----------------------------------------------------------------------

      - name: Cache Helm chart
        uses: actions/cache@v5
        with:
          path: ~/.cache/helm
          key: helm-eos-chart-${{ inputs.eos_image_tag || '5.4.1.el9' }}

      # -----------------------------------------------------------------------
      # EOS cluster
      # -----------------------------------------------------------------------

      - name: Deploy EOS via Helm chart
        env:
          EOS_IMAGE_REPO: gitlab-registry.cern.ch/dss/eos/eos-ci
          EOS_IMAGE_TAG: ${{ inputs.eos_image_tag || '5.4.1.el9' }}
        run: |
          # Deploy a minimal EOS cluster: 1 QDB (single-node raft), 1 MGM (+MQ sidecar), 2 FSTs.
          # Uses the publicly accessible eos-ci image pre-loaded into the kind cluster.
          # global.pullPolicy=IfNotPresent ensures k8s uses the locally pre-loaded image
          # instead of attempting to re-pull from the CERN registry.
          helm install eos oci://registry.cern.ch/eos/charts/server \
            --set "global.repository=${EOS_IMAGE_REPO}" \
            --set "global.tag=${EOS_IMAGE_TAG}" \
            --set "global.pullPolicy=IfNotPresent" \
            --set "fst.replicaCount=2" \
            --set "qdb.replicaCount=1" \
            --set "global.securityContext.privileged=true" \
            --set "global.securityContext.allowPrivilegeEscalation=true" \
            --timeout 20m \
            --wait

      - name: Wait for EOS namespace to boot
        run: |
          echo "Waiting for EOS MGM namespace to reach 'booted' state..."
          MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
            -o jsonpath='{.spec.containers[0].name}')
          echo "Using MGM container: ${MGM_CONTAINER}"

          for i in $(seq 1 60); do
            if kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- \
                eos ns stat 2>/dev/null | grep -q "booted"; then
              echo "EOS MGM booted after $((i * 10))s"
              break
            fi
            if [ "$i" -eq 60 ]; then
              echo "ERROR: EOS MGM did not boot within 600s"
              kubectl logs eos-mgm-0 --all-containers=true --tail=50
              exit 1
            fi
            echo "  attempt $i/60 — not yet booted, retrying in 10s..."
            sleep 10
          done

          echo "=== eos fs ls ==="
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos fs ls

      # -----------------------------------------------------------------------
      # SSH access into the MGM pod
      #
      # The eos-tui client issues every EOS command via SSH (runCommand uses
      # `ssh -o BatchMode=yes <target> <cmd>`).  We:
      #   1. Generate an ephemeral ed25519 key pair on the runner.
      #   2. Install openssh-server and socat inside the MGM container.
      #   3. Drop the public key into root's authorized_keys and start sshd
      #      on port 2222 (avoids any conflict with the EOS MGM service).
      #   4. Start a socat tunnel inside the container:
      #        localhost:7777  →  eos-qdb-0.eos-qdb.default.svc.cluster.local:7777
      #      This makes `redis-cli -p 7777 raft-info` work from the MGM pod,
      #      which is needed by TestIntegrationMGMs and
      #      TestIntegrationDiscoverMGMMaster.
      #   5. kubectl port-forward the container's port 2222 to the runner.
      #   6. Write an SSH client config block mapping the alias "eos-mgm" to
      #      localhost:2222 with the ephemeral key — no code changes needed.
      # -----------------------------------------------------------------------

      - name: Set up SSH access to EOS MGM
        timeout-minutes: 5
        run: |
          # Generate an ephemeral SSH key pair (no passphrase).
          mkdir -p ~/.ssh
          chmod 700 ~/.ssh
          ssh-keygen -t ed25519 -N '' -f ~/.ssh/eos_test_key -C eos-tui-ci
          PUBLIC_KEY=$(cat ~/.ssh/eos_test_key.pub)

          MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
            -o jsonpath='{.spec.containers[0].name}')
          echo "Using MGM container: ${MGM_CONTAINER}"

          # Install openssh-server + socat, then configure sshd and the QDB
          # proxy tunnel inside the MGM container.
          #
          # dnf can hang if the container's repos (e.g. CERN mirrors) are
          # unreachable from inside the kind cluster, so we:
          #   - wrap with `timeout 120` to cap the total time,
          #   - set per-connection timeout to 30s via --setopt,
          #   - disable weak-dep pulls and GPG checks for speed,
          #   - disable any non-base repos that might be unreachable.
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- bash -c "
            set -eo pipefail
            echo '--- Installing openssh-server and socat ---'

            # Disable any non-standard repos that may be unreachable from kind.
            for f in /etc/yum.repos.d/*.repo; do
              case \"\$(basename \"\$f\")\" in
                alma*|rocky*|centos*|baseos*|appstream*|extras*|crb*) ;;
                *) echo \"  disabling repo file: \$f\"
                   sed -i 's/^enabled=1/enabled=0/' \"\$f\" 2>/dev/null || true ;;
              esac
            done

            # Retry dnf up to 3 times — mirrors inside kind can be flaky.
            for attempt in 1 2 3; do
              echo \"  dnf install attempt \$attempt/3\"
              if timeout 120 dnf install -y \
                   --nogpgcheck \
                   --setopt=timeout=30 \
                   --setopt=install_weak_deps=False \
                   openssh-server socat; then
                break
              fi
              if [ \"\$attempt\" -eq 3 ]; then
                echo 'ERROR: dnf install failed after 3 attempts'
                exit 1
              fi
              echo '  retrying in 5s...'
              sleep 5
            done

            # Sanity-check that sshd was actually installed.
            if [ ! -x /usr/sbin/sshd ]; then
              echo 'ERROR: /usr/sbin/sshd not found after dnf install'
              exit 1
            fi

            echo '--- Configuring authorized_keys ---'
            mkdir -p /root/.ssh
            chmod 700 /root/.ssh
            echo '${PUBLIC_KEY}' > /root/.ssh/authorized_keys
            chmod 600 /root/.ssh/authorized_keys

            echo '--- Generating SSH host keys ---'
            ssh-keygen -A

            echo '--- Removing nologin gate (pam_nologin) ---'
            rm -f /run/nologin /etc/nologin /var/run/nologin

            echo '--- Starting sshd on port 2222 ---'
            /usr/sbin/sshd -p 2222 \
              -o UsePAM=no \
              -o PermitRootLogin=yes \
              -o PubkeyAuthentication=yes \
              -o PasswordAuthentication=no \
              -o AuthorizedKeysFile=/root/.ssh/authorized_keys \
              -o PrintLastLog=no \
              -o PrintMotd=no
            echo 'sshd running'

            echo '--- Starting socat QDB tunnel (localhost:7777 -> eos-qdb) ---'
            nohup socat TCP-LISTEN:7777,fork,reuseaddr \
              TCP:eos-qdb-0.eos-qdb.default.svc.cluster.local:7777 \
              </dev/null >/dev/null 2>&1 &
            echo 'socat tunnel started'
          "

          # Port-forward the MGM's sshd to the runner.
          kubectl port-forward pod/eos-mgm-0 2222:2222 &
          # Give port-forward time to establish.
          sleep 5

          # Write an SSH client config block so that `ssh eos-mgm <cmd>` works.
          cat >> ~/.ssh/config << 'EOF'
          Host eos-mgm
            HostName 127.0.0.1
            Port 2222
            User root
            IdentityFile ~/.ssh/eos_test_key
            StrictHostKeyChecking no
            UserKnownHostsFile /dev/null
            ServerAliveInterval 30
            ConnectTimeout 10
          EOF

          echo "--- Verifying SSH connectivity ---"
          ssh eos-mgm 'eos version'
          echo "--- Verifying EOS node list reachable via SSH ---"
          ssh eos-mgm 'eos -j node ls' | head -5
          echo "--- Verifying redis-cli via socat tunnel ---"
          ssh eos-mgm 'redis-cli -p 7777 raft-info' | head -5

      # -----------------------------------------------------------------------
      # Integration tests
      # -----------------------------------------------------------------------

      - name: Run integration tests
        env:
          EOS_TEST_INTEGRATION: '1'
          EOS_TEST_SSH_TARGET: eos-mgm
        run: |
          go test -v -timeout 10m -run TestIntegration ./eos/...

      # -----------------------------------------------------------------------
      # Debug dump on failure
      # -----------------------------------------------------------------------

      - name: Dump EOS state on failure
        if: failure()
        run: |
          echo "=== kubectl get pods ==="
          kubectl get pods || true

          MGM_CONTAINER=$(kubectl get pod eos-mgm-0 \
            -o jsonpath='{.spec.containers[0].name}' 2>/dev/null || echo "eos-mgm")

          echo "=== EOS version ==="
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos version || true
          echo "=== EOS ns stat ==="
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos ns stat || true
          echo "=== EOS fs ls ==="
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos fs ls || true
          echo "=== EOS node ls ==="
          kubectl exec eos-mgm-0 -c "${MGM_CONTAINER}" -- eos -j node ls 2>/dev/null \
            | head -50 || true

          echo "=== eos-mgm-0 pod logs (last 100 lines) ==="
          kubectl logs eos-mgm-0 --all-containers=true --tail=100 || true
          echo "=== eos-qdb-0 pod logs (last 50 lines) ==="
          kubectl logs eos-qdb-0 --tail=50 || true