-
Notifications
You must be signed in to change notification settings - Fork 54
238 lines (209 loc) · 10 KB
/
e2e-test.yml
File metadata and controls
238 lines (209 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: E2E Tests
# This workflow runs end-to-end tests on both AMD64 and ARM64 architectures in parallel
# with both MongoDB and PostgreSQL datastores to ensure compatibility across different
# hardware platforms and database backends.
#
# Configuration:
# - Set RUNNER_ARCH_LARGE_AMD64 variable to override default AMD64 runner
# - Set RUNNER_ARCH_LARGE_ARM64 variable to override default ARM64 runner
# - Each architecture + datastore combination gets its own isolated cluster and test artifacts
on:
push:
branches:
- main
- "pull-request/[0-9]+"
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read # Required for checking out code
actions: read # Required for artifact operations
jobs:
e2e-test:
if: github.repository == 'nvidia/nvsentinel'
# Run E2E tests on both AMD64 and ARM64 architectures with both MongoDB and PostgreSQL
strategy:
fail-fast: false # Allow all combinations to complete even if some fail
matrix:
include:
- arch: amd64
runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }}
arch_name: "AMD64"
datastore: "mongodb"
datastore_name: "MongoDB"
- arch: arm64
runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }}
arch_name: "ARM64"
datastore: "mongodb"
datastore_name: "MongoDB"
- arch: amd64
runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }}
arch_name: "AMD64"
datastore: "postgresql"
datastore_name: "PostgreSQL"
- arch: arm64
runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }}
arch_name: "ARM64"
datastore: "postgresql"
datastore_name: "PostgreSQL"
name: "E2E Tests (${{ matrix.arch_name }} + ${{ matrix.datastore_name }})"
runs-on: ${{ matrix.runner }}
timeout-minutes: 120
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Workaround for freeing up more disk space
run: |
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
# Additional Docker cleanup as recommended by Kind
docker system prune -f
- name: Setup build environment
uses: ./.github/actions/setup-ci-env
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- name: Prep system for multi-node Kind cluster
run: |
# System configuration for Kind multi-node setup
sudo systemctl stop apparmor || echo "unable to stop apparmor"
sudo systemctl disable apparmor || echo "unable to disable apparmor"
sudo modprobe br_netfilter || echo "unable to run modprobe"
# Network configuration
sudo sysctl -w net.ipv6.conf.all.forwarding=1
sudo sysctl -w net.ipv4.ip_forward=1
sudo sysctl -w net.bridge.bridge-nf-call-ip6tables=1
sudo sysctl -w net.bridge.bridge-nf-call-iptables=1
# File system limits for Kind
sudo sysctl -w fs.inotify.max_user_watches=524288
sudo sysctl -w fs.inotify.max_user_instances=1024
# IPTables cleanup and configuration
sudo iptables -F && sudo iptables -X && sudo iptables -t nat -F && sudo iptables -t nat -X && sudo iptables -t mangle -F && sudo iptables -t mangle -X && sudo iptables -P INPUT ACCEPT && sudo iptables -P FORWARD ACCEPT -w 5 && sudo iptables -P OUTPUT ACCEPT -w 5
sudo systemctl restart docker
- name: Install E2E testing tools
uses: ./.github/actions/install-e2e-tools
- name: Configure Helm repositories
run: |
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add bitnami https://charts.bitnami.com/bitnami
helm repo add jetstack https://charts.jetstack.io
helm repo update
- name: Configure ctlptl registry authentication
run: |
./scripts/configure-ctlptl-registry.sh
- name: Compute ref name with short SHA
id: ref-name
run: |
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}"
# Sanitize ref name: replace slashes with hyphens for Docker tag compatibility
SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g')
echo "value=$SAFE_REF" >> $GITHUB_OUTPUT
- name: Create cluster for E2E tests
env:
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
CTLPTL_YAML: .ctlptl.yaml
# Make cluster names unique per architecture and datastore to avoid conflicts in parallel runs
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}-${{ matrix.datastore }}"
run: |
make cluster-create
- name: Override MongoDB image for ARM64
if: matrix.arch == 'arm64' && matrix.datastore == 'mongodb'
run: |
sed -i 's/repository: "bitnamilegacy\/mongodb"/repository: "dlavrenuek\/bitnami-mongodb-arm"/' distros/kubernetes/nvsentinel/values-tilt.yaml
sed -i 's/tag: "8.0.3-debian-12-r1"/tag: "8.0.4"/' distros/kubernetes/nvsentinel/values-tilt.yaml
- name: Run E2E tests
env:
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
CTLPTL_YAML: .ctlptl.yaml
# Use same cluster name suffix for consistency
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}-${{ matrix.datastore }}"
# Set USE_POSTGRESQL for PostgreSQL tests (our integrated Tiltfile approach)
USE_POSTGRESQL: ${{ matrix.datastore == 'postgresql' && '1' || '0' }}
# Set architecture-specific test tags
TEST_TAGS: ${{ matrix.arch }}_group${{ matrix.datastore == 'mongodb' && ',mongodb' || '' }}
run: |
make e2e-test-ci
- name: Upload test results
uses: ./.github/actions/upload-test-artifacts
with:
component-name: e2e-test-${{ matrix.arch }}-${{ matrix.datastore }}
file-paths: |
tests/results/
tests/*.log
retention-days: 14
- name: Export Kind logs
if: always()
run: |
mkdir -p /tmp/kind-logs
CLUSTER_NAME=$(kind get clusters | head -n1)
if [ -n "$CLUSTER_NAME" ]; then
kind export logs /tmp/kind-logs --name "$CLUSTER_NAME" || true
fi
- name: Pull Kind container logs via crictl
if: always()
run: |
chmod +x ./scripts/pull_kind_logs.sh
./scripts/pull_kind_logs.sh /tmp/kind-container-logs || true
- name: Upload Kind container logs
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: e2e-kind-container-logs-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
path: /tmp/kind-container-logs/
retention-days: 7
- name: Collect debug artifacts
if: failure()
run: |
mkdir -p /tmp/debug-artifacts
kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
kubectl get pods --all-namespaces -o yaml > /tmp/debug-artifacts/all-pods.yaml || true
kubectl get jobs --all-namespaces -o yaml > /tmp/debug-artifacts/all-jobs.yaml || true
kubectl logs --all-namespaces --all-containers=true --tail=500 > /tmp/debug-artifacts/all-logs.txt || true
docker images > /tmp/debug-artifacts/docker-images.txt || true
df -h > /tmp/debug-artifacts/disk-usage.txt || true
# Comprehensive cluster state dump for debugging
kubectl cluster-info dump --all-namespaces --output-directory=/tmp/debug-artifacts/cluster-info-dump || true
# Get node details including cordon state
kubectl get nodes -o wide > /tmp/debug-artifacts/nodes-wide.txt || true
kubectl get nodes -o yaml > /tmp/debug-artifacts/nodes-full.yaml || true
# Get node descriptions for annotations and conditions
kubectl describe nodes > /tmp/debug-artifacts/nodes-describe.txt || true
- name: Upload Kind logs
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: e2e-kind-logs-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
path: /tmp/kind-logs/
retention-days: 7
- name: Upload debug artifacts
if: failure()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: e2e-debug-artifacts-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
path: /tmp/debug-artifacts/
retention-days: 7
- name: Cleanup Docker resources
if: always()
run: |
docker rm -f $(docker ps -a -q) || true
docker rmi -f $(docker images -q -a) || true
docker volume prune -f || true
docker network prune -f || true
docker builder prune -f || true