Skip to content

Commit 6244c8a

Browse files
committed
Merge branch 'main' into jtschelling/124-azure-maintenance-events
2 parents dc50464 + 485cf27 commit 6244c8a

File tree

429 files changed

+80583
-1991
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

429 files changed

+80583
-1991
lines changed

.github/copy-pr-bot.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ additional_trustees:
2020
- nitz2407
2121
- XRFXLP
2222
- mchmarny
23-
- tanishagoyal2
23+
- tanishagoyal2
24+
- ksaur

.github/headers/NOTICE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ This product includes software developed by NVIDIA Corporation (https://www.nvid
66
The Initial Developer of the distros/kubernetes/nvsentinel/charts/mongodb-store/charts/mongodb, is Broadcom, Inc. (https://www.broadcom.com/).
77
Copyright 2024 Broadcom, Inc. All Rights Reserved.
88
Source: https://github.com/bitnami/charts/commit/f001424f6aad534d5fe5a9ad09265b24b1fd6fe7
9+
10+
The Initial Developer of the distros/kubernetes/nvsentinel/charts/mongodb-store/charts/psmdb-operator and distros/kubernetes/nvsentinel/charts/mongodb-store/charts/psmdb-db, is Percona LLC (https://www.percona.com/).
11+
Copyright 2019 Paul Czarkowski. All Rights Reserved.
12+
Source: https://github.com/percona/percona-helm-charts

.github/workflows/code-scanning.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
pull-requests: write # Required for posting comments
4141
steps:
4242
- name: Checkout repository
43-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
43+
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
4444

4545
- name: Setup build environment
4646
uses: ./.github/actions/setup-ci-env
@@ -90,7 +90,7 @@ jobs:
9090
packages: read
9191
steps:
9292
- name: Checkout repository
93-
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
93+
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
9494

9595
- name: Setup build environment
9696
uses: ./.github/actions/setup-ci-env

.github/workflows/container-build-test.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ jobs:
6767
- component: file-server-cleanup
6868
make_command: 'make -C log-collector docker-build-file-server-cleanup'
6969
steps:
70-
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
70+
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
7171

7272
- name: Setup build environment
7373
uses: ./.github/actions/setup-ci-env
@@ -122,8 +122,10 @@ jobs:
122122
path: ./cmd/csp-health-monitor
123123
- module: health-monitors/csp-health-monitor
124124
path: ./cmd/maintenance-notifier
125+
- module: event-exporter
126+
path: .
125127
steps:
126-
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
128+
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
127129

128130
- name: Setup build environment
129131
uses: ./.github/actions/setup-ci-env

.github/workflows/e2e-test.yml

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
name: E2E Tests
1616

1717
# This workflow runs end-to-end tests on both AMD64 and ARM64 architectures in parallel
18-
# to ensure compatibility across different hardware platforms.
18+
# with both MongoDB and PostgreSQL datastores to ensure compatibility across different
19+
# hardware platforms and database backends.
1920
#
2021
# Configuration:
2122
# - Set RUNNER_ARCH_LARGE_AMD64 variable to override default AMD64 runner
2223
# - Set RUNNER_ARCH_LARGE_ARM64 variable to override default ARM64 runner
23-
# - Each architecture gets its own isolated cluster and test artifacts
24+
# - Each architecture + datastore combination gets its own isolated cluster and test artifacts
2425

2526
on:
2627
push:
@@ -39,22 +40,37 @@ permissions:
3940

4041
jobs:
4142
e2e-test:
43+
# Run E2E tests on both AMD64 and ARM64 architectures with both MongoDB and PostgreSQL
4244
strategy:
43-
fail-fast: false # Allow both architectures to complete even if one fails
45+
fail-fast: false # Allow all combinations to complete even if some fail
4446
matrix:
4547
include:
4648
- arch: amd64
4749
runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }}
4850
arch_name: "AMD64"
51+
datastore: "mongodb"
52+
datastore_name: "MongoDB"
4953
- arch: arm64
5054
runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }}
5155
arch_name: "ARM64"
56+
datastore: "mongodb"
57+
datastore_name: "MongoDB"
58+
- arch: amd64
59+
runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }}
60+
arch_name: "AMD64"
61+
datastore: "postgresql"
62+
datastore_name: "PostgreSQL"
63+
- arch: arm64
64+
runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }}
65+
arch_name: "ARM64"
66+
datastore: "postgresql"
67+
datastore_name: "PostgreSQL"
5268

53-
name: "E2E Tests (${{ matrix.arch_name }})"
69+
name: "E2E Tests (${{ matrix.arch_name }} + ${{ matrix.datastore_name }})"
5470
runs-on: ${{ matrix.runner }}
5571
timeout-minutes: 90
5672
steps:
57-
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
73+
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
5874

5975
- name: Workaround for freeing up more disk space
6076
run: |
@@ -120,24 +136,32 @@ jobs:
120136
env:
121137
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
122138
CTLPTL_YAML: .ctlptl.yaml
123-
# Make cluster names unique per architecture to avoid conflicts in parallel runs
124-
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}"
139+
# Make cluster names unique per architecture and datastore to avoid conflicts in parallel runs
140+
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}-${{ matrix.datastore }}"
125141
run: |
126142
make cluster-create
127143
144+
- name: Override MongoDB image for ARM64
145+
if: matrix.arch == 'arm64' && matrix.datastore == 'mongodb'
146+
run: |
147+
sed -i 's/repository: "bitnamilegacy\/mongodb"/repository: "dlavrenuek\/bitnami-mongodb-arm"/' distros/kubernetes/nvsentinel/values-tilt.yaml
148+
sed -i 's/tag: "8.0.3-debian-12-r1"/tag: "8.0.4"/' distros/kubernetes/nvsentinel/values-tilt.yaml
149+
128150
- name: Run E2E tests
129151
env:
130152
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
131153
CTLPTL_YAML: .ctlptl.yaml
132154
# Use same cluster name suffix for consistency
133-
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}"
155+
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}-${{ matrix.datastore }}"
156+
# Set USE_POSTGRESQL for PostgreSQL tests (our integrated Tiltfile approach)
157+
USE_POSTGRESQL: ${{ matrix.datastore == 'postgresql' && '1' || '0' }}
134158
run: |
135159
make e2e-test-ci
136160
137161
- name: Upload test results
138162
uses: ./.github/actions/upload-test-artifacts
139163
with:
140-
component-name: e2e-test-${{ matrix.arch }}
164+
component-name: e2e-test-${{ matrix.arch }}-${{ matrix.datastore }}
141165
file-paths: |
142166
tests/results/
143167
tests/*.log
@@ -152,6 +176,20 @@ jobs:
152176
kind export logs /tmp/kind-logs --name "$CLUSTER_NAME" || true
153177
fi
154178
179+
- name: Pull Kind container logs via crictl
180+
if: always()
181+
run: |
182+
chmod +x ./scripts/pull_kind_logs.sh
183+
./scripts/pull_kind_logs.sh /tmp/kind-container-logs || true
184+
185+
- name: Upload Kind container logs
186+
if: always()
187+
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
188+
with:
189+
name: e2e-kind-container-logs-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
190+
path: /tmp/kind-container-logs/
191+
retention-days: 7
192+
155193
- name: Collect debug artifacts
156194
if: failure()
157195
run: |
@@ -162,20 +200,27 @@ jobs:
162200
kubectl logs --all-namespaces --all-containers=true --tail=500 > /tmp/debug-artifacts/all-logs.txt || true
163201
docker images > /tmp/debug-artifacts/docker-images.txt || true
164202
df -h > /tmp/debug-artifacts/disk-usage.txt || true
203+
# Comprehensive cluster state dump for debugging
204+
kubectl cluster-info dump --all-namespaces --output-directory=/tmp/debug-artifacts/cluster-info-dump || true
205+
# Get node details including cordon state
206+
kubectl get nodes -o wide > /tmp/debug-artifacts/nodes-wide.txt || true
207+
kubectl get nodes -o yaml > /tmp/debug-artifacts/nodes-full.yaml || true
208+
# Get node descriptions for annotations and conditions
209+
kubectl describe nodes > /tmp/debug-artifacts/nodes-describe.txt || true
165210
166211
- name: Upload Kind logs
167212
if: always()
168213
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
169214
with:
170-
name: e2e-kind-logs-${{ matrix.arch }}-${{ github.run_id }}
215+
name: e2e-kind-logs-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
171216
path: /tmp/kind-logs/
172217
retention-days: 7
173218

174219
- name: Upload debug artifacts
175220
if: failure()
176221
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
177222
with:
178-
name: e2e-debug-artifacts-${{ matrix.arch }}-${{ github.run_id }}
223+
name: e2e-debug-artifacts-${{ matrix.arch }}-${{ matrix.datastore }}-${{ github.run_id }}
179224
path: /tmp/debug-artifacts/
180225
retention-days: 7
181226

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Integration Tests - AWS
16+
17+
on:
18+
workflow_dispatch: {} # allow manual runs for testing
19+
schedule:
20+
- cron: '0 8 * * *' # daily at midnight PST, runs on default branch only
21+
push:
22+
branches:
23+
- main
24+
25+
permissions:
26+
contents: read
27+
actions: read
28+
id-token: write
29+
30+
jobs:
31+
integration-test-aws:
32+
runs-on: ubuntu-latest
33+
timeout-minutes: 60
34+
env:
35+
CSP: "aws"
36+
PREFIX: "nvs"
37+
AWS_ACCOUNT_ID: "615299774277"
38+
AWS_REGION: "us-east-1"
39+
GITHUB_ACTIONS_ROLE_NAME: "github-actions-role"
40+
CLUSTER_NAME: "nvs-d${{ github.run_id }}"
41+
K8S_VERSION: "1.34"
42+
EKSCTL_VERSION: "0.216.0"
43+
GPU_AVAILABILITY_ZONE: "e"
44+
GPU_NODE_COUNT: "1"
45+
CAPACITY_RESERVATION_ID: "cr-0cbe491320188dfa6"
46+
47+
# Debug
48+
SKIP_DELETE: "false" # skip cluster deletion
49+
TEST_TAG: "main-33c1d03"
50+
51+
steps:
52+
# Checkout
53+
- name: Checkout
54+
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
55+
56+
# Auth
57+
- name: Configure AWS credentials
58+
id: auth
59+
uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8 # v5.1.0
60+
with:
61+
role-to-assume: "arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/${{ env.GITHUB_ACTIONS_ROLE_NAME }}"
62+
aws-region: ${{ env.AWS_REGION }}
63+
role-session-name: GitHubActions-NVSentinel-Integration
64+
65+
# Install eksctl
66+
- name: Install eksctl
67+
run: |
68+
set -euox pipefail
69+
# Check if eksctl is already installed
70+
if command -v eksctl >/dev/null 2>&1; then
71+
echo "eksctl is already installed:"
72+
eksctl version
73+
exit 0
74+
fi
75+
76+
echo "Installing eksctl..."
77+
curl -LO "https://github.com/eksctl-io/eksctl/releases/download/v${EKSCTL_VERSION}/eksctl_linux_amd64.tar.gz"
78+
tar -xzf eksctl_linux_amd64.tar.gz
79+
chmod +x eksctl
80+
sudo mv eksctl /usr/local/bin/
81+
rm eksctl_linux_amd64.tar.gz
82+
echo "eksctl installed successfully:"
83+
eksctl version
84+
85+
# Cluster
86+
- name: Create Cluster
87+
id: cluster
88+
shell: bash
89+
run: |
90+
set -euox pipefail
91+
tests/uat/aws/create-eks-cluster.sh
92+
93+
# Connect
94+
- name: Connect to Cluster
95+
id: client
96+
if: steps.cluster.outcome == 'success'
97+
shell: bash
98+
run: |
99+
set -euox pipefail
100+
# Check if kubectl is already installed
101+
if command -v kubectl >/dev/null 2>&1; then
102+
echo "kubectl is already installed:"
103+
kubectl version
104+
exit 0
105+
fi
106+
107+
echo "Installing kubectl..."
108+
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
109+
chmod +x kubectl
110+
sudo mv kubectl /usr/local/bin/
111+
echo "Updating kubeconfig..."
112+
aws eks update-kubeconfig --region ${{ env.AWS_REGION }} --name ${{ env.CLUSTER_NAME }}
113+
echo "Verifying cluster connection..."
114+
kubectl get nodes
115+
116+
# Image Tag
117+
- name: Compute ref name with short SHA
118+
id: ref-name
119+
if: steps.cluster.outcome == 'success'
120+
run: |
121+
if [[ "${{ github.ref_type }}" == "tag" ]]; then
122+
SAFE_REF="${{ github.ref_name }}"
123+
elif [[ "${{ github.ref_name }}" == "main" ]]; then
124+
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
125+
SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}"
126+
else
127+
SAFE_REF="${{ env.TEST_TAG }}"
128+
fi
129+
# Sanitize ref name: replace slashes with hyphens for Docker tag compatibility
130+
SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g')
131+
echo "value=$SAFE_REF" >> $GITHUB_OUTPUT
132+
133+
# Apps
134+
- name: Install NVS
135+
id: apps
136+
if: steps.client.outcome == 'success'
137+
shell: bash
138+
env:
139+
NVSENTINEL_VERSION: "${{ steps.ref-name.outputs.value }}"
140+
run: |
141+
set -euxo pipefail
142+
tests/uat/install-apps.sh
143+
144+
# Test
145+
- name: Run UAT Tests
146+
id: tests
147+
if: steps.apps.outcome == 'success'
148+
shell: bash
149+
run: |
150+
set -euxo pipefail
151+
tests/uat/tests.sh
152+
153+
# Teardown
154+
- name: Destroy Cluster
155+
if: always() && steps.cluster.outcome != 'skipped' && env.SKIP_DELETE != 'true'
156+
shell: bash
157+
run: |
158+
set -euxo pipefail
159+
tests/uat/aws/delete-eks-cluster.sh
160+
161+
# Summary
162+
- name: Test Summary
163+
if: always()
164+
run: |
165+
echo "## Test Results" >> $GITHUB_STEP_SUMMARY
166+
echo "- Cluster: ${{ steps.cluster.outcome }}" >> $GITHUB_STEP_SUMMARY
167+
echo "- Connection: ${{ steps.client.outcome }}" >> $GITHUB_STEP_SUMMARY
168+
echo "- Apps: ${{ steps.apps.outcome }}" >> $GITHUB_STEP_SUMMARY
169+
echo "- Tests: ${{ steps.tests.outcome }}" >> $GITHUB_STEP_SUMMARY

0 commit comments

Comments
 (0)