Skip to content

Commit 6953bef

Browse files
committed
wip: dirty test unconditional alert
Signed-off-by: Alex Gronskiy <[email protected]>
1 parent 6acc3df commit 6953bef

File tree

3 files changed

+109
-40
lines changed

3 files changed

+109
-40
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
name: "Slack Alert"
15+
description: "Send alerts to Slack using the alerting service"
16+
17+
inputs:
18+
token:
19+
description: "Slack alerting token"
20+
required: true
21+
channel:
22+
description: "Slack channel ID"
23+
required: true
24+
default: "D08N6TYDS79"
25+
severity:
26+
description: "Alert severity (info, error, silent)"
27+
required: true
28+
title:
29+
description: "Alert title"
30+
required: true
31+
text:
32+
description: "Alert message text"
33+
required: true
34+
35+
runs:
36+
using: "composite"
37+
steps:
38+
- name: Send Slack Alert
39+
shell: bash
40+
env:
41+
SLACK_FRONTIER_EVAL_ALERTING_TOKEN: ${{ inputs.token }}
42+
run: |
43+
set -euo pipefail # Fail fast on any error
44+
45+
# Fail fast if token is missing
46+
if [ -z "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" ]; then
47+
echo "Error: SLACK_FRONTIER_EVAL_ALERTING_TOKEN is not set"
48+
exit 1
49+
fi
50+
51+
# Check if alerting endpoint is reachable
52+
echo "Checking connectivity to alerting endpoint..."
53+
if ! curl -sL --max-time 10 --head https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
54+
echo "Warning: Alerting endpoint https://alerting-frontier-evals.nvidia.com/alert is not reachable"
55+
echo "Attempting to ping endpoint..."
56+
if ! curl -sL --max-time 10 https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
57+
echo "Error: Cannot reach alerting endpoint"
58+
exit 1
59+
fi
60+
fi
61+
echo "Alerting endpoint is reachable"
62+
63+
# Download the alert script first to check for errors
64+
ALERT_SCRIPT=$(mktemp)
65+
if ! curl -sL -o "$ALERT_SCRIPT" https://gitlab-master.nvidia.com/dl/JoC/competitive_evaluation/nvpark-k8s-infra/-/raw/main/alerting-service-helm/alert.sh; then
66+
echo "Error: Failed to download alert script"
67+
rm -f "$ALERT_SCRIPT"
68+
exit 1
69+
fi
70+
71+
# Run the alert script and capture exit code
72+
set +e # Temporarily disable exit on error to capture exit code
73+
bash "$ALERT_SCRIPT" \
74+
--token "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" \
75+
--channel "${{ inputs.channel }}" \
76+
--severity "${{ inputs.severity }}" \
77+
--title "${{ inputs.title }}" \
78+
--text "${{ inputs.text }}"
79+
ALERT_EXIT_CODE=$?
80+
set -e # Re-enable exit on error
81+
82+
# Clean up
83+
rm -f "$ALERT_SCRIPT"
84+
85+
# Check if the alert script failed
86+
if [ $ALERT_EXIT_CODE -ne 0 ]; then
87+
echo "Error: Alert script failed with exit code $ALERT_EXIT_CODE"
88+
exit $ALERT_EXIT_CODE
89+
fi

.github/workflows/cicd-main.yml

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,22 @@ jobs:
5454
steps:
5555
- name: Checkout
5656
uses: actions/checkout@v4
57+
58+
- name: Send test alert
59+
uses: ./.github/actions/slack-alert
60+
with:
61+
token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
62+
channel: "D08N6TYDS79"
63+
severity: "error"
64+
title: "CI Pipeline Test Alert - ${{ github.ref_name }}"
65+
text: |
66+
🧪 TEST ALERT: This is a test alert sent at the beginning of CI pipeline
67+
68+
Workflow: ${{ github.workflow }}
69+
Run: ${{ github.run_id }}
70+
Branch: ${{ github.ref_name }}
71+
Commit: ${{ github.sha }}
72+
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
5773
- name: Setup Python
5874
uses: actions/setup-python@v5
5975
with:
@@ -200,9 +216,6 @@ jobs:
200216
runs-on: ubuntu-latest
201217
permissions: write-all
202218
steps:
203-
- name: Checkout
204-
uses: actions/checkout@v4
205-
206219
- name: Get workflow result
207220
id: result
208221
shell: bash -x -e -u -o pipefail {0}
@@ -326,26 +339,16 @@ jobs:
326339
- name: Determine workflow status
327340
id: status
328341
shell: bash
329-
env:
330-
TEST_MODE: ${{ vars.TEST_ALERT_MODE == 'true' }}
331342
run: |
332343
set -euo pipefail # Fail fast
333-
334-
# TEST MODE: Force error alert for testing
335-
if [ "$TEST_MODE" == "true" ]; then
336-
echo "severity=error" >> $GITHUB_OUTPUT
337-
echo "status=failure" >> $GITHUB_OUTPUT
338-
echo "message=🧪 TEST MODE: Forced error alert for testing purposes" >> $GITHUB_OUTPUT
339-
exit 0
340-
fi
341-
344+
342345
# Check if main test job succeeded
343346
MAIN_TEST_RESULT="${{ needs.Nemo_CICD_Test.result }}"
344-
347+
345348
# Coverage jobs might be skipped, so we only care if they failed (not if skipped)
346349
COVERAGE_RESULT="${{ needs.Coverage.result }}"
347350
COVERAGE_FAKE_RESULT="${{ needs.Coverage_Fake.result }}"
348-
351+
349352
if [ "$MAIN_TEST_RESULT" == "success" ] && [ "$COVERAGE_RESULT" != "failure" ] && [ "$COVERAGE_FAKE_RESULT" != "failure" ]; then
350353
echo "severity=info" >> $GITHUB_OUTPUT
351354
echo "status=success" >> $GITHUB_OUTPUT
@@ -365,10 +368,9 @@ jobs:
365368
title: "CI Pipeline - ${{ github.ref_name }}"
366369
text: |
367370
${{ steps.status.outputs.message }}
368-
371+
369372
Workflow: ${{ github.workflow }}
370373
Run: ${{ github.run_id }}
371374
Commit: ${{ github.sha }}
372375
Author: ${{ github.actor }}
373376
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
374-
test-mode: ${{ vars.TEST_ALERT_MODE == 'true' }}

.github/workflows/release.yaml

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -110,19 +110,9 @@ jobs:
110110
env:
111111
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
112112
DRY_RUN_INPUT: ${{ inputs.dry-run }}
113-
TEST_MODE: ${{ vars.TEST_ALERT_MODE == 'true' }}
114113
run: |
115114
set -euo pipefail # Fail fast
116115
117-
# TEST MODE: Force error alert for testing
118-
if [ "$TEST_MODE" == "true" ]; then
119-
echo "severity=error" >> $GITHUB_OUTPUT
120-
echo "status=failure" >> $GITHUB_OUTPUT
121-
echo "message=🧪 TEST MODE: Forced error alert for testing purposes" >> $GITHUB_OUTPUT
122-
echo "release_type=Test" >> $GITHUB_OUTPUT
123-
exit 0
124-
fi
125-
126116
RELEASE_RESULT="${{ needs.release.result }}"
127117
128118
# Determine if this is a dry run: scheduled releases are production, manual can be dry-run
@@ -160,7 +150,6 @@ jobs:
160150
Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
161151
Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
162152
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
163-
test-mode: ${{ vars.TEST_ALERT_MODE == 'true' }}
164153
165154
release-launcher:
166155
if: ${{ github.event_name == 'schedule' || inputs.component == 'nemo-evaluator-launcher' }}
@@ -204,19 +193,9 @@ jobs:
204193
env:
205194
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
206195
DRY_RUN_INPUT: ${{ inputs.dry-run }}
207-
TEST_MODE: ${{ vars.TEST_ALERT_MODE == 'true' }}
208196
run: |
209197
set -euo pipefail # Fail fast
210198
211-
# TEST MODE: Force error alert for testing
212-
if [ "$TEST_MODE" == "true" ]; then
213-
echo "severity=error" >> $GITHUB_OUTPUT
214-
echo "status=failure" >> $GITHUB_OUTPUT
215-
echo "message=🧪 TEST MODE: Forced error alert for testing purposes" >> $GITHUB_OUTPUT
216-
echo "release_type=Test" >> $GITHUB_OUTPUT
217-
exit 0
218-
fi
219-
220199
RELEASE_RESULT="${{ needs.release-launcher.result }}"
221200
222201
# Determine if this is a dry run: scheduled releases are production, manual can be dry-run
@@ -254,4 +233,3 @@ jobs:
254233
Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
255234
Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
256235
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
257-
test-mode: ${{ vars.TEST_ALERT_MODE == 'true' }}

0 commit comments

Comments
 (0)