Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/actions/slack-alert/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Slack Alert"
description: "Send alerts to Slack using the alerting service"

inputs:
token:
description: "Slack alerting token"
required: true
channel:
description: "Slack channel ID"
required: true
default: "D08N6TYDS79"
severity:
description: "Alert severity (info, error, silent)"
required: true
title:
description: "Alert title"
required: true
text:
description: "Alert message text"
required: true

runs:
using: "composite"
steps:
- name: Send Slack Alert
shell: bash
env:
SLACK_FRONTIER_EVAL_ALERTING_TOKEN: ${{ inputs.token }}
run: |
set -euo pipefail # Fail fast on any error

# Fail fast if token is missing
if [ -z "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" ]; then
echo "Error: SLACK_FRONTIER_EVAL_ALERTING_TOKEN is not set"
exit 1
fi

# Check if alerting endpoint is reachable
echo "Checking connectivity to alerting endpoint..."
if ! curl -sL --max-time 10 --head https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
echo "Warning: Alerting endpoint https://alerting-frontier-evals.nvidia.com/alert is not reachable"
echo "Attempting to ping endpoint..."
if ! curl -sL --max-time 10 https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
echo "Error: Cannot reach alerting endpoint"
exit 1
fi
fi
echo "Alerting endpoint is reachable"

# Download the alert script first to check for errors
ALERT_SCRIPT=$(mktemp)
if ! curl -sL -o "$ALERT_SCRIPT" https://gitlab-master.nvidia.com/dl/JoC/competitive_evaluation/nvpark-k8s-infra/-/raw/main/alerting-service-helm/alert.sh; then
echo "Error: Failed to download alert script"
rm -f "$ALERT_SCRIPT"
exit 1
fi

# Run the alert script and capture exit code
set +e # Temporarily disable exit on error to capture exit code
bash "$ALERT_SCRIPT" \
--token "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" \
--channel "${{ inputs.channel }}" \
--severity "${{ inputs.severity }}" \
--title "${{ inputs.title }}" \
--text "${{ inputs.text }}"
ALERT_EXIT_CODE=$?
set -e # Re-enable exit on error

# Clean up
rm -f "$ALERT_SCRIPT"

# Check if the alert script failed
if [ $ALERT_EXIT_CODE -ne 0 ]; then
echo "Error: Alert script failed with exit code $ALERT_EXIT_CODE"
exit $ALERT_EXIT_CODE
fi
75 changes: 72 additions & 3 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Send test alert
uses: ./.github/actions/slack-alert
with:
token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
channel: "D08N6TYDS79"
severity: "error"
title: "CI Pipeline Test Alert - ${{ github.ref_name }}"
text: |
🧪 TEST ALERT: This is a test alert sent at the beginning of CI pipeline

Workflow: ${{ github.workflow }}
Run: ${{ github.run_id }}
Branch: ${{ github.ref_name }}
Commit: ${{ github.sha }}
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- name: Setup Python
uses: actions/setup-python@v5
with:
Expand Down Expand Up @@ -200,9 +216,6 @@ jobs:
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
Expand Down Expand Up @@ -305,3 +318,59 @@ jobs:
path: |
.coverage
include-hidden-files: true

send-ci-alert:
runs-on: ubuntu-latest
needs:
- Nemo_CICD_Test
- Coverage
- Coverage_Fake
if: |
(
github.ref == 'refs/heads/main'
|| github.ref == 'refs/heads/agronskiy/feat/ci-failed-builds-on-main-and-releases'
)
&& always()
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Determine workflow status
id: status
shell: bash
run: |
set -euo pipefail # Fail fast

# Check if main test job succeeded
MAIN_TEST_RESULT="${{ needs.Nemo_CICD_Test.result }}"

# Coverage jobs might be skipped, so we only care if they failed (not if skipped)
COVERAGE_RESULT="${{ needs.Coverage.result }}"
COVERAGE_FAKE_RESULT="${{ needs.Coverage_Fake.result }}"

if [ "$MAIN_TEST_RESULT" == "success" ] && [ "$COVERAGE_RESULT" != "failure" ] && [ "$COVERAGE_FAKE_RESULT" != "failure" ]; then
echo "severity=info" >> $GITHUB_OUTPUT
echo "status=success" >> $GITHUB_OUTPUT
echo "message=✅ CI pipeline completed successfully on ${{ github.ref_name }}" >> $GITHUB_OUTPUT
else
echo "severity=error" >> $GITHUB_OUTPUT
echo "status=failure" >> $GITHUB_OUTPUT
echo "message=❌ CI pipeline failed on ${{ github.ref_name }}" >> $GITHUB_OUTPUT
fi

- name: Send Slack Alert
uses: ./.github/actions/slack-alert
with:
token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
channel: "D08N6TYDS79"
severity: ${{ steps.status.outputs.severity }}
title: "CI Pipeline - ${{ github.ref_name }}"
text: |
${{ steps.status.outputs.message }}

Workflow: ${{ github.workflow }}
Run: ${{ github.run_id }}
Commit: ${{ github.sha }}
Author: ${{ github.actor }}
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
110 changes: 110 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,61 @@ jobs:
SSH_PWD: ${{ secrets.SSH_PWD }}
BOT_KEY: ${{ secrets.BOT_KEY }}

send-release-alert:
runs-on: ubuntu-latest
needs: [release]
if: always() && !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Determine release status
id: status
shell: bash
env:
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
DRY_RUN_INPUT: ${{ inputs.dry-run }}
run: |
set -euo pipefail # Fail fast

RELEASE_RESULT="${{ needs.release.result }}"

# Determine if this is a dry run: scheduled releases are production, manual can be dry-run
if [ "$IS_SCHEDULED" == "true" ]; then
RELEASE_TYPE="Production"
elif [ "$DRY_RUN_INPUT" == "true" ]; then
RELEASE_TYPE="Dry Run"
else
RELEASE_TYPE="Production"
fi

if [ "$RELEASE_RESULT" == "success" ]; then
echo "severity=info" >> $GITHUB_OUTPUT
echo "status=success" >> $GITHUB_OUTPUT
echo "message=✅ $RELEASE_TYPE release completed successfully for nemo-evaluator" >> $GITHUB_OUTPUT
else
echo "severity=error" >> $GITHUB_OUTPUT
echo "status=failure" >> $GITHUB_OUTPUT
echo "message=❌ $RELEASE_TYPE release failed for nemo-evaluator" >> $GITHUB_OUTPUT
fi
echo "release_type=$RELEASE_TYPE" >> $GITHUB_OUTPUT

- name: Send Slack Alert
uses: ./.github/actions/slack-alert
with:
token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
channel: "D08N6TYDS79"
severity: ${{ steps.status.outputs.severity }}
title: "Release - nemo-evaluator (${{ steps.status.outputs.release_type }})"
text: |
${{ steps.status.outputs.message }}

Workflow: ${{ github.workflow }}
Run: ${{ github.run_id }}
Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

release-launcher:
if: ${{ github.event_name == 'schedule' || inputs.component == 'nemo-evaluator-launcher' }}
needs: validate-inputs
Expand Down Expand Up @@ -123,3 +178,58 @@ jobs:
SSH_KEY: ${{ secrets.SSH_KEY }}
SSH_PWD: ${{ secrets.SSH_PWD }}
BOT_KEY: ${{ secrets.BOT_KEY }}

send-release-launcher-alert:
runs-on: ubuntu-latest
needs: [release-launcher]
if: always() && !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Determine release status
id: status
shell: bash
env:
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
DRY_RUN_INPUT: ${{ inputs.dry-run }}
run: |
set -euo pipefail # Fail fast

RELEASE_RESULT="${{ needs.release-launcher.result }}"

# Determine if this is a dry run: scheduled releases are production, manual can be dry-run
if [ "$IS_SCHEDULED" == "true" ]; then
RELEASE_TYPE="Production"
elif [ "$DRY_RUN_INPUT" == "true" ]; then
RELEASE_TYPE="Dry Run"
else
RELEASE_TYPE="Production"
fi

if [ "$RELEASE_RESULT" == "success" ]; then
echo "severity=info" >> $GITHUB_OUTPUT
echo "status=success" >> $GITHUB_OUTPUT
echo "message=✅ $RELEASE_TYPE release completed successfully for nemo-evaluator-launcher" >> $GITHUB_OUTPUT
else
echo "severity=error" >> $GITHUB_OUTPUT
echo "status=failure" >> $GITHUB_OUTPUT
echo "message=❌ $RELEASE_TYPE release failed for nemo-evaluator-launcher" >> $GITHUB_OUTPUT
fi
echo "release_type=$RELEASE_TYPE" >> $GITHUB_OUTPUT

- name: Send Slack Alert
uses: ./.github/actions/slack-alert
with:
token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
channel: "D08N6TYDS79"
severity: ${{ steps.status.outputs.severity }}
title: "Release - nemo-evaluator-launcher (${{ steps.status.outputs.release_type }})"
text: |
${{ steps.status.outputs.message }}

Workflow: ${{ github.workflow }}
Run: ${{ github.run_id }}
Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
Loading