NVIDIA-NeMo · agronskiy · Dec 9, 2025 · Dec 9, 2025
@@ -0,0 +1,89 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "Slack Alert"
+description: "Send alerts to Slack using the alerting service"
+
+inputs:
+  token:
+    description: "Slack alerting token"
+    required: true
+  channel:
+    description: "Slack channel ID"
+    required: true
+    default: "D08N6TYDS79"
+  severity:
+    description: "Alert severity (info, error, silent)"
+    required: true
+  title:
+    description: "Alert title"
+    required: true
+  text:
+    description: "Alert message text"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Send Slack Alert
+      shell: bash
+      env:
+        SLACK_FRONTIER_EVAL_ALERTING_TOKEN: ${{ inputs.token }}
+      run: |
+        set -euo pipefail  # Fail fast on any error
+
+        # Fail fast if token is missing
+        if [ -z "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" ]; then
+          echo "Error: SLACK_FRONTIER_EVAL_ALERTING_TOKEN is not set"
+          exit 1
+        fi
+
+        # Check if alerting endpoint is reachable
+        echo "Checking connectivity to alerting endpoint..."
+        if ! curl -sL --max-time 10 --head https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
+          echo "Warning: Alerting endpoint https://alerting-frontier-evals.nvidia.com/alert is not reachable"
+          echo "Attempting to ping endpoint..."
+          if ! curl -sL --max-time 10 https://alerting-frontier-evals.nvidia.com/alert > /dev/null 2>&1; then
+            echo "Error: Cannot reach alerting endpoint"
+            exit 1
+          fi
+        fi
+        echo "Alerting endpoint is reachable"
+
+        # Download the alert script first to check for errors
+        ALERT_SCRIPT=$(mktemp)
+        if ! curl -sL -o "$ALERT_SCRIPT" https://gitlab-master.nvidia.com/dl/JoC/competitive_evaluation/nvpark-k8s-infra/-/raw/main/alerting-service-helm/alert.sh; then
+          echo "Error: Failed to download alert script"
+          rm -f "$ALERT_SCRIPT"
+          exit 1
+        fi
+
+        # Run the alert script and capture exit code
+        set +e  # Temporarily disable exit on error to capture exit code
+        bash "$ALERT_SCRIPT" \
+          --token "$SLACK_FRONTIER_EVAL_ALERTING_TOKEN" \
+          --channel "${{ inputs.channel }}" \
+          --severity "${{ inputs.severity }}" \
+          --title "${{ inputs.title }}" \
+          --text "${{ inputs.text }}"
+        ALERT_EXIT_CODE=$?
+        set -e  # Re-enable exit on error
+
+        # Clean up
+        rm -f "$ALERT_SCRIPT"
+
+        # Check if the alert script failed
+        if [ $ALERT_EXIT_CODE -ne 0 ]; then
+          echo "Error: Alert script failed with exit code $ALERT_EXIT_CODE"
+          exit $ALERT_EXIT_CODE
+        fi
@@ -54,6 +54,22 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+
+      - name: Send test alert
+        uses: ./.github/actions/slack-alert
+        with:
+          token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
+          channel: "D08N6TYDS79"
+          severity: "error"
+          title: "CI Pipeline Test Alert - ${{ github.ref_name }}"
+          text: |
+            🧪 TEST ALERT: This is a test alert sent at the beginning of CI pipeline
+
+            Workflow: ${{ github.workflow }}
+            Run: ${{ github.run_id }}
+            Branch: ${{ github.ref_name }}
+            Commit: ${{ github.sha }}
+            Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
@@ -200,9 +216,6 @@ jobs:
     runs-on: ubuntu-latest
     permissions: write-all
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
       - name: Get workflow result
         id: result
         shell: bash -x -e -u -o pipefail {0}
@@ -305,3 +318,59 @@ jobs:
           path: |
             .coverage
           include-hidden-files: true
+
+  send-ci-alert:
+    runs-on: ubuntu-latest
+    needs:
+      - Nemo_CICD_Test
+      - Coverage
+      - Coverage_Fake
+    if: |
+      (
+        github.ref == 'refs/heads/main'
+        || github.ref == 'refs/heads/agronskiy/feat/ci-failed-builds-on-main-and-releases'
+      )
+      && always()
+      && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Determine workflow status
+        id: status
+        shell: bash
+        run: |
+          set -euo pipefail  # Fail fast
+
+          # Check if main test job succeeded
+          MAIN_TEST_RESULT="${{ needs.Nemo_CICD_Test.result }}"
+
+          # Coverage jobs might be skipped, so we only care if they failed (not if skipped)
+          COVERAGE_RESULT="${{ needs.Coverage.result }}"
+          COVERAGE_FAKE_RESULT="${{ needs.Coverage_Fake.result }}"
+
+          if [ "$MAIN_TEST_RESULT" == "success" ] && [ "$COVERAGE_RESULT" != "failure" ] && [ "$COVERAGE_FAKE_RESULT" != "failure" ]; then
+            echo "severity=info" >> $GITHUB_OUTPUT
+            echo "status=success" >> $GITHUB_OUTPUT
+            echo "message=✅ CI pipeline completed successfully on ${{ github.ref_name }}" >> $GITHUB_OUTPUT
+          else
+            echo "severity=error" >> $GITHUB_OUTPUT
+            echo "status=failure" >> $GITHUB_OUTPUT
+            echo "message=❌ CI pipeline failed on ${{ github.ref_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Send Slack Alert
+        uses: ./.github/actions/slack-alert
+        with:
+          token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
+          channel: "D08N6TYDS79"
+          severity: ${{ steps.status.outputs.severity }}
+          title: "CI Pipeline - ${{ github.ref_name }}"
+          text: |
+            ${{ steps.status.outputs.message }}
+
+            Workflow: ${{ github.workflow }}
+            Run: ${{ github.run_id }}
+            Commit: ${{ github.sha }}
+            Author: ${{ github.actor }}
+            Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -96,6 +96,61 @@ jobs:
       SSH_PWD: ${{ secrets.SSH_PWD }}
       BOT_KEY: ${{ secrets.BOT_KEY }}
 
+  send-release-alert:
+    runs-on: ubuntu-latest
+    needs: [release]
+    if: always() && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Determine release status
+        id: status
+        shell: bash
+        env:
+          IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
+          DRY_RUN_INPUT: ${{ inputs.dry-run }}
+        run: |
+          set -euo pipefail  # Fail fast
+
+          RELEASE_RESULT="${{ needs.release.result }}"
+
+          # Determine if this is a dry run: scheduled releases are production, manual can be dry-run
+          if [ "$IS_SCHEDULED" == "true" ]; then
+            RELEASE_TYPE="Production"
+          elif [ "$DRY_RUN_INPUT" == "true" ]; then
+            RELEASE_TYPE="Dry Run"
+          else
+            RELEASE_TYPE="Production"
+          fi
+
+          if [ "$RELEASE_RESULT" == "success" ]; then
+            echo "severity=info" >> $GITHUB_OUTPUT
+            echo "status=success" >> $GITHUB_OUTPUT
+            echo "message=✅ $RELEASE_TYPE release completed successfully for nemo-evaluator" >> $GITHUB_OUTPUT
+          else
+            echo "severity=error" >> $GITHUB_OUTPUT
+            echo "status=failure" >> $GITHUB_OUTPUT
+            echo "message=❌ $RELEASE_TYPE release failed for nemo-evaluator" >> $GITHUB_OUTPUT
+          fi
+          echo "release_type=$RELEASE_TYPE" >> $GITHUB_OUTPUT
+
+      - name: Send Slack Alert
+        uses: ./.github/actions/slack-alert
+        with:
+          token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
+          channel: "D08N6TYDS79"
+          severity: ${{ steps.status.outputs.severity }}
+          title: "Release - nemo-evaluator (${{ steps.status.outputs.release_type }})"
+          text: |
+            ${{ steps.status.outputs.message }}
+
+            Workflow: ${{ github.workflow }}
+            Run: ${{ github.run_id }}
+            Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
+            Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
+            Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
   release-launcher:
     if: ${{ github.event_name == 'schedule' || inputs.component == 'nemo-evaluator-launcher' }}
     needs: validate-inputs
@@ -123,3 +178,58 @@ jobs:
       SSH_KEY: ${{ secrets.SSH_KEY }}
       SSH_PWD: ${{ secrets.SSH_PWD }}
       BOT_KEY: ${{ secrets.BOT_KEY }}
+
+  send-release-launcher-alert:
+    runs-on: ubuntu-latest
+    needs: [release-launcher]
+    if: always() && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Determine release status
+        id: status
+        shell: bash
+        env:
+          IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
+          DRY_RUN_INPUT: ${{ inputs.dry-run }}
+        run: |
+          set -euo pipefail  # Fail fast
+
+          RELEASE_RESULT="${{ needs.release-launcher.result }}"
+
+          # Determine if this is a dry run: scheduled releases are production, manual can be dry-run
+          if [ "$IS_SCHEDULED" == "true" ]; then
+            RELEASE_TYPE="Production"
+          elif [ "$DRY_RUN_INPUT" == "true" ]; then
+            RELEASE_TYPE="Dry Run"
+          else
+            RELEASE_TYPE="Production"
+          fi
+
+          if [ "$RELEASE_RESULT" == "success" ]; then
+            echo "severity=info" >> $GITHUB_OUTPUT
+            echo "status=success" >> $GITHUB_OUTPUT
+            echo "message=✅ $RELEASE_TYPE release completed successfully for nemo-evaluator-launcher" >> $GITHUB_OUTPUT
+          else
+            echo "severity=error" >> $GITHUB_OUTPUT
+            echo "status=failure" >> $GITHUB_OUTPUT
+            echo "message=❌ $RELEASE_TYPE release failed for nemo-evaluator-launcher" >> $GITHUB_OUTPUT
+          fi
+          echo "release_type=$RELEASE_TYPE" >> $GITHUB_OUTPUT
+
+      - name: Send Slack Alert
+        uses: ./.github/actions/slack-alert
+        with:
+          token: ${{ secrets.SLACK_FRONTIER_EVAL_ALERTING_TOKEN }}
+          channel: "D08N6TYDS79"
+          severity: ${{ steps.status.outputs.severity }}
+          title: "Release - nemo-evaluator-launcher (${{ steps.status.outputs.release_type }})"
+          text: |
+            ${{ steps.status.outputs.message }}
+
+            Workflow: ${{ github.workflow }}
+            Run: ${{ github.run_id }}
+            Commit: ${{ github.event_name == 'schedule' && github.sha || inputs.release-ref || github.sha }}
+            Trigger: ${{ github.event_name == 'schedule' && 'Scheduled' || 'Manual' }}
+            Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}