Skip to content

CI Failure Monitor

CI Failure Monitor #71

name: CI Failure Monitor
on:
schedule:
- cron: '0 */12 * * *' # Every 12 hour
workflow_dispatch:
concurrency:
group: ci-failure-monitor-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
actions: read
jobs:
failure-analysis:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.14'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests slack_sdk
- name: Run Failure Analysis
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GH_PAT_FOR_RUNNER_ADMIN: ${{ secrets.GH_PAT_FOR_RUNNER_ADMIN }}
PYTHONUNBUFFERED: 1
PYTHONIOENCODING: utf-8
run: |
cd scripts/ci_monitor
python ci_failures_analysis.py \
--token $GITHUB_TOKEN \
--limit 100 \
--output ci_failure_analysis_$(date +%Y%m%d_%H%M%S).json
- name: Upload Analysis Results
uses: actions/upload-artifact@v4
with:
name: ci-failure-analysis-${{ github.run_number }}
path: |
scripts/ci_monitor/ci_failure_analysis_*.json
retention-days: 7
- name: Send Slack Notification
if: always()
env:
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
run: |
cd scripts/ci_monitor
LATEST_REPORT=$(ls -t ci_failure_analysis_*.json | head -1)
if [ ! -f "$LATEST_REPORT" ]; then
echo "No report found, so skipping Slack notification"
exit 0
fi
if [ -n "$SGLANG_DIFFUSION_SLACK_TOKEN" ]; then
python3 post_ci_failures_to_slack.py --report-file "$LATEST_REPORT"
else
echo "SGLANG_DIFFUSION_SLACK_TOKEN not configured, skipping notification"
fi