[AMD] add amd ci monitor #3
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: AMD CI Job Monitor | |
| on: | |
| schedule: | |
| - cron: '0 0 * * *' # Daily at midnight UTC | |
| pull_request: | |
| paths: | |
| - '.github/workflows/amd-ci-job-monitor.yml' | |
| - 'scripts/ci/query_job_status.py' | |
| workflow_dispatch: | |
| inputs: | |
| hours: | |
| description: 'Time window in hours' | |
| required: false | |
| default: '24' | |
| type: string | |
| job_filter: | |
| description: 'Job name filter (leave empty for all AMD jobs)' | |
| required: false | |
| type: string | |
| jobs: | |
| # AMD Runner status report | |
| runner-status: | |
| name: Report - AMD CI Runners | |
| if: ${{ !inputs.job_filter }} | |
| runs-on: ubuntu-latest | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| steps: | |
| - name: Query AMD Runner Status | |
| run: | | |
| echo "# AMD CI Runner Status Report" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "**Generated:** $(date -u '+%Y-%m-%d %H:%M:%S') UTC" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Try to get runners (requires admin access) | |
| RUNNERS=$(gh api repos/${{ github.repository }}/actions/runners --jq '.runners[] | select(.labels[].name | test("amd|mi3"; "i"))' 2>/dev/null) || { | |
| echo "> **Note:** Cannot access runners API (requires admin permission). Showing runner info from recent jobs instead." >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Fallback: Get runner info from recent jobs | |
| echo "## Runners Observed in Recent Jobs (Last 6 hours)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Runner Name | Last Job | Status | Started |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-------------|----------|--------|---------|" >> $GITHUB_STEP_SUMMARY | |
| gh api "repos/${{ github.repository }}/actions/runs?status=in_progress&per_page=50" --jq '.workflow_runs[] | select(.name | test("AMD"; "i")) | .id' | while read run_id; do | |
| gh api "repos/${{ github.repository }}/actions/runs/$run_id/jobs" --jq '.jobs[] | select(.runner_name != null) | select(.runner_name | test("mi3"; "i")) | "| \(.runner_name) | \(.name | .[0:50]) | \(.status) | \(.started_at | .[0:16]) |"' 2>/dev/null | |
| done | sort -u >> $GITHUB_STEP_SUMMARY | |
| exit 0 | |
| } | |
| # If we have admin access, show full runner status | |
| echo "## Runner Status" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Runner Name | Status | Busy | Labels |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-------------|--------|------|--------|" >> $GITHUB_STEP_SUMMARY | |
| gh api repos/${{ github.repository }}/actions/runners --paginate --jq ' | |
| .runners[] | | |
| select(.labels[].name | test("amd|mi3"; "i")) | | |
| "| \(.name) | \(.status) | \(.busy) | \([.labels[].name] | join(", ")) |" | |
| ' >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Summary counts | |
| echo "## Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| ONLINE=$(gh api repos/${{ github.repository }}/actions/runners --paginate --jq '[.runners[] | select(.labels[].name | test("amd|mi3"; "i")) | select(.status == "online")] | length') | |
| BUSY=$(gh api repos/${{ github.repository }}/actions/runners --paginate --jq '[.runners[] | select(.labels[].name | test("amd|mi3"; "i")) | select(.busy == true)] | length') | |
| IDLE=$((ONLINE - BUSY)) | |
| echo "| Metric | Count |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Online | $ONLINE |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Busy | $BUSY |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Idle | $IDLE |" >> $GITHUB_STEP_SUMMARY | |
| # Single job filter mode | |
| custom-report: | |
| name: Custom Job Report | |
| if: ${{ inputs.job_filter }} | |
| runs-on: ubuntu-latest | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Install dependencies | |
| run: pip install tabulate | |
| - name: Generate Custom Job Report | |
| timeout-minutes: 30 | |
| run: | | |
| python scripts/ci/query_job_status.py \ | |
| --repo ${{ github.repository }} \ | |
| --job "${{ inputs.job_filter }}" \ | |
| --workflow "pr-test-amd.yml" \ | |
| --hours ${{ inputs.hours || '24' }} \ | |
| --summary | |
| # All jobs report using matrix for parallel execution | |
| all-reports: | |
| name: Report - ${{ matrix.job_name }} | |
| if: ${{ !inputs.job_filter }} | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 14 | |
| matrix: | |
| job_name: | |
| - sgl-kernel-unit-test-amd | |
| - stage-a-test-1-amd | |
| - stage-b-test-small-1-gpu-amd | |
| - stage-b-test-small-1-gpu-amd-mi35x | |
| - stage-b-test-large-2-gpu-amd | |
| - multimodal-gen-test-1-gpu-amd | |
| - multimodal-gen-test-2-gpu-amd | |
| - stage-c-test-large-8-gpu-amd | |
| - stage-c-test-large-8-gpu-amd-mi35x | |
| - stage-b-test-small-1-gpu-performance-amd | |
| - stage-b-test-large-1-gpu-performance-amd | |
| - stage-b-test-large-2-gpu-performance-amd | |
| - stage-b-test-small-1-gpu-accuracy-amd | |
| - stage-b-test-large-2-gpu-accuracy-amd | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Install dependencies | |
| run: pip install tabulate | |
| - name: Generate Report | |
| timeout-minutes: 15 | |
| run: | | |
| python scripts/ci/query_job_status.py \ | |
| --repo ${{ github.repository }} \ | |
| --job "${{ matrix.job_name }}" \ | |
| --workflow "pr-test-amd.yml" \ | |
| --hours ${{ inputs.hours || '24' }} \ | |
| --summary |