Skip to content

AMD CI Job Monitor

AMD CI Job Monitor #3

name: AMD CI Job Monitor
on:
schedule:
- cron: '0 0 * * *' # Daily at midnight UTC
pull_request:
paths:
- '.github/workflows/amd-ci-job-monitor.yml'
- 'scripts/ci/utils/query_job_status.py'
workflow_dispatch:
inputs:
hours:
description: 'Time window in hours'
required: false
default: '24'
type: string
job_filter:
description: 'Job name filter (leave empty for all AMD jobs)'
required: false
type: string
jobs:
fetch-actions-data:
name: Fetch Actions Snapshot
runs-on: ubuntu-latest
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Select workflows for snapshot
id: select-workflows
run: |
if [[ -n "${{ inputs.job_filter }}" ]]; then
echo "workflows=pr-test-amd.yml" >> "$GITHUB_OUTPUT"
else
echo "workflows=pr-test-amd.yml,nightly-test-amd.yml,pr-test-amd-rocm720.yml,nightly-test-amd-rocm720.yml" >> "$GITHUB_OUTPUT"
fi
- name: Fetch Actions data snapshot
timeout-minutes: 30
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--workflow "${{ steps.select-workflows.outputs.workflows }}" \
--hours ${{ inputs.hours || '24' }} \
--dump-data-file actions-job-snapshot.json
- name: Upload Actions data snapshot
uses: actions/upload-artifact@v4
with:
name: actions-job-snapshot
path: actions-job-snapshot.json
if-no-files-found: error
# Single job filter mode
custom-report:
name: Custom Job Report
if: ${{ inputs.job_filter }}
needs: fetch-actions-data
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate Custom Job Report
timeout-minutes: 30
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ inputs.job_filter }}" \
--workflow "pr-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary
# Parse workflow files to get job names dynamically
parse-workflows:
name: Parse Workflow Jobs
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
outputs:
pr_jobs: ${{ steps.parse.outputs.pr_jobs }}
nightly_jobs: ${{ steps.parse.outputs.nightly_jobs }}
pr_rocm720_jobs: ${{ steps.parse.outputs.pr_rocm720_jobs }}
nightly_rocm720_jobs: ${{ steps.parse.outputs.nightly_rocm720_jobs }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Parse workflow files
id: parse
run: |
# Parse pr-test-amd.yml and extract job names (exclude utility jobs)
# Excluded: call-gate, check-changes, pr-test-amd-finish, cancel, check-all-jobs
pr_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/pr-test-amd.yml | \
grep -v -E '^(call-gate|check-changes|pr-test-amd-finish|cancel|check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "pr_jobs=$pr_jobs" >> $GITHUB_OUTPUT
echo "PR jobs: $pr_jobs"
# Parse nightly-test-amd.yml and extract job names (exclude utility jobs)
# Excluded: check-all-jobs
nightly_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/nightly-test-amd.yml | \
grep -v -E '^(check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "nightly_jobs=$nightly_jobs" >> $GITHUB_OUTPUT
echo "Nightly jobs: $nightly_jobs"
# Parse pr-test-amd-rocm720.yml (exclude utility jobs)
# Excluded: call-gate, check-changes, pr-test-amd-finish, cancel, check-all-jobs
pr_rocm720_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/pr-test-amd-rocm720.yml | \
grep -v -E '^(call-gate|check-changes|pr-test-amd-finish|cancel|check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "pr_rocm720_jobs=$pr_rocm720_jobs" >> $GITHUB_OUTPUT
echo "PR ROCm 7.2 jobs: $pr_rocm720_jobs"
# Parse nightly-test-amd-rocm720.yml (exclude utility jobs)
# Excluded: check-all-jobs
nightly_rocm720_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/nightly-test-amd-rocm720.yml | \
grep -v -E '^(check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "nightly_rocm720_jobs=$nightly_rocm720_jobs" >> $GITHUB_OUTPUT
echo "Nightly ROCm 7.2 jobs: $nightly_rocm720_jobs"
# PR CI reports using dynamic matrix
pr-ci-reports:
name: PR - ${{ matrix.job_name }}
needs: [parse-workflows, fetch-actions-data]
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.pr_jobs) }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "pr-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary
# Nightly AMD test reports using dynamic matrix
nightly-reports:
name: Nightly - ${{ matrix.job_name }}
needs: [parse-workflows, fetch-actions-data]
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.nightly_jobs) }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate Nightly Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "nightly-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary
# PR ROCm 7.2 CI reports using dynamic matrix
pr-rocm720-ci-reports:
name: PR ROCm720 - ${{ matrix.job_name }}
needs: [parse-workflows, fetch-actions-data]
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.pr_rocm720_jobs) }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate PR ROCm 7.2 Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "pr-test-amd-rocm720.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary
# Nightly ROCm 7.2 reports using dynamic matrix
nightly-rocm720-reports:
name: Nightly ROCm720 - ${{ matrix.job_name }}
needs: [parse-workflows, fetch-actions-data]
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.nightly_rocm720_jobs) }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate Nightly ROCm 7.2 Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "nightly-test-amd-rocm720.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary
# Runner fleet report - cross-workflow runner analytics in a single pass
runner-fleet-report:
name: Runner Fleet Report
if: ${{ !inputs.job_filter }}
needs: fetch-actions-data
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Download Actions data snapshot
uses: actions/download-artifact@v4
with:
name: actions-job-snapshot
path: ci-data
- name: Generate Runner Fleet Report
timeout-minutes: 30
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--runner-report \
--workflow "pr-test-amd.yml,nightly-test-amd.yml,pr-test-amd-rocm720.yml,nightly-test-amd-rocm720.yml" \
--hours ${{ inputs.hours || '24' }} \
--input-data-file ci-data/actions-job-snapshot.json \
--summary