Skip to content

Commit e215044

Browse files
committed
add runner usage
1 parent bcf88a7 commit e215044

File tree

2 files changed

+191
-18
lines changed

2 files changed

+191
-18
lines changed

.github/workflows/amd-ci-job-monitor.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,34 @@ jobs:
4949
--hours ${{ inputs.hours || '24' }} \
5050
--summary
5151
52+
# AMD Runner Usage - catches ALL jobs using AMD runners (including unmerged new jobs)
53+
amd-runner-usage:
54+
name: AMD Runner Usage
55+
if: ${{ !inputs.job_filter }}
56+
runs-on: ubuntu-latest
57+
env:
58+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
59+
steps:
60+
- name: Checkout code
61+
uses: actions/checkout@v4
62+
63+
- name: Set up Python
64+
uses: actions/setup-python@v5
65+
with:
66+
python-version: '3.10'
67+
68+
- name: Install dependencies
69+
run: pip install tabulate
70+
71+
- name: Generate AMD Runner Usage Report
72+
timeout-minutes: 30
73+
run: |
74+
python scripts/ci/query_job_status.py \
75+
--repo ${{ github.repository }} \
76+
--runner-filter "mi325,mi35x" \
77+
--hours ${{ inputs.hours || '24' }} \
78+
--summary
79+
5280
# Parse workflow files to get job names dynamically
5381
parse-workflows:
5482
name: Parse Workflow Jobs

scripts/ci/query_job_status.py

Lines changed: 163 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,7 @@ def run_gh_command(args: list[str]) -> dict:
6969
text=True,
7070
)
7171
except FileNotFoundError:
72-
raise Exception(
73-
"gh CLI not found. Please install from https://cli.github.com/"
74-
)
72+
raise Exception("gh CLI not found. Please install from https://cli.github.com/")
7573

7674
if result.returncode != 0:
7775
raise Exception(f"gh api failed: {result.stderr}")
@@ -152,15 +150,57 @@ def get_pr_number_from_run(run: dict) -> Optional[int]:
152150
return None
153151

154152

153+
def matches_runner_filter(job: dict, runner_filters: list[str]) -> bool:
154+
"""Check if a job matches any of the runner filters (fuzzy match on labels or runner_name)."""
155+
if not runner_filters:
156+
return True
157+
158+
# Check job labels (requested runner labels)
159+
job_labels = job.get("labels", [])
160+
for label in job_labels:
161+
for rf in runner_filters:
162+
if rf.lower() in label.lower():
163+
return True
164+
165+
# Check actual runner name
166+
runner_name = job.get("runner_name") or ""
167+
for rf in runner_filters:
168+
if rf.lower() in runner_name.lower():
169+
return True
170+
171+
return False
172+
173+
174+
def get_runner_label_for_job(job: dict, runner_filters: list[str]) -> str:
175+
"""Get the matching runner label for a job (for grouping)."""
176+
# First try to find a matching label from job labels
177+
job_labels = job.get("labels", [])
178+
for label in job_labels:
179+
for rf in runner_filters:
180+
if rf.lower() in label.lower():
181+
return label
182+
183+
# Fall back to runner_name
184+
runner_name = job.get("runner_name") or "-"
185+
return runner_name
186+
187+
155188
def query_jobs(
156189
repo: str,
157190
job_filter: str,
158191
workflow: str = None,
159192
hours: int = 24,
160193
status_filter: str = None,
194+
runner_filter: str = None,
161195
) -> list[dict]:
162196
"""Query jobs matching the filter."""
163197

198+
# Parse runner filter into list
199+
runner_filters = []
200+
if runner_filter:
201+
runner_filters = [rf.strip() for rf in runner_filter.split(",") if rf.strip()]
202+
print(f"Runner filter: {runner_filters}", file=sys.stderr)
203+
164204
print(f"Fetching workflow runs from last {hours} hours...", file=sys.stderr)
165205
runs = get_workflow_runs(repo, workflow, hours)
166206
print(f"Found {len(runs)} workflow runs", file=sys.stderr)
@@ -188,8 +228,12 @@ def query_jobs(
188228
for job in jobs:
189229
job_name = job.get("name", "")
190230

191-
# Filter by job name
192-
if job_filter.lower() not in job_name.lower():
231+
# Filter by job name (if provided)
232+
if job_filter and job_filter.lower() not in job_name.lower():
233+
continue
234+
235+
# Filter by runner (if provided)
236+
if runner_filters and not matches_runner_filter(job, runner_filters):
193237
continue
194238

195239
# Filter by status if specified
@@ -199,6 +243,11 @@ def query_jobs(
199243
job_status = job.get("status", "unknown")
200244
runner_name = job.get("runner_name") or "-"
201245

246+
# Get runner label for grouping
247+
runner_label = (
248+
get_runner_label_for_job(job, runner_filters) if runner_filters else "-"
249+
)
250+
202251
# Detect stuck/ghost jobs:
203252
# - Job is in_progress but no runner assigned
204253
# - Job is in_progress but workflow run is cancelled/completed
@@ -221,6 +270,7 @@ def query_jobs(
221270
"started_at": job.get("started_at", ""),
222271
"completed_at": job.get("completed_at", ""),
223272
"runner_name": runner_name,
273+
"runner_label": runner_label,
224274
"run_id": run["id"],
225275
"run_status": run_status,
226276
"run_conclusion": run_conclusion,
@@ -313,17 +363,21 @@ def calculate_queue_time(
313363

314364

315365
def process_results(
316-
results: list[dict], repo: str, report_time: datetime = None
366+
results: list[dict],
367+
repo: str,
368+
report_time: datetime = None,
369+
group_by_runner: bool = False,
317370
) -> dict:
318371
"""
319372
Process raw results into structured data for presentation.
320373
Returns a dictionary containing:
321-
- status_summary: dict of job_name -> status counts
374+
- status_summary: dict of job_name (or runner_label) -> status counts
322375
- sorted_results: list of results sorted by created_at descending
323376
- active_jobs: list of in_progress/queued/waiting jobs (excluding stuck)
324377
- stuck_jobs: list of stuck/ghost jobs
325378
- failed_jobs: list of failed jobs
326379
- processed_jobs: list of jobs with calculated fields (queue_time, duration, etc.)
380+
- runner_summary: dict of runner_label -> status counts (only when group_by_runner=True)
327381
"""
328382
if report_time is None:
329383
report_time = datetime.now(timezone.utc)
@@ -336,15 +390,21 @@ def process_results(
336390
"stuck_jobs": [],
337391
"failed_jobs": [],
338392
"processed_jobs": [],
393+
"runner_summary": {},
339394
}
340395

341396
# Group by job name for summary
342397
status_summary = {}
398+
runner_summary = {}
399+
343400
for r in results:
344401
job_name = r["job_name"]
345402
status = r["status"]
346403
conclusion = r.get("conclusion", "-")
347404
is_stuck = r.get("is_stuck", False)
405+
runner_label = r.get("runner_label", "-")
406+
407+
# Job name summary
348408
if job_name not in status_summary:
349409
status_summary[job_name] = {
350410
"in_progress": 0,
@@ -355,18 +415,42 @@ def process_results(
355415
"failure": 0,
356416
"cancelled": 0,
357417
}
418+
419+
# Runner label summary (when grouping by runner)
420+
if group_by_runner and runner_label != "-":
421+
if runner_label not in runner_summary:
422+
runner_summary[runner_label] = {
423+
"in_progress": 0,
424+
"queued": 0,
425+
"waiting": 0,
426+
"stuck": 0,
427+
"success": 0,
428+
"failure": 0,
429+
"cancelled": 0,
430+
}
431+
358432
if is_stuck:
359433
status_summary[job_name]["stuck"] += 1
434+
if group_by_runner and runner_label in runner_summary:
435+
runner_summary[runner_label]["stuck"] += 1
360436
elif status == "completed":
361437
# For completed jobs, count by conclusion
362438
if conclusion == "success":
363439
status_summary[job_name]["success"] += 1
440+
if group_by_runner and runner_label in runner_summary:
441+
runner_summary[runner_label]["success"] += 1
364442
elif conclusion == "failure":
365443
status_summary[job_name]["failure"] += 1
444+
if group_by_runner and runner_label in runner_summary:
445+
runner_summary[runner_label]["failure"] += 1
366446
elif conclusion in ("cancelled", "timed_out", "action_required"):
367447
status_summary[job_name]["cancelled"] += 1
448+
if group_by_runner and runner_label in runner_summary:
449+
runner_summary[runner_label]["cancelled"] += 1
368450
elif status in status_summary[job_name]:
369451
status_summary[job_name][status] += 1
452+
if group_by_runner and runner_label in runner_summary:
453+
runner_summary[runner_label][status] += 1
370454

371455
# Sort by created_at descending
372456
sorted_results = sorted(results, key=lambda x: x["created_at"], reverse=True)
@@ -422,6 +506,7 @@ def process_results(
422506
"stuck_jobs": stuck_jobs,
423507
"failed_jobs": failed_jobs,
424508
"processed_jobs": processed_jobs,
509+
"runner_summary": runner_summary,
425510
}
426511

427512

@@ -613,12 +698,18 @@ def format_markdown(
613698
hours: int,
614699
generated_time: str,
615700
report_time: datetime = None,
701+
runner_filter: str = None,
616702
) -> str:
617703
"""Format results as markdown for GitHub Actions summary."""
618704
lines = []
619705

620-
# Header
621-
lines.append(f"# Job Status Report: `{job_filter}`")
706+
# Header - different title for runner filter mode
707+
if runner_filter:
708+
lines.append(f"# AMD Runner Usage Report")
709+
lines.append("")
710+
lines.append(f"**Runner filter:** `{runner_filter}`")
711+
else:
712+
lines.append(f"# Job Status Report: `{job_filter}`")
622713
lines.append("")
623714
lines.append(f"**Time window:** Last {hours} hours")
624715
lines.append(f"**Generated:** {generated_time} UTC")
@@ -632,14 +723,46 @@ def format_markdown(
632723
return "\n".join(lines)
633724

634725
# Process data using shared function
635-
data = process_results(results, repo, report_time)
726+
group_by_runner = bool(runner_filter)
727+
data = process_results(results, repo, report_time, group_by_runner)
636728
status_summary = data["status_summary"]
729+
runner_summary = data.get("runner_summary", {})
637730
processed_jobs = data["processed_jobs"]
638731
active_jobs = data["active_jobs"]
639732
stuck_jobs = data["stuck_jobs"]
640733
failed_jobs = data["failed_jobs"]
641734

642-
# Summary table
735+
# Runner summary table (when using runner filter)
736+
if runner_filter and runner_summary:
737+
lines.append("## Summary by Runner Label")
738+
lines.append("")
739+
lines.append(
740+
"> **Status meanings:** Running = executing, Queued = waiting for runner, Waiting = waiting for dependent jobs, Stuck = ghost job"
741+
)
742+
lines.append("")
743+
lines.append(
744+
"| Runner Label | Running | Queued | Waiting | Stuck | Success | Failure | Cancelled |"
745+
)
746+
lines.append(
747+
"|--------------|---------|--------|---------|-------|---------|---------|-----------|"
748+
)
749+
750+
for runner_label, counts in sorted(runner_summary.items()):
751+
running = (
752+
f"**{counts['in_progress']}**" if counts["in_progress"] > 0 else "0"
753+
)
754+
queued = f"**{counts['queued']}**" if counts["queued"] > 0 else "0"
755+
waiting = f"**{counts['waiting']}**" if counts["waiting"] > 0 else "0"
756+
stuck = f"**{counts['stuck']}**" if counts["stuck"] > 0 else "0"
757+
success = str(counts["success"])
758+
failure = f"**{counts['failure']}**" if counts["failure"] > 0 else "0"
759+
cancelled = str(counts["cancelled"])
760+
lines.append(
761+
f"| `{runner_label}` | {running} | {queued} | {waiting} | {stuck} | {success} | {failure} | {cancelled} |"
762+
)
763+
lines.append("")
764+
765+
# Summary table by job name
643766
lines.append("## Summary by Job Name")
644767
lines.append("")
645768
lines.append(
@@ -804,13 +927,13 @@ def main():
804927
)
805928
parser.add_argument(
806929
"--job",
807-
required=True,
808-
help="Job name filter (e.g., 'stage-c-test-large-8-gpu-amd-mi35x')",
930+
default="",
931+
help="Job name filter (e.g., 'stage-c-test-large-8-gpu-amd-mi35x'). Optional if --runner-filter is provided.",
809932
)
810933
parser.add_argument(
811934
"--workflow",
812-
default="pr-test-amd.yml",
813-
help="Workflow file name (default: pr-test-amd.yml)",
935+
default="",
936+
help="Workflow file name (e.g., 'pr-test-amd.yml'). Empty = all workflows.",
814937
)
815938
parser.add_argument(
816939
"--hours",
@@ -823,6 +946,11 @@ def main():
823946
choices=["in_progress", "queued", "completed", "waiting"],
824947
help="Filter by job status",
825948
)
949+
parser.add_argument(
950+
"--runner-filter",
951+
type=str,
952+
help="Filter by runner label (comma-separated, fuzzy match). E.g., 'mi325,mi35x'",
953+
)
826954
parser.add_argument(
827955
"--output",
828956
choices=["table", "csv", "json", "markdown"],
@@ -841,12 +969,17 @@ def main():
841969
)
842970
args = parser.parse_args()
843971

972+
# Validate: at least one of --job or --runner-filter must be provided
973+
if not args.job and not args.runner_filter:
974+
parser.error("At least one of --job or --runner-filter must be provided")
975+
844976
results = query_jobs(
845977
args.repo,
846978
args.job,
847-
args.workflow,
979+
args.workflow if args.workflow else None,
848980
args.hours,
849981
args.status,
982+
args.runner_filter,
850983
)
851984

852985
output_content = None
@@ -884,7 +1017,13 @@ def main():
8841017
print(output_content)
8851018
elif args.output == "markdown":
8861019
output_content = format_markdown(
887-
results, args.repo, args.job, args.hours, report_generated_time, report_time
1020+
results,
1021+
args.repo,
1022+
args.job,
1023+
args.hours,
1024+
report_generated_time,
1025+
report_time,
1026+
runner_filter=args.runner_filter,
8881027
)
8891028
print(output_content)
8901029

@@ -897,7 +1036,13 @@ def main():
8971036
# Write to GITHUB_STEP_SUMMARY if requested
8981037
if args.summary:
8991038
md_content = format_markdown(
900-
results, args.repo, args.job, args.hours, report_generated_time, report_time
1039+
results,
1040+
args.repo,
1041+
args.job,
1042+
args.hours,
1043+
report_generated_time,
1044+
report_time,
1045+
runner_filter=args.runner_filter,
9011046
)
9021047
summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
9031048
if summary_file:

0 commit comments

Comments
 (0)