Skip to content

Commit 44f4e43

Browse files
authored
[CI] Extend metrics container to log BuildKite metrics (#130996)
The current container focuses on Github metrics. Before deprecating BuildKite, we want to make sure the new infra quality is better, or at least the same. Being able to compare buildkite metrics with github metrics on grafana will allow us to easily present the comparison. BuildKite API allows filtering, but doesn't allow changing the result ordering. Meaning we are left with builds ordered by IDs. This means a completed job can appear before a running job in the list. 2 solutions from there: - keep the cursor on the oldest running workflow - keep a list of running workflows to compare. Because there is no guarantees in workflow ordering, waiting for the oldest build to complete before reporting any newer build could mean delaying the more recent build completion reporting by a few hours. And because grafana cannot ingest metrics older than 2 hours, this is not an option. Thus we leave with the second solution: remember what jobs were running during the last iteration, and record them as soon as they are completed. Buildkite has at most ~100 pending jobs, so keeping all those IDs should be OK.
1 parent 0a5847f commit 44f4e43

File tree

1 file changed

+211
-2
lines changed

1 file changed

+211
-2
lines changed

.ci/metrics/metrics.py

+211-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import collections
22
import datetime
3+
import dateutil
34
import github
5+
import json
46
import logging
57
import os
68
import requests
@@ -53,6 +55,18 @@
5355
# by trial and error).
5456
GRAFANA_METRIC_MAX_AGE_MN = 120
5557

58+
# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
59+
# the metric name in Grafana. This is important not to lose metrics history
60+
# if the workflow name changes.
61+
BUILDKITE_WORKFLOW_TO_TRACK = {
62+
":linux: Linux x64": "buildkite_linux",
63+
":windows: Windows x64": "buildkite_windows",
64+
}
65+
66+
# Number of builds to fetch per page. Since we scrape regularly, this can
67+
# remain small.
68+
BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
69+
5670
@dataclass
5771
class JobMetrics:
5872
job_name: str
@@ -70,6 +84,191 @@ class GaugeMetric:
7084
time_ns: int
7185

7286

87+
def buildkite_fetch_page_build_list(
88+
buildkite_token: str, after_cursor: str = None
89+
) -> list[dict[str, str]]:
90+
"""Fetches a page of the build list using the GraphQL BuildKite API.
91+
92+
Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
93+
or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
94+
older than the one pointer by |after_cursor| if provided.
95+
The |after_cursor| value is taken from the previous page returned by the
96+
API.
97+
98+
Args:
99+
buildkite_token: the secret token to authenticate GraphQL requests.
100+
after_cursor: cursor after which to start the page fetch.
101+
102+
Returns:
103+
The most recent builds after cursor (if set) with the following format:
104+
[
105+
{
106+
"cursor": <value>,
107+
"number": <build-number>,
108+
}
109+
]
110+
"""
111+
112+
BUILDKITE_GRAPHQL_QUERY = """
113+
query OrganizationShowQuery {{
114+
organization(slug: "llvm-project") {{
115+
pipelines(search: "Github pull requests", first: 1) {{
116+
edges {{
117+
node {{
118+
builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
119+
edges {{
120+
cursor
121+
node {{
122+
number
123+
}}
124+
}}
125+
}}
126+
}}
127+
}}
128+
}}
129+
}}
130+
}}
131+
"""
132+
query = BUILDKITE_GRAPHQL_QUERY.format(
133+
PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
134+
AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
135+
)
136+
query = json.dumps({"query": query})
137+
url = "https://graphql.buildkite.com/v1"
138+
headers = {
139+
"Authorization": "Bearer " + buildkite_token,
140+
"Content-Type": "application/json",
141+
}
142+
data = requests.post(url, data=query, headers=headers).json()
143+
# De-nest the build list.
144+
if "errors" in data:
145+
logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
146+
return []
147+
builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
148+
"edges"
149+
]
150+
# Fold cursor info into the node dictionnary.
151+
return [{**x["node"], "cursor": x["cursor"]} for x in builds]
152+
153+
154+
def buildkite_get_build_info(build_number: str) -> dict:
155+
"""Returns all the info associated with the provided build number.
156+
157+
Note: for unknown reasons, graphql returns no jobs for a given build,
158+
while this endpoint does, hence why this uses this API instead of graphql.
159+
160+
Args:
161+
build_number: which build number to fetch info for.
162+
163+
Returns:
164+
The info for the target build, a JSON dictionnary.
165+
"""
166+
167+
URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
168+
return requests.get(URL.format(build_number)).json()
169+
170+
171+
def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
172+
"""Returns all the running/pending BuildKite builds.
173+
174+
Args:
175+
buildkite_token: the secret token to authenticate GraphQL requests.
176+
last_cursor: the cursor to stop at if set. If None, a full page is fetched.
177+
"""
178+
output = []
179+
cursor = None
180+
while True:
181+
page = buildkite_fetch_page_build_list(buildkite_token, cursor)
182+
if len(page) == 0:
183+
break
184+
cursor = page[-1]["cursor"]
185+
output += page
186+
return output
187+
188+
189+
def buildkite_get_metrics(
190+
buildkite_token: str, previously_incomplete: set[int]
191+
) -> (list[JobMetrics], set[int]):
192+
"""Returns a tuple with:
193+
194+
- the metrics recorded for newly completed workflow jobs.
195+
- the set of workflow still running now.
196+
197+
Args:
198+
buildkite_token: the secret token to authenticate GraphQL requests.
199+
previously_incomplete: the set of running workflows the last time this
200+
function was called.
201+
"""
202+
203+
running_builds = buildkite_get_incomplete_tasks(buildkite_token)
204+
incomplete_now = set([x["number"] for x in running_builds])
205+
output = []
206+
207+
for build_id in previously_incomplete:
208+
if build_id in incomplete_now:
209+
continue
210+
211+
info = buildkite_get_build_info(build_id)
212+
metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
213+
for job in info["jobs"]:
214+
# This workflow is not interesting to us.
215+
if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
216+
continue
217+
218+
created_at = dateutil.parser.isoparse(job["created_at"])
219+
scheduled_at = (
220+
created_at
221+
if job["scheduled_at"] is None
222+
else dateutil.parser.isoparse(job["scheduled_at"])
223+
)
224+
started_at = (
225+
scheduled_at
226+
if job["started_at"] is None
227+
else dateutil.parser.isoparse(job["started_at"])
228+
)
229+
if job["canceled_at"] is None:
230+
finished_at = (
231+
started_at
232+
if job["finished_at"] is None
233+
else dateutil.parser.isoparse(job["finished_at"])
234+
)
235+
else:
236+
finished_at = dateutil.parser.isoparse(job["canceled_at"])
237+
238+
job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
239+
queue_time = (started_at - scheduled_at).seconds
240+
run_time = (finished_at - started_at).seconds
241+
status = bool(job["passed"])
242+
243+
# Grafana will refuse to ingest metrics older than ~2 hours, so we
244+
# should avoid sending historical data.
245+
metric_age_mn = (
246+
datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
247+
).total_seconds() / 60
248+
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
249+
logging.warning(
250+
f"Job {job['name']} from workflow {build_id} dropped due"
251+
+ f" to staleness: {metric_age_mn}mn old."
252+
)
253+
continue
254+
255+
metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
256+
workflow_id = build_id
257+
workflow_name = "Github pull requests"
258+
output.append(
259+
JobMetrics(
260+
job_name,
261+
queue_time,
262+
run_time,
263+
status,
264+
metric_timestamp_ns,
265+
workflow_id,
266+
workflow_name,
267+
)
268+
)
269+
270+
return output, incomplete_now
271+
73272
def github_get_metrics(
74273
github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
75274
) -> tuple[list[JobMetrics], int]:
@@ -195,7 +394,7 @@ def github_get_metrics(
195394
datetime.datetime.now(datetime.timezone.utc) - completed_at
196395
).total_seconds() / 60
197396
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
198-
logging.info(
397+
logging.warning(
199398
f"Job {job.id} from workflow {task.id} dropped due"
200399
+ f" to staleness: {metric_age_mn}mn old."
201400
)
@@ -292,23 +491,33 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
292491
def main():
293492
# Authenticate with Github
294493
github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
494+
buildkite_token = os.environ["BUILDKITE_TOKEN"]
295495
grafana_api_key = os.environ["GRAFANA_API_KEY"]
296496
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
297497

298498
# The last workflow this script processed.
299499
# Because the Github queries are broken, we'll simply log a 'processed'
300500
# bit for the last COUNT_TO_PROCESS workflows.
301501
gh_last_workflows_seen_as_completed = set()
502+
# Stores the list of pending/running builds in BuildKite we need to check
503+
# at the next iteration.
504+
bk_incomplete = set()
302505

303506
# Enter the main loop. Every five minutes we wake up and dump metrics for
304507
# the relevant jobs.
305508
while True:
306509
github_object = Github(auth=github_auth)
307510
github_repo = github_object.get_repo("llvm/llvm-project")
308511

309-
metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
512+
gh_metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
310513
github_repo, gh_last_workflows_seen_as_completed
311514
)
515+
516+
bk_metrics, bk_incomplete = buildkite_get_metrics(
517+
buildkite_token, bk_incomplete
518+
)
519+
520+
metrics = gh_metrics + bk_metrics
312521
upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
313522
logging.info(f"Uploaded {len(metrics)} metrics")
314523

0 commit comments

Comments
 (0)