1
1
import collections
2
2
import datetime
3
+ import dateutil
3
4
import github
5
+ import json
4
6
import logging
5
7
import os
6
8
import requests
53
55
# by trial and error).
54
56
GRAFANA_METRIC_MAX_AGE_MN = 120
55
57
58
+ # Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
59
+ # the metric name in Grafana. This is important not to lose metrics history
60
+ # if the workflow name changes.
61
+ BUILDKITE_WORKFLOW_TO_TRACK = {
62
+ ":linux: Linux x64" : "buildkite_linux" ,
63
+ ":windows: Windows x64" : "buildkite_windows" ,
64
+ }
65
+
66
+ # Number of builds to fetch per page. Since we scrape regularly, this can
67
+ # remain small.
68
+ BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
69
+
56
70
@dataclass
57
71
class JobMetrics :
58
72
job_name : str
@@ -70,6 +84,191 @@ class GaugeMetric:
70
84
time_ns : int
71
85
72
86
87
+ def buildkite_fetch_page_build_list (
88
+ buildkite_token : str , after_cursor : str = None
89
+ ) -> list [dict [str , str ]]:
90
+ """Fetches a page of the build list using the GraphQL BuildKite API.
91
+
92
+ Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
93
+ or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
94
+ older than the one pointer by |after_cursor| if provided.
95
+ The |after_cursor| value is taken from the previous page returned by the
96
+ API.
97
+
98
+ Args:
99
+ buildkite_token: the secret token to authenticate GraphQL requests.
100
+ after_cursor: cursor after which to start the page fetch.
101
+
102
+ Returns:
103
+ The most recent builds after cursor (if set) with the following format:
104
+ [
105
+ {
106
+ "cursor": <value>,
107
+ "number": <build-number>,
108
+ }
109
+ ]
110
+ """
111
+
112
+ BUILDKITE_GRAPHQL_QUERY = """
113
+ query OrganizationShowQuery {{
114
+ organization(slug: "llvm-project") {{
115
+ pipelines(search: "Github pull requests", first: 1) {{
116
+ edges {{
117
+ node {{
118
+ builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
119
+ edges {{
120
+ cursor
121
+ node {{
122
+ number
123
+ }}
124
+ }}
125
+ }}
126
+ }}
127
+ }}
128
+ }}
129
+ }}
130
+ }}
131
+ """
132
+ query = BUILDKITE_GRAPHQL_QUERY .format (
133
+ PAGE_SIZE = BUILDKITE_GRAPHQL_BUILDS_PER_PAGE ,
134
+ AFTER = "null" if after_cursor is None else '"{}"' .format (after_cursor ),
135
+ )
136
+ query = json .dumps ({"query" : query })
137
+ url = "https://graphql.buildkite.com/v1"
138
+ headers = {
139
+ "Authorization" : "Bearer " + buildkite_token ,
140
+ "Content-Type" : "application/json" ,
141
+ }
142
+ data = requests .post (url , data = query , headers = headers ).json ()
143
+ # De-nest the build list.
144
+ if "errors" in data :
145
+ logging .info ("Failed to fetch BuildKite jobs: {}" .format (data ["errors" ]))
146
+ return []
147
+ builds = data ["data" ]["organization" ]["pipelines" ]["edges" ][0 ]["node" ]["builds" ][
148
+ "edges"
149
+ ]
150
+ # Fold cursor info into the node dictionnary.
151
+ return [{** x ["node" ], "cursor" : x ["cursor" ]} for x in builds ]
152
+
153
+
154
+ def buildkite_get_build_info (build_number : str ) -> dict :
155
+ """Returns all the info associated with the provided build number.
156
+
157
+ Note: for unknown reasons, graphql returns no jobs for a given build,
158
+ while this endpoint does, hence why this uses this API instead of graphql.
159
+
160
+ Args:
161
+ build_number: which build number to fetch info for.
162
+
163
+ Returns:
164
+ The info for the target build, a JSON dictionnary.
165
+ """
166
+
167
+ URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
168
+ return requests .get (URL .format (build_number )).json ()
169
+
170
+
171
+ def buildkite_get_incomplete_tasks (buildkite_token : str ) -> list :
172
+ """Returns all the running/pending BuildKite builds.
173
+
174
+ Args:
175
+ buildkite_token: the secret token to authenticate GraphQL requests.
176
+ last_cursor: the cursor to stop at if set. If None, a full page is fetched.
177
+ """
178
+ output = []
179
+ cursor = None
180
+ while True :
181
+ page = buildkite_fetch_page_build_list (buildkite_token , cursor )
182
+ if len (page ) == 0 :
183
+ break
184
+ cursor = page [- 1 ]["cursor" ]
185
+ output += page
186
+ return output
187
+
188
+
189
+ def buildkite_get_metrics (
190
+ buildkite_token : str , previously_incomplete : set [int ]
191
+ ) -> (list [JobMetrics ], set [int ]):
192
+ """Returns a tuple with:
193
+
194
+ - the metrics recorded for newly completed workflow jobs.
195
+ - the set of workflow still running now.
196
+
197
+ Args:
198
+ buildkite_token: the secret token to authenticate GraphQL requests.
199
+ previously_incomplete: the set of running workflows the last time this
200
+ function was called.
201
+ """
202
+
203
+ running_builds = buildkite_get_incomplete_tasks (buildkite_token )
204
+ incomplete_now = set ([x ["number" ] for x in running_builds ])
205
+ output = []
206
+
207
+ for build_id in previously_incomplete :
208
+ if build_id in incomplete_now :
209
+ continue
210
+
211
+ info = buildkite_get_build_info (build_id )
212
+ metric_timestamp = dateutil .parser .isoparse (info ["finished_at" ])
213
+ for job in info ["jobs" ]:
214
+ # This workflow is not interesting to us.
215
+ if job ["name" ] not in BUILDKITE_WORKFLOW_TO_TRACK :
216
+ continue
217
+
218
+ created_at = dateutil .parser .isoparse (job ["created_at" ])
219
+ scheduled_at = (
220
+ created_at
221
+ if job ["scheduled_at" ] is None
222
+ else dateutil .parser .isoparse (job ["scheduled_at" ])
223
+ )
224
+ started_at = (
225
+ scheduled_at
226
+ if job ["started_at" ] is None
227
+ else dateutil .parser .isoparse (job ["started_at" ])
228
+ )
229
+ if job ["canceled_at" ] is None :
230
+ finished_at = (
231
+ started_at
232
+ if job ["finished_at" ] is None
233
+ else dateutil .parser .isoparse (job ["finished_at" ])
234
+ )
235
+ else :
236
+ finished_at = dateutil .parser .isoparse (job ["canceled_at" ])
237
+
238
+ job_name = BUILDKITE_WORKFLOW_TO_TRACK [job ["name" ]]
239
+ queue_time = (started_at - scheduled_at ).seconds
240
+ run_time = (finished_at - started_at ).seconds
241
+ status = bool (job ["passed" ])
242
+
243
+ # Grafana will refuse to ingest metrics older than ~2 hours, so we
244
+ # should avoid sending historical data.
245
+ metric_age_mn = (
246
+ datetime .datetime .now (datetime .timezone .utc ) - metric_timestamp
247
+ ).total_seconds () / 60
248
+ if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
249
+ logging .warning (
250
+ f"Job { job ['name' ]} from workflow { build_id } dropped due"
251
+ + f" to staleness: { metric_age_mn } mn old."
252
+ )
253
+ continue
254
+
255
+ metric_timestamp_ns = int (metric_timestamp .timestamp ()) * 10 ** 9
256
+ workflow_id = build_id
257
+ workflow_name = "Github pull requests"
258
+ output .append (
259
+ JobMetrics (
260
+ job_name ,
261
+ queue_time ,
262
+ run_time ,
263
+ status ,
264
+ metric_timestamp_ns ,
265
+ workflow_id ,
266
+ workflow_name ,
267
+ )
268
+ )
269
+
270
+ return output , incomplete_now
271
+
73
272
def github_get_metrics (
74
273
github_repo : github .Repository , last_workflows_seen_as_completed : set [int ]
75
274
) -> tuple [list [JobMetrics ], int ]:
@@ -195,7 +394,7 @@ def github_get_metrics(
195
394
datetime .datetime .now (datetime .timezone .utc ) - completed_at
196
395
).total_seconds () / 60
197
396
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
198
- logging .info (
397
+ logging .warning (
199
398
f"Job { job .id } from workflow { task .id } dropped due"
200
399
+ f" to staleness: { metric_age_mn } mn old."
201
400
)
@@ -292,23 +491,33 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
292
491
def main ():
293
492
# Authenticate with Github
294
493
github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
494
+ buildkite_token = os .environ ["BUILDKITE_TOKEN" ]
295
495
grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
296
496
grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
297
497
298
498
# The last workflow this script processed.
299
499
# Because the Github queries are broken, we'll simply log a 'processed'
300
500
# bit for the last COUNT_TO_PROCESS workflows.
301
501
gh_last_workflows_seen_as_completed = set ()
502
+ # Stores the list of pending/running builds in BuildKite we need to check
503
+ # at the next iteration.
504
+ bk_incomplete = set ()
302
505
303
506
# Enter the main loop. Every five minutes we wake up and dump metrics for
304
507
# the relevant jobs.
305
508
while True :
306
509
github_object = Github (auth = github_auth )
307
510
github_repo = github_object .get_repo ("llvm/llvm-project" )
308
511
309
- metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
512
+ gh_metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
310
513
github_repo , gh_last_workflows_seen_as_completed
311
514
)
515
+
516
+ bk_metrics , bk_incomplete = buildkite_get_metrics (
517
+ buildkite_token , bk_incomplete
518
+ )
519
+
520
+ metrics = gh_metrics + bk_metrics
312
521
upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
313
522
logging .info (f"Uploaded { len (metrics )} metrics" )
314
523
0 commit comments