Skip to content

Commit 315ceb4

Browse files
cagedmantisgopherbot
authored andcommitted
cmd/watchflakes: detect and report broken bots
This change automates issue creation for bots requiring human intervention. When a bot enters a failed state needing manual resolution, it checks for an existing GitHub issue. If none exists, it creates a new issue in the 'Broken Bot' project and tags the bot's port owners. Fixes golang/go#68790 Change-Id: I3a64d3a06f80e948fc895a2b15d89ea8d6f37a03 Reviewed-on: https://go-review.googlesource.com/c/build/+/646838 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Auto-Submit: Carlos Amedee <carlos@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
1 parent 4cc202a commit 315ceb4

File tree

3 files changed

+263
-3
lines changed

3 files changed

+263
-3
lines changed

cmd/watchflakes/github.go

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ var (
4040
repo *github.Repo
4141
labels map[string]*github.Label
4242
testFlakes *github.Project
43+
brokenBots *github.Project
4344
)
4445

4546
// readIssues reads the GitHub issues in the Test Flakes project.
@@ -109,6 +110,71 @@ func readIssues(old []*Issue) ([]*Issue, error) {
109110
return issues, nil
110111
}
111112

113+
// readBuilderIssues reads the GitHub issues in the Broken Bots project.
114+
// It also sets up the repo, labels, and testFlakes variables for
115+
// use by other functions below.
116+
func readBuilderIssues() ([]*Issue, error) {
117+
// Find repo.
118+
r, err := gh.Repo("golang", "go")
119+
if err != nil {
120+
return nil, err
121+
}
122+
repo = r
123+
124+
var builderLabel *github.Label
125+
126+
// Find labels.
127+
list, err := gh.SearchLabels("golang", "go", "")
128+
if err != nil {
129+
return nil, err
130+
}
131+
for _, label := range list {
132+
if label.Name == "Builders" {
133+
builderLabel = label
134+
break
135+
}
136+
}
137+
if builderLabel == nil {
138+
return nil, fmt.Errorf("cannot find builder label")
139+
}
140+
141+
labels = make(map[string]*github.Label)
142+
for _, label := range list {
143+
labels[label.Name] = label
144+
}
145+
146+
// Find Test Flakes project.
147+
ps, err := gh.Projects("golang", "")
148+
if err != nil {
149+
return nil, err
150+
}
151+
for _, p := range ps {
152+
if p.Title == "Broken Bots" {
153+
brokenBots = p
154+
break
155+
}
156+
}
157+
if brokenBots == nil {
158+
return nil, fmt.Errorf("cannot find Broken Bots project")
159+
}
160+
161+
// Read all issues in Test Flakes.
162+
var issues []*Issue
163+
items, err := gh.ProjectItems(brokenBots)
164+
if err != nil {
165+
return nil, err
166+
}
167+
for _, item := range items {
168+
if item.Issue != nil {
169+
issues = append(issues, &Issue{Issue: item.Issue, NewBody: true, Stale: true})
170+
}
171+
}
172+
sort.Slice(issues, func(i, j int) bool {
173+
return issues[i].Number < issues[j].Number
174+
})
175+
return issues, nil
176+
}
177+
112178
// findScripts finds the scripts in the issues,
113179
// initializing issue.Script and .ScriptText or else .Error
114180
// in each issue.
@@ -329,7 +395,7 @@ func readComments(issue *Issue) {
329395
}
330396

331397
// postNew creates a new issue with the given title and body,
332-
// setting the NeedsInvestigation label and placing the issue int
398+
// setting the NeedsInvestigation label and placing the issue in
333399
// the Test Flakes project.
334400
// It automatically adds signature to the body.
335401
func postNew(title, body string) *github.Issue {
@@ -346,6 +412,24 @@ func postNew(title, body string) *github.Issue {
346412
return issue
347413
}
348414

415+
// postNewBrokenBot creates a new issue with the given title and body,
416+
// setting the NeedsInvestigation label and placing the issue in
417+
// the Broken Bots project.
418+
// It automatically adds signature to the body.
419+
func postNewBrokenBot(title, body string) (*github.Issue, error) {
420+
var args []any
421+
if lab := labels["NeedsInvestigation"]; lab != nil {
422+
args = append(args, lab)
423+
}
424+
if lab := labels["Builders"]; lab != nil {
425+
args = append(args, lab)
426+
}
427+
428+
args = append(args, brokenBots)
429+
issue, err := gh.CreateIssue(repo, title, body+signature, args...)
430+
return issue, err
431+
}
432+
349433
// postComment posts a new comment on the issue.
350434
// It automatically adds signature to the comment.
351435
func postComment(issue *Issue, body string) error {

cmd/watchflakes/luci.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"go.chromium.org/luci/grpc/prpc"
2424
rdbpb "go.chromium.org/luci/resultdb/proto/v1"
2525
spb "go.chromium.org/luci/swarming/proto/api_v2"
26+
goluci "golang.org/x/build/internal/luci"
2627
"golang.org/x/sync/errgroup"
2728
"google.golang.org/protobuf/types/known/fieldmaskpb"
2829
"google.golang.org/protobuf/types/known/timestamppb"
@@ -147,6 +148,8 @@ type Failure struct {
147148

148149
type Bot struct {
149150
ID string
151+
Goos string
152+
Goarch string
150153
Dead bool
151154
Quarantined bool
152155
}
@@ -686,7 +689,8 @@ func (c *LUCIClient) fetchLogsForBuild(r *BuildResult) {
686689
// from the list of returned bots.
687690
type filter func(bot *spb.BotInfo) bool
688691

689-
// filterOutDarwin filters out darwin machines which no longer exist.
692+
// filterOutDarwin filters out darwin bots which no longer exist but are still
693+
// listed as valid bots.
690694
func filterOutDarwin(bot *spb.BotInfo) bool {
691695
return strings.HasPrefix(bot.BotId, "darwin-")
692696
}
@@ -724,7 +728,12 @@ nextCursor:
724728
continue
725729
}
726730
if bot.IsDead || bot.Quarantined {
727-
brokenBots = append(brokenBots, Bot{ID: bot.BotId, Dead: bot.IsDead, Quarantined: bot.Quarantined})
731+
goos, goarch, err := platform(bot)
732+
if err != nil {
733+
fmt.Printf("failed to determine platform for %s: %s\n", bot.GetBotId(), err)
734+
continue
735+
}
736+
brokenBots = append(brokenBots, Bot{ID: bot.BotId, Dead: bot.IsDead, Quarantined: bot.Quarantined, Goos: goos, Goarch: goarch})
728737
}
729738
}
730739
if resp.GetCursor() != "" {
@@ -734,6 +743,40 @@ nextCursor:
734743
return brokenBots, nil
735744
}
736745

746+
// platform determines the platform that the bot is running.
747+
func platform(bot *spb.BotInfo) (string, string, error) {
748+
var goos, goarch, targetGoos, targetGoarch string
749+
var err error
750+
751+
for _, d := range bot.GetDimensions() {
752+
key := d.GetKey()
753+
switch key {
754+
case "cipd_platform":
755+
val := d.GetValue()
756+
if len(val) == 0 {
757+
return "", "", fmt.Errorf("invalid cipd_platform value: %+v", val)
758+
}
759+
goos, goarch, err = goluci.PlatformToGoValues(val[0])
760+
if err != nil {
761+
return "", "", fmt.Errorf("unable to parse cipd_platform value %q: %w", val[0], err)
762+
}
763+
case "target_goarch":
764+
if val := d.GetValue(); len(val) == 1 {
765+
targetGoarch = val[0]
766+
}
767+
case "target_goos":
768+
if val := d.GetValue(); len(val) == 1 {
769+
targetGoos = val[0]
770+
}
771+
default:
772+
}
773+
}
774+
if targetGoarch != "" && targetGoos != "" {
775+
goos, goarch = targetGoos, targetGoarch
776+
}
777+
return goos, goarch, nil
778+
}
779+
737780
func fetchURL(url string) string {
738781
resp, err := http.Get(url)
739782
if err != nil {

cmd/watchflakes/main.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,16 @@
77
package main
88

99
import (
10+
"bytes"
1011
"context"
12+
"encoding/json"
1113
"flag"
1214
"fmt"
15+
"io"
1316
"log"
17+
"net/http"
1418
"os"
19+
"regexp"
1520
"runtime"
1621
"strconv"
1722
"strings"
@@ -21,6 +26,7 @@ import (
2126
rdbpb "go.chromium.org/luci/resultdb/proto/v1"
2227
"golang.org/x/build/buildenv"
2328
"golang.org/x/build/cmd/watchflakes/internal/script"
29+
"golang.org/x/build/devapp/owners"
2430
"golang.org/x/build/internal/secret"
2531
"rsc.io/github"
2632
)
@@ -106,6 +112,7 @@ func main() {
106112
Repeat:
107113
startTime := time.Now()
108114
ctx, cancel := context.WithTimeout(context.Background(), timeout)
115+
reportBrokenBots(ctx, c)
109116
var boards []*Dashboard
110117
if *build == "" {
111118
// fetch the dashboard
@@ -302,6 +309,132 @@ Repeat:
302309
}
303310
}
304311

312+
func reportBrokenBots(ctx context.Context, c *LUCIClient) {
313+
// query for broken bots
314+
brokenBots, err := c.ListBrokenBots(ctx, filterOutDarwin)
315+
if err != nil {
316+
log.Printf("failed to query for bots: %s", err)
317+
return
318+
}
319+
// query for existing broken bot issues
320+
existingIssues, err := readBuilderIssues()
321+
if err != nil {
322+
log.Printf("failed querying for existing builder issues: %s", err)
323+
return
324+
}
325+
// map used as set to check for existing issues for a bot ID.
326+
// botID -> issue number
327+
botIssues := make(map[string]int)
328+
for _, issue := range existingIssues {
329+
if botID, ok := botIDFromIssueBody(issue.Body); ok {
330+
fmt.Printf("found existing issue: %+v for %s\n", issue, botID)
331+
botIssues[botID] = issue.Number
332+
}
333+
}
334+
po, err := getPlatformOwners()
335+
if err != nil {
336+
log.Printf("failed to query for platform owners: %s", err)
337+
}
338+
// for each broken bot, is there an existing open issue?
339+
for _, bot := range brokenBots {
340+
if issueID, ok := botIssues[bot.ID]; ok {
341+
fmt.Printf("issue #%d found for broken bot %s\n", issueID, bot.ID)
342+
continue
343+
}
344+
title := brokenBotIssueTitle(bot.ID)
345+
var botOwners []string
346+
if v, ok := po[bot.Goos]; ok {
347+
for _, bo := range v {
348+
botOwners = append(botOwners, "@"+bo)
349+
}
350+
}
351+
if v, ok := po[bot.Goarch]; ok {
352+
for _, bo := range v {
353+
botOwners = append(botOwners, "@"+bo)
354+
}
355+
}
356+
if !*post {
357+
fmt.Printf("dry-run: skipped posting a new broken bot issue for %s\n", bot.ID)
358+
continue
359+
}
360+
i, err := postNewBrokenBot(title, brokenBotIssueBody(bot, botOwners))
361+
if err != nil {
362+
log.Printf("failed to post broken bot issue: %s", err)
363+
continue
364+
}
365+
fmt.Printf("Posted new broken bot issue for %s, issue: %s\n", bot.ID, i.ID)
366+
}
367+
}
368+
369+
func getPlatformOwners() (map[string][]string, error) {
370+
url := "https://dev.golang.org/owners"
371+
var o owners.Request
372+
o.Payload.Platform = true
373+
374+
body, err := json.Marshal(o)
375+
if err != nil {
376+
return nil, fmt.Errorf("unable to marshal json: %s", err)
377+
}
378+
r, err := http.Post(url, "application/json", bytes.NewBuffer(body))
379+
if err != nil {
380+
return nil, fmt.Errorf("failed to query for platform owners: %s", err)
381+
}
382+
defer r.Body.Close()
383+
if r.StatusCode != http.StatusOK {
384+
return nil, fmt.Errorf("failed to query for platform owners, response code=%d", r.StatusCode)
385+
}
386+
b, err := io.ReadAll(r.Body)
387+
if err != nil {
388+
return nil, fmt.Errorf("failed to read http response body: %w", err)
389+
}
390+
var response owners.Response
391+
err = json.Unmarshal(b, &response)
392+
if err != nil {
393+
return nil, fmt.Errorf("unable to unmarshal owners response: %s", err)
394+
}
395+
m := map[string][]string{}
396+
for k, v := range response.Payload.Platforms {
397+
if len(v.Primary) == 0 {
398+
continue
399+
}
400+
var primaries []string
401+
for _, p := range v.Primary {
402+
primaries = append(primaries, p.GitHubUsername)
403+
}
404+
m[k] = primaries
405+
}
406+
return m, nil
407+
}
408+
409+
var issueFooter = regexp.MustCompile(`<!-- DO NOT EDIT: (.*?) -->`)
410+
411+
func botIDFromIssueBody(body string) (string, bool) {
412+
matches := issueFooter.FindStringSubmatch(body)
413+
if len(matches) != 2 {
414+
return "", false
415+
}
416+
return matches[1], true
417+
}
418+
419+
func brokenBotIssueTitle(botID string) string {
420+
return fmt.Sprintf("x/build: bot %s reported as broken", botID)
421+
}
422+
423+
func brokenBotIssueBody(bot Bot, owners []string) string {
424+
var state string
425+
if bot.Dead {
426+
state = "dead"
427+
} else if bot.Quarantined {
428+
state = "quarantined"
429+
} else {
430+
state = "unknown"
431+
}
432+
botURL := fmt.Sprintf("https://chromium-swarm.appspot.com/bot?id=%s", bot.ID)
433+
body := "The bot [%s](%s) has been reported as broken. It is currently in %q state. Please work to resolve the issue.\n\n%s\n%s"
434+
footer := fmt.Sprintf("<!-- DO NOT EDIT: %s -->", bot.ID)
435+
return fmt.Sprintf(body, bot.ID, botURL, state, strings.Join(owners, "\n"), footer)
436+
}
437+
305438
const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure
306439

307440
// skipBrokenCommits identifies broken commits,

0 commit comments

Comments
 (0)