Skip to content

Commit 9699138

Browse files
nstrayerwesmclaude
authored
feat: add backup agent failover for review jobs (#283)
## Human summary I found myself hitting codex limits really quickly and then dumping back to copilot in this case. I wanted a way to automate this so I didnt have to keep switching the config.toml. This PR adds a `*backup_agent` setting as a fallback in case the review fails. Coincidentally while making this PR I ran out of codex and the fallback worked perfectly! ## Summary - Add per-workflow and default backup agent configuration (`default_backup_agent`, `review_backup_agent`, etc.) with the same repo-overrides-global priority as primary agents - Implement failover in the worker pool: when an agent error exhausts retries, the worker resolves the backup agent from config and requeues the job with the new agent - Distinguish agent errors (eligible for failover) from non-agent errors (prompt build failures) in the worker's retry logic - No schema migration — backup agent is resolved from config at failover time, not stored per-job ## Changes - `internal/config/config.go` — backup agent fields on `Config`/`RepoConfig`, `ResolveBackupAgentForWorkflow` resolution - `internal/storage/jobs.go` — `FailoverJob(jobID, workerID, backupAgent)` atomically swaps agent and requeues - `internal/daemon/worker.go` — `resolveBackupAgent` reads config at failover time, canonicalizes (verify installed, skip if same as primary), `failOrRetryAgent` path with failover split from `failOrRetry` - Tests for config resolution, storage failover, worker-level backup agent resolution, and edge cases ## Test plan - [x] `go test ./...` passes - [ ] Verify failover works end-to-end: configure `backup_agent`, trigger a review with a failing primary agent, confirm job retries with the backup - [ ] Verify no failover when `backup_agent` is unset (existing behavior unchanged) - [ ] Verify second failover with same backup is a no-op (agent already swapped) --------- Co-authored-by: Wes McKinney <wesmckinn+git@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7d4040c commit 9699138

File tree

11 files changed

+773
-68
lines changed

11 files changed

+773
-68
lines changed

internal/config/config.go

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ type Config struct {
4949
ReviewContextCount int `toml:"review_context_count"`
5050
DefaultAgent string `toml:"default_agent"`
5151
DefaultModel string `toml:"default_model"` // Default model for agents (format varies by agent)
52+
DefaultBackupAgent string `toml:"default_backup_agent"`
5253
JobTimeoutMinutes int `toml:"job_timeout_minutes"`
5354

5455
// Workflow-specific agent/model configuration
@@ -92,7 +93,15 @@ type Config struct {
9293
DesignModelFast string `toml:"design_model_fast"`
9394
DesignModelStandard string `toml:"design_model_standard"`
9495
DesignModelThorough string `toml:"design_model_thorough"`
95-
AllowUnsafeAgents *bool `toml:"allow_unsafe_agents"` // nil = not set, allows commands to choose their own default
96+
97+
// Backup agents for failover
98+
ReviewBackupAgent string `toml:"review_backup_agent"`
99+
RefineBackupAgent string `toml:"refine_backup_agent"`
100+
FixBackupAgent string `toml:"fix_backup_agent"`
101+
SecurityBackupAgent string `toml:"security_backup_agent"`
102+
DesignBackupAgent string `toml:"design_backup_agent"`
103+
104+
AllowUnsafeAgents *bool `toml:"allow_unsafe_agents"` // nil = not set, allows commands to choose their own default
96105

97106
// Agent commands
98107
CodexCmd string `toml:"codex_cmd"`
@@ -348,6 +357,7 @@ type RepoCIConfig struct {
348357
type RepoConfig struct {
349358
Agent string `toml:"agent"`
350359
Model string `toml:"model"` // Model for agents (format varies by agent)
360+
BackupAgent string `toml:"backup_agent"`
351361
ReviewContextCount int `toml:"review_context_count"`
352362
ReviewGuidelines string `toml:"review_guidelines"`
353363
JobTimeoutMinutes int `toml:"job_timeout_minutes"`
@@ -402,6 +412,13 @@ type RepoConfig struct {
402412
DesignModelStandard string `toml:"design_model_standard"`
403413
DesignModelThorough string `toml:"design_model_thorough"`
404414

415+
// Backup agents for failover
416+
ReviewBackupAgent string `toml:"review_backup_agent"`
417+
RefineBackupAgent string `toml:"refine_backup_agent"`
418+
FixBackupAgent string `toml:"fix_backup_agent"`
419+
SecurityBackupAgent string `toml:"security_backup_agent"`
420+
DesignBackupAgent string `toml:"design_backup_agent"`
421+
405422
// Hooks configuration (per-repo)
406423
Hooks []HookConfig `toml:"hooks"`
407424

@@ -729,6 +746,55 @@ func ResolveModelForWorkflow(cli, repoPath string, globalCfg *Config, workflow,
729746
return getWorkflowValue(repoCfg, globalCfg, workflow, level, false)
730747
}
731748

749+
// ResolveBackupAgentForWorkflow returns the backup agent for a workflow,
750+
// or empty string if none is configured.
751+
// Priority:
752+
// 1. Repo {workflow}_backup_agent
753+
// 2. Repo backup_agent (generic)
754+
// 3. Global {workflow}_backup_agent
755+
// 4. Global default_backup_agent
756+
// 5. "" (no backup)
757+
func ResolveBackupAgentForWorkflow(repoPath string, globalCfg *Config, workflow string) string {
758+
repoCfg, _ := LoadRepoConfig(repoPath)
759+
760+
// Repo layer: workflow-specific > generic
761+
if repoCfg != nil {
762+
if s := lookupFieldByTag(reflect.ValueOf(*repoCfg), workflow+"_backup_agent"); s != "" {
763+
return s
764+
}
765+
if s := strings.TrimSpace(repoCfg.BackupAgent); s != "" {
766+
return s
767+
}
768+
}
769+
770+
// Global layer: workflow-specific > default
771+
if globalCfg != nil {
772+
if s := lookupFieldByTag(reflect.ValueOf(*globalCfg), workflow+"_backup_agent"); s != "" {
773+
return s
774+
}
775+
if s := strings.TrimSpace(globalCfg.DefaultBackupAgent); s != "" {
776+
return s
777+
}
778+
}
779+
780+
return ""
781+
}
782+
783+
// lookupFieldByTag finds a struct field by its TOML tag and returns its trimmed value.
784+
func lookupFieldByTag(v reflect.Value, key string) string {
785+
t := v.Type()
786+
for i := 0; i < t.NumField(); i++ {
787+
tag := t.Field(i).Tag.Get("toml")
788+
if tag == "" {
789+
continue
790+
}
791+
if strings.Split(tag, ",")[0] == key {
792+
return strings.TrimSpace(v.Field(i).String())
793+
}
794+
}
795+
return ""
796+
}
797+
732798
// getWorkflowValue looks up agent or model config following Option A priority.
733799
func getWorkflowValue(repo *RepoConfig, global *Config, workflow, level string, isAgent bool) string {
734800
// Repo layer: level-specific > workflow-specific > generic

internal/config/config_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"os"
66
"os/exec"
77
"path/filepath"
8+
"reflect"
89
"strings"
910
"testing"
1011

@@ -1227,6 +1228,101 @@ func TestResolveModelForWorkflow(t *testing.T) {
12271228
}
12281229
}
12291230

1231+
func TestResolveBackupAgentForWorkflow(t *testing.T) {
1232+
tests := []struct {
1233+
name string
1234+
repo map[string]string
1235+
global map[string]string
1236+
workflow string
1237+
expect string
1238+
}{
1239+
// No backup configured
1240+
{"empty config", nil, nil, "review", ""},
1241+
{"only primary agent configured", M{"review_agent": "claude"}, nil, "review", ""},
1242+
1243+
// Global backup agent
1244+
{"global backup only", nil, M{"review_backup_agent": "test"}, "review", "test"},
1245+
{"global backup for refine", nil, M{"refine_backup_agent": "claude"}, "refine", "claude"},
1246+
{"global backup for fix", nil, M{"fix_backup_agent": "codex"}, "fix", "codex"},
1247+
{"global backup for security", nil, M{"security_backup_agent": "gemini"}, "security", "gemini"},
1248+
{"global backup for design", nil, M{"design_backup_agent": "droid"}, "design", "droid"},
1249+
1250+
// Repo backup agent overrides global
1251+
{"repo overrides global", M{"review_backup_agent": "repo-test"}, M{"review_backup_agent": "global-test"}, "review", "repo-test"},
1252+
{"repo backup only", M{"review_backup_agent": "test"}, nil, "review", "test"},
1253+
1254+
// Different workflows resolve independently
1255+
{"review backup doesn't affect refine", M{"review_backup_agent": "claude"}, nil, "refine", ""},
1256+
{"each workflow has own backup", M{"review_backup_agent": "claude", "refine_backup_agent": "codex"}, nil, "review", "claude"},
1257+
{"each workflow has own backup - refine", M{"review_backup_agent": "claude", "refine_backup_agent": "codex"}, nil, "refine", "codex"},
1258+
1259+
// Unknown workflow returns empty
1260+
{"unknown workflow", M{"review_backup_agent": "test"}, nil, "unknown", ""},
1261+
1262+
// No reasoning level support for backup agents
1263+
{"no level variants recognized", M{"review_backup_agent_fast": "claude"}, nil, "review", ""},
1264+
{"backup agent doesn't use levels", M{"review_backup_agent": "claude"}, nil, "review", "claude"},
1265+
1266+
// Default/generic backup agent fallback
1267+
{"global default_backup_agent", nil, M{"default_backup_agent": "test"}, "review", "test"},
1268+
{"global default_backup_agent for any workflow", nil, M{"default_backup_agent": "test"}, "fix", "test"},
1269+
{"global workflow-specific overrides default", nil, M{"default_backup_agent": "test", "review_backup_agent": "claude"}, "review", "claude"},
1270+
{"global default used when workflow not set", nil, M{"default_backup_agent": "test", "review_backup_agent": "claude"}, "fix", "test"},
1271+
{"repo backup_agent generic", M{"backup_agent": "repo-fallback"}, nil, "review", "repo-fallback"},
1272+
{"repo backup_agent generic for any workflow", M{"backup_agent": "repo-fallback"}, nil, "refine", "repo-fallback"},
1273+
{"repo workflow-specific overrides repo generic", M{"backup_agent": "generic", "review_backup_agent": "specific"}, nil, "review", "specific"},
1274+
{"repo generic overrides global workflow-specific", M{"backup_agent": "repo"}, M{"review_backup_agent": "global"}, "review", "repo"},
1275+
{"repo generic overrides global default", M{"backup_agent": "repo"}, M{"default_backup_agent": "global"}, "review", "repo"},
1276+
}
1277+
1278+
for _, tt := range tests {
1279+
t.Run(tt.name, func(t *testing.T) {
1280+
// Create temp dir for repo config
1281+
repoDir := t.TempDir()
1282+
1283+
// Write repo config if provided
1284+
if tt.repo != nil {
1285+
writeRepoConfig(t, repoDir, tt.repo)
1286+
}
1287+
1288+
// Create global config if provided
1289+
var globalCfg *Config
1290+
if tt.global != nil {
1291+
globalCfg = &Config{}
1292+
populateConfigFromMap(globalCfg, tt.global)
1293+
}
1294+
1295+
// Test the function
1296+
result := ResolveBackupAgentForWorkflow(repoDir, globalCfg, tt.workflow)
1297+
1298+
if result != tt.expect {
1299+
t.Errorf("ResolveBackupAgentForWorkflow(%q, global, %q) = %q, want %q",
1300+
repoDir, tt.workflow, result, tt.expect)
1301+
}
1302+
})
1303+
}
1304+
}
1305+
1306+
// populateConfigFromMap is a helper to set config fields from a map
1307+
func populateConfigFromMap(cfg *Config, m map[string]string) {
1308+
v := reflect.ValueOf(cfg).Elem()
1309+
t := v.Type()
1310+
1311+
for i := 0; i < t.NumField(); i++ {
1312+
field := t.Field(i)
1313+
tag := field.Tag.Get("toml")
1314+
if tag == "" {
1315+
continue
1316+
}
1317+
tagName := strings.Split(tag, ",")[0]
1318+
if val, ok := m[tagName]; ok {
1319+
if field.Type.Kind() == reflect.String {
1320+
v.Field(i).SetString(val)
1321+
}
1322+
}
1323+
}
1324+
}
1325+
12301326
// M is a shorthand type for map[string]string to keep test tables compact
12311327
type M = map[string]string
12321328

internal/daemon/ci_poller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ func (p *CIPoller) processPR(ctx context.Context, ghRepo string, pr ghPR, cfg *c
427427
resolvedAgent = resolved.Name()
428428
}
429429

430-
// Resolve model through workflow config when not explicitly set
430+
// Resolve model through workflow config
431431
resolvedModel := config.ResolveModelForWorkflow(cfg.CI.Model, repo.RootPath, cfg, workflow, reasoning)
432432

433433
job, err := p.db.EnqueueJob(storage.EnqueueOpts{

internal/daemon/ci_poller_test.go

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,7 +1278,9 @@ func initGitRepoWithOrigin(t *testing.T) (dir string, runGit func(args ...string
12781278
runGit("init", "-b", "main")
12791279
runGit("config", "user.email", "test@test.com")
12801280
runGit("config", "user.name", "Test")
1281-
os.WriteFile(filepath.Join(dir, "README.md"), []byte("init"), 0644)
1281+
if err := os.WriteFile(filepath.Join(dir, "README.md"), []byte("init"), 0644); err != nil {
1282+
t.Fatalf("write README.md: %v", err)
1283+
}
12821284
runGit("add", "-A")
12831285
runGit("commit", "-m", "initial")
12841286
runGit("remote", "add", "origin", dir)
@@ -1291,8 +1293,10 @@ func TestLoadCIRepoConfig_LoadsFromDefaultBranch(t *testing.T) {
12911293
dir, runGit := initGitRepoWithOrigin(t)
12921294

12931295
// Commit .roborev.toml on main with CI agents override
1294-
os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1295-
[]byte("[ci]\nagents = [\"claude\"]\n"), 0644)
1296+
if err := os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1297+
[]byte("[ci]\nagents = [\"claude\"]\n"), 0644); err != nil {
1298+
t.Fatalf("write .roborev.toml: %v", err)
1299+
}
12961300
runGit("add", ".roborev.toml")
12971301
runGit("commit", "-m", "add config")
12981302
runGit("fetch", "origin")
@@ -1313,8 +1317,10 @@ func TestLoadCIRepoConfig_FallsBackWhenNoConfigOnDefaultBranch(t *testing.T) {
13131317
dir, _ := initGitRepoWithOrigin(t)
13141318

13151319
// No .roborev.toml on origin/main, but put one in the working tree
1316-
os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1317-
[]byte("[ci]\nagents = [\"codex\"]\n"), 0644)
1320+
if err := os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1321+
[]byte("[ci]\nagents = [\"codex\"]\n"), 0644); err != nil {
1322+
t.Fatalf("write .roborev.toml: %v", err)
1323+
}
13181324

13191325
cfg, err := loadCIRepoConfig(dir)
13201326
if err != nil {
@@ -1332,15 +1338,19 @@ func TestLoadCIRepoConfig_PropagatesParseError(t *testing.T) {
13321338
dir, runGit := initGitRepoWithOrigin(t)
13331339

13341340
// Commit invalid TOML on main
1335-
os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1336-
[]byte("this is not valid toml [[["), 0644)
1341+
if err := os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1342+
[]byte("this is not valid toml [[["), 0644); err != nil {
1343+
t.Fatalf("write .roborev.toml: %v", err)
1344+
}
13371345
runGit("add", ".roborev.toml")
13381346
runGit("commit", "-m", "add bad config")
13391347
runGit("fetch", "origin")
13401348

1341-
// Also put valid config in working tree — should NOT be used
1342-
os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1343-
[]byte("[ci]\nagents = [\"codex\"]\n"), 0644)
1349+
// Also put valid config in working tree -- should NOT be used
1350+
if err := os.WriteFile(filepath.Join(dir, ".roborev.toml"),
1351+
[]byte("[ci]\nagents = [\"codex\"]\n"), 0644); err != nil {
1352+
t.Fatalf("write .roborev.toml: %v", err)
1353+
}
13441354

13451355
cfg, err := loadCIRepoConfig(dir)
13461356
if err == nil {
@@ -1388,6 +1398,9 @@ func TestCIPollerProcessPR_SetsPendingCommitStatus(t *testing.T) {
13881398
if sc.state != "pending" {
13891399
t.Errorf("state=%q, want pending", sc.state)
13901400
}
1401+
if sc.desc != "Review in progress" {
1402+
t.Errorf("desc=%q, want %q", sc.desc, "Review in progress")
1403+
}
13911404
}
13921405

13931406
func TestCIPollerPostBatchResults_SetsSuccessStatus(t *testing.T) {

internal/daemon/server_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1591,7 +1591,7 @@ func TestHandleRerunJob(t *testing.T) {
15911591
commit, _ := db.GetOrCreateCommit(repo.ID, "rerun-failed", "Author", "Subject", time.Now())
15921592
job, _ := db.EnqueueJob(storage.EnqueueOpts{RepoID: repo.ID, CommitID: commit.ID, GitRef: "rerun-failed", Agent: "test"})
15931593
db.ClaimJob("worker-1")
1594-
db.FailJob(job.ID, "some error")
1594+
db.FailJob(job.ID, "", "some error")
15951595

15961596
req := testutil.MakeJSONRequest(t, http.MethodPost, "/api/job/rerun", RerunJobRequest{JobID: job.ID})
15971597
w := httptest.NewRecorder()

0 commit comments

Comments
 (0)