forked from gastownhall/gastown
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdaemon.go
More file actions
executable file
·2498 lines (2185 loc) · 90.9 KB
/
daemon.go
File metadata and controls
executable file
·2498 lines (2185 loc) · 90.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package daemon
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/gofrs/flock"
beadsdk "github.com/steveyegge/beads"
"gopkg.in/natefinch/lumberjack.v2"
"github.com/steveyegge/gastown/internal/beads"
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/doltserver"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/feed"
gitpkg "github.com/steveyegge/gastown/internal/git"
"github.com/steveyegge/gastown/internal/mayor"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/refinery"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/telemetry"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
"github.com/steveyegge/gastown/internal/wisp"
"github.com/steveyegge/gastown/internal/witness"
)
// Daemon is the town-level background service.
// It ensures patrol agents (Deacon, Witnesses) are running and detects failures.
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
type Daemon struct {
config *Config
patrolConfig *DaemonPatrolConfig
tmux *tmux.Tmux
logger *log.Logger
ctx context.Context
cancel context.CancelFunc
curator *feed.Curator
convoyManager *ConvoyManager
beadsStores map[string]beadsdk.Storage
doltServer *DoltServerManager
krcPruner *KRCPruner
// disabledPatrols is loaded from town settings (disabled_patrols field).
// Provides a simple way to disable individual patrol dogs without editing
// mayor/daemon.json. Checked by isPatrolActive alongside patrolConfig.
disabledPatrols map[string]bool
// Mass death detection: track recent session deaths
deathsMu sync.Mutex
recentDeaths []sessionDeath
// Deacon startup tracking: prevents race condition where newly started
// sessions are immediately killed by the heartbeat check.
// See: https://github.com/steveyegge/gastown/issues/567
// Note: Only accessed from heartbeat loop goroutine - no sync needed.
deaconLastStarted time.Time
// syncFailures tracks consecutive git pull failures per workdir.
// Used to escalate logging from WARN to ERROR after repeated failures.
// Only accessed from heartbeat loop goroutine - no sync needed.
syncFailures map[string]int
// PATCH-006: Resolved binary paths to avoid PATH issues in subprocesses.
gtPath string
bdPath string
// Boot spawn cooldown: prevents Boot from spawning on every heartbeat tick.
// Only accessed from heartbeat loop goroutine - no sync needed.
bootLastSpawned time.Time
// Restart tracking with exponential backoff to prevent crash loops
restartTracker *RestartTracker
// telemetry exports metrics and logs to VictoriaMetrics / VictoriaLogs.
// Nil when telemetry is disabled (GT_OTEL_METRICS_URL / GT_OTEL_LOGS_URL not set).
otelProvider *telemetry.Provider
metrics *daemonMetrics
// jsonlPushFailures tracks consecutive git push failures for JSONL backup.
// Only accessed from heartbeat loop goroutine - no sync needed.
jsonlPushFailures int
// lastDoctorMolTime tracks when the last mol-dog-doctor molecule was poured.
// Option B throttling: only pour when anomaly detected AND cooldown elapsed.
// Only accessed from heartbeat loop goroutine - no sync needed.
lastDoctorMolTime time.Time
// lastMaintenanceRun tracks when scheduled maintenance last ran.
// Only accessed from heartbeat loop goroutine - no sync needed.
lastMaintenanceRun time.Time
// mayorZombieCount tracks consecutive patrol cycles where the Mayor tmux
// session exists but the agent process is not detected. A count >= 3
// triggers a zombie restart, debouncing transient gaps during handoffs.
// Only accessed from heartbeat loop goroutine - no sync needed.
mayorZombieCount int
}
// sessionDeath records a detected session death for mass death analysis.
type sessionDeath struct {
sessionName string
timestamp time.Time
}
// Mass death detection parameters — these are fallback defaults.
// Prefer config.OperationalConfig.GetDaemonConfig() accessors when
// a TownSettings is available (loaded via d.loadOperationalConfig()).
const (
massDeathWindow = 30 * time.Second // Time window to detect mass death
massDeathThreshold = 3 // Number of deaths to trigger alert
// doctorMolCooldown is the minimum interval between mol-dog-doctor molecules.
// Configurable via operational.daemon.doctor_mol_cooldown.
doctorMolCooldown = 5 * time.Minute
)
// New creates a new daemon instance.
func New(config *Config) (*Daemon, error) {
// Ensure daemon directory exists
daemonDir := filepath.Dir(config.LogFile)
if err := os.MkdirAll(daemonDir, 0755); err != nil {
return nil, fmt.Errorf("creating daemon directory: %w", err)
}
// Open log file with rotation (100MB max, 3 backups, 7 days, compressed)
logWriter := &lumberjack.Logger{
Filename: config.LogFile,
MaxSize: 100, // megabytes
MaxBackups: 3,
MaxAge: 7, // days
Compress: true,
}
logger := log.New(logWriter, "", log.LstdFlags)
ctx, cancel := context.WithCancel(context.Background())
// Initialize session prefix and agent registries from town root.
if err := session.InitRegistry(config.TownRoot); err != nil {
logger.Printf("Warning: failed to initialize town registry: %v", err)
}
// Set GT_TOWN_ROOT in tmux global environment so run-shell subprocesses
// (e.g., gt cycle next/prev) can find the workspace even when CWD is $HOME.
// Non-fatal: tmux server may not be running yet — daemon creates sessions shortly.
t := tmux.NewTmux()
if err := t.SetGlobalEnvironment("GT_TOWN_ROOT", config.TownRoot); err != nil {
logger.Printf("Warning: failed to set GT_TOWN_ROOT in tmux global env: %v", err)
}
// Load patrol config from mayor/daemon.json, ensuring lifecycle defaults
// are populated for any missing data maintenance tickers. Without this,
// opt-in patrols (compactor, reaper, doctor, JSONL backup, dolt backup)
// remain disabled if the file was created before they were implemented.
if err := EnsureLifecycleConfigFile(config.TownRoot); err != nil {
logger.Printf("Warning: failed to ensure lifecycle config: %v", err)
}
patrolConfig := LoadPatrolConfig(config.TownRoot)
if patrolConfig != nil {
logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot))
// Propagate env vars from daemon.json to this process and all spawned sessions.
for k, v := range patrolConfig.Env {
os.Setenv(k, v)
logger.Printf("Set env %s=%s from daemon.json", k, v)
}
}
// Load disabled_patrols from town settings (settings/config.json).
// This provides a simpler way to disable patrols than editing daemon.json.
disabledPatrols := loadDisabledPatrolsFromTownSettings(config.TownRoot)
if len(disabledPatrols) > 0 {
names := make([]string, 0, len(disabledPatrols))
for k := range disabledPatrols {
names = append(names, k)
}
logger.Printf("Patrols disabled via town settings: %v", names)
}
// Initialize Dolt server manager if configured
var doltServer *DoltServerManager
if patrolConfig != nil && patrolConfig.Patrols != nil && patrolConfig.Patrols.DoltServer != nil {
doltServer = NewDoltServerManager(config.TownRoot, patrolConfig.Patrols.DoltServer, logger.Printf)
if doltServer.IsEnabled() {
logger.Printf("Dolt server management enabled (port %d)", patrolConfig.Patrols.DoltServer.Port)
// Propagate Dolt connection info to process env so AgentEnv() passes it to
// all spawned agent sessions. Without this, bd in agent sessions
// auto-starts rogue Dolt instances or connects to localhost. (GH#2412)
portStr := strconv.Itoa(patrolConfig.Patrols.DoltServer.Port)
os.Setenv("GT_DOLT_PORT", portStr)
os.Setenv("BEADS_DOLT_PORT", portStr)
if patrolConfig.Patrols.DoltServer.Host != "" {
os.Setenv("GT_DOLT_HOST", patrolConfig.Patrols.DoltServer.Host)
os.Setenv("BEADS_DOLT_SERVER_HOST", patrolConfig.Patrols.DoltServer.Host)
}
}
}
// Fallback: if GT_DOLT_PORT still isn't set (no DoltServerManager, daemon
// started independently of gt up), detect the port from dolt config.
// This ensures AgentEnv() always has the port for spawned sessions. (GH#2412)
if os.Getenv("GT_DOLT_PORT") == "" {
doltCfg := doltserver.DefaultConfig(config.TownRoot)
if doltCfg.Port > 0 {
portStr := strconv.Itoa(doltCfg.Port)
os.Setenv("GT_DOLT_PORT", portStr)
os.Setenv("BEADS_DOLT_PORT", portStr)
logger.Printf("Set GT_DOLT_PORT=%s from Dolt config (fallback)", portStr)
}
}
// Propagate Dolt host to process env so bd doesn't fall back to 127.0.0.1
// when the server runs on a remote machine (e.g., mini2 over Tailscale).
if os.Getenv("BEADS_DOLT_SERVER_HOST") == "" {
doltCfg := doltserver.DefaultConfig(config.TownRoot)
if doltCfg.Host != "" {
os.Setenv("BEADS_DOLT_SERVER_HOST", doltCfg.Host)
logger.Printf("Set BEADS_DOLT_SERVER_HOST=%s from Dolt config", doltCfg.Host)
}
}
// PATCH-006: Resolve binary paths at startup.
gtPath, err := exec.LookPath("gt")
if err != nil {
gtPath = "gt"
logger.Printf("Warning: gt not found in PATH, subprocess calls may fail")
}
bdPath, err := exec.LookPath("bd")
if err != nil {
bdPath = "bd"
logger.Printf("Warning: bd not found in PATH, subprocess calls may fail")
}
// Initialize restart tracker with exponential backoff.
// Parameters are configurable via patrols.restart_tracker in daemon.json.
var rtCfg RestartTrackerConfig
if patrolConfig != nil && patrolConfig.Patrols != nil && patrolConfig.Patrols.RestartTracker != nil {
rtCfg = *patrolConfig.Patrols.RestartTracker
}
restartTracker := NewRestartTracker(config.TownRoot, rtCfg)
if err := restartTracker.Load(); err != nil {
logger.Printf("Warning: failed to load restart state: %v", err)
}
// Initialize OpenTelemetry (best-effort — telemetry failure never blocks startup).
// Activate by setting GT_OTEL_METRICS_URL and/or GT_OTEL_LOGS_URL.
otelProvider, otelErr := telemetry.Init(ctx, "gastown-daemon", "")
if otelErr != nil {
logger.Printf("Warning: telemetry init failed: %v", otelErr)
}
var dm *daemonMetrics
if otelProvider != nil {
dm, err = newDaemonMetrics()
if err != nil {
logger.Printf("Warning: failed to register daemon metrics: %v", err)
dm = nil
} else {
metricsURL := os.Getenv(telemetry.EnvMetricsURL)
if metricsURL == "" {
metricsURL = telemetry.DefaultMetricsURL
}
logsURL := os.Getenv(telemetry.EnvLogsURL)
if logsURL == "" {
logsURL = telemetry.DefaultLogsURL
}
logger.Printf("Telemetry active (metrics → %s, logs → %s)",
metricsURL, logsURL)
}
}
return &Daemon{
config: config,
patrolConfig: patrolConfig,
disabledPatrols: disabledPatrols,
tmux: tmux.NewTmux(),
logger: logger,
ctx: ctx,
cancel: cancel,
doltServer: doltServer,
gtPath: gtPath,
bdPath: bdPath,
restartTracker: restartTracker,
otelProvider: otelProvider,
metrics: dm,
}, nil
}
// Run starts the daemon main loop.
func (d *Daemon) Run() error {
d.logger.Printf("Daemon starting (PID %d)", os.Getpid())
// Acquire exclusive lock to prevent multiple daemons from running.
// This prevents the TOCTOU race condition where multiple concurrent starts
// can all pass the IsRunning() check before any writes the PID file.
// Uses gofrs/flock for cross-platform compatibility (Unix + Windows).
lockFile := filepath.Join(d.config.TownRoot, "daemon", "daemon.lock")
fileLock := flock.New(lockFile)
// Try to acquire exclusive lock (non-blocking)
locked, err := fileLock.TryLock()
if err != nil {
return fmt.Errorf("acquiring lock: %w", err)
}
if !locked {
return fmt.Errorf("daemon already running (lock held by another process)")
}
defer func() { _ = fileLock.Unlock() }()
// Pre-flight check: all rigs must be on Dolt backend.
if err := d.checkAllRigsDolt(); err != nil {
return err
}
// Repair metadata.json for all rigs on startup.
// This ensures all rigs have proper Dolt server configuration.
if _, errs := doltserver.EnsureAllMetadata(d.config.TownRoot); len(errs) > 0 {
for _, e := range errs {
d.logger.Printf("Warning: metadata repair: %v", e)
}
}
// Write PID file with nonce for ownership verification
if _, err := writePIDFile(d.config.PidFile, os.Getpid()); err != nil {
return fmt.Errorf("writing PID file: %w", err)
}
defer func() { _ = os.Remove(d.config.PidFile) }() // best-effort cleanup
// Update state
state := &State{
Running: true,
PID: os.Getpid(),
StartedAt: time.Now(),
}
if err := SaveState(d.config.TownRoot, state); err != nil {
d.logger.Printf("Warning: failed to save state: %v", err)
}
// Handle signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, daemonSignals()...)
// Fixed recovery-focused heartbeat (no activity-based backoff)
// Normal wake is handled by feed subscription (bd activity --follow)
timer := time.NewTimer(d.recoveryHeartbeatInterval())
defer timer.Stop()
d.logger.Printf("Daemon running, recovery heartbeat interval %v", d.recoveryHeartbeatInterval())
// Start feed curator goroutine
d.curator = feed.NewCurator(d.config.TownRoot)
if err := d.curator.Start(); err != nil {
d.logger.Printf("Warning: failed to start feed curator: %v", err)
} else {
d.logger.Println("Feed curator started")
}
// Start convoy manager (event-driven + periodic stranded scan)
// Try opening beads stores eagerly; if Dolt isn't ready yet,
// pass the opener as a callback for lazy retry on each poll tick.
d.beadsStores = d.openBeadsStores()
isRigParked := func(rigName string) bool {
ok, _ := d.isRigOperational(rigName)
return !ok
}
var storeOpener func() map[string]beadsdk.Storage
if len(d.beadsStores) == 0 {
storeOpener = d.openBeadsStores
}
d.convoyManager = NewConvoyManager(d.config.TownRoot, d.logger.Printf, d.gtPath, 0, d.beadsStores, storeOpener, isRigParked)
if err := d.convoyManager.Start(); err != nil {
d.logger.Printf("Warning: failed to start convoy manager: %v", err)
} else {
d.logger.Println("Convoy manager started")
}
// Wire a recovery callback so that when Dolt transitions from unhealthy
// back to healthy, the convoy manager runs a sweep to catch any convoys
// that completed during the outage and were missed by the event poller.
if d.doltServer != nil {
cm := d.convoyManager
d.doltServer.SetRecoveryCallback(func() {
d.logger.Printf("Dolt recovery detected: triggering convoy recovery sweep")
cm.scan()
})
}
// Start KRC pruner for automatic ephemeral data cleanup
krcPruner, err := NewKRCPruner(d.config.TownRoot, d.logger.Printf)
if err != nil {
d.logger.Printf("Warning: failed to create KRC pruner: %v", err)
} else {
d.krcPruner = krcPruner
if err := d.krcPruner.Start(); err != nil {
d.logger.Printf("Warning: failed to start KRC pruner: %v", err)
} else {
d.logger.Println("KRC pruner started")
}
}
// Start dedicated Dolt health check ticker if Dolt server is configured.
// This runs at a much higher frequency (default 30s) than the general
// heartbeat (3 min) so Dolt crashes are detected quickly.
var doltHealthTicker *time.Ticker
var doltHealthChan <-chan time.Time
if d.doltServer != nil && d.doltServer.IsEnabled() {
interval := d.doltServer.HealthCheckInterval()
doltHealthTicker = time.NewTicker(interval)
doltHealthChan = doltHealthTicker.C
defer doltHealthTicker.Stop()
d.logger.Printf("Dolt health check ticker started (interval %v)", interval)
}
// Start dedicated Dolt remotes push ticker if configured.
// This runs at a lower frequency (default 15 min) than the heartbeat (3 min)
// to periodically push databases to their git remotes.
var doltRemotesTicker *time.Ticker
var doltRemotesChan <-chan time.Time
if d.isPatrolActive("dolt_remotes") {
interval := doltRemotesInterval(d.patrolConfig)
doltRemotesTicker = time.NewTicker(interval)
doltRemotesChan = doltRemotesTicker.C
defer doltRemotesTicker.Stop()
d.logger.Printf("Dolt remotes push ticker started (interval %v)", interval)
}
// Start dedicated Dolt backup ticker if configured.
// Runs filesystem backup sync (dolt backup sync) for production databases.
var doltBackupTicker *time.Ticker
var doltBackupChan <-chan time.Time
if d.isPatrolActive("dolt_backup") {
interval := doltBackupInterval(d.patrolConfig)
doltBackupTicker = time.NewTicker(interval)
doltBackupChan = doltBackupTicker.C
defer doltBackupTicker.Stop()
d.logger.Printf("Dolt backup ticker started (interval %v)", interval)
}
// Start JSONL git backup ticker if configured.
// Exports issues to JSONL, scrubs ephemeral data, pushes to git repo.
var jsonlGitBackupTicker *time.Ticker
var jsonlGitBackupChan <-chan time.Time
if d.isPatrolActive("jsonl_git_backup") {
interval := jsonlGitBackupInterval(d.patrolConfig)
jsonlGitBackupTicker = time.NewTicker(interval)
jsonlGitBackupChan = jsonlGitBackupTicker.C
defer jsonlGitBackupTicker.Stop()
d.logger.Printf("JSONL git backup ticker started (interval %v)", interval)
}
// Start wisp reaper ticker if configured.
// Closes stale wisps (abandoned molecule steps, old patrol data) across all databases.
var wispReaperTicker *time.Ticker
var wispReaperChan <-chan time.Time
if d.isPatrolActive("wisp_reaper") {
interval := wispReaperInterval(d.patrolConfig)
wispReaperTicker = time.NewTicker(interval)
wispReaperChan = wispReaperTicker.C
defer wispReaperTicker.Stop()
d.logger.Printf("Wisp reaper ticker started (interval %v)", interval)
}
// Start doctor dog ticker if configured.
// Health monitor: TCP check, latency, DB count, gc, zombie detection, backup/disk checks.
var doctorDogTicker *time.Ticker
var doctorDogChan <-chan time.Time
if d.isPatrolActive("doctor_dog") {
interval := doctorDogInterval(d.patrolConfig)
doctorDogTicker = time.NewTicker(interval)
doctorDogChan = doctorDogTicker.C
defer doctorDogTicker.Stop()
d.logger.Printf("Doctor dog ticker started (interval %v)", interval)
}
// Start compactor dog ticker if configured.
// Flattens Dolt commit history to reclaim graph storage (daily).
var compactorDogTicker *time.Ticker
var compactorDogChan <-chan time.Time
if d.isPatrolActive("compactor_dog") {
interval := compactorDogInterval(d.patrolConfig)
compactorDogTicker = time.NewTicker(interval)
compactorDogChan = compactorDogTicker.C
defer compactorDogTicker.Stop()
d.logger.Printf("Compactor dog ticker started (interval %v)", interval)
}
// Start checkpoint dog ticker if configured.
// Auto-commits WIP changes in active polecat worktrees to prevent data loss.
var checkpointDogTicker *time.Ticker
var checkpointDogChan <-chan time.Time
if d.isPatrolActive("checkpoint_dog") {
interval := checkpointDogInterval(d.patrolConfig)
checkpointDogTicker = time.NewTicker(interval)
checkpointDogChan = checkpointDogTicker.C
defer checkpointDogTicker.Stop()
d.logger.Printf("Checkpoint dog ticker started (interval %v)", interval)
}
// Start scheduled maintenance ticker if configured.
// Checks periodically whether we're in the maintenance window and
// runs `gt maintain --force` when commit counts exceed threshold.
var scheduledMaintenanceTicker *time.Ticker
var scheduledMaintenanceChan <-chan time.Time
if d.isPatrolActive("scheduled_maintenance") {
interval := maintenanceCheckInterval(d.patrolConfig)
scheduledMaintenanceTicker = time.NewTicker(interval)
scheduledMaintenanceChan = scheduledMaintenanceTicker.C
defer scheduledMaintenanceTicker.Stop()
window := maintenanceWindow(d.patrolConfig)
d.logger.Printf("Scheduled maintenance ticker started (check interval %v, window %s)", interval, window)
}
// Start main-branch test runner ticker if configured.
// Periodically runs quality gates on each rig's main branch to catch regressions.
var mainBranchTestTicker *time.Ticker
var mainBranchTestChan <-chan time.Time
if d.isPatrolActive("main_branch_test") {
interval := mainBranchTestInterval(d.patrolConfig)
mainBranchTestTicker = time.NewTicker(interval)
mainBranchTestChan = mainBranchTestTicker.C
defer mainBranchTestTicker.Stop()
d.logger.Printf("Main branch test ticker started (interval %v)", interval)
}
// Start quota dog ticker if configured.
// Scans for rate-limited sessions and automatically rotates credentials.
var quotaDogTicker *time.Ticker
var quotaDogChan <-chan time.Time
if d.isPatrolActive("quota_dog") {
interval := quotaDogInterval(d.patrolConfig)
quotaDogTicker = time.NewTicker(interval)
quotaDogChan = quotaDogTicker.C
defer quotaDogTicker.Stop()
d.logger.Printf("Quota dog ticker started (interval %v)", interval)
}
// Note: PATCH-010 uses per-session hooks in deacon/manager.go (SetAutoRespawnHook).
// Global pane-died hooks don't fire reliably in tmux 3.2a, so we rely on the
// per-session approach which has been tested to work for continuous recovery.
// Initial heartbeat
d.heartbeat(state)
for {
select {
case <-d.ctx.Done():
d.logger.Println("Daemon context canceled, shutting down")
return d.shutdown(state)
case sig := <-sigChan:
if isLifecycleSignal(sig) {
// Lifecycle signal: immediate lifecycle processing (from gt handoff)
d.logger.Println("Received lifecycle signal, processing lifecycle requests immediately")
d.processLifecycleRequests()
} else if isReloadRestartSignal(sig) {
// Reload restart tracker from disk (from 'gt daemon clear-backoff')
d.logger.Println("Received reload-restart signal, reloading restart tracker from disk")
if d.restartTracker != nil {
if err := d.restartTracker.Load(); err != nil {
d.logger.Printf("Warning: failed to reload restart tracker: %v", err)
}
}
} else {
d.logger.Printf("Received signal %v, shutting down", sig)
return d.shutdown(state)
}
case <-doltHealthChan:
// Dedicated Dolt health check — fast crash detection independent
// of the 3-minute general heartbeat.
if !d.isShutdownInProgress() {
d.ensureDoltServerRunning()
}
case <-doltRemotesChan:
// Periodic Dolt remote push — pushes databases to their configured
// git remotes on a 15-minute cadence (independent of heartbeat).
if !d.isShutdownInProgress() {
d.pushDoltRemotes()
}
case <-doltBackupChan:
// Periodic Dolt filesystem backup — syncs production databases to
// local backup directory on a 15-minute cadence.
if !d.isShutdownInProgress() {
d.syncDoltBackups()
}
case <-jsonlGitBackupChan:
// Periodic JSONL git backup — exports issues, scrubs ephemeral data,
// commits and pushes to git repo.
if !d.isShutdownInProgress() {
d.syncJsonlGitBackup()
}
case <-wispReaperChan:
// Periodic wisp reaper — closes stale wisps (abandoned molecule steps,
// old patrol data) to prevent unbounded table growth (Clown Show audit).
if !d.isShutdownInProgress() {
d.reapWisps()
}
case <-doctorDogChan:
// Doctor dog — comprehensive Dolt health monitor: connectivity, latency,
// gc, zombie detection, backup staleness, and disk usage checks.
if !d.isShutdownInProgress() {
d.runDoctorDog()
}
case <-compactorDogChan:
// Compactor dog — flattens Dolt commit history on production databases.
// Reclaims commit graph storage, then runs gc to reclaim chunks.
if !d.isShutdownInProgress() {
d.runCompactorDog()
}
case <-checkpointDogChan:
// Checkpoint dog — auto-commits WIP changes in active polecat
// worktrees to prevent data loss from session crashes.
if !d.isShutdownInProgress() {
d.runCheckpointDog()
}
case <-scheduledMaintenanceChan:
// Scheduled maintenance — checks if we're in the maintenance window
// and runs `gt maintain --force` when commit counts exceed threshold.
if !d.isShutdownInProgress() {
d.runScheduledMaintenance()
}
case <-mainBranchTestChan:
// Main branch test runner — periodically runs quality gates on each
// rig's main branch to catch regressions from merges or direct pushes.
if !d.isShutdownInProgress() {
d.runMainBranchTests()
}
case <-quotaDogChan:
// Quota dog — scans for rate-limited sessions and automatically
// rotates credentials to available accounts via keychain swap.
if !d.isShutdownInProgress() {
d.runQuotaDog()
}
case <-timer.C:
d.heartbeat(state)
// Fixed recovery interval (no activity-based backoff)
timer.Reset(d.recoveryHeartbeatInterval())
}
}
}
// recoveryHeartbeatInterval returns the config-driven recovery heartbeat interval.
// Normal wake is handled by feed subscription (bd activity --follow).
// The daemon is a safety net for dead sessions, GUPP violations, and orphaned work.
// Default: 3 minutes — fast enough to detect stuck agents promptly.
func (d *Daemon) recoveryHeartbeatInterval() time.Duration {
return d.loadOperationalConfig().GetDaemonConfig().RecoveryHeartbeatIntervalD()
}
// heartbeat performs one heartbeat cycle.
// The daemon is recovery-focused: it ensures agents are running and detects failures.
// Normal wake is handled by feed subscription (bd activity --follow).
// The daemon is the safety net for edge cases:
// - Dead sessions that need restart
// - Agents with work-on-hook not progressing (GUPP violation)
// - Orphaned work (assigned to dead agents)
func (d *Daemon) heartbeat(state *State) {
// Skip heartbeat if shutdown is in progress.
// This prevents the daemon from fighting shutdown by auto-restarting killed agents.
// The shutdown.lock file is created by gt down before terminating sessions.
if d.isShutdownInProgress() {
d.logger.Println("Shutdown in progress, skipping heartbeat")
return
}
d.metrics.recordHeartbeat(d.ctx)
d.logger.Println("Heartbeat starting (recovery-focused)")
// 0a. Reload prefix registry so new/changed rigs get correct session names.
// Without this, rigs added after daemon startup get the "gt" default prefix,
// causing ghost sessions like gt-witness instead of ti-witness. (hq-ouz, hq-eqf, hq-3i4)
if err := session.InitRegistry(d.config.TownRoot); err != nil {
d.logger.Printf("Warning: failed to reload prefix registry: %v", err)
}
// 0b. Kill ghost sessions left over from stale registry (default "gt" prefix).
d.killDefaultPrefixGhosts()
// 0. Ensure Dolt server is running (if configured)
// This must happen before beads operations that depend on Dolt.
d.ensureDoltServerRunning()
// 1. Ensure Deacon is running (restart if dead)
// Check patrol config - can be disabled in mayor/daemon.json
if d.isPatrolActive("deacon") {
d.ensureDeaconRunning()
} else {
d.logger.Printf("Deacon patrol disabled in config, skipping")
// Kill leftover deacon/boot sessions from before patrol was disabled.
// Without this, a stale deacon keeps running its own patrol loop,
// spawning witnesses and refineries despite daemon config. (hq-2mstj)
d.killDeaconSessions()
}
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
// Boot handles nuanced "is Deacon responsive" decisions
// Only run if Deacon patrol is enabled
if d.isPatrolActive("deacon") {
d.ensureBootRunning()
}
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
// Boot may not detect all stuck states; this provides a fallback
// Only run if Deacon patrol is enabled
if d.isPatrolActive("deacon") {
d.checkDeaconHeartbeat()
}
// 4. Ensure Witnesses are running for all rigs (restart if dead)
// Check patrol config - can be disabled in mayor/daemon.json
if d.isPatrolActive("witness") {
d.ensureWitnessesRunning()
} else {
d.logger.Printf("Witness patrol disabled in config, skipping")
// Kill leftover witness sessions from before patrol was disabled. (hq-2mstj)
d.killWitnessSessions()
}
// 5. Ensure Refineries are running for all rigs (restart if dead)
// Check patrol config - can be disabled in mayor/daemon.json
// Pressure-gated: refineries consume API credits, defer when system is loaded.
if d.isPatrolActive("refinery") {
if p := d.checkPressure("refinery"); !p.OK {
d.logger.Printf("Deferring refinery spawn: %s", p.Reason)
} else {
d.ensureRefineriesRunning()
}
} else {
d.logger.Printf("Refinery patrol disabled in config, skipping")
// Kill leftover refinery sessions from before patrol was disabled. (hq-2mstj)
d.killRefinerySessions()
}
// 6. Ensure Mayor is running (restart if dead)
d.ensureMayorRunning()
// 6.5. Handle Dog lifecycle: cleanup stuck dogs and dispatch plugins
// Pressure-gated: dog dispatch spawns new agent sessions.
if d.isPatrolActive("handler") {
if p := d.checkPressure("dog"); !p.OK {
d.logger.Printf("Deferring dog dispatch: %s", p.Reason)
// Still run cleanup phases (stuck/stale/idle) — only skip dispatch
d.handleDogsCleanupOnly()
} else {
d.handleDogs()
}
} else {
d.logger.Printf("Handler patrol disabled in config, skipping")
}
// 7. Process lifecycle requests
d.processLifecycleRequests()
// 9. (Removed) Stale agent check - violated "discover, don't track"
// 10. Check for GUPP violations (agents with work-on-hook not progressing)
d.checkGUPPViolations()
// 11. Check for orphaned work (assigned to dead agents)
d.checkOrphanedWork()
// 12. Check polecat session health (proactive crash detection)
// This validates tmux sessions are still alive for polecats with work-on-hook
d.checkPolecatSessionHealth()
// 12b. Reap idle polecat sessions to prevent API slot burn.
// Polecats transition to IDLE after gt done but sessions stay alive.
// Kill sessions that have been idle longer than the configured threshold.
d.reapIdlePolecats()
// 13. Clean up orphaned claude subagent processes (memory leak prevention)
// These are Task tool subagents that didn't clean up after completion.
// This is a safety net - Deacon patrol also does this more frequently.
d.cleanupOrphanedProcesses()
// 13. Prune stale local polecat tracking branches across all rig clones.
// When polecats push branches to origin, other clones create local tracking
// branches via git fetch. After merge, remote branches are deleted but local
// branches persist indefinitely. This cleans them up periodically.
d.pruneStaleBranches()
// 14. Dispatch scheduled work (capacity-controlled polecat dispatch).
// Shells out to `gt scheduler run` to avoid circular import between daemon and cmd.
// Pressure-gated: polecats are the primary resource consumers.
if p := d.checkPressure("polecat"); !p.OK {
d.logger.Printf("Deferring polecat dispatch: %s", p.Reason)
} else {
d.dispatchQueuedWork()
}
// 15. Rotate oversized Dolt logs (copytruncate for child process fds).
// daemon.log uses lumberjack for automatic rotation; this handles Dolt server logs.
d.rotateOversizedLogs()
// Update state
state.LastHeartbeat = time.Now()
state.HeartbeatCount++
if err := SaveState(d.config.TownRoot, state); err != nil {
d.logger.Printf("Warning: failed to save state: %v", err)
}
d.logger.Printf("Heartbeat complete (#%d)", state.HeartbeatCount)
}
// rotateOversizedLogs checks Dolt server log files and rotates any that exceed
// the size threshold. Uses copytruncate which is safe for logs held open by
// child processes. Runs every heartbeat but is cheap (just stat calls).
func (d *Daemon) rotateOversizedLogs() {
result := RotateLogs(d.config.TownRoot)
for _, path := range result.Rotated {
d.logger.Printf("log_rotation: rotated %s", path)
}
for _, err := range result.Errors {
d.logger.Printf("log_rotation: error: %v", err)
}
}
// ensureDoltServerRunning ensures the Dolt SQL server is running if configured.
// This provides the backend for beads database access in server mode.
// Option B throttling: pours a mol-dog-doctor molecule only when health check
// warnings are detected, with a 5-minute cooldown to avoid wisp spam.
func (d *Daemon) ensureDoltServerRunning() {
if d.doltServer == nil || !d.doltServer.IsEnabled() {
return
}
if err := d.doltServer.EnsureRunning(); err != nil {
d.logger.Printf("Error ensuring Dolt server is running: %v", err)
}
// Option B throttling: pour mol-dog-doctor only on anomaly with cooldown.
if warnings := d.doltServer.LastWarnings(); len(warnings) > 0 {
if time.Since(d.lastDoctorMolTime) >= doctorMolCooldown {
d.lastDoctorMolTime = time.Now()
go d.pourDoctorMolecule(warnings)
}
}
// Update OTel gauges with the latest Dolt health snapshot.
if d.metrics != nil {
h := doltserver.GetHealthMetrics(d.config.TownRoot)
d.metrics.updateDoltHealth(
int64(h.Connections),
int64(h.MaxConnections),
float64(h.QueryLatency.Milliseconds()),
h.DiskUsageBytes,
h.Healthy,
)
}
}
// pourDoctorMolecule creates a mol-dog-doctor molecule to track a health anomaly.
// Runs asynchronously — molecule lifecycle is observability, not control flow.
func (d *Daemon) pourDoctorMolecule(warnings []string) {
mol := d.pourDogMolecule(constants.MolDogDoctor, map[string]string{
"port": strconv.Itoa(d.doltServer.config.Port),
})
defer mol.close()
// Step 1: probe — connectivity was already checked (we got here because it passed).
mol.closeStep("probe")
// Step 2: inspect — resource checks produced the warnings.
mol.closeStep("inspect")
// Step 3: report — log the warning summary.
summary := strings.Join(warnings, "; ")
d.logger.Printf("Doctor molecule: %d warning(s): %s", len(warnings), summary)
mol.closeStep("report")
}
// checkAllRigsDolt verifies all rigs are using the Dolt backend.
func (d *Daemon) checkAllRigsDolt() error {
var problems []string
// Check town-level beads
townBeadsDir := filepath.Join(d.config.TownRoot, ".beads")
if backend := readBeadsBackend(townBeadsDir); backend != "" && backend != "dolt" {
problems = append(problems, fmt.Sprintf(
"Rig %q is using %s backend.\n Gas Town requires Dolt. Run: cd %s && bd migrate dolt",
"town-root", backend, d.config.TownRoot))
}
// Check each registered rig
for _, rigName := range d.getKnownRigs() {
rigBeadsDir := filepath.Join(d.config.TownRoot, rigName, "mayor", "rig", ".beads")
if backend := readBeadsBackend(rigBeadsDir); backend != "" && backend != "dolt" {
rigPath := filepath.Join(d.config.TownRoot, rigName)
problems = append(problems, fmt.Sprintf(
"Rig %q is using %s backend.\n Gas Town requires Dolt. Run: cd %s && bd migrate dolt",
rigName, backend, rigPath))
}
}
if len(problems) == 0 {
return nil
}
return fmt.Errorf("daemon startup blocked: %d rig(s) not on Dolt backend\n\n %s",
len(problems), strings.Join(problems, "\n\n "))
}
// readBeadsBackend reads the backend field from metadata.json in a beads directory.
// Returns empty string if the directory or metadata doesn't exist.
func readBeadsBackend(beadsDir string) string {
metadataPath := filepath.Join(beadsDir, "metadata.json")
data, err := os.ReadFile(metadataPath)
if err != nil {
return ""
}
var metadata struct {
Backend string `json:"backend"`
}
if err := json.Unmarshal(data, &metadata); err != nil {
return ""
}
return metadata.Backend
}
// DeaconRole is the role name for the Deacon's handoff bead.
const DeaconRole = "deacon"
// getDeaconSessionName returns the Deacon session name for the daemon's town.
func (d *Daemon) getDeaconSessionName() string {
return session.DeaconSessionName()
}
// ensureBootRunning spawns Boot to triage the Deacon.
// Boot is a fresh-each-tick watchdog that decides whether to start/wake/nudge
// the Deacon, centralizing the "when to wake" decision in an agent.
// In degraded mode (no tmux), falls back to mechanical checks.
// bootSpawnCooldown returns the config-driven boot spawn cooldown.
// Boot triage runs are expensive (AI reasoning); if one just ran, skip.
func (d *Daemon) bootSpawnCooldown() time.Duration {
return d.loadOperationalConfig().GetDaemonConfig().BootSpawnCooldownD()
}
func (d *Daemon) ensureBootRunning() {
// Cooldown gate: skip if Boot was spawned recently (fixes #2084)
if !d.bootLastSpawned.IsZero() && time.Since(d.bootLastSpawned) < d.bootSpawnCooldown() {
d.logger.Printf("Boot spawned %s ago, within cooldown (%s), skipping",
time.Since(d.bootLastSpawned).Round(time.Second), d.bootSpawnCooldown())
return
}
b := boot.New(d.config.TownRoot)
// Check for degraded mode
degraded := os.Getenv("GT_DEGRADED") == "true"
if degraded || !d.tmux.IsAvailable() {
// In degraded mode, run mechanical triage directly
d.logger.Println("Degraded mode: running mechanical Boot triage")
d.runDegradedBootTriage(b)
return
}
// Spawn Boot in a fresh tmux session
d.logger.Println("Spawning Boot for triage...")
if err := b.Spawn(""); err != nil {
d.logger.Printf("Error spawning Boot: %v, falling back to direct Deacon check", err)
// Fallback: ensure Deacon is running directly
d.ensureDeaconRunning()
return
}
d.bootLastSpawned = time.Now()
d.logger.Println("Boot spawned successfully")
}
// runDegradedBootTriage performs mechanical Boot logic without AI reasoning.
// This is for degraded mode when tmux is unavailable.
func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {