Skip to content

Commit af83e8a

Browse files
committed
fix(runner): drop dead-on-arrival toolbox HTTP readiness check
waitForToolboxReady was introduced in fc88aa0 (2026-06-05, "feat: add agent-ready runtime catalog", which then landed on main via PR #715 on 2026-06-10). It HTTP-polls http://127.0.0.1:<hostPort>/version expecting a daemon on guest TCP port 2280 — that interface dates from the Daytona-daemon era and was never reimplemented after the Rust guest agent rewrite landed in dbb11ec (2026-04-01). The new agent binds **vsock://2695** for gRPC + notifies the host via vsock://2696; nothing inside the VM listens on TCP:2280, so libkrun's port-forward accepts the SYN and immediately reset-by-peer's, and every CREATE_BOX times out 30 s in. Production data from a Tokyo runner: in 24 h, 490 CREATE_BOX events, 0 toolbox-ready successes, 181 toolbox-ready failures. The exec path that fires immediately afterward (via the same vsock gRPC channel) **always succeeds** — confirming the box VM is healthy, the readiness check itself is the bug. Remove the dead probe: drop waitForToolboxReady from client.go's Create and Start, drop the function + its TCP/HTTP imports, drop the toolboxReadyTimeout field, drop the ToolboxReadyTimeout/ DaemonStartTimeoutSec config plumbing in main.go + config.go, drop the two now-unreachable tests. Box readiness is now signalled by bx.Start(ctx) returning (which itself blocks on the vsock notification from the guest). Branched off chore/e2e-required-merge-gate (PR #724) so the e2e-cloud stack picks this up next dispatch.
1 parent 31f0de0 commit af83e8a

5 files changed

Lines changed: 0 additions & 128 deletions

File tree

apps/runner/cmd/runner/config/config.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ type Config struct {
3737
AWSSecretAccessKey string `envconfig:"AWS_SECRET_ACCESS_KEY"`
3838
AWSDefaultBucket string `envconfig:"AWS_DEFAULT_BUCKET"`
3939
ResourceLimitsDisabled bool `envconfig:"RESOURCE_LIMITS_DISABLED"`
40-
DaemonStartTimeoutSec int `envconfig:"DAEMON_START_TIMEOUT_SEC"`
4140
BoxStartTimeoutSec int `envconfig:"BOX_START_TIMEOUT_SEC"`
4241
UseSnapshotEntrypoint bool `envconfig:"USE_SNAPSHOT_ENTRYPOINT"`
4342
Domain string `envconfig:"RUNNER_DOMAIN" validate:"omitempty,hostname|ip"`

apps/runner/cmd/runner/main.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ func run() int {
118118
VolumeCleanupInterval: cfg.VolumeCleanupInterval,
119119
VolumeCleanupDryRun: cfg.VolumeCleanupDryRun,
120120
VolumeCleanupExclusionPeriod: cfg.VolumeCleanupExclusionPeriod,
121-
ToolboxReadyTimeout: time.Duration(cfg.DaemonStartTimeoutSec) * time.Second,
122121
})
123122
if err != nil {
124123
logger.Error("Error creating BoxLite client", "error", err)

apps/runner/pkg/boxlite/client.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ type Client struct {
3737
volumeMutexesMutex sync.Mutex
3838
volumeCleanupMutex sync.Mutex
3939
toolboxPortMutex sync.Mutex
40-
toolboxReadyTimeout time.Duration
4140
lastVolumeCleanup time.Time
4241
volumeCleanup volumeCleanupConfig
4342
}
@@ -56,7 +55,6 @@ type ClientConfig struct {
5655
VolumeCleanupInterval time.Duration
5756
VolumeCleanupDryRun bool
5857
VolumeCleanupExclusionPeriod time.Duration
59-
ToolboxReadyTimeout time.Duration
6058
}
6159

6260
func networkSpec(blockAll *bool, allowList *string) boxlite.NetworkSpec {
@@ -136,11 +134,6 @@ func buildImageRegistries(insecureRegistries []string, ghcrUsername, ghcrToken s
136134

137135
// NewClient creates a new BoxLite client backed by the BoxLite VM runtime.
138136
func NewClient(ctx context.Context, config ClientConfig) (*Client, error) {
139-
toolboxReadyTimeout := config.ToolboxReadyTimeout
140-
if toolboxReadyTimeout <= 0 {
141-
toolboxReadyTimeout = 30 * time.Second
142-
}
143-
144137
var opts []boxlite.RuntimeOption
145138
if config.HomeDir != "" {
146139
opts = append(opts, boxlite.WithHomeDir(config.HomeDir))
@@ -171,7 +164,6 @@ func NewClient(ctx context.Context, config ClientConfig) (*Client, error) {
171164
awsAccessKeyId: config.AWSAccessKeyId,
172165
awsSecretAccessKey: config.AWSSecretAccessKey,
173166
volumeMutexes: make(map[string]*sync.Mutex),
174-
toolboxReadyTimeout: toolboxReadyTimeout,
175167
volumeCleanup: volumeCleanupConfig{
176168
interval: config.VolumeCleanupInterval,
177169
dryRun: config.VolumeCleanupDryRun,
@@ -305,9 +297,6 @@ func (c *Client) Create(ctx context.Context, boxDto dto.CreateBoxDTO) (string, s
305297
if err := bx.Start(ctx); err != nil {
306298
return bx.ID(), "", fmt.Errorf("failed to start box: %w", err)
307299
}
308-
if err := c.waitForToolboxReady(ctx, boxDto.Id); err != nil {
309-
return bx.ID(), "", err
310-
}
311300
}
312301

313302
return bx.ID(), "boxlite", nil
@@ -326,9 +315,6 @@ func (c *Client) Start(ctx context.Context, boxId string, authToken *string, met
326315
if err := bx.Start(ctx); err != nil {
327316
return "", err
328317
}
329-
if err := c.waitForToolboxReady(ctx, boxId); err != nil {
330-
return "", err
331-
}
332318
return "boxlite", nil
333319
}
334320

apps/runner/pkg/boxlite/toolbox_ports.go

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,11 @@ import (
77
"context"
88
"encoding/json"
99
"fmt"
10-
"io"
1110
"net"
12-
"net/http"
1311
"os"
1412
"path/filepath"
1513
"strconv"
1614
"strings"
17-
"time"
1815
)
1916

2017
const (
@@ -62,55 +59,6 @@ func (c *Client) ToolboxHostPort(boxID string) (int, error) {
6259
return c.readToolboxHostPort(boxID)
6360
}
6461

65-
func (c *Client) waitForToolboxReady(ctx context.Context, boxID string) error {
66-
hostPort, err := c.ToolboxHostPort(boxID)
67-
if err != nil {
68-
return fmt.Errorf("toolbox host port not available for box %s: %w", boxID, err)
69-
}
70-
71-
timeout := c.toolboxReadyTimeout
72-
if timeout <= 0 {
73-
timeout = 30 * time.Second
74-
}
75-
readyCtx, cancel := context.WithTimeout(ctx, timeout)
76-
defer cancel()
77-
78-
url := fmt.Sprintf("http://127.0.0.1:%d/version", hostPort)
79-
client := http.Client{Timeout: time.Second}
80-
ticker := time.NewTicker(200 * time.Millisecond)
81-
defer ticker.Stop()
82-
83-
var lastErr error
84-
for {
85-
req, reqErr := http.NewRequestWithContext(readyCtx, http.MethodGet, url, nil)
86-
if reqErr != nil {
87-
return reqErr
88-
}
89-
90-
resp, reqErr := client.Do(req)
91-
if reqErr == nil {
92-
_, _ = io.Copy(io.Discard, resp.Body)
93-
_ = resp.Body.Close()
94-
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
95-
c.logger.InfoContext(ctx, "box toolbox is ready", "box", boxID, "hostPort", hostPort)
96-
return nil
97-
}
98-
lastErr = fmt.Errorf("unexpected status %d from %s", resp.StatusCode, url)
99-
} else {
100-
lastErr = reqErr
101-
}
102-
103-
select {
104-
case <-readyCtx.Done():
105-
if lastErr != nil {
106-
return fmt.Errorf("box toolbox not ready after %s (box=%s hostPort=%d): %w", timeout, boxID, hostPort, lastErr)
107-
}
108-
return fmt.Errorf("box toolbox not ready after %s (box=%s hostPort=%d)", timeout, boxID, hostPort)
109-
case <-ticker.C:
110-
}
111-
}
112-
}
113-
11462
func (c *Client) removeToolboxPortRecord(ctx context.Context, boxID string) error {
11563
c.toolboxPortMutex.Lock()
11664
defer c.toolboxPortMutex.Unlock()

apps/runner/pkg/boxlite/toolbox_ports_test.go

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@ package boxlite
66
import (
77
"io"
88
"log/slog"
9-
"net"
10-
"net/http"
119
"testing"
12-
"time"
1310
)
1411

1512
func TestReserveToolboxHostPortPersistsRecord(t *testing.T) {
@@ -55,60 +52,3 @@ func TestRemoveToolboxPortRecord(t *testing.T) {
5552
}
5653
}
5754

58-
func TestWaitForToolboxReadyReturnsAfterVersionEndpointResponds(t *testing.T) {
59-
listener, err := net.Listen("tcp", "127.0.0.1:0")
60-
if err != nil {
61-
t.Fatalf("listen: %v", err)
62-
}
63-
mux := http.NewServeMux()
64-
mux.HandleFunc("/version", func(w http.ResponseWriter, _ *http.Request) {
65-
w.WriteHeader(http.StatusOK)
66-
_, _ = w.Write([]byte(`{"version":"test"}`))
67-
})
68-
server := &http.Server{Handler: mux}
69-
go func() {
70-
_ = server.Serve(listener)
71-
}()
72-
defer server.Close()
73-
74-
port := listener.Addr().(*net.TCPAddr).Port
75-
client := &Client{
76-
homeDir: t.TempDir(),
77-
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
78-
toolboxReadyTimeout: time.Second,
79-
}
80-
if err := client.writeToolboxPortRecord(toolboxPortRecord{
81-
BoxID: "box-1",
82-
GuestPort: ToolboxGuestPort,
83-
HostPort: port,
84-
}); err != nil {
85-
t.Fatalf("writeToolboxPortRecord: %v", err)
86-
}
87-
88-
if err := client.waitForToolboxReady(t.Context(), "box-1"); err != nil {
89-
t.Fatalf("waitForToolboxReady: %v", err)
90-
}
91-
}
92-
93-
func TestWaitForToolboxReadyTimesOutWhenEndpointDoesNotRespond(t *testing.T) {
94-
port, err := findAvailableLocalPort()
95-
if err != nil {
96-
t.Fatalf("findAvailableLocalPort: %v", err)
97-
}
98-
client := &Client{
99-
homeDir: t.TempDir(),
100-
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
101-
toolboxReadyTimeout: 20 * time.Millisecond,
102-
}
103-
if err := client.writeToolboxPortRecord(toolboxPortRecord{
104-
BoxID: "box-1",
105-
GuestPort: ToolboxGuestPort,
106-
HostPort: port,
107-
}); err != nil {
108-
t.Fatalf("writeToolboxPortRecord: %v", err)
109-
}
110-
111-
if err := client.waitForToolboxReady(t.Context(), "box-1"); err == nil {
112-
t.Fatal("expected waitForToolboxReady to time out")
113-
}
114-
}

0 commit comments

Comments
 (0)