Skip to content

Commit 1ef92a1

Browse files
authored
fix(worker-sizing): no-sizing request must return the nil default profile (#693)
Regression from #692. resolveWorkerProfile resolved EVERY request to a concrete profile (e.g. "8|16Gi|false"), including no-GUC / gate-off traffic. On a deploy with SHARED_WARM_TARGET>0 the concrete-default key never matches a neutral warm worker's key ("||false"), so default connections can neither reuse a warm worker nor be served by warm replenishment (which spawns neutral workers) — they fail with "no warm Duckgres worker is currently available". Confirmed on mw-dev: default connections timed out and the warm pool churned neutral workers. Fix forward: return nil (the default sentinel that matches the neutral warm pool) when client sizing is gated off OR no worker_cpu/worker_memory/worker_ttl GUC is set — restoring pre-#692 default behavior exactly. A concrete profile is returned only when the client explicitly opts in (gate on + a sizing GUC). The default worker SIZE continues to come from the pool-global WorkerCPURequest/MemoryRequest. Concrete sizing only becomes fully functional once the warm pool is removed (docs/design/worker-ttl-pool.md); until then it is opt-in via the gate. Regression guard: TestResolveWorkerProfileSizing_NoSizingIsNil. The per-PR e2e runs SHARED_WARM_TARGET=0 so cannot reproduce this; the standing deploy runs >0.
1 parent 1edabc8 commit 1ef92a1

2 files changed

Lines changed: 90 additions & 54 deletions

File tree

controlplane/worker_profile.go

Lines changed: 55 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -25,61 +25,75 @@ const (
2525
)
2626

2727
// resolveWorkerProfile turns the client's worker_cpu / worker_memory / worker_ttl
28-
// startup options into a concrete *WorkerProfile (a worker size + hot-idle TTL).
28+
// startup options into a *WorkerProfile (a worker size + hot-idle TTL), or nil.
2929
//
30-
// Every request resolves to a concrete size: the deployment default
31-
// (WorkerCPURequest / WorkerMemoryRequest, defaultWorkerTTL) when a field is
32-
// unset or client sizing is gated off, otherwise the clamped client value.
33-
// Client cpu/mem are clamped to [min,max] and ttl to [0,maxTTL]; an out-of-range
34-
// value is clamped with a warning, an unparseable/negative one errors (rejecting
35-
// the connection). Returns (profile, warns, nil) on success.
30+
// - (nil, nil, nil) => the DEFAULT profile: gate off, or no sizing GUC set.
31+
// nil is the sentinel that matches the neutral/default worker pool (the warm
32+
// pool spawns neutral workers with no profile), so default traffic reuses
33+
// warm workers exactly as before. The default worker SIZE then comes from the
34+
// pool-global WorkerCPURequest/MemoryRequest, not from this profile.
35+
// - (profile, warns, nil) => the client explicitly requested a size/ttl. cpu/mem
36+
// default to the pool-global request (else built-in 8/16Gi) for any field the
37+
// client omitted, clamped to [min,max]; ttl to [0,WorkerMaxTTL], default 20m.
38+
// - (nil, nil, err) => an unparseable/negative value (rejects the connection).
39+
//
40+
// IMPORTANT: a no-sizing request MUST return nil, not a concrete-default profile.
41+
// Returning concrete here regresses warm-pool reuse — the concrete key (e.g.
42+
// "8|16Gi|false") never matches a neutral warm worker's key ("||false"), so the
43+
// request can neither reuse a warm worker nor be served by warm replenishment
44+
// (which spawns neutral workers), and fails with "no warm worker available".
45+
// Concrete sizing only becomes fully functional once the warm pool is removed
46+
// (see docs/design/worker-ttl-pool.md) — until then it is opt-in via the gate.
3647
func (cp *ControlPlane) resolveWorkerProfile(opts map[string]string) (*WorkerProfile, []string, error) {
3748
k := cp.cfg.K8s
49+
if !k.AllowClientWorkerProfile {
50+
return nil, nil, nil
51+
}
52+
53+
rawCPU := strings.TrimSpace(opts[gucWorkerCPU])
54+
rawMem := strings.TrimSpace(opts[gucWorkerMemory])
55+
rawTTL := strings.TrimSpace(opts[gucWorkerTTL])
56+
if rawCPU == "" && rawMem == "" && rawTTL == "" {
57+
return nil, nil, nil // no client sizing -> default (nil) profile
58+
}
3859

3960
cpu := firstNonEmpty(strings.TrimSpace(k.WorkerCPURequest), defaultWorkerCPU)
4061
mem := firstNonEmpty(strings.TrimSpace(k.WorkerMemoryRequest), defaultWorkerMemory)
4162
ttl := defaultWorkerTTL
4263

4364
var warns []string
44-
if k.AllowClientWorkerProfile {
45-
rawCPU := strings.TrimSpace(opts[gucWorkerCPU])
46-
rawMem := strings.TrimSpace(opts[gucWorkerMemory])
47-
rawTTL := strings.TrimSpace(opts[gucWorkerTTL])
48-
49-
if rawCPU != "" {
50-
v, warn, err := sizeField(gucWorkerCPU, rawCPU, k.WorkerProfileMinCPU, k.WorkerProfileMaxCPU)
51-
if err != nil {
52-
return nil, nil, err
53-
}
54-
cpu = v
55-
if warn != "" {
56-
warns = append(warns, warn)
57-
}
65+
if rawCPU != "" {
66+
v, warn, err := sizeField(gucWorkerCPU, rawCPU, k.WorkerProfileMinCPU, k.WorkerProfileMaxCPU)
67+
if err != nil {
68+
return nil, nil, err
5869
}
59-
if rawMem != "" {
60-
v, warn, err := sizeField(gucWorkerMemory, rawMem, k.WorkerProfileMinMemory, k.WorkerProfileMaxMemory)
61-
if err != nil {
62-
return nil, nil, err
63-
}
64-
mem = v
65-
if warn != "" {
66-
warns = append(warns, warn)
67-
}
70+
cpu = v
71+
if warn != "" {
72+
warns = append(warns, warn)
73+
}
74+
}
75+
if rawMem != "" {
76+
v, warn, err := sizeField(gucWorkerMemory, rawMem, k.WorkerProfileMinMemory, k.WorkerProfileMaxMemory)
77+
if err != nil {
78+
return nil, nil, err
79+
}
80+
mem = v
81+
if warn != "" {
82+
warns = append(warns, warn)
83+
}
84+
}
85+
if rawTTL != "" {
86+
v, warn, err := ttlField(rawTTL, k.WorkerMaxTTL)
87+
if err != nil {
88+
return nil, nil, err
6889
}
69-
if rawTTL != "" {
70-
v, warn, err := ttlField(rawTTL, k.WorkerMaxTTL)
71-
if err != nil {
72-
return nil, nil, err
73-
}
74-
ttl = v
75-
if warn != "" {
76-
warns = append(warns, warn)
77-
}
90+
ttl = v
91+
if warn != "" {
92+
warns = append(warns, warn)
7893
}
7994
}
8095

81-
// Normalize the (possibly default) sizes so reuse-matching is canonical
82-
// (e.g. "16Gi" vs "16384Mi" compare equal once normalized).
96+
// Normalize the sizes so reuse-matching is canonical ("16Gi" == "16384Mi").
8397
cpuN, _, err := sizeField(gucWorkerCPU, cpu, "", "")
8498
if err != nil {
8599
return nil, nil, fmt.Errorf("invalid default worker cpu: %w", err)

controlplane/worker_profile_test.go

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,16 @@ func TestResolveWorkerProfileSizing(t *testing.T) {
2525
tests := []struct {
2626
name string
2727
opts map[string]string
28-
wantKey string // CPU|Memory|Colocate
28+
wantNil bool // expect the default (nil) profile — matches the neutral warm pool
29+
wantKey string // CPU|Memory|Colocate (only when !wantNil)
2930
wantTTL time.Duration
3031
wantErr bool
3132
wantWarn bool
3233
}{
33-
{name: "no opts -> defaults", opts: map[string]string{}, wantKey: "8|16Gi|false", wantTTL: defaultWorkerTTL},
34-
{name: "unrelated opts -> defaults", opts: map[string]string{"search_path": "iceberg.public"}, wantKey: "8|16Gi|false", wantTTL: defaultWorkerTTL},
34+
{name: "no opts -> default (nil)", opts: map[string]string{}, wantNil: true},
35+
{name: "unrelated opts -> default (nil)", opts: map[string]string{"search_path": "iceberg.public"}, wantNil: true},
3536
{name: "client cpu+mem", opts: map[string]string{gucWorkerCPU: "4", gucWorkerMemory: "32Gi"}, wantKey: "4|32Gi|false", wantTTL: defaultWorkerTTL},
36-
{name: "client ttl", opts: map[string]string{gucWorkerTTL: "5m"}, wantKey: "8|16Gi|false", wantTTL: 5 * time.Minute},
37+
{name: "client ttl only -> concrete w/ default size", opts: map[string]string{gucWorkerTTL: "5m"}, wantKey: "8|16Gi|false", wantTTL: 5 * time.Minute},
3738
{name: "cpu over max -> clamp+warn", opts: map[string]string{gucWorkerCPU: "64"}, wantKey: "16|16Gi|false", wantTTL: defaultWorkerTTL, wantWarn: true},
3839
{name: "mem under min -> clamp+warn", opts: map[string]string{gucWorkerMemory: "1Gi"}, wantKey: "8|4Gi|false", wantTTL: defaultWorkerTTL, wantWarn: true},
3940
{name: "ttl over max -> clamp+warn", opts: map[string]string{gucWorkerTTL: "24h"}, wantKey: "8|16Gi|false", wantTTL: time.Hour, wantWarn: true},
@@ -56,8 +57,14 @@ func TestResolveWorkerProfileSizing(t *testing.T) {
5657
if err != nil {
5758
t.Fatalf("unexpected error: %v", err)
5859
}
60+
if tt.wantNil {
61+
if got != nil {
62+
t.Fatalf("expected nil (default) profile, got %s", got.MatchKey())
63+
}
64+
return
65+
}
5966
if got == nil {
60-
t.Fatal("expected a concrete profile, got nil")
67+
t.Fatalf("expected concrete profile %q, got nil", tt.wantKey)
6168
}
6269
if got.MatchKey() != tt.wantKey {
6370
t.Fatalf("MatchKey = %q, want %q", got.MatchKey(), tt.wantKey)
@@ -75,8 +82,8 @@ func TestResolveWorkerProfileSizing(t *testing.T) {
7582
}
7683
}
7784

78-
// Gate off: every GUC is ignored and the deployment defaults are returned (never
79-
// an error, even for garbage values).
85+
// Gate off: every GUC is ignored and the default (nil) profile is returned (never
86+
// an error, even for garbage values) — preserving warm-pool reuse.
8087
func TestResolveWorkerProfileSizing_GateOff(t *testing.T) {
8188
cp := newSizingTestCP()
8289
cp.cfg.K8s.AllowClientWorkerProfile = false
@@ -87,19 +94,34 @@ func TestResolveWorkerProfileSizing_GateOff(t *testing.T) {
8794
if len(warns) != 0 {
8895
t.Fatalf("gate off must not warn, got %v", warns)
8996
}
90-
if got == nil || got.MatchKey() != "8|16Gi|false" || got.TTL != defaultWorkerTTL {
91-
t.Fatalf("gate off must return defaults, got %v / ttl=%v", got, got.TTL)
97+
if got != nil {
98+
t.Fatalf("gate off must return the default (nil) profile, got %s", got.MatchKey())
99+
}
100+
}
101+
102+
// A no-sizing request must return the nil default sentinel so it matches the
103+
// neutral warm pool (regression guard: returning a concrete-default profile here
104+
// broke worker acquisition on a warm-pool-enabled deploy).
105+
func TestResolveWorkerProfileSizing_NoSizingIsNil(t *testing.T) {
106+
cp := newSizingTestCP()
107+
got, _, err := cp.resolveWorkerProfile(map[string]string{})
108+
if err != nil {
109+
t.Fatal(err)
110+
}
111+
if got != nil {
112+
t.Fatalf("no-sizing request must return nil (default), got %s", got.MatchKey())
92113
}
93114
}
94115

95-
// With no configured default size, the built-in 8/16Gi/20m applies.
116+
// When the client DOES set sizing and no default size is configured, the built-in
117+
// 8/16Gi/20m applies for the omitted fields.
96118
func TestResolveWorkerProfileSizing_BuiltinDefaults(t *testing.T) {
97119
cp := &ControlPlane{cfg: ControlPlaneConfig{K8s: K8sConfig{AllowClientWorkerProfile: true}}}
98-
got, _, err := cp.resolveWorkerProfile(map[string]string{})
120+
got, _, err := cp.resolveWorkerProfile(map[string]string{gucWorkerTTL: "1m"})
99121
if err != nil {
100122
t.Fatal(err)
101123
}
102-
if got.CPU != defaultWorkerCPU || got.Memory != defaultWorkerMemory || got.TTL != defaultWorkerTTL {
103-
t.Fatalf("built-in defaults = %s/%s/%s, want %s/%s/%s", got.CPU, got.Memory, got.TTL, defaultWorkerCPU, defaultWorkerMemory, defaultWorkerTTL)
124+
if got == nil || got.CPU != defaultWorkerCPU || got.Memory != defaultWorkerMemory || got.TTL != time.Minute {
125+
t.Fatalf("built-in defaults = %v, want %s/%s/1m", got, defaultWorkerCPU, defaultWorkerMemory)
104126
}
105127
}

0 commit comments

Comments
 (0)