Skip to content

Commit 3c6601e

Browse files
r2k1Copilot
andauthored
fix(e2e): harden kube exec against apiserver SPDY hangs (#8627)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d6e0e58 commit 3c6601e

3 files changed

Lines changed: 38 additions & 6 deletions

File tree

e2e/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ require (
9191
github.com/tidwall/match v1.1.1 // indirect
9292
github.com/tidwall/pretty v1.2.1 // indirect
9393
github.com/x448/float16 v0.8.4 // indirect
94-
golang.org/x/net v0.55.0 // indirect
94+
golang.org/x/net v0.55.0
9595
golang.org/x/oauth2 v0.30.0 // indirect
9696
golang.org/x/sys v0.45.0 // indirect
9797
golang.org/x/term v0.43.0 // indirect

e2e/kube.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"encoding/base64"
66
"encoding/json"
77
"fmt"
8+
"net"
9+
"net/http"
810
"strings"
911
"testing"
1012
"time"
@@ -13,6 +15,7 @@ import (
1315
"github.com/Azure/agentbaker/e2e/toolkit"
1416
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
1517
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
18+
"golang.org/x/net/http2"
1619
appsv1 "k8s.io/api/apps/v1"
1720
corev1 "k8s.io/api/core/v1"
1821
v1 "k8s.io/api/core/v1"
@@ -58,6 +61,25 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana
5861
config.QPS = 200
5962
config.Burst = 400
6063

64+
// Defense-in-depth against silent connection wedges (apiserver SPDY proxy
65+
// hangs, NAT/LB idle timeouts) which manifest as kube exec calls that hang
66+
// indefinitely. Bound the TCP dial and enable HTTP/2 keep-alive pings so
67+
// the transport itself surfaces a dead peer as a connection error,
68+
// triggering retries instead of consuming the caller's timeout budget.
69+
config.Dial = (&net.Dialer{
70+
Timeout: 10 * time.Second,
71+
KeepAlive: 30 * time.Second,
72+
}).DialContext
73+
config.WrapTransport = func(rt http.RoundTripper) http.RoundTripper {
74+
if t, ok := rt.(*http.Transport); ok {
75+
if h2, err := http2.ConfigureTransports(t); err == nil {
76+
h2.ReadIdleTimeout = 30 * time.Second
77+
h2.PingTimeout = 15 * time.Second
78+
}
79+
}
80+
return rt
81+
}
82+
6183
dynamic, err := client.New(config, client.Options{})
6284
if err != nil {
6385
return nil, fmt.Errorf("create dynamic Kubeclient: %w", err)

e2e/validation.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package e2e
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"strings"
89
"time"
@@ -300,11 +301,11 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {
300301

301302
checks := []wireServerCheck{
302303
{
303-
cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4",
304+
cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4 --max-time 8",
304305
desc: "wireserver port 80 goalstate",
305306
},
306307
{
307-
cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4",
308+
cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4 --max-time 8",
308309
desc: "wireserver port 32526 vmSettings",
309310
},
310311
}
@@ -313,10 +314,19 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {
313314

314315
for _, check := range checks {
315316
var execResult *podExecResult
316-
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
317-
r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
317+
// Per-attempt cap (15s) prevents a single SPDY/exec hang from consuming the entire
318+
// poll budget. Derived from the poll's inner ctx so it honors both the per-attempt
319+
// cap and the overall poll deadline, whichever fires first.
320+
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) {
321+
attemptCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
322+
defer cancel()
323+
r, execErr := execOnUnprivilegedPod(attemptCtx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
318324
if execErr != nil {
319-
s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
325+
if errors.Is(execErr, context.DeadlineExceeded) {
326+
s.T.Logf("wireserver check %q: exec attempt timed out after 15s (retrying): %v", check.desc, execErr)
327+
} else {
328+
s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
329+
}
320330
return false, nil
321331
}
322332
execResult = r

0 commit comments

Comments
 (0)