Skip to content

Commit 68e6e0f

Browse files
committed
harden the UDN isolation tests to account for network blips
UDN tests are known to be heavy and these tests will fail on the first dropped connection which seems to happen randomly, albeit rarely. test: allow single timeout in UDN KAPI reachability check Hardens e2e test against transient networking blips while preserving strict failure detection for real regressions. Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com>
1 parent 0f44179 commit 68e6e0f

1 file changed

Lines changed: 66 additions & 28 deletions

File tree

test/extended/networking/network_segmentation.go

Lines changed: 66 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -392,42 +392,75 @@ var _ = Describe("[sig-network][OCPFeatureGate:NetworkSegmentation][Feature:User
392392
}
393393

394394
By("asserting UDN pod can reach the kapi service in the default network")
395-
// Use the service name to get test the DNS access
396-
Consistently(func() bool {
397-
_, err := e2ekubectl.RunKubectl(
395+
// This is a positive reachability check that verifies the UDN pod can reach KAPI
396+
// service via DNS. It tolerates one isolated curl timeout (observed on Azure) but
397+
// still fails on consecutive failures or sustained connectivity issues that would
398+
// indicate a UDN route/DNS/connectivity regression.
399+
const (
400+
requiredSuccesses = 3
401+
maxTimeouts = 1
402+
kapiProbeWindow = 30 * time.Second
403+
kapiProbeInterval = 2 * time.Second
404+
)
405+
406+
successCount := 0
407+
timeoutCount := 0
408+
consecutiveFailures := 0
409+
deadline := time.Now().Add(kapiProbeWindow)
410+
411+
for time.Now().Before(deadline) && successCount < requiredSuccesses {
412+
stdout, err := e2ekubectl.RunKubectl(
398413
udnPodConfig.namespace,
399414
"exec",
400415
udnPodConfig.name,
401416
"--",
402417
"curl",
418+
"--silent",
419+
"--show-error",
420+
"--fail",
403421
"--connect-timeout",
404-
// FIXME: We have seen in OCP CI that it can take two seconds or maybe more
405-
// for a single curl to succeed. Example:
406-
// STEP: asserting UDN pod can reach the kapi service in the default network @ 01/20/25 00:38:42.32
407-
// I0120 00:38:42.320808 70120 builder.go:121] Running '/usr/bin/kubectl
408-
// --server=https://api.ci-op-bkg2qwwq-4edbf.XXXXXXXXXXXXXXXXXXXXXX:6443 --kubeconfig=/tmp/kubeconfig-1734723086
409-
// --namespace=e2e-test-network-segmentation-e2e-kzdw7 exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
410-
// I0120 00:38:44.108334 70120 builder.go:146] stderr: " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 2 100 2 0 0 9 0 --:--:-- --:--:-- --:--:-- 9\r100 2 100 2 0 0 9 0 --:--:-- --:--:-- --:--:-- 9\n"
411-
// I0120 00:38:44.108415 70120 builder.go:147] stdout: "ok" --> 2 seconds later
412-
// I0120 00:38:45.109237 70120 builder.go:121] Running '/usr/bin/kubectl
413-
// --server=https://api.ci-op-bkg2qwwq-4edbf.XXXXXXXXXXXXXXXXXXXXXX:6443 --kubeconfig=/tmp/kubeconfig-1734723086
414-
// --namespace=e2e-test-network-segmentation-e2e-kzdw7 exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
415-
// I0120 00:38:48.460089 70120 builder.go:135] rc: 28
416-
// around the same time we have observed OVS issues like:
417-
// Jan 20 00:38:45.329999 ci-op-bkg2qwwq-4edbf-xv8kb-worker-b-flqxd ovs-vswitchd[1094]: ovs|03661|timeval|WARN|context switches: 0 voluntary, 695 involuntary
418-
// Jan 20 00:38:45.329967 ci-op-bkg2qwwq-4edbf-xv8kb-worker-b-flqxd ovs-vswitchd[1094]: ovs|03660|timeval|WARN|Unreasonably long 1730ms poll interval (32ms user, 903ms system)
419-
// which might need more investigation. Bumping the timeout to 5seconds can help with this
420-
// but we need to figure out what exactly is causing random timeouts in CI when trying to reach kapi-server
421-
// sometimes we have also seen more than 2seconds being taken for the timeout which also needs to be investigated:
422-
// I0118 13:35:50.419638 87083 builder.go:121] Running '/usr/bin/kubectl
423-
// --server=https://api.ostest.test.metalkube.org:6443 --kubeconfig=/tmp/secret/kubeconfig
424-
// --namespace=e2e-test-network-segmentation-e2e-d4fzk exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
425-
// I0118 13:35:54.093268 87083 builder.go:135] rc: 28 --> takes close to 4seconds?
426422
"5",
423+
"--max-time",
424+
"10",
427425
"--insecure",
428-
"https://kubernetes.default/healthz")
429-
return err == nil
430-
}, 15*time.Second, 3*time.Second).Should(BeTrue())
426+
"https://kubernetes.default/healthz",
427+
)
428+
429+
if err == nil {
430+
Expect(strings.TrimSpace(stdout)).To(Equal("ok"),
431+
"unexpected response from kapi healthz")
432+
successCount++
433+
consecutiveFailures = 0
434+
framework.Logf("UDN pod reached kapi healthz: success=%d/%d timeoutCount=%d",
435+
successCount, requiredSuccesses, timeoutCount)
436+
if successCount < requiredSuccesses {
437+
time.Sleep(kapiProbeInterval)
438+
}
439+
continue
440+
}
441+
442+
if isCurlExitCode28(err) {
443+
timeoutCount++
444+
consecutiveFailures++
445+
framework.Logf("UDN pod kapi healthz curl timeout: timeoutCount=%d err=%v",
446+
timeoutCount, err)
447+
448+
Expect(timeoutCount).To(BeNumerically("<=", maxTimeouts),
449+
"only one transient timeout is allowed")
450+
Expect(consecutiveFailures).To(BeNumerically("<=", 1),
451+
"consecutive KAPI failures indicate sustained connectivity failure")
452+
453+
time.Sleep(kapiProbeInterval)
454+
continue
455+
}
456+
457+
Expect(err).NotTo(HaveOccurred(),
458+
"non-timeout error reaching kapi healthz, stdout=%q", stdout)
459+
}
460+
461+
Expect(successCount).To(BeNumerically(">=", requiredSuccesses),
462+
"UDN pod did not reach kapi healthz enough times within %s; timeoutCount=%d",
463+
kapiProbeWindow, timeoutCount)
431464

432465
By("asserting UDN pod can't reach default services via default network interface")
433466
// route setup is already done, get kapi IPs
@@ -1285,6 +1318,11 @@ var _ = Describe("[sig-network][OCPFeatureGate:NetworkSegmentation][Feature:User
12851318
})
12861319
})
12871320

1321+
// isCurlExitCode28 checks if the error from RunKubectl indicates a curl timeout (exit code 28).
1322+
func isCurlExitCode28(err error) bool {
1323+
return err != nil && strings.Contains(err.Error(), "rc: 28")
1324+
}
1325+
12881326
// randomNetworkMetaName return pseudo random name for network related objects (NAD,UDN,CUDN).
12891327
// CUDN is cluster-scoped object, in case tests running in parallel, having random names avoids
12901328
// conflicting with other tests.

0 commit comments

Comments
 (0)