@@ -392,42 +392,75 @@ var _ = Describe("[sig-network][OCPFeatureGate:NetworkSegmentation][Feature:User
392392 }
393393
394394 By ("asserting UDN pod can reach the kapi service in the default network" )
395- // Use the service name to get test the DNS access
396- Consistently (func () bool {
397- _ , err := e2ekubectl .RunKubectl (
395+ // This is a positive reachability check that verifies the UDN pod can reach KAPI
396+ // service via DNS. It tolerates one isolated curl timeout (observed on Azure) but
397+ // still fails on consecutive failures or sustained connectivity issues that would
398+ // indicate a UDN route/DNS/connectivity regression.
399+ const (
400+ requiredSuccesses = 3
401+ maxTimeouts = 1
402+ kapiProbeWindow = 30 * time .Second
403+ kapiProbeInterval = 2 * time .Second
404+ )
405+
406+ successCount := 0
407+ timeoutCount := 0
408+ consecutiveFailures := 0
409+ deadline := time .Now ().Add (kapiProbeWindow )
410+
411+ for time .Now ().Before (deadline ) && successCount < requiredSuccesses {
412+ stdout , err := e2ekubectl .RunKubectl (
398413 udnPodConfig .namespace ,
399414 "exec" ,
400415 udnPodConfig .name ,
401416 "--" ,
402417 "curl" ,
418+ "--silent" ,
419+ "--show-error" ,
420+ "--fail" ,
403421 "--connect-timeout" ,
404- // FIXME: We have seen in OCP CI that it can take two seconds or maybe more
405- // for a single curl to succeed. Example:
406- // STEP: asserting UDN pod can reach the kapi service in the default network @ 01/20/25 00:38:42.32
407- // I0120 00:38:42.320808 70120 builder.go:121] Running '/usr/bin/kubectl
408- // --server=https://api.ci-op-bkg2qwwq-4edbf.XXXXXXXXXXXXXXXXXXXXXX:6443 --kubeconfig=/tmp/kubeconfig-1734723086
409- // --namespace=e2e-test-network-segmentation-e2e-kzdw7 exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
410- // I0120 00:38:44.108334 70120 builder.go:146] stderr: " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 2 100 2 0 0 9 0 --:--:-- --:--:-- --:--:-- 9\r100 2 100 2 0 0 9 0 --:--:-- --:--:-- --:--:-- 9\n"
411- // I0120 00:38:44.108415 70120 builder.go:147] stdout: "ok" --> 2 seconds later
412- // I0120 00:38:45.109237 70120 builder.go:121] Running '/usr/bin/kubectl
413- // --server=https://api.ci-op-bkg2qwwq-4edbf.XXXXXXXXXXXXXXXXXXXXXX:6443 --kubeconfig=/tmp/kubeconfig-1734723086
414- // --namespace=e2e-test-network-segmentation-e2e-kzdw7 exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
415- // I0120 00:38:48.460089 70120 builder.go:135] rc: 28
416- // around the same time we have observed OVS issues like:
417- // Jan 20 00:38:45.329999 ci-op-bkg2qwwq-4edbf-xv8kb-worker-b-flqxd ovs-vswitchd[1094]: ovs|03661|timeval|WARN|context switches: 0 voluntary, 695 involuntary
418- // Jan 20 00:38:45.329967 ci-op-bkg2qwwq-4edbf-xv8kb-worker-b-flqxd ovs-vswitchd[1094]: ovs|03660|timeval|WARN|Unreasonably long 1730ms poll interval (32ms user, 903ms system)
419- // which might need more investigation. Bumping the timeout to 5seconds can help with this
420- // but we need to figure out what exactly is causing random timeouts in CI when trying to reach kapi-server
421- // sometimes we have also seen more than 2seconds being taken for the timeout which also needs to be investigated:
422- // I0118 13:35:50.419638 87083 builder.go:121] Running '/usr/bin/kubectl
423- // --server=https://api.ostest.test.metalkube.org:6443 --kubeconfig=/tmp/secret/kubeconfig
424- // --namespace=e2e-test-network-segmentation-e2e-d4fzk exec udn-pod -- curl --connect-timeout 2 --insecure https://kubernetes.default/healthz'
425- // I0118 13:35:54.093268 87083 builder.go:135] rc: 28 --> takes close to 4seconds?
426422 "5" ,
423+ "--max-time" ,
424+ "10" ,
427425 "--insecure" ,
428- "https://kubernetes.default/healthz" )
429- return err == nil
430- }, 15 * time .Second , 3 * time .Second ).Should (BeTrue ())
426+ "https://kubernetes.default/healthz" ,
427+ )
428+
429+ if err == nil {
430+ Expect (strings .TrimSpace (stdout )).To (Equal ("ok" ),
431+ "unexpected response from kapi healthz" )
432+ successCount ++
433+ consecutiveFailures = 0
434+ framework .Logf ("UDN pod reached kapi healthz: success=%d/%d timeoutCount=%d" ,
435+ successCount , requiredSuccesses , timeoutCount )
436+ if successCount < requiredSuccesses {
437+ time .Sleep (kapiProbeInterval )
438+ }
439+ continue
440+ }
441+
442+ if isCurlExitCode28 (err ) {
443+ timeoutCount ++
444+ consecutiveFailures ++
445+ framework .Logf ("UDN pod kapi healthz curl timeout: timeoutCount=%d err=%v" ,
446+ timeoutCount , err )
447+
448+ Expect (timeoutCount ).To (BeNumerically ("<=" , maxTimeouts ),
449+ "only one transient timeout is allowed" )
450+ Expect (consecutiveFailures ).To (BeNumerically ("<=" , 1 ),
451+ "consecutive KAPI failures indicate sustained connectivity failure" )
452+
453+ time .Sleep (kapiProbeInterval )
454+ continue
455+ }
456+
457+ Expect (err ).NotTo (HaveOccurred (),
458+ "non-timeout error reaching kapi healthz, stdout=%q" , stdout )
459+ }
460+
461+ Expect (successCount ).To (BeNumerically (">=" , requiredSuccesses ),
462+ "UDN pod did not reach kapi healthz enough times within %s; timeoutCount=%d" ,
463+ kapiProbeWindow , timeoutCount )
431464
432465 By ("asserting UDN pod can't reach default services via default network interface" )
433466 // route setup is already done, get kapi IPs
@@ -1285,6 +1318,11 @@ var _ = Describe("[sig-network][OCPFeatureGate:NetworkSegmentation][Feature:User
12851318 })
12861319})
12871320
1321+ // isCurlExitCode28 checks if the error from RunKubectl indicates a curl timeout (exit code 28).
1322+ func isCurlExitCode28 (err error ) bool {
1323+ return err != nil && strings .Contains (err .Error (), "rc: 28" )
1324+ }
1325+
12881326// randomNetworkMetaName return pseudo random name for network related objects (NAD,UDN,CUDN).
12891327// CUDN is cluster-scoped object, in case tests running in parallel, having random names avoids
12901328// conflicting with other tests.
0 commit comments