Skip to content

Commit 2b11daa

Browse files
jzdingclaude
andcommitted
CI: check clockclass after cloud-event-proxy crash with improved readiness
Add crash recovery test for OC, TGM, and dual-follower configs. Fix PushInitialEvent hang by using context.WithTimeout. Simplify crash recovery test to use verifyClockClassCurrentState with fresh subscriptions. WaitForConsumerReady now polls the publisher service health URL (ptp-event-publisher-service-<node>.openshift-ptp.svc.cluster.local:9043) from within the consumer pod, then waits for the consumer app to log "waiting for events" before proceeding, ensuring both the publisher is reachable and the consumer has finished initialization. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9ecc996 commit 2b11daa

File tree

2 files changed

+110
-47
lines changed

2 files changed

+110
-47
lines changed

test/conformance/serial/ptp.go

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,11 +1329,25 @@ var _ = Describe("["+strings.ToLower(DesiredMode.String())+"-serial]", Serial, f
13291329
if ptphelper.PtpEventEnabled() != 2 {
13301330
Skip("Skipping: test applies to event API v2 only")
13311331
}
1332-
if fullConfig.PtpModeDiscovered != testconfig.BoundaryClock &&
1333-
fullConfig.PtpModeDiscovered != testconfig.DualNICBoundaryClock &&
1334-
fullConfig.PtpModeDiscovered != testconfig.DualNICBoundaryClockHA {
1335-
Skip("Skipping: test applies to boundary clock configurations only")
1332+
1333+
// Determine expected clock class based on PTP mode
1334+
var expectedClockClass fbprotocol.ClockClass
1335+
switch fullConfig.PtpModeDiscovered {
1336+
case testconfig.OrdinaryClock, testconfig.DualFollowerClock:
1337+
// In 4.21+, OC/DualFollower correctly reports its local clock class (255/SlaveOnly).
1338+
// Before 4.21, OC was incorrectly detected as GM and reported upstream GM's class (6).
1339+
expectedClockClass = fbprotocol.ClockClass6
1340+
if ptphelper.IsPTPOperatorVersionAtLeast("4.21") {
1341+
expectedClockClass = fbprotocol.ClockClassSlaveOnly
1342+
}
1343+
case testconfig.BoundaryClock, testconfig.DualNICBoundaryClock, testconfig.DualNICBoundaryClockHA,
1344+
testconfig.TelcoGrandMasterClock:
1345+
expectedClockClass = fbprotocol.ClockClass6
1346+
default:
1347+
Skip(fmt.Sprintf("Skipping: test does not apply to %s mode", fullConfig.PtpModeDiscovered))
13361348
}
1349+
expectedClockClassStr := strconv.Itoa(int(expectedClockClass))
1350+
logrus.Infof("PtpModeDiscovered: %s, expected clockClass: %s", fullConfig.PtpModeDiscovered, expectedClockClassStr)
13371351

13381352
By("Deploying consumer app for event API v2")
13391353
nodeName := fullConfig.DiscoveredClockUnderTestPod.Spec.NodeName
@@ -1348,28 +1362,29 @@ var _ = Describe("["+strings.ToLower(DesiredMode.String())+"-serial]", Serial, f
13481362
event.PubSub.Close()
13491363
}
13501364
})
1351-
time.Sleep(10 * time.Second)
1365+
if waitErr := event.WaitForConsumerReady(nodeName); waitErr != nil {
1366+
Skip(fmt.Sprintf("Consumer app not ready: %v", waitErr))
1367+
}
1368+
event.InitPubSub()
13521369

1353-
By("Verifying initial clockClass is 6 via metrics")
1354-
checkClockClassState(fullConfig, strconv.Itoa(int(fbprotocol.ClockClass6)))
1370+
By(fmt.Sprintf("Verifying initial clockClass is %s via metrics", expectedClockClassStr))
1371+
checkClockClassState(fullConfig, expectedClockClassStr)
13551372

1356-
By("Verifying initial clockClass is 6 via PMC")
1373+
// PMC gm.ClockClass always reports the upstream GM's class (6),
1374+
// regardless of whether this node is BC or OC.
1375+
By("Verifying initial gm.ClockClass is 6 via PMC")
13571376
checkClockClassViaPMC(fullConfig, strconv.Itoa(int(fbprotocol.ClockClass6)))
13581377

1359-
By("Setting up event monitoring and verifying initial clockClass is 6 via Event API")
1360-
event.InitPubSub()
1361-
term, monErr := event.MonitorPodLogsRegex()
1362-
Expect(monErr).ToNot(HaveOccurred(), "could not start listening to events")
1363-
DeferCleanup(func() { stopMonitor(term) })
1364-
verifyClockClassViaEventAPI(int(fbprotocol.ClockClass6), 60*time.Second)
1378+
By(fmt.Sprintf("Verifying initial clockClass is %s via Event API", expectedClockClassStr))
1379+
verifyClockClassCurrentState(int(expectedClockClass), 60*time.Second)
13651380

13661381
By("Killing cloud-event-proxy process in sidecar container")
13671382
_, _, killErr := pods.ExecCommand(
13681383
client.Client,
13691384
true,
13701385
fullConfig.DiscoveredClockUnderTestPod,
13711386
pkg.EventProxyContainerName,
1372-
[]string{"sh", "-c", "kill -9 $(pgrep -f ^./cloud-event-proxy) || true"},
1387+
[]string{"sh", "-c", "pkill -9 -f ^./cloud-event-proxy || true"},
13731388
)
13741389
Expect(killErr).To(BeNil(), "failed to kill cloud-event-proxy process")
13751390

@@ -1399,14 +1414,14 @@ var _ = Describe("["+strings.ToLower(DesiredMode.String())+"-serial]", Serial, f
13991414
}, 2*time.Minute, 2*time.Second).Should(ContainSubstring("OK"),
14001415
"cloud-event-proxy health endpoint did not recover after restart")
14011416

1402-
By("Verifying clockClass remains 6 via metrics after cloud-event-proxy restart")
1403-
checkClockClassState(fullConfig, strconv.Itoa(int(fbprotocol.ClockClass6)))
1417+
By(fmt.Sprintf("Verifying clockClass remains %s via metrics after cloud-event-proxy restart", expectedClockClassStr))
1418+
checkClockClassState(fullConfig, expectedClockClassStr)
14041419

1405-
By("Verifying clockClass remains 6 via PMC after cloud-event-proxy restart")
1420+
By("Verifying gm.ClockClass remains 6 via PMC after cloud-event-proxy restart")
14061421
checkClockClassViaPMC(fullConfig, strconv.Itoa(int(fbprotocol.ClockClass6)))
14071422

1408-
By("Verifying clockClass is 6 via Event API after cloud-event-proxy restart")
1409-
verifyClockClassViaEventAPI(int(fbprotocol.ClockClass6), 90*time.Second)
1423+
By(fmt.Sprintf("Verifying clockClass is %s via Event API after cloud-event-proxy restart", expectedClockClassStr))
1424+
verifyClockClassCurrentState(int(expectedClockClass), 90*time.Second)
14101425
})
14111426

14121427
Context("Event API version validation", func() {
@@ -3692,7 +3707,10 @@ func setupBCClockClassEvents(nodeName string) bcEventContext {
36923707
logrus.Warnf("PTP events not available: %s; skipping event checks", createErr)
36933708
return ctx
36943709
}
3695-
time.Sleep(10 * time.Second)
3710+
if waitErr := event.WaitForConsumerReady(nodeName); waitErr != nil {
3711+
logrus.Warnf("Consumer app not ready: %s; skipping event checks", waitErr)
3712+
return ctx
3713+
}
36963714
event.InitPubSub()
36973715
var eventCleanup func()
36983716
ctx.subs, eventCleanup = event.SubscribeToGMChangeEvents(100, true, 60*time.Second)
@@ -3725,8 +3743,12 @@ func verifyClockClassViaEvent(evCtx bcEventContext, expectedClass int) {
37253743
verifyMetric(events[ptpEvent.PtpClockClassChange], float64(expectedClass))
37263744
}
37273745

3728-
// verifyClockClassViaEventAPI verifies clock class via Event Fast Notification API getCurrentState
3729-
func verifyClockClassViaEventAPI(expectedClockClass int, timeout time.Duration) {
3746+
// verifyClockClassCurrentState creates a fresh event subscription with pushInitial=true
3747+
// to query the current clock class state from the Event API's getCurrentState endpoint.
3748+
// Unlike verifyClockClassViaEvent (which drains an existing long-lived subscription),
3749+
// this function is safe to call after disruptions such as a cloud-event-proxy restart,
3750+
// where the previous subscription may be stale or disconnected.
3751+
func verifyClockClassCurrentState(expectedClockClass int, timeout time.Duration) {
37303752
const incomingEventsBuffer = 100
37313753
subs, cleanup := event.SubscribeToGMChangeEvents(incomingEventsBuffer, true, timeout)
37323754
defer cleanup()

test/pkg/event/event.go

Lines changed: 65 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ const (
3838
ConsumerSaName = "consumer-sa"
3939
ConsumerServiceName = "consumer-events-subscription-service"
4040
TestSidecarSuccessLogString = "rest service returned healthy status"
41+
ConsumerReadyLogString = "waiting for events"
4142
ConsumerContainerName = "consumer"
4243
sidecarNamespaceDeleteTimeout = time.Minute * 2
4344
ApiBaseV1 = "127.0.0.1:9085/api/ocloudNotifications/v1/"
@@ -322,6 +323,50 @@ func CreateConsumerApp(nodeNameFull string) (err error) {
322323
return nil
323324
}
324325

326+
// WaitForConsumerReady polls the publisher's health endpoint from within the
327+
// consumer pod until it responds OK, then waits for the consumer app to log
328+
// "waiting for events" confirming it has finished initialization.
329+
func WaitForConsumerReady(nodeNameFull string) error {
330+
nodeName := nodeNameFull
331+
if nodeNameFull != "" && strings.Contains(nodeNameFull, ".") {
332+
nodeName = strings.Split(nodeName, ".")[0]
333+
}
334+
publisherHealthURL := fmt.Sprintf("http://ptp-event-publisher-service-%s.%s.svc.cluster.local:9043/api/ocloudNotifications/v2/health", nodeName, ProducerNamespace)
335+
logrus.Infof("Waiting for publisher to become ready at %s", publisherHealthURL)
336+
337+
consumerPod, err := testclient.Client.Pods(ConsumerNamespace).Get(context.TODO(), ConsumerPodName, metav1.GetOptions{})
338+
if err != nil {
339+
return fmt.Errorf("could not get consumer pod: %s", err)
340+
}
341+
342+
deadline := time.Now().Add(1 * time.Minute)
343+
for time.Now().Before(deadline) {
344+
buf, _, execErr := pods.ExecCommand(
345+
testclient.Client,
346+
false,
347+
consumerPod,
348+
ConsumerContainerName,
349+
[]string{"curl", "-s", publisherHealthURL},
350+
)
351+
if execErr == nil && strings.Contains(buf.String(), "OK") {
352+
logrus.Info("Publisher health endpoint is ready")
353+
break
354+
}
355+
time.Sleep(2 * time.Second)
356+
}
357+
if time.Now().After(deadline) {
358+
return fmt.Errorf("publisher health endpoint %s did not return OK within 1 minute", publisherHealthURL)
359+
}
360+
361+
logrus.Info("Waiting for consumer app to finish initialization")
362+
_, err = pods.GetPodLogsRegex(ConsumerNamespace, ConsumerPodName, ConsumerContainerName, ConsumerReadyLogString, true, 1*time.Minute)
363+
if err != nil {
364+
return fmt.Errorf("consumer app did not become ready: %s", err)
365+
}
366+
logrus.Info("Consumer app is ready")
367+
return nil
368+
}
369+
325370
const roleName = "use-privileged"
326371
const roleBindingName = "sidecar-rolebinding"
327372

@@ -523,39 +568,35 @@ func PushInitialEvent(eventType string, timeout time.Duration) (err error) {
523568
TailLines: &count,
524569
}
525570

571+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
572+
defer cancel()
573+
526574
podLogRequest := testclient.Client.CoreV1().Pods(namespace).GetLogs(podName, &podLogOptions)
527-
stream, err := podLogRequest.Stream(context.TODO())
575+
stream, err := podLogRequest.Stream(ctx)
528576
if err != nil {
529577
return fmt.Errorf("could not retrieve log in ns=%s pod=%s, err=%s", namespace, podName, err)
530578
}
531579
defer stream.Close()
532-
start := time.Now()
533-
for {
534-
scanner := bufio.NewScanner(stream)
535-
for scanner.Scan() {
536-
t := time.Now()
537-
elapsed := t.Sub(start)
538-
if elapsed > timeout {
539-
return fmt.Errorf("timedout PushInitialValue, waiting for log in ns=%s pod=%s, looking for = %s", namespace, podName, regex)
580+
581+
r := regexp.MustCompile(regex)
582+
scanner := bufio.NewScanner(stream)
583+
for scanner.Scan() {
584+
line := scanner.Text()
585+
logrus.Trace(line)
586+
587+
matches := r.FindAllStringSubmatch(line, -1)
588+
if len(matches) > 0 {
589+
aStoredEvent, eType, err := createStoredEvent([]byte(matches[0][1]))
590+
if err != nil {
591+
return err
540592
}
541-
line := scanner.Text()
542-
logrus.Trace(line)
543-
544-
r := regexp.MustCompile(regex)
545-
matches := r.FindAllStringSubmatch(line, -1)
546-
if len(matches) > 0 {
547-
aStoredEvent, eType, err := createStoredEvent([]byte(matches[0][1]))
548-
if err != nil {
549-
return err
550-
}
551-
if eType == eventType {
552-
PubSub.Publish(eType, aStoredEvent)
553-
return nil
554-
}
593+
if eType == eventType {
594+
PubSub.Publish(eType, aStoredEvent)
595+
return nil
555596
}
556597
}
557-
558598
}
599+
return fmt.Errorf("timed out PushInitialEvent, waiting for log in ns=%s pod=%s, looking for %s", namespace, podName, regex)
559600
}
560601
func createStoredEvent(data []byte) (aStoredEvent exports.StoredEvent, aType string, err error) {
561602
apiVersion := ptphelper.PtpEventEnabled()

0 commit comments

Comments
 (0)