@@ -64,6 +64,9 @@ const (
6464var (
6565 clockClassPattern = `^openshift_ptp_clock_class\{(?:config="ptp4l\.\d+\.config",)?node="([^"]+)",process="([^"]+)"\}\s+(\d+)`
6666 clockClassRe = regexp .MustCompile (clockClassPattern )
67+
68+ // pmcClockClassRe parses gm.ClockClass from "pmc GET PARENT_DATA_SET" output
69+ pmcClockClassRe = regexp .MustCompile (`gm\.ClockClass\s+(\d+)` )
6770)
6871var DesiredMode = testconfig .GetDesiredConfig (true ).PtpModeDesired
6972
@@ -1149,6 +1152,90 @@ var _ = Describe("["+strings.ToLower(DesiredMode.String())+"-serial]", Serial, f
11491152 "Threshold metrics are not detected" )
11501153 })
11511154
1155+ It ("Should recover clockClass via event API after cloud-event-proxy crash" , func () {
1156+ if ptphelper .PtpEventEnabled () != 2 {
1157+ Skip ("Skipping: test applies to event API v2 only" )
1158+ }
1159+ if fullConfig .PtpModeDiscovered != testconfig .BoundaryClock &&
1160+ fullConfig .PtpModeDiscovered != testconfig .DualNICBoundaryClock &&
1161+ fullConfig .PtpModeDiscovered != testconfig .DualNICBoundaryClockHA {
1162+ Skip ("Skipping: test applies to boundary clock configurations only" )
1163+ }
1164+
1165+ By ("Deploying consumer app for event API v2" )
1166+ nodeName := fullConfig .DiscoveredClockUnderTestPod .Spec .NodeName
1167+ Expect (nodeName ).ToNot (BeEmpty (), "clock-under-test pod node is empty" )
1168+ err := event .CreateConsumerApp (nodeName )
1169+ if err != nil {
1170+ Skip (fmt .Sprintf ("Consumer app setup failed: %v" , err ))
1171+ }
1172+ DeferCleanup (func () {
1173+ _ = event .DeleteConsumerNamespace ()
1174+ if event .PubSub != nil {
1175+ event .PubSub .Close ()
1176+ }
1177+ })
1178+ time .Sleep (10 * time .Second )
1179+
1180+ By ("Verifying initial clockClass is 6 via metrics" )
1181+ checkClockClassState (fullConfig , strconv .Itoa (int (fbprotocol .ClockClass6 )))
1182+
1183+ By ("Verifying initial clockClass is 6 via PMC" )
1184+ checkClockClassViaPMC (fullConfig , strconv .Itoa (int (fbprotocol .ClockClass6 )))
1185+
1186+ By ("Setting up event monitoring and verifying initial clockClass is 6 via Event API" )
1187+ event .InitPubSub ()
1188+ term , monErr := event .MonitorPodLogsRegex ()
1189+ Expect (monErr ).ToNot (HaveOccurred (), "could not start listening to events" )
1190+ DeferCleanup (func () { stopMonitor (term ) })
1191+ verifyClockClassViaEventAPI (int (fbprotocol .ClockClass6 ), 60 * time .Second )
1192+
1193+ By ("Killing cloud-event-proxy process in sidecar container" )
1194+ _ , _ , killErr := pods .ExecCommand (
1195+ client .Client ,
1196+ true ,
1197+ fullConfig .DiscoveredClockUnderTestPod ,
1198+ pkg .EventProxyContainerName ,
1199+ []string {"sh" , "-c" , "kill -9 $(pgrep -f ^./cloud-event-proxy) || true" },
1200+ )
1201+ Expect (killErr ).To (BeNil (), "failed to kill cloud-event-proxy process" )
1202+
1203+ By ("Waiting for cloud-event-proxy process to restart" )
1204+ Eventually (func () bool {
1205+ buf , _ , _ := pods .ExecCommand (
1206+ client .Client ,
1207+ true ,
1208+ fullConfig .DiscoveredClockUnderTestPod ,
1209+ pkg .EventProxyContainerName ,
1210+ []string {"sh" , "-c" , "pgrep -f ^./cloud-event-proxy" },
1211+ )
1212+ return strings .TrimSpace (buf .String ()) != ""
1213+ }, 3 * time .Minute , 1 * time .Second ).Should (BeTrue (),
1214+ "cloud-event-proxy process did not restart within 3 minutes" )
1215+
1216+ By ("Waiting for cloud-event-proxy health endpoint to recover" )
1217+ Eventually (func () string {
1218+ buf , _ , _ := pods .ExecCommand (
1219+ client .Client ,
1220+ false ,
1221+ fullConfig .DiscoveredClockUnderTestPod ,
1222+ pkg .EventProxyContainerName ,
1223+ []string {"curl" , path .Join (event .ApiBaseV2 , "health" )},
1224+ )
1225+ return buf .String ()
1226+ }, 2 * time .Minute , 2 * time .Second ).Should (ContainSubstring ("OK" ),
1227+ "cloud-event-proxy health endpoint did not recover after restart" )
1228+
1229+ By ("Verifying clockClass remains 6 via metrics after cloud-event-proxy restart" )
1230+ checkClockClassState (fullConfig , strconv .Itoa (int (fbprotocol .ClockClass6 )))
1231+
1232+ By ("Verifying clockClass remains 6 via PMC after cloud-event-proxy restart" )
1233+ checkClockClassViaPMC (fullConfig , strconv .Itoa (int (fbprotocol .ClockClass6 )))
1234+
1235+ By ("Verifying clockClass is 6 via Event API after cloud-event-proxy restart" )
1236+ verifyClockClassViaEventAPI (int (fbprotocol .ClockClass6 ), 90 * time .Second )
1237+ })
1238+
11521239 Context ("Event API version validation" , func () {
11531240 BeforeEach (func () {
11541241 if ! ptphelper .IsPTPOperatorVersionAtLeast ("4.19" ) {
@@ -2811,6 +2898,30 @@ func checkClockClassState(fullConfig testconfig.TestConfig, expectedState string
28112898 fmt .Sprintf ("Expected ptp4l clock class to eventually be %s for GM" , expectedState ))
28122899}
28132900
2901+ // checkClockClassViaPMC verifies clock class by running "pmc GET PARENT_DATA_SET"
2902+ // and parsing the gm.ClockClass field from the output.
2903+ func checkClockClassViaPMC (fullConfig testconfig.TestConfig , expectedClockClass string ) {
2904+ By (fmt .Sprintf ("Verifying gm.ClockClass is %s via PMC PARENT_DATA_SET" , expectedClockClass ))
2905+ Eventually (func () bool {
2906+ buf , _ , err := pods .ExecCommand (client .Client , true ,
2907+ fullConfig .DiscoveredClockUnderTestPod , pkg .PtpContainerName ,
2908+ []string {"pmc" , "-b" , "0" , "-u" , "-f" , "/var/run/ptp4l.0.config" , "GET PARENT_DATA_SET" })
2909+ if err != nil {
2910+ fmt .Fprintf (GinkgoWriter , "PMC exec error: %v\n " , err )
2911+ return false
2912+ }
2913+ output := buf .String ()
2914+ matches := pmcClockClassRe .FindStringSubmatch (output )
2915+ if len (matches ) < 2 {
2916+ fmt .Fprintf (GinkgoWriter , "PMC: gm.ClockClass not found in output: %s\n " , output )
2917+ return false
2918+ }
2919+ fmt .Fprintf (GinkgoWriter , "PMC: gm.ClockClass=%s (expected %s)\n " , matches [1 ], expectedClockClass )
2920+ return strings .TrimSpace (matches [1 ]) == expectedClockClass
2921+ }, pkg .TimeoutIn3Minutes , 2 * time .Second ).Should (BeTrue (),
2922+ fmt .Sprintf ("Expected gm.ClockClass %s via PMC but did not get it" , expectedClockClass ))
2923+ }
2924+
28142925func checkDPLLFrequencyState (fullConfig testconfig.TestConfig , state string ) {
28152926 /*
28162927 # TODO: Revisit this for 2 card as each card will have its own dpll process
@@ -3301,3 +3412,28 @@ func waitForStateAndCC(subs event.Subscriptions, state ptpEvent.SyncState, cc in
33013412 }
33023413 }
33033414}
3415+
3416+ // verifyClockClassViaEventAPI verifies clock class via Event Fast Notification API getCurrentState
3417+ func verifyClockClassViaEventAPI (expectedClockClass int , timeout time.Duration ) {
3418+ const incomingEventsBuffer = 100
3419+ subs , cleanup := event .SubscribeToGMChangeEvents (incomingEventsBuffer , true , timeout )
3420+ defer cleanup ()
3421+
3422+ timer := time .NewTimer (timeout )
3423+ defer timer .Stop ()
3424+
3425+ for {
3426+ select {
3427+ case <- timer .C :
3428+ Fail (fmt .Sprintf ("Timed out waiting for clockClass %d via Event API" , expectedClockClass ))
3429+ return
3430+ case ev := <- subs .CLOCKCLASS :
3431+ if res , ok := processEvent (ptpEvent .PtpClockClassChange , ev ); ok {
3432+ if v , ok2 := res .Values ["metric" ].(float64 ); ok2 && int (v ) == expectedClockClass {
3433+ fmt .Fprintf (GinkgoWriter , "ClockClass %d verified via Event API\n " , expectedClockClass )
3434+ return
3435+ }
3436+ }
3437+ }
3438+ }
3439+ }
0 commit comments