@@ -1394,9 +1394,9 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
13941394 healthEvents []* protos.HealthEvent
13951395 expectNodeConditions bool
13961396 expectKubernetesEvents bool
1397- expectedConditionCount int
1398- expectedEventCount int
13991397 description string
1398+ expectedConditionType string
1399+ expectedEventType string
14001400 }{
14011401 {
14021402 name : "STORE_ONLY event should not create node condition" ,
@@ -1417,8 +1417,6 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14171417 },
14181418 expectNodeConditions : false ,
14191419 expectKubernetesEvents : false ,
1420- expectedConditionCount : 0 ,
1421- expectedEventCount : 0 ,
14221420 description : "STORE_ONLY fatal event should not create node condition" ,
14231421 },
14241422 {
@@ -1440,8 +1438,6 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14401438 },
14411439 expectNodeConditions : false ,
14421440 expectKubernetesEvents : false ,
1443- expectedConditionCount : 0 ,
1444- expectedEventCount : 0 ,
14451441 description : "STORE_ONLY non-fatal event should not create Kubernetes event" ,
14461442 },
14471443 {
@@ -1463,9 +1459,9 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14631459 },
14641460 expectNodeConditions : true ,
14651461 expectKubernetesEvents : false ,
1466- expectedConditionCount : 1 ,
1467- expectedEventCount : 0 ,
14681462 description : "EXECUTE_REMEDIATION fatal event should create node condition" ,
1463+ expectedConditionType : "GpuXidError" ,
1464+ expectedEventType : "" ,
14691465 },
14701466 {
14711467 name : "Mixed strategies - only EXECUTE_REMEDIATION should be processed" ,
@@ -1499,9 +1495,55 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14991495 },
15001496 expectNodeConditions : true ,
15011497 expectKubernetesEvents : false ,
1502- expectedConditionCount : 1 , // Only GpuThermalWatch should create condition
1503- expectedEventCount : 0 ,
15041498 description : "Only EXECUTE_REMEDIATION events should create conditions" ,
1499+ expectedConditionType : "GpuThermalWatch" ,
1500+ expectedEventType : "" ,
1501+ },
1502+ {
1503+ name : "STORE_ONLY non fatal event should not create Kubernetes event" ,
1504+ healthEvents : []* protos.HealthEvent {
1505+ {
1506+ CheckName : "GpuPowerWatch" ,
1507+ IsHealthy : false ,
1508+ EntitiesImpacted : []* protos.Entity {{EntityType : "GPU" , EntityValue : "0" }},
1509+ ErrorCode : []string {"" },
1510+ IsFatal : false ,
1511+ ProcessingStrategy : protos .ProcessingStrategy_STORE_ONLY ,
1512+ GeneratedTimestamp : timestamppb .New (time .Now ()),
1513+ ComponentClass : "GPU" ,
1514+ RecommendedAction : protos .RecommendedAction_CONTACT_SUPPORT ,
1515+ Message : "Power warning on GPU 0" ,
1516+ NodeName : "store-only-test-node" ,
1517+ },
1518+ },
1519+ expectNodeConditions : false ,
1520+ expectKubernetesEvents : false ,
1521+ description : "STORE_ONLY non fatal event should not create Kubernetes event" ,
1522+ expectedConditionType : "" ,
1523+ expectedEventType : "" ,
1524+ },
1525+ {
1526+ name : "EXECUTE_REMEDIATION non fatal event should create Kubernetes event" ,
1527+ healthEvents : []* protos.HealthEvent {
1528+ {
1529+ CheckName : "GpuPowerWatch" ,
1530+ IsHealthy : false ,
1531+ EntitiesImpacted : []* protos.Entity {{EntityType : "GPU" , EntityValue : "0" }},
1532+ ErrorCode : []string {"" },
1533+ IsFatal : false ,
1534+ ProcessingStrategy : protos .ProcessingStrategy_EXECUTE_REMEDIATION ,
1535+ GeneratedTimestamp : timestamppb .New (time .Now ()),
1536+ ComponentClass : "GPU" ,
1537+ RecommendedAction : protos .RecommendedAction_CONTACT_SUPPORT ,
1538+ Message : "Power warning on GPU 0" ,
1539+ NodeName : "store-only-test-node" ,
1540+ },
1541+ },
1542+ expectNodeConditions : false ,
1543+ expectKubernetesEvents : true ,
1544+ description : "EXECUTE_REMEDIATION non fatal event should create Kubernetes event" ,
1545+ expectedConditionType : "" ,
1546+ expectedEventType : "GpuPowerWatch" ,
15051547 },
15061548 }
15071549
@@ -1548,25 +1590,25 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
15481590 require .NoError (t , err , "Failed to get test node" )
15491591
15501592 // Count NVSentinel-related conditions (excluding standard K8s conditions like NodeReady)
1551- nvsentinelConditions := 0
1593+ var nvsentinelConditions []corev1. NodeCondition
15521594 for _ , condition := range node .Status .Conditions {
15531595 condType := string (condition .Type )
15541596 if condType != string (corev1 .NodeReady ) &&
15551597 condType != string (corev1 .NodeMemoryPressure ) &&
15561598 condType != string (corev1 .NodeDiskPressure ) &&
15571599 condType != string (corev1 .NodePIDPressure ) &&
15581600 condType != string (corev1 .NodeNetworkUnavailable ) {
1559- nvsentinelConditions ++
1601+ nvsentinelConditions = append ( nvsentinelConditions , condition )
15601602 t .Logf ("Found NVSentinel condition: %s" , condType )
15611603 }
15621604 }
15631605
15641606 if tc .expectNodeConditions {
1565- assert .Equal (t , tc .expectedConditionCount , nvsentinelConditions ,
1566- "Expected %d NVSentinel node conditions , got %d " , tc .expectedConditionCount , nvsentinelConditions )
1607+ assert .Equal (t , tc .expectedConditionType , string ( nvsentinelConditions [ 0 ]. Type ) ,
1608+ "Expected condition type %s , got %s " , tc .expectedConditionType , nvsentinelConditions [ 0 ]. Type )
15671609 } else {
1568- assert .Equal (t , 0 , nvsentinelConditions ,
1569- "Expected no NVSentinel node conditions for STORE_ONLY events, got %d" , nvsentinelConditions )
1610+ assert .Equal (t , 0 , len ( nvsentinelConditions ) ,
1611+ "Expected no NVSentinel node conditions for STORE_ONLY events, got %d" , len ( nvsentinelConditions ) )
15701612 }
15711613
15721614 // Verify Kubernetes events
@@ -1576,8 +1618,8 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
15761618 require .NoError (t , err , "Failed to list events" )
15771619
15781620 if tc .expectKubernetesEvents {
1579- assert .Equal (t , tc .expectedEventCount , len ( events .Items ) ,
1580- "Expected %d Kubernetes events , got %d " , tc .expectedEventCount , len ( events .Items ) )
1621+ assert .Equal (t , tc .expectedEventType , events .Items [ 0 ]. Type ,
1622+ "Expected event type %s , got %s " , tc .expectedEventType , events .Items [ 0 ]. Type )
15811623 } else {
15821624 assert .Equal (t , 0 , len (events .Items ),
15831625 "Expected no Kubernetes events for STORE_ONLY events, got %d" , len (events .Items ))
0 commit comments