Skip to content

Commit 404bf55

Browse files
committed
feat: make review comment changes
Signed-off-by: Tanisha goyal <[email protected]>
1 parent 69a01e2 commit 404bf55

File tree

5 files changed

+98
-127
lines changed

5 files changed

+98
-127
lines changed

docs/postgresql-schema.sql

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,7 @@ CREATE TABLE IF NOT EXISTS health_events (
103103

104104
-- Metadata
105105
created_at TIMESTAMPTZ DEFAULT NOW(),
106-
updated_at TIMESTAMPTZ DEFAULT NOW(),
107-
108-
-- Event handling strategy
109-
processing_strategy VARCHAR(50)
106+
updated_at TIMESTAMPTZ DEFAULT NOW()
110107
);
111108

112109
-- Indexes for health_events

fault-quarantine/pkg/initializer/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func InitializeAll(ctx context.Context, params InitializationParams) (*Component
6363
}
6464

6565
builder := client.GetPipelineBuilder()
66-
pipeline := builder.BuildAllHealthEventInsertsPipeline()
66+
pipeline := builder.BuildProcessableHealthEventInsertsPipeline()
6767

6868
var tomlCfg config.TomlConfig
6969
if err := configmanager.LoadTOMLConfig(params.TomlConfigPath, &tomlCfg); err != nil {

platform-connectors/pkg/connectors/kubernetes/k8s_platform_connector_test.go

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,9 +1394,9 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
13941394
healthEvents []*protos.HealthEvent
13951395
expectNodeConditions bool
13961396
expectKubernetesEvents bool
1397-
expectedConditionCount int
1398-
expectedEventCount int
13991397
description string
1398+
expectedConditionType string
1399+
expectedEventType string
14001400
}{
14011401
{
14021402
name: "STORE_ONLY event should not create node condition",
@@ -1417,8 +1417,6 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14171417
},
14181418
expectNodeConditions: false,
14191419
expectKubernetesEvents: false,
1420-
expectedConditionCount: 0,
1421-
expectedEventCount: 0,
14221420
description: "STORE_ONLY fatal event should not create node condition",
14231421
},
14241422
{
@@ -1440,8 +1438,6 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14401438
},
14411439
expectNodeConditions: false,
14421440
expectKubernetesEvents: false,
1443-
expectedConditionCount: 0,
1444-
expectedEventCount: 0,
14451441
description: "STORE_ONLY non-fatal event should not create Kubernetes event",
14461442
},
14471443
{
@@ -1463,9 +1459,9 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14631459
},
14641460
expectNodeConditions: true,
14651461
expectKubernetesEvents: false,
1466-
expectedConditionCount: 1,
1467-
expectedEventCount: 0,
14681462
description: "EXECUTE_REMEDIATION fatal event should create node condition",
1463+
expectedConditionType: "GpuXidError",
1464+
expectedEventType: "",
14691465
},
14701466
{
14711467
name: "Mixed strategies - only EXECUTE_REMEDIATION should be processed",
@@ -1499,9 +1495,55 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
14991495
},
15001496
expectNodeConditions: true,
15011497
expectKubernetesEvents: false,
1502-
expectedConditionCount: 1, // Only GpuThermalWatch should create condition
1503-
expectedEventCount: 0,
15041498
description: "Only EXECUTE_REMEDIATION events should create conditions",
1499+
expectedConditionType: "GpuThermalWatch",
1500+
expectedEventType: "",
1501+
},
1502+
{
1503+
name: "STORE_ONLY non fatal event should not create Kubernetes event",
1504+
healthEvents: []*protos.HealthEvent{
1505+
{
1506+
CheckName: "GpuPowerWatch",
1507+
IsHealthy: false,
1508+
EntitiesImpacted: []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}},
1509+
ErrorCode: []string{""},
1510+
IsFatal: false,
1511+
ProcessingStrategy: protos.ProcessingStrategy_STORE_ONLY,
1512+
GeneratedTimestamp: timestamppb.New(time.Now()),
1513+
ComponentClass: "GPU",
1514+
RecommendedAction: protos.RecommendedAction_CONTACT_SUPPORT,
1515+
Message: "Power warning on GPU 0",
1516+
NodeName: "store-only-test-node",
1517+
},
1518+
},
1519+
expectNodeConditions: false,
1520+
expectKubernetesEvents: false,
1521+
description: "STORE_ONLY non fatal event should not create Kubernetes event",
1522+
expectedConditionType: "",
1523+
expectedEventType: "",
1524+
},
1525+
{
1526+
name: "EXECUTE_REMEDIATION non fatal event should create Kubernetes event",
1527+
healthEvents: []*protos.HealthEvent{
1528+
{
1529+
CheckName: "GpuPowerWatch",
1530+
IsHealthy: false,
1531+
EntitiesImpacted: []*protos.Entity{{EntityType: "GPU", EntityValue: "0"}},
1532+
ErrorCode: []string{""},
1533+
IsFatal: false,
1534+
ProcessingStrategy: protos.ProcessingStrategy_EXECUTE_REMEDIATION,
1535+
GeneratedTimestamp: timestamppb.New(time.Now()),
1536+
ComponentClass: "GPU",
1537+
RecommendedAction: protos.RecommendedAction_CONTACT_SUPPORT,
1538+
Message: "Power warning on GPU 0",
1539+
NodeName: "store-only-test-node",
1540+
},
1541+
},
1542+
expectNodeConditions: false,
1543+
expectKubernetesEvents: true,
1544+
description: "EXECUTE_REMEDIATION non fatal event should create Kubernetes event",
1545+
expectedConditionType: "",
1546+
expectedEventType: "GpuPowerWatch",
15051547
},
15061548
}
15071549

@@ -1548,25 +1590,25 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
15481590
require.NoError(t, err, "Failed to get test node")
15491591

15501592
// Count NVSentinel-related conditions (excluding standard K8s conditions like NodeReady)
1551-
nvsentinelConditions := 0
1593+
var nvsentinelConditions []corev1.NodeCondition
15521594
for _, condition := range node.Status.Conditions {
15531595
condType := string(condition.Type)
15541596
if condType != string(corev1.NodeReady) &&
15551597
condType != string(corev1.NodeMemoryPressure) &&
15561598
condType != string(corev1.NodeDiskPressure) &&
15571599
condType != string(corev1.NodePIDPressure) &&
15581600
condType != string(corev1.NodeNetworkUnavailable) {
1559-
nvsentinelConditions++
1601+
nvsentinelConditions = append(nvsentinelConditions, condition)
15601602
t.Logf("Found NVSentinel condition: %s", condType)
15611603
}
15621604
}
15631605

15641606
if tc.expectNodeConditions {
1565-
assert.Equal(t, tc.expectedConditionCount, nvsentinelConditions,
1566-
"Expected %d NVSentinel node conditions, got %d", tc.expectedConditionCount, nvsentinelConditions)
1607+
assert.Equal(t, tc.expectedConditionType, string(nvsentinelConditions[0].Type),
1608+
"Expected condition type %s, got %s", tc.expectedConditionType, nvsentinelConditions[0].Type)
15671609
} else {
1568-
assert.Equal(t, 0, nvsentinelConditions,
1569-
"Expected no NVSentinel node conditions for STORE_ONLY events, got %d", nvsentinelConditions)
1610+
assert.Equal(t, 0, len(nvsentinelConditions),
1611+
"Expected no NVSentinel node conditions for STORE_ONLY events, got %d", len(nvsentinelConditions))
15701612
}
15711613

15721614
// Verify Kubernetes events
@@ -1576,8 +1618,8 @@ func TestProcessHealthEvents_StoreOnlyStrategy(t *testing.T) {
15761618
require.NoError(t, err, "Failed to list events")
15771619

15781620
if tc.expectKubernetesEvents {
1579-
assert.Equal(t, tc.expectedEventCount, len(events.Items),
1580-
"Expected %d Kubernetes events, got %d", tc.expectedEventCount, len(events.Items))
1621+
assert.Equal(t, tc.expectedEventType, events.Items[0].Type,
1622+
"Expected event type %s, got %s", tc.expectedEventType, events.Items[0].Type)
15811623
} else {
15821624
assert.Equal(t, 0, len(events.Items),
15831625
"Expected no Kubernetes events for STORE_ONLY events, got %d", len(events.Items))

tests/fault_quarantine_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,24 @@ func TestFaultQuarantineWithProcessingStrategy(t *testing.T) {
259259
WithProcessingStrategy(int(protos.ProcessingStrategy_STORE_ONLY))
260260
helpers.SendHealthEvent(ctx, t, event)
261261

262+
t.Logf("Node %s should not have condition SysLogsXIDError", testCtx.NodeName)
263+
helpers.EnsureNodeConditionNotPresent(ctx, t, client, testCtx.NodeName, "SysLogsXIDError")
264+
265+
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
266+
ExpectCordoned: false,
267+
ExpectAnnotation: false,
268+
})
269+
270+
event = helpers.NewHealthEvent(testCtx.NodeName).
271+
WithErrorCode("DCGM_FR_CLOCK_THROTTLE_POWER").
272+
WithCheckName("GpuPowerWatch").
273+
WithFatal(false).
274+
WithProcessingStrategy(int(protos.ProcessingStrategy_STORE_ONLY))
275+
helpers.SendHealthEvent(ctx, t, event)
276+
277+
t.Logf("Node %s should not have GpuPowerWatch node event", testCtx.NodeName)
278+
helpers.EnsureNodeEventNotPresent(ctx, t, client, testCtx.NodeName, "GpuPowerWatch", "GpuPowerWatchIsNotHealthy")
279+
262280
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
263281
ExpectCordoned: false,
264282
ExpectAnnotation: false,
@@ -277,6 +295,24 @@ func TestFaultQuarantineWithProcessingStrategy(t *testing.T) {
277295
WithProcessingStrategy(int(protos.ProcessingStrategy_EXECUTE_REMEDIATION))
278296
helpers.SendHealthEvent(ctx, t, event)
279297

298+
t.Logf("Node %s should have condition SysLogsXIDError", testCtx.NodeName)
299+
helpers.CheckNodeConditionExists(ctx, client, testCtx.NodeName, "SysLogsXIDError", "SysLogsXIDErrorIsNotHealthy")
300+
301+
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
302+
ExpectCordoned: true,
303+
ExpectAnnotation: true,
304+
})
305+
306+
event = helpers.NewHealthEvent(testCtx.NodeName).
307+
WithErrorCode("DCGM_FR_CLOCK_THROTTLE_POWER").
308+
WithCheckName("GpuPowerWatch").
309+
WithFatal(false).
310+
WithProcessingStrategy(int(protos.ProcessingStrategy_EXECUTE_REMEDIATION))
311+
helpers.SendHealthEvent(ctx, t, event)
312+
313+
t.Logf("Node %s should have node event GpuPowerWatch", testCtx.NodeName)
314+
helpers.CheckNodeEventExists(ctx, client, testCtx.NodeName, "GpuPowerWatch", "GpuPowerWatchIsNotHealthy")
315+
280316
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
281317
ExpectCordoned: true,
282318
ExpectAnnotation: true,

tests/platform-connector_test.go

Lines changed: 0 additions & 104 deletions
This file was deleted.

0 commit comments

Comments
 (0)