Skip to content

Commit 69a01e2

Browse files
committed
feat: add e2e tests
Signed-off-by: Tanisha goyal <[email protected]>
1 parent a44b5a6 commit 69a01e2

File tree

7 files changed

+208
-2
lines changed

7 files changed

+208
-2
lines changed

distros/kubernetes/nvsentinel/values-tilt-postgresql.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,10 @@ postgresql:
216216
217217
-- Metadata
218218
created_at TIMESTAMPTZ DEFAULT NOW(),
219-
updated_at TIMESTAMPTZ DEFAULT NOW()
219+
updated_at TIMESTAMPTZ DEFAULT NOW(),
220+
221+
-- Event handling strategy
222+
processing_strategy VARCHAR(50)
220223
);
221224
222225
-- Indexes for health_events

docs/postgresql-schema.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,10 @@ CREATE TABLE IF NOT EXISTS health_events (
103103

104104
-- Metadata
105105
created_at TIMESTAMPTZ DEFAULT NOW(),
106-
updated_at TIMESTAMPTZ DEFAULT NOW()
106+
updated_at TIMESTAMPTZ DEFAULT NOW(),
107+
108+
-- Event handling strategy
109+
processing_strategy VARCHAR(50)
107110
);
108111

109112
-- Indexes for health_events

fault-quarantine/pkg/evaluator/rule_evaluator_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ func TestRoundTrip(t *testing.T) {
260260
"nanos": float64(eventTime.GetNanos()),
261261
},
262262
"nodeName": "test-node",
263+
"processingStrategy": float64(0),
263264
"quarantineOverrides": nil,
264265
"drainOverrides": nil,
265266
}

tests/fault_quarantine_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"tests/helpers"
2525

26+
"github.com/nvidia/nvsentinel/data-models/pkg/protos"
2627
"github.com/stretchr/testify/assert"
2728
"github.com/stretchr/testify/require"
2829
v1 "k8s.io/api/core/v1"
@@ -228,3 +229,67 @@ func TestPreCordonedNodeHandling(t *testing.T) {
228229

229230
testEnv.Test(t, feature.Feature())
230231
}
232+
233+
func TestFaultQuarantineWithProcessingStrategy(t *testing.T) {
234+
feature := features.New("TestFaultQuarantineWithProcessingStrategy").
235+
WithLabel("suite", "fault-quarantine-with-processing-strategy")
236+
237+
var testCtx *helpers.QuarantineTestContext
238+
239+
feature.Setup(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
240+
client, err := c.NewClient()
241+
require.NoError(t, err)
242+
243+
var newCtx context.Context
244+
newCtx, testCtx = helpers.SetupQuarantineTest(ctx, t, c, "data/managed-by-nvsentinel-configmap.yaml")
245+
246+
err = helpers.SetNodeManagedByNVSentinel(newCtx, client, testCtx.NodeName, true)
247+
require.NoError(t, err)
248+
249+
return newCtx
250+
})
251+
252+
feature.Assess("Check that node is not quarantined for STORE_ONLY events", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
253+
client, err := c.NewClient()
254+
require.NoError(t, err)
255+
256+
event := helpers.NewHealthEvent(testCtx.NodeName).
257+
WithErrorCode("79").
258+
WithMessage("XID error occurred").
259+
WithProcessingStrategy(int(protos.ProcessingStrategy_STORE_ONLY))
260+
helpers.SendHealthEvent(ctx, t, event)
261+
262+
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
263+
ExpectCordoned: false,
264+
ExpectAnnotation: false,
265+
})
266+
267+
return ctx
268+
})
269+
270+
feature.Assess("Check that node is quarantined for EXECUTE_REMEDIATION events", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
271+
client, err := c.NewClient()
272+
require.NoError(t, err)
273+
274+
event := helpers.NewHealthEvent(testCtx.NodeName).
275+
WithErrorCode("79").
276+
WithMessage("XID error occurred").
277+
WithProcessingStrategy(int(protos.ProcessingStrategy_EXECUTE_REMEDIATION))
278+
helpers.SendHealthEvent(ctx, t, event)
279+
280+
helpers.AssertQuarantineState(ctx, t, client, testCtx.NodeName, helpers.QuarantineAssertion{
281+
ExpectCordoned: true,
282+
ExpectAnnotation: true,
283+
})
284+
285+
return ctx
286+
})
287+
288+
feature.Teardown(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
289+
helpers.SendHealthyEvent(ctx, t, testCtx.NodeName)
290+
291+
return helpers.TeardownQuarantineTest(ctx, t, c)
292+
})
293+
294+
testEnv.Test(t, feature.Feature())
295+
}

tests/helpers/healthevent.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ type HealthEventTemplate struct {
4545
Metadata map[string]string `json:"metadata,omitempty"`
4646
QuarantineOverrides *QuarantineOverrides `json:"quarantineOverrides,omitempty"`
4747
NodeName string `json:"nodeName"`
48+
ProcessingStrategy int `json:"processingStrategy,omitempty"`
4849
}
4950

5051
type EntityImpacted struct {
@@ -149,6 +150,11 @@ func (h *HealthEventTemplate) WithRecommendedAction(action int) *HealthEventTemp
149150
return h
150151
}
151152

153+
func (h *HealthEventTemplate) WithProcessingStrategy(strategy int) *HealthEventTemplate {
154+
h.ProcessingStrategy = strategy
155+
return h
156+
}
157+
152158
func (h *HealthEventTemplate) WriteToTempFile() (string, error) {
153159
tempFile, err := os.CreateTemp("", "health-event-*.json")
154160
if err != nil {

tests/helpers/kube.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,30 @@ func WaitForNodeEvent(ctx context.Context, t *testing.T, c klient.Client, nodeNa
384384
}, EventuallyWaitTimeout, WaitInterval, "node %s should have event %v", nodeName, expectedEvent)
385385
}
386386

387+
func EnsureNodeEventNotPresent(ctx context.Context, t *testing.T,
388+
c klient.Client, nodeName string, eventType, eventReason string) {
389+
t.Helper()
390+
391+
require.Never(t, func() bool {
392+
events, err := GetNodeEvents(ctx, c, nodeName, eventType)
393+
if err != nil {
394+
t.Logf("failed to get events for node %s: %v", nodeName, err)
395+
return false
396+
}
397+
398+
for _, event := range events.Items {
399+
if event.Type == eventType && event.Reason == eventReason {
400+
t.Logf("node %s has event %v", nodeName, event)
401+
return true
402+
}
403+
}
404+
405+
t.Logf("node %s does not have event %v", nodeName, eventType)
406+
407+
return false
408+
}, NeverWaitTimeout, WaitInterval, "node %s should not have event %v", nodeName, eventType, eventReason)
409+
}
410+
387411
// SelectTestNodeFromUnusedPool selects an available test node from the cluster.
388412
// Prefers uncordoned nodes but will fall back to the first node if none are available.
389413
func SelectTestNodeFromUnusedPool(ctx context.Context, t *testing.T, client klient.Client) string {

tests/platform-connector_test.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package tests
16+
17+
import (
18+
"context"
19+
"testing"
20+
"tests/helpers"
21+
22+
"github.com/nvidia/nvsentinel/data-models/pkg/protos"
23+
"github.com/stretchr/testify/require"
24+
"sigs.k8s.io/e2e-framework/pkg/envconf"
25+
"sigs.k8s.io/e2e-framework/pkg/features"
26+
)
27+
28+
type PlatformConnectorTestContext struct {
29+
NodeName string
30+
ConfigMapBackup []byte
31+
TestNamespace string
32+
}
33+
34+
func TestPlatformConnectorWithProcessingStrategy(t *testing.T) {
35+
feature := features.New("TestPlatformConnector").
36+
WithLabel("suite", "platform-connector")
37+
38+
var testCtx *PlatformConnectorTestContext
39+
40+
feature.Setup(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
41+
client, err := c.NewClient()
42+
require.NoError(t, err)
43+
44+
nodeName := helpers.SelectTestNodeFromUnusedPool(ctx, t, client)
45+
46+
testCtx = &PlatformConnectorTestContext{
47+
NodeName: nodeName,
48+
}
49+
50+
return ctx
51+
})
52+
53+
feature.Assess("Check that platform connector is not applying node events/conditions for STORE_ONLY events", func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
54+
client, err := c.NewClient()
55+
require.NoError(t, err)
56+
57+
event := helpers.NewHealthEvent(testCtx.NodeName).
58+
WithErrorCode(helpers.ERRORCODE_79).
59+
WithMessage("XID error occurred").
60+
WithProcessingStrategy(int(protos.ProcessingStrategy_STORE_ONLY))
61+
helpers.SendHealthEvent(ctx, t, event)
62+
63+
t.Logf("Node %s should not have condition SysLogsXIDError", testCtx.NodeName)
64+
helpers.EnsureNodeConditionNotPresent(ctx, t, client, testCtx.NodeName, "SysLogsXIDError")
65+
66+
event = helpers.NewHealthEvent(testCtx.NodeName).
67+
WithErrorCode(helpers.ERRORCODE_31).
68+
WithMessage("XID error occurred").
69+
WithFatal(false).
70+
WithProcessingStrategy(int(protos.ProcessingStrategy_STORE_ONLY))
71+
helpers.SendHealthEvent(ctx, t, event)
72+
73+
t.Logf("Node %s should not have event SysLogsXIDError", testCtx.NodeName)
74+
helpers.EnsureNodeEventNotPresent(ctx, t, client, testCtx.NodeName, "SysLogsXIDError", "SysLogsXIDErrorIsNotHealthy")
75+
76+
event = helpers.NewHealthEvent(testCtx.NodeName).
77+
WithErrorCode(helpers.ERRORCODE_79).
78+
WithMessage("XID error occurred").
79+
WithProcessingStrategy(int(protos.ProcessingStrategy_EXECUTE_REMEDIATION))
80+
helpers.SendHealthEvent(ctx, t, event)
81+
82+
t.Logf("Node %s should have condition SysLogsXIDError", testCtx.NodeName)
83+
helpers.CheckNodeConditionExists(ctx, client, testCtx.NodeName, "SysLogsXIDError", "SysLogsXIDErrorIsNotHealthy")
84+
85+
event = helpers.NewHealthEvent(testCtx.NodeName).
86+
WithErrorCode(helpers.ERRORCODE_31).
87+
WithMessage("XID error occurred").
88+
WithFatal(false).
89+
WithProcessingStrategy(int(protos.ProcessingStrategy_EXECUTE_REMEDIATION))
90+
helpers.SendHealthEvent(ctx, t, event)
91+
92+
t.Logf("Node %s should have event SysLogsXIDError", testCtx.NodeName)
93+
helpers.CheckNodeEventExists(ctx, client, testCtx.NodeName, "SysLogsXIDError", "SysLogsXIDErrorIsNotHealthy")
94+
95+
return ctx
96+
})
97+
98+
feature.Teardown(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
99+
helpers.SendHealthyEvent(ctx, t, testCtx.NodeName)
100+
return ctx
101+
})
102+
103+
testEnv.Test(t, feature.Feature())
104+
}

0 commit comments

Comments
 (0)