Skip to content

Commit 01cd387

Browse files
authored
Merge branch 'main' into feat/drainGPUPods
2 parents cda0a2d + 94b3f05 commit 01cd387

9 files changed

Lines changed: 327 additions & 32 deletions

File tree

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/configmap.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,24 @@ data:
5353
{{- if .healthEvent.processingStrategy }}
5454
processingStrategy = {{ .healthEvent.processingStrategy | quote }}
5555
{{- end }}
56+
{{- if .healthEvent.quarantineOverrides }}
57+
[policies.healthEvent.quarantineOverrides]
58+
{{- if hasKey .healthEvent.quarantineOverrides "force" }}
59+
force = {{ .healthEvent.quarantineOverrides.force }}
60+
{{- end }}
61+
{{- if hasKey .healthEvent.quarantineOverrides "skip" }}
62+
skip = {{ .healthEvent.quarantineOverrides.skip }}
63+
{{- end }}
64+
{{- end }}
65+
{{- if .healthEvent.drainOverrides }}
66+
[policies.healthEvent.drainOverrides]
67+
{{- if hasKey .healthEvent.drainOverrides "force" }}
68+
force = {{ .healthEvent.drainOverrides.force }}
69+
{{- end }}
70+
{{- if hasKey .healthEvent.drainOverrides "skip" }}
71+
skip = {{ .healthEvent.drainOverrides.skip }}
72+
{{- end }}
73+
{{- end }}
5674
5775
{{- end }}
5876

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,14 @@ policies:
5050
recommendedAction: CONTACT_SUPPORT
5151
errorCode:
5252
- NODE_NOT_READY
53+
# Optional behavior overrides for this policy's generated HealthEvents.
54+
# Set either force or skip, never both in the same override block.
55+
# quarantineOverrides:
56+
# force: true # Force node cordon even if normal quarantine rules would not.
57+
# skip: true # Skip node cordon for this health event.
58+
# drainOverrides:
59+
# force: true # Force immediate pod eviction regardless of namespace drain mode.
60+
# skip: true # Skip pod eviction and mark the event as already drained.
5361

5462
# Example: Monitor a custom resource (e.g., a GPU Job)
5563
# Uncomment and modify to monitor your own custom resources
@@ -89,6 +97,8 @@ policies:
8997
# recommendedAction: CONTACT_SUPPORT
9098
# errorCode:
9199
# - GPU_JOB_FAILED
100+
# drainOverrides:
101+
# skip: true
92102

93103
resources:
94104
requests:

docs/configuration/kubernetes-object-monitor.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ kubernetes-object-monitor:
8383
recommendedAction: CONTACT_SUPPORT
8484
errorCode:
8585
- ERROR_CODE
86+
quarantineOverrides:
87+
force: true # Or use skip: true; do not set both
88+
drainOverrides:
89+
skip: true # Or use force: true; do not set both
8690
```
8791
8892
### Parameters
@@ -135,6 +139,12 @@ Action code from health event proto (see [health_event.proto](https://github.com
135139
##### errorCode
136140
Array of error code strings for categorization and filtering.
137141

142+
##### quarantineOverrides
143+
Optional behavior override for fault-quarantine. `force` forces node cordoning regardless of normal rules; `skip` skips node cordoning for the generated health event. Set at most one of `force` or `skip`.
144+
145+
##### drainOverrides
146+
Optional behavior override for node-drainer. `force` forces immediate pod eviction regardless of configured namespace drain modes; `skip` skips pod eviction and marks the event as already drained. Set at most one of `force` or `skip`.
147+
138148
## CEL Expressions
139149

140150
### Predicate Expressions

docs/kubernetes-object-monitor.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,16 @@ healthEvent:
134134
recommendedAction: CONTACT_SUPPORT # Action hint
135135
errorCode:
136136
- NODE_NOT_READY # Error codes for classification
137+
quarantineOverrides: # Optional: override node cordon behavior
138+
force: true # Or use skip: true; do not set both
139+
drainOverrides: # Optional: override pod eviction behavior
140+
skip: true # Or use force: true; do not set both
137141
```
138142

143+
For each override block, `force` and `skip` are mutually exclusive. Use `force`
144+
when this policy should perform the action regardless of normal rules, or `skip`
145+
when this policy should bypass the action.
146+
139147
## Key Features
140148

141149
### Policy-Based Monitoring

health-monitors/kubernetes-object-monitor/pkg/config/loader.go

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -50,25 +50,55 @@ func validate(cfg *Config) error {
5050

5151
policyNames[policy.Name] = true
5252

53-
if policy.Resource.Version == "" {
54-
return fmt.Errorf("policy %q: resource.version is required", policy.Name)
53+
if err := validatePolicy(policy); err != nil {
54+
return err
5555
}
56+
}
5657

57-
if policy.Resource.Kind == "" {
58-
return fmt.Errorf("policy %q: resource.kind is required", policy.Name)
59-
}
58+
return nil
59+
}
6060

61-
if policy.Predicate.Expression == "" {
62-
return fmt.Errorf("policy %q: predicate.expression is required", policy.Name)
63-
}
61+
func validatePolicy(policy Policy) error {
62+
if policy.Resource.Version == "" {
63+
return fmt.Errorf("policy %q: resource.version is required", policy.Name)
64+
}
6465

65-
if policy.HealthEvent.ComponentClass == "" {
66-
return fmt.Errorf("policy %q: healthEvent.componentClass is required", policy.Name)
67-
}
66+
if policy.Resource.Kind == "" {
67+
return fmt.Errorf("policy %q: resource.kind is required", policy.Name)
68+
}
6869

69-
if policy.HealthEvent.Message == "" {
70-
return fmt.Errorf("policy %q: healthEvent.message is required", policy.Name)
71-
}
70+
if policy.Predicate.Expression == "" {
71+
return fmt.Errorf("policy %q: predicate.expression is required", policy.Name)
72+
}
73+
74+
if policy.HealthEvent.ComponentClass == "" {
75+
return fmt.Errorf("policy %q: healthEvent.componentClass is required", policy.Name)
76+
}
77+
78+
if policy.HealthEvent.Message == "" {
79+
return fmt.Errorf("policy %q: healthEvent.message is required", policy.Name)
80+
}
81+
82+
if err := validateBehaviourOverrides(policy.Name, "quarantineOverrides",
83+
policy.HealthEvent.QuarantineOverrides); err != nil {
84+
return err
85+
}
86+
87+
if err := validateBehaviourOverrides(policy.Name, "drainOverrides",
88+
policy.HealthEvent.DrainOverrides); err != nil {
89+
return err
90+
}
91+
92+
return nil
93+
}
94+
95+
func validateBehaviourOverrides(policyName, fieldName string, overrides *BehaviourOverridesSpec) error {
96+
if overrides == nil {
97+
return nil
98+
}
99+
100+
if overrides.Force && overrides.Skip {
101+
return fmt.Errorf("policy %q: healthEvent.%s cannot set both force and skip", policyName, fieldName)
72102
}
73103

74104
return nil
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
package config
15+
16+
import (
17+
"os"
18+
"path/filepath"
19+
"testing"
20+
21+
"github.com/stretchr/testify/require"
22+
)
23+
24+
func TestLoadHealthEventBehaviourOverrides(t *testing.T) {
25+
configPath := filepath.Join(t.TempDir(), "config.toml")
26+
tomlConfig := `
27+
[[policies]]
28+
name = "operator-pod-unhealthy"
29+
enabled = true
30+
31+
[policies.resource]
32+
group = ""
33+
version = "v1"
34+
kind = "Pod"
35+
36+
[policies.predicate]
37+
expression = "true"
38+
39+
[policies.healthEvent]
40+
componentClass = "Software"
41+
isFatal = true
42+
message = "operator pod is unhealthy"
43+
recommendedAction = "CONTACT_SUPPORT"
44+
errorCode = ["OPERATOR_POD_UNHEALTHY"]
45+
46+
[policies.healthEvent.quarantineOverrides]
47+
force = true
48+
49+
[policies.healthEvent.drainOverrides]
50+
skip = true
51+
`
52+
require.NoError(t, os.WriteFile(configPath, []byte(tomlConfig), 0o600))
53+
54+
cfg, err := Load(configPath)
55+
require.NoError(t, err)
56+
require.Len(t, cfg.Policies, 1)
57+
58+
healthEvent := cfg.Policies[0].HealthEvent
59+
require.NotNil(t, healthEvent.QuarantineOverrides)
60+
require.True(t, healthEvent.QuarantineOverrides.Force)
61+
require.False(t, healthEvent.QuarantineOverrides.Skip)
62+
require.NotNil(t, healthEvent.DrainOverrides)
63+
require.False(t, healthEvent.DrainOverrides.Force)
64+
require.True(t, healthEvent.DrainOverrides.Skip)
65+
}
66+
67+
func TestLoadRejectsConflictingBehaviourOverrides(t *testing.T) {
68+
tests := []struct {
69+
name string
70+
overrideTOML string
71+
wantError string
72+
}{
73+
{
74+
name: "quarantine overrides force and skip",
75+
overrideTOML: `
76+
[policies.healthEvent.quarantineOverrides]
77+
force = true
78+
skip = true
79+
`,
80+
wantError: `healthEvent.quarantineOverrides cannot set both force and skip`,
81+
},
82+
{
83+
name: "drain overrides force and skip",
84+
overrideTOML: `
85+
[policies.healthEvent.drainOverrides]
86+
force = true
87+
skip = true
88+
`,
89+
wantError: `healthEvent.drainOverrides cannot set both force and skip`,
90+
},
91+
}
92+
93+
for _, tt := range tests {
94+
t.Run(tt.name, func(t *testing.T) {
95+
configPath := filepath.Join(t.TempDir(), "config.toml")
96+
tomlConfig := `
97+
[[policies]]
98+
name = "operator-pod-unhealthy"
99+
enabled = true
100+
101+
[policies.resource]
102+
group = ""
103+
version = "v1"
104+
kind = "Pod"
105+
106+
[policies.predicate]
107+
expression = "true"
108+
109+
[policies.healthEvent]
110+
componentClass = "Software"
111+
isFatal = true
112+
message = "operator pod is unhealthy"
113+
recommendedAction = "CONTACT_SUPPORT"
114+
errorCode = ["OPERATOR_POD_UNHEALTHY"]
115+
` + tt.overrideTOML
116+
require.NoError(t, os.WriteFile(configPath, []byte(tomlConfig), 0o600))
117+
118+
_, err := Load(configPath)
119+
require.Error(t, err)
120+
require.Contains(t, err.Error(), tt.wantError)
121+
})
122+
}
123+
}

health-monitors/kubernetes-object-monitor/pkg/config/types.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,22 @@ type AssociationSpec struct {
4141
}
4242

4343
type HealthEventSpec struct {
44-
ComponentClass string `toml:"componentClass"`
45-
IsFatal bool `toml:"isFatal"`
46-
Message string `toml:"message"`
47-
RecommendedAction string `toml:"recommendedAction"`
48-
ErrorCode []string `toml:"errorCode"`
44+
ComponentClass string `toml:"componentClass"`
45+
IsFatal bool `toml:"isFatal"`
46+
Message string `toml:"message"`
47+
RecommendedAction string `toml:"recommendedAction"`
48+
ErrorCode []string `toml:"errorCode"`
49+
QuarantineOverrides *BehaviourOverridesSpec `toml:"quarantineOverrides,omitempty"`
50+
DrainOverrides *BehaviourOverridesSpec `toml:"drainOverrides,omitempty"`
4951
// override the processing strategy for the policy
5052
ProcessingStrategy string `toml:"processingStrategy"`
5153
}
5254

55+
type BehaviourOverridesSpec struct {
56+
Force bool `toml:"force"`
57+
Skip bool `toml:"skip"`
58+
}
59+
5360
func (r *ResourceSpec) GVK() string {
5461
if r.Group == "" {
5562
return r.Version + "/" + r.Kind

health-monitors/kubernetes-object-monitor/pkg/publisher/publisher.go

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,25 @@ func (p *Publisher) PublishHealthEvent(ctx context.Context,
8080
}
8181
}
8282

83+
quarantineOverrides := behaviourOverridesFromSpec(policy.HealthEvent.QuarantineOverrides)
84+
drainOverrides := behaviourOverridesFromSpec(policy.HealthEvent.DrainOverrides)
85+
8386
event := &pb.HealthEvent{
84-
Version: 1,
85-
Agent: agentName,
86-
CheckName: policy.Name,
87-
ComponentClass: policy.HealthEvent.ComponentClass,
88-
GeneratedTimestamp: timestamppb.New(time.Now()),
89-
Message: policy.HealthEvent.Message,
90-
IsFatal: policy.HealthEvent.IsFatal,
91-
IsHealthy: isHealthy,
92-
NodeName: nodeName,
93-
RecommendedAction: mapRecommendedAction(policy.HealthEvent.RecommendedAction),
94-
ErrorCode: policy.HealthEvent.ErrorCode,
95-
ProcessingStrategy: strategy,
96-
EntitiesImpacted: entitiesImpacted,
87+
Version: 1,
88+
Agent: agentName,
89+
CheckName: policy.Name,
90+
ComponentClass: policy.HealthEvent.ComponentClass,
91+
GeneratedTimestamp: timestamppb.New(time.Now()),
92+
Message: policy.HealthEvent.Message,
93+
IsFatal: policy.HealthEvent.IsFatal,
94+
IsHealthy: isHealthy,
95+
NodeName: nodeName,
96+
RecommendedAction: mapRecommendedAction(policy.HealthEvent.RecommendedAction),
97+
ErrorCode: policy.HealthEvent.ErrorCode,
98+
ProcessingStrategy: strategy,
99+
EntitiesImpacted: entitiesImpacted,
100+
QuarantineOverrides: quarantineOverrides,
101+
DrainOverrides: drainOverrides,
97102
}
98103

99104
healthEvents := &pb.HealthEvents{
@@ -113,3 +118,14 @@ func mapRecommendedAction(action string) pb.RecommendedAction {
113118

114119
return pb.RecommendedAction_CONTACT_SUPPORT
115120
}
121+
122+
func behaviourOverridesFromSpec(spec *config.BehaviourOverridesSpec) *pb.BehaviourOverrides {
123+
if spec == nil {
124+
return nil
125+
}
126+
127+
return &pb.BehaviourOverrides{
128+
Force: spec.Force,
129+
Skip: spec.Skip,
130+
}
131+
}

0 commit comments

Comments
 (0)