Skip to content

Commit faa7768

Browse files
committed
chore: address review comments
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent 17a823a commit faa7768

11 files changed

Lines changed: 220 additions & 235 deletions

File tree

distros/kubernetes/nvsentinel/charts/labeler/templates/_helpers.tpl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ enabled = {{ .enabled }}
6060
groupingLabels = [{{- range $i, $label := . }}{{ if $i }}, {{ end }}{{ $label | quote }}{{- end }}]
6161
{{- end }}
6262
currentExpression = '''
63-
{{ .currentExpression | trimSuffix "\n" }}
63+
{{ trimSuffix "\n" (default "" .currentExpression) }}
6464
'''
6565

6666
[classes.labels]

distros/kubernetes/nvsentinel/charts/labeler/values.yaml

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ assumeDriverInstalled: false
4747
# Current counts come from CEL expressions over the node object and associated ResourceSlices.
4848
# Expected counts are learned per grouping-label partition unless an override matches.
4949
expectedDeviceCounts:
50-
enabled: false
50+
enabled: true
5151
classes:
5252
- name: gpu
53-
enabled: false
53+
enabled: true
5454
labels:
5555
current: nvsentinel.dgxc.nvidia.com/gpu.count.current
5656
expected: nvsentinel.dgxc.nvidia.com/gpu.count.expected
@@ -60,30 +60,31 @@ expectedDeviceCounts:
6060
expectedCountOverrides: []
6161
currentExpression: |
6262
int(node.metadata.labels['nvidia.com/gpu.count'])
63-
- name: nic
64-
enabled: false
65-
labels:
66-
current: nvsentinel.dgxc.nvidia.com/nic.count.current
67-
expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
68-
groupingLabels:
69-
- node.kubernetes.io/instance-type
70-
expectedCountOverrides: []
71-
currentExpression: |
72-
sum(resourceSlices
73-
.filter(rs,
74-
has(rs.spec.driver) &&
75-
rs.spec.driver == 'dra.networking.k8s.aws' &&
76-
has(rs.spec.devices)
77-
)
78-
.map(rs, rs.spec.devices
79-
.filter(d,
80-
has(d.attributes) &&
81-
'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
82-
has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
83-
d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
84-
)
85-
.size()
86-
))
63+
# Example AWS RoCE DRA NIC class. Uncomment and adapt per environment.
64+
# - name: nic
65+
# enabled: false
66+
# labels:
67+
# current: nvsentinel.dgxc.nvidia.com/nic.count.current
68+
# expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
69+
# groupingLabels:
70+
# - node.kubernetes.io/instance-type
71+
# expectedCountOverrides: []
72+
# currentExpression: |
73+
# sum(resourceSlices
74+
# .filter(rs,
75+
# has(rs.spec.driver) &&
76+
# rs.spec.driver == 'dra.networking.k8s.aws' &&
77+
# has(rs.spec.devices)
78+
# )
79+
# .map(rs, rs.spec.devices
80+
# .filter(d,
81+
# has(d.attributes) &&
82+
# 'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
83+
# has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
84+
# d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
85+
# )
86+
# .size()
87+
# ))
8788

8889
resources:
8990
requests:

distros/kubernetes/nvsentinel/values-full.yaml

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -960,13 +960,12 @@ labeler:
960960
assumeDriverInstalled: false
961961

962962
# Expected device count labels
963-
# Disabled by default. Enable classes to write current/expected device-count labels
964-
# from node labels or DRA ResourceSlice data.
963+
# Writes current/expected device-count labels from node labels or DRA ResourceSlice data.
965964
expectedDeviceCounts:
966-
enabled: false
965+
enabled: true
967966
classes:
968967
- name: gpu
969-
enabled: false
968+
enabled: true
970969
labels:
971970
current: nvsentinel.dgxc.nvidia.com/gpu.count.current
972971
expected: nvsentinel.dgxc.nvidia.com/gpu.count.expected
@@ -976,30 +975,31 @@ labeler:
976975
expectedCountOverrides: []
977976
currentExpression: |
978977
int(node.metadata.labels['nvidia.com/gpu.count'])
979-
- name: nic
980-
enabled: false
981-
labels:
982-
current: nvsentinel.dgxc.nvidia.com/nic.count.current
983-
expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
984-
groupingLabels:
985-
- node.kubernetes.io/instance-type
986-
expectedCountOverrides: []
987-
currentExpression: |
988-
sum(resourceSlices
989-
.filter(rs,
990-
has(rs.spec.driver) &&
991-
rs.spec.driver == 'dra.networking.k8s.aws' &&
992-
has(rs.spec.devices)
993-
)
994-
.map(rs, rs.spec.devices
995-
.filter(d,
996-
has(d.attributes) &&
997-
'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
998-
has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
999-
d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
1000-
)
1001-
.size()
1002-
))
978+
# Example AWS RoCE DRA NIC class. Uncomment and adapt per environment.
979+
# - name: nic
980+
# enabled: false
981+
# labels:
982+
# current: nvsentinel.dgxc.nvidia.com/nic.count.current
983+
# expected: nvsentinel.dgxc.nvidia.com/nic.count.expected
984+
# groupingLabels:
985+
# - node.kubernetes.io/instance-type
986+
# expectedCountOverrides: []
987+
# currentExpression: |
988+
# sum(resourceSlices
989+
# .filter(rs,
990+
# has(rs.spec.driver) &&
991+
# rs.spec.driver == 'dra.networking.k8s.aws' &&
992+
# has(rs.spec.devices)
993+
# )
994+
# .map(rs, rs.spec.devices
995+
# .filter(d,
996+
# has(d.attributes) &&
997+
# 'dra.vpc.amazonaws.com/deviceType' in d.attributes &&
998+
# has(d.attributes['dra.vpc.amazonaws.com/deviceType'].string) &&
999+
# d.attributes['dra.vpc.amazonaws.com/deviceType'].string == 'roce'
1000+
# )
1001+
# .size()
1002+
# ))
10031003

10041004
# Pod resource limits and requests
10051005
resources:

labeler/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func run() error {
7979

8080
expectedDeviceCounts, err := loadExpectedDeviceCountsConfig(*expectedDeviceCountsConfigFile)
8181
if err != nil {
82-
return err
82+
return fmt.Errorf("load expected device counts config: %w", err)
8383
}
8484

8585
srv := server.NewServer(

labeler/pkg/devicecounts/device_counts.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func NewManager(config Config) (*Manager, error) {
102102
for i, classConfig := range config.Classes {
103103
compiledClass, ok, err := compileDeviceCountClass(env, i, classConfig)
104104
if err != nil {
105-
return nil, err
105+
return nil, fmt.Errorf("compile device-count class at index %d: %w", i, err)
106106
}
107107

108108
if !ok {
@@ -126,10 +126,6 @@ func (m *Manager) Enabled() bool {
126126

127127
// ClassCount returns the number of compiled device-count classes.
128128
func (m *Manager) ClassCount() int {
129-
if m == nil {
130-
return 0
131-
}
132-
133129
return len(m.classes)
134130
}
135131

@@ -237,7 +233,7 @@ func compileDeviceCountClass(
237233
}
238234

239235
if err := validateDeviceCountClassConfig(index, classConfig); err != nil {
240-
return compiledClass{}, false, err
236+
return compiledClass{}, false, fmt.Errorf("validate device-count class: %w", err)
241237
}
242238

243239
ast, issues := env.Compile(classConfig.CurrentExpression)

labeler/pkg/devicecounts/resource_slices.go

Lines changed: 0 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@
1515
package devicecounts
1616

1717
import (
18-
"fmt"
19-
"log/slog"
20-
"reflect"
21-
2218
corev1 "k8s.io/api/core/v1"
2319
resourcev1 "k8s.io/api/resource/v1"
2420
"k8s.io/client-go/tools/cache"
@@ -48,56 +44,6 @@ func ResourceSlicesForNode(store cache.Store, node *corev1.Node) []*resourcev1.R
4844
return resourceSlices
4945
}
5046

51-
// NewResourceSliceEventHandlers returns informer handlers that reconcile nodes affected by ResourceSlice changes.
52-
func NewResourceSliceEventHandlers(
53-
allInformersSynced func() bool,
54-
updateNodeLabels func(string) error,
55-
) cache.ResourceEventHandlerFuncs {
56-
return cache.ResourceEventHandlerFuncs{
57-
AddFunc: func(obj any) {
58-
resourceSlice, ok := resourceSliceFromEventObject(obj)
59-
if !ok {
60-
slog.Warn("Skipping ResourceSlice add event with unexpected object type",
61-
"type", fmt.Sprintf("%T", obj))
62-
63-
return
64-
}
65-
66-
handleResourceSliceEvent(allInformersSynced, updateNodeLabels, resourceSlice)
67-
},
68-
UpdateFunc: func(oldObj, newObj any) {
69-
oldResourceSlice, oldOk := resourceSliceFromEventObject(oldObj)
70-
newResourceSlice, newOk := resourceSliceFromEventObject(newObj)
71-
72-
if !oldOk || !newOk {
73-
slog.Warn("Skipping ResourceSlice update event with unexpected object type",
74-
"oldType", fmt.Sprintf("%T", oldObj), "newType", fmt.Sprintf("%T", newObj))
75-
76-
return
77-
}
78-
79-
// Device-count expressions only read ResourceSlice spec; ignore metadata-only churn.
80-
if reflect.DeepEqual(oldResourceSlice.Spec, newResourceSlice.Spec) {
81-
return
82-
}
83-
84-
// Reconcile nodes matched by both old and new specs in case node selection changed.
85-
handleResourceSliceEvent(allInformersSynced, updateNodeLabels, oldResourceSlice, newResourceSlice)
86-
},
87-
DeleteFunc: func(obj any) {
88-
resourceSlice, ok := resourceSliceFromEventObject(obj)
89-
if !ok {
90-
slog.Warn("Skipping ResourceSlice delete event with unexpected object type",
91-
"type", fmt.Sprintf("%T", obj))
92-
93-
return
94-
}
95-
96-
handleResourceSliceEvent(allInformersSynced, updateNodeLabels, resourceSlice)
97-
},
98-
}
99-
}
100-
10147
func resourceSliceBelongsToNode(resourceSlice *resourcev1.ResourceSlice, nodeName string) bool {
10248
resourceSliceNodeName, ok := resourceSliceNodeName(resourceSlice)
10349

@@ -111,52 +57,3 @@ func resourceSliceNodeName(resourceSlice *resourcev1.ResourceSlice) (string, boo
11157

11258
return *resourceSlice.Spec.NodeName, true
11359
}
114-
115-
func handleResourceSliceEvent(
116-
allInformersSynced func() bool,
117-
updateNodeLabels func(string) error,
118-
resourceSlices ...*resourcev1.ResourceSlice,
119-
) {
120-
if !allInformersSynced() {
121-
return
122-
}
123-
124-
for nodeName := range nodeNamesForResourceSlices(resourceSlices...) {
125-
if err := updateNodeLabels(nodeName); err != nil {
126-
slog.Error("Failed to reconcile node labels after ResourceSlice event",
127-
"node", nodeName, "error", err)
128-
}
129-
}
130-
}
131-
132-
func nodeNamesForResourceSlices(resourceSlices ...*resourcev1.ResourceSlice) map[string]struct{} {
133-
nodeNames := map[string]struct{}{}
134-
135-
for _, resourceSlice := range resourceSlices {
136-
nodeName, ok := resourceSliceNodeName(resourceSlice)
137-
if !ok {
138-
continue
139-
}
140-
141-
nodeNames[nodeName] = struct{}{}
142-
}
143-
144-
return nodeNames
145-
}
146-
147-
func resourceSliceFromEventObject(obj any) (*resourcev1.ResourceSlice, bool) {
148-
resourceSlice, ok := obj.(*resourcev1.ResourceSlice)
149-
if ok {
150-
return resourceSlice, true
151-
}
152-
153-
// Delete events can arrive as tombstones when the informer misses the final object state.
154-
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
155-
if !ok {
156-
return nil, false
157-
}
158-
159-
resourceSlice, ok = tombstone.Obj.(*resourcev1.ResourceSlice)
160-
161-
return resourceSlice, ok
162-
}

labeler/pkg/initializer/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func InitializeAll(params InitializationParams) (*Components, error) {
5252

5353
slog.Info("Successfully initialized kubernetes client")
5454

55-
labelerInstance, err := labeler.NewLabelerWithDeviceCounts(
55+
labelerInstance, err := labeler.NewLabeler(
5656
clientSet,
5757
30*time.Second,
5858
params.DCGMAppLabel,

0 commit comments

Comments
 (0)