Skip to content

Commit 536313d

Browse files
committed
fix(binder): sanitize configmap volume names for dotted pod names
Signed-off-by: Julien Semaan <jsemaan@kubex.ai>
1 parent 4cbd3ea commit 536313d

6 files changed

Lines changed: 186 additions & 17 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2323
- **Breaking:** The podgroup produced for JobSet is now produces as a single PodGroup per JobSet with a two-level SubGroup hierarchy (one parent SubGroup per `replicatedJob`, one leaf SubGroup per replica) regardless of `startupPolicyOrder`. The `kai.scheduler/batch-min-member` annotation on the JobSet now overrides the root `minSubGroup`; the same annotation on `replicatedJobs[].template.metadata.annotations` overrides the leaf `minMember` (defaulting to `template.spec.parallelism`). [#1617](https://github.com/kai-scheduler/KAI-Scheduler/pull/1617) [davidLif](https://github.com/davidLif)
2424

2525
### Fixed
26+
- Fixed GPU-sharing pods with dotted pod names generating invalid ConfigMap-backed volume names. Volume names are now sanitized to valid DNS labels while preserving original ConfigMap references used for shared-GPU injection.
2627
- Reduced scheduler heap retention after scheduling cycles by clearing completed session snapshots and callback references, and by releasing the node scoring pool without waiting for finalizers.
2728
- Fixed Helm chart prometheus RBAC always being installed when `prometheus.enabled` is false, and the `kai-prometheus` ClusterRoleBinding referencing the `prometheus` ServiceAccount in hardcoded `kai-scheduler` namespace instead of the Helm release namespace [#1684](https://github.com/kai-scheduler/KAI-Scheduler/pull/1684) [dttung2905](https://github.com/dttung2905)
2829
- Fixed post-delete cleanup hook hardcoding `kai-scheduler` namespace instead of Helm release namespace on `helm uninstall` [#1619](https://github.com/kai-scheduler/KAI-Scheduler/pull/1619) [dttung2905](https://github.com/dttung2905)

pkg/admission/webhook/v1alpha2/gpusharing/gpu_sharing_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package gpusharing
55

66
import (
77
"fmt"
8+
"strings"
89
"testing"
910

1011
v1 "k8s.io/api/core/v1"
@@ -350,6 +351,88 @@ func TestMutate(t *testing.T) {
350351
}
351352
},
352353
},
354+
{
355+
name: "pod with dotted name keeps configmap refs and sanitizes volume name",
356+
pod: &v1.Pod{
357+
ObjectMeta: metav1.ObjectMeta{
358+
Name: "test.pod",
359+
Namespace: "test-namespace",
360+
Annotations: map[string]string{
361+
constants.GpuFraction: "0.5",
362+
},
363+
},
364+
Spec: v1.PodSpec{
365+
Containers: []v1.Container{
366+
{
367+
Name: "test-container",
368+
Resources: v1.ResourceRequirements{
369+
Limits: v1.ResourceList{},
370+
},
371+
},
372+
},
373+
},
374+
},
375+
expectError: false,
376+
validateMutatedPod: func(t *testing.T, pod *v1.Pod) {
377+
configMapPrefix, found := pod.Annotations["runai/shared-gpu-configmap"]
378+
if !found {
379+
t.Fatalf("Expected shared-gpu-configmap annotation to be added")
380+
}
381+
if !strings.Contains(configMapPrefix, ".") {
382+
t.Fatalf("Expected configmap annotation to preserve dotted pod name, got %q", configMapPrefix)
383+
}
384+
385+
expectedCapabilitiesConfigMapName := configMapPrefix + "-0"
386+
expectedEnvVarsConfigMapName := expectedCapabilitiesConfigMapName + "-evar"
387+
388+
var configMapVolume *v1.Volume
389+
for i := range pod.Spec.Volumes {
390+
volume := &pod.Spec.Volumes[i]
391+
if volume.ConfigMap != nil {
392+
configMapVolume = volume
393+
break
394+
}
395+
}
396+
if configMapVolume == nil {
397+
t.Fatalf("Expected ConfigMap volume to be added")
398+
}
399+
if strings.Contains(configMapVolume.Name, ".") {
400+
t.Fatalf("Expected sanitized volume name, got %q", configMapVolume.Name)
401+
}
402+
if configMapVolume.ConfigMap.LocalObjectReference.Name != expectedCapabilitiesConfigMapName {
403+
t.Fatalf("Expected volume configmap ref %q, got %q",
404+
expectedCapabilitiesConfigMapName, configMapVolume.ConfigMap.LocalObjectReference.Name)
405+
}
406+
407+
container := pod.Spec.Containers[0]
408+
for _, env := range container.Env {
409+
if env.ValueFrom == nil || env.ValueFrom.ConfigMapKeyRef == nil {
410+
continue
411+
}
412+
if env.ValueFrom.ConfigMapKeyRef.LocalObjectReference.Name != expectedCapabilitiesConfigMapName {
413+
t.Fatalf("Expected env var configmap ref %q, got %q",
414+
expectedCapabilitiesConfigMapName,
415+
env.ValueFrom.ConfigMapKeyRef.LocalObjectReference.Name)
416+
}
417+
}
418+
419+
foundEnvFrom := false
420+
for _, envFrom := range container.EnvFrom {
421+
if envFrom.ConfigMapRef == nil {
422+
continue
423+
}
424+
foundEnvFrom = true
425+
if envFrom.ConfigMapRef.LocalObjectReference.Name != expectedEnvVarsConfigMapName {
426+
t.Fatalf("Expected EnvFrom configmap ref %q, got %q",
427+
expectedEnvVarsConfigMapName,
428+
envFrom.ConfigMapRef.LocalObjectReference.Name)
429+
}
430+
}
431+
if !foundEnvFrom {
432+
t.Fatalf("Expected ConfigMapRef in EnvFrom")
433+
}
434+
},
435+
},
353436
{
354437
name: "pod with GPU fraction and specific container name",
355438
pod: &v1.Pod{

pkg/binder/common/gpusharingconfigmap/config_map.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"fmt"
99
"strconv"
10+
"strings"
1011

1112
v1 "k8s.io/api/core/v1"
1213
"k8s.io/apimachinery/pkg/api/errors"
@@ -140,6 +141,7 @@ func generateConfigMapNamePrefix(pod *v1.Pod, containerIndex int) string {
140141
if len(baseName) > maxBaseNameLength {
141142
baseName = baseName[:maxBaseNameLength]
142143
}
144+
baseName = strings.TrimRight(baseName, ".-")
143145
return fmt.Sprintf("%v-%v-%v", baseName,
144146
utilrand.String(configMapNameNumRandomChars), gpuSharingConfigMap)
145147
}

pkg/binder/common/gpusharingconfigmap/config_map_name_test.go

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ package gpusharingconfigmap
66
import (
77
"testing"
88

9+
"github.com/stretchr/testify/assert"
910
v1 "k8s.io/api/core/v1"
1011
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"k8s.io/apimachinery/pkg/util/validation"
1113
)
1214

1315
type configMapNameTest struct {
@@ -27,9 +29,7 @@ func TestGetDesiredConfigMapName(t *testing.T) {
2729
},
2830
},
2931
Spec: v1.PodSpec{
30-
Containers: []v1.Container{
31-
{},
32-
},
32+
Containers: []v1.Container{{}},
3333
},
3434
},
3535
containerIndex: 0,
@@ -44,22 +44,18 @@ func TestGetDesiredConfigMapName(t *testing.T) {
4444
},
4545
},
4646
Spec: v1.PodSpec{
47-
Containers: []v1.Container{
48-
{},
49-
{},
50-
},
51-
Volumes: []v1.Volume{
52-
{
53-
Name: "config-map-vol",
54-
VolumeSource: v1.VolumeSource{
55-
ConfigMap: &v1.ConfigMapVolumeSource{
56-
LocalObjectReference: v1.LocalObjectReference{
57-
Name: "config-map-name-1",
58-
},
47+
Containers: []v1.Container{{}, {}},
48+
Volumes: []v1.Volume{{
49+
Name: "config-map-vol",
50+
VolumeSource: v1.VolumeSource{
51+
ConfigMap: &v1.ConfigMapVolumeSource{
52+
LocalObjectReference: v1.LocalObjectReference{
53+
Name: "config-map-name-1",
5954
},
6055
},
6156
},
6257
}},
58+
},
6359
},
6460
containerIndex: 1,
6561
desiredConfigMapName: "config-map-name-1",
@@ -85,3 +81,52 @@ func TestGetDesiredConfigMapName(t *testing.T) {
8581
}
8682
}
8783
}
84+
85+
func TestSetGpuCapabilitiesConfigMapNameTrimsTrailingDotAfterTruncation(t *testing.T) {
86+
pod := &v1.Pod{
87+
ObjectMeta: metav1.ObjectMeta{
88+
Name: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.b",
89+
},
90+
Spec: v1.PodSpec{
91+
Containers: []v1.Container{{}},
92+
},
93+
}
94+
95+
containerRef := &PodContainerRef{
96+
Container: &pod.Spec.Containers[0],
97+
Index: 0,
98+
Type: RegularContainer,
99+
}
100+
101+
configMapName := SetGpuCapabilitiesConfigMapName(pod, containerRef)
102+
103+
assert.Empty(t, validation.IsDNS1123Subdomain(configMapName))
104+
assert.NotContains(t, configMapName, ".-")
105+
assert.Equal(t, pod.Annotations[gpuSharingConfigMapAnnotation]+"-0", configMapName)
106+
}
107+
108+
func TestSetGpuCapabilitiesConfigMapNameUsesOwnerRefAndTrimsTrailingHyphenAfterTruncation(t *testing.T) {
109+
pod := &v1.Pod{
110+
ObjectMeta: metav1.ObjectMeta{
111+
Name: "fallback-name",
112+
OwnerReferences: []metav1.OwnerReference{{
113+
Name: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-b",
114+
}},
115+
},
116+
Spec: v1.PodSpec{
117+
Containers: []v1.Container{{}},
118+
},
119+
}
120+
121+
containerRef := &PodContainerRef{
122+
Container: &pod.Spec.Containers[0],
123+
Index: 0,
124+
Type: RegularContainer,
125+
}
126+
127+
configMapName := SetGpuCapabilitiesConfigMapName(pod, containerRef)
128+
129+
assert.Empty(t, validation.IsDNS1123Subdomain(configMapName))
130+
assert.NotContains(t, configMapName, "--")
131+
assert.Equal(t, pod.Annotations[gpuSharingConfigMapAnnotation]+"-0", configMapName)
132+
}

pkg/binder/common/volumes.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
package common
55

66
import (
7-
"fmt"
7+
"regexp"
8+
"strings"
89

910
"k8s.io/api/core/v1"
1011
)
@@ -40,6 +41,11 @@ func addConfigMapVolume(podSpec *v1.PodSpec, volumeName string, configMapName st
4041
podSpec.Volumes = append(podSpec.Volumes, volume)
4142
}
4243

44+
var invalidDNSLabelChars = regexp.MustCompile(`[^a-z0-9-]+`)
45+
4346
func GetConfigVolumeName(configMapName string) string {
44-
return fmt.Sprintf("%v-vol", configMapName)
47+
// ConfigMap names may be DNS subdomains, but volume names must be DNS labels.
48+
volumeName := strings.ToLower(configMapName + "-vol")
49+
volumeName = invalidDNSLabelChars.ReplaceAllString(volumeName, "-")
50+
return strings.Trim(volumeName, "-")
4551
}

pkg/binder/common/volumes_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
package common
55

66
import (
7+
"strings"
78
"testing"
89

910
"github.com/stretchr/testify/assert"
1011
v1 "k8s.io/api/core/v1"
12+
"k8s.io/apimachinery/pkg/util/validation"
1113
)
1214

1315
func TestAddConfigMapVolume(t *testing.T) {
@@ -39,3 +41,33 @@ func TestOverrideConfigMapVolume(t *testing.T) {
3941
assert.Equal(t, podSpec.Volumes[0].Name, "name")
4042
assert.Equal(t, podSpec.Volumes[0].ConfigMap.LocalObjectReference.Name, "conf2")
4143
}
44+
45+
func TestGetConfigVolumeName(t *testing.T) {
46+
tests := []struct {
47+
name string
48+
configMapName string
49+
expected string
50+
}{
51+
{
52+
name: "replaces dots and keeps suffix",
53+
configMapName: "test.pod-abc1234-shared-gpu-0",
54+
expected: "test-pod-abc1234-shared-gpu-0-vol",
55+
},
56+
{
57+
name: "normalizes unsupported characters",
58+
configMapName: "test_pod.config@1",
59+
expected: "test-pod-config-1-vol",
60+
},
61+
}
62+
63+
for _, tt := range tests {
64+
t.Run(tt.name, func(t *testing.T) {
65+
actual := GetConfigVolumeName(tt.configMapName)
66+
67+
assert.Equal(t, tt.expected, actual)
68+
assert.True(t, strings.HasSuffix(actual, "-vol"))
69+
assert.NotContains(t, actual, ".")
70+
assert.Empty(t, validation.IsDNS1123Label(actual))
71+
})
72+
}
73+
}

0 commit comments

Comments
 (0)