Skip to content

Commit 628cab2

Browse files
committed
feat(skyhook): temporarily remove skyhook tuning due to bugs
1 parent 0463e2d commit 628cab2

File tree

5 files changed

+110
-38
lines changed

5 files changed

+110
-38
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Skyhook H100 EKS Ubuntu Customization
16+
# Minimal templating - only tolerations/nodeSelectors are dynamic for CLI flag support
17+
#
18+
# This customization configures H100 GPU nodes on Ubuntu with:
19+
# - GRUB parameters for hugepages and nokaslr
20+
# - Containerd service limits
21+
# - Sysctl kernel tuning parameters
22+
{{- $cust := index .Values "skyhook-customizations" }}
23+
{{- if ne (toString (index $cust "enabled")) "false" }}
24+
---
25+
apiVersion: skyhook.nvidia.com/v1alpha1
26+
kind: Skyhook
27+
metadata:
28+
annotations:
29+
# Helm hooks ensure this CR is applied after skyhook-operator CRD is installed
30+
"helm.sh/hook": post-install,post-upgrade
31+
"helm.sh/hook-weight": "10"
32+
"helm.sh/hook-delete-policy": before-hook-creation
33+
labels:
34+
app.kubernetes.io/part-of: skyhook-operator
35+
app.kubernetes.io/created-by: eidos
36+
app.kubernetes.io/managed-by: {{ .Release.Service }}
37+
helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }}
38+
name: no-op
39+
namespace: {{ .Release.Namespace }}
40+
spec:
41+
runtimeRequired: true
42+
# Dynamic: Supports --accelerated-node-toleration CLI flag
43+
additionalTolerations:
44+
{{- if $cust.acceleratedTolerations }}
45+
{{- toYaml $cust.acceleratedTolerations | nindent 4 }}
46+
{{- else }}
47+
- key: dedicated
48+
operator: Exists
49+
{{- end }}
50+
# Dynamic: Supports --accelerated-node-selector CLI flag
51+
{{- if $cust.acceleratedNodeSelector }}
52+
nodeSelectors:
53+
matchLabels:
54+
{{- toYaml $cust.acceleratedNodeSelector | nindent 6 }}
55+
{{- end }}
56+
# Static: Baked-in tuning for H100 Ubuntu training
57+
packages:
58+
no-op:
59+
image: ghcr.io/nvidia/skyhook-packages/shellscript
60+
version: 1.1.1"
61+
configMap:
62+
apply.sh: |
63+
#!/bin/bash
64+
echo "No-op package applied"
65+
{{- end }}

recipes/components/skyhook-customizations/manifests/tuning.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,17 @@ spec:
4141
runtimeRequired: true
4242
# Dynamic: Supports --accelerated-node-toleration CLI flag
4343
additionalTolerations:
44-
{{- if $cust.tolerations }}
45-
{{- toYaml $cust.tolerations | nindent 4 }}
44+
{{- if $cust.acceleratedTolerations }}
45+
{{- toYaml $cust.acceleratedTolerations | nindent 4 }}
4646
{{- else }}
4747
- key: dedicated
4848
operator: Exists
4949
{{- end }}
5050
# Dynamic: Supports --accelerated-node-selector CLI flag
51-
{{- if $cust.nodeSelector }}
51+
{{- if $cust.acceleratedNodeSelector }}
5252
nodeSelectors:
5353
matchLabels:
54-
{{- toYaml $cust.nodeSelector | nindent 6 }}
54+
{{- toYaml $cust.acceleratedNodeSelector | nindent 6 }}
5555
{{- end }}
5656
# Static: Baked-in tuning for H100 Ubuntu training
5757
packages:

recipes/overlays/h100-eks-inference.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@ spec:
3333
value: ">= 1.32.4"
3434

3535
componentRefs:
36+
3637
- name: skyhook-customizations
3738
type: Helm
3839
manifestFiles:
39-
- components/skyhook-customizations/manifests/tuning.yaml
40+
# Temporarily disabled until skyhook-customizations is fixed
41+
- components/skyhook-customizations/manifests/no-op.yaml
4042
overrides:
4143
service: aws
4244
accelerator: h100
4345
intent: inference
4446
dependencyRefs:
45-
- skyhook-operator
47+
- skyhook-operator

recipes/overlays/h100-eks-training.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ spec:
3333
value: ">= 1.32.4"
3434

3535
componentRefs:
36-
3736
# H100-specific GPU Operator overrides (inherits valuesFile from eks-training)
3837
- name: gpu-operator
3938
type: Helm
@@ -46,7 +45,8 @@ spec:
4645
- name: skyhook-customizations
4746
type: Helm
4847
manifestFiles:
49-
- components/skyhook-customizations/manifests/tuning.yaml
48+
# Temporarily disabled until skyhook-customizations is fixed
49+
- components/skyhook-customizations/manifests/no-op.yaml
5050
overrides:
5151
service: aws
5252
accelerator: h100

recipes/registry.yaml

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,41 @@ apiVersion: eidos.nvidia.com/v1alpha1
3939
kind: ComponentRegistry
4040

4141
components:
42+
- name: skyhook-operator
43+
displayName: skyhook
44+
valueOverrideKeys:
45+
- skyhook
46+
helm:
47+
defaultRepository: https://nvidia.github.io/skyhook
48+
defaultChart: skyhook-operator
49+
defaultNamespace: skyhook
50+
nodeScheduling:
51+
system:
52+
nodeSelectorPaths:
53+
- controllerManager.selectors
54+
tolerationPaths:
55+
- controllerManager.tolerations
56+
57+
- name: skyhook-customizations
58+
displayName: skyhook-customizations
59+
valueOverrideKeys:
60+
- skyhookcustomizations
61+
helm:
62+
# Manifest-only component - no external Helm chart, uses manifestFiles
63+
defaultRepository: ""
64+
defaultNamespace: skyhook
65+
nodeScheduling:
66+
system:
67+
nodeSelectorPaths:
68+
- systemNodeSelector
69+
tolerationPaths:
70+
- systemTolerations
71+
accelerated:
72+
nodeSelectorPaths:
73+
- acceleratedNodeSelector
74+
tolerationPaths:
75+
- acceleratedTolerations
76+
4277
- name: gpu-operator
4378
displayName: gpu-operator
4479
valueOverrideKeys:
@@ -119,36 +154,6 @@ components:
119154
- cainjector.tolerations
120155
- startupapicheck.tolerations
121156

122-
- name: skyhook-operator
123-
displayName: skyhook
124-
valueOverrideKeys:
125-
- skyhook
126-
helm:
127-
defaultRepository: https://nvidia.github.io/skyhook
128-
defaultChart: skyhook-operator
129-
defaultNamespace: skyhook
130-
nodeScheduling:
131-
system:
132-
nodeSelectorPaths:
133-
- controllerManager.selectors
134-
tolerationPaths:
135-
- controllerManager.tolerations
136-
137-
- name: skyhook-customizations
138-
displayName: skyhook-customizations
139-
valueOverrideKeys:
140-
- skyhookcustomizations
141-
helm:
142-
# Manifest-only component - no external Helm chart, uses manifestFiles
143-
defaultRepository: ""
144-
defaultNamespace: skyhook
145-
nodeScheduling:
146-
accelerated:
147-
nodeSelectorPaths:
148-
- nodeSelector
149-
tolerationPaths:
150-
- tolerations
151-
152157
- name: nvsentinel
153158
displayName: nvsentinel
154159
valueOverrideKeys:

0 commit comments

Comments
 (0)