Skip to content

Commit ae35dd6

Browse files
committed
feat: add GPU training CI workflow with gang scheduling test
Add a new H100 training test workflow that validates PyTorchJob gang scheduling with KAI scheduler on a 2-GPU runner. This includes: - New overlay h100-kind-training-kubeflow with kubeflow-trainer and dynamo components for kind-based training validation - New workflow gpu-h100-training-test.yaml targeting the linux-amd64-gpu-h100-latest-2 runner - Parameterize intent in gpu-operator-install action (default: inference) so training workflows can pass --intent training Signed-off-by: Davanum Srinivas <dsrinivas@nvidia.com>
1 parent 0463e2d commit ae35dd6

File tree

9 files changed

+601
-4
lines changed

9 files changed

+601
-4
lines changed

.github/actions/gpu-operator-install/action.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ inputs:
2323
description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)'
2424
required: false
2525
default: ''
26+
intent:
27+
description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)'
28+
required: false
29+
default: 'inference'
2630
platform:
27-
description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
31+
description: 'Platform for recipe generation (bundle mode only, e.g. dynamo, kubeflow)'
2832
required: false
2933
default: ''
3034

@@ -63,7 +67,7 @@ runs:
6367
6468
# --- Bundle mode: eidos recipe → bundle → deploy ---
6569

66-
- name: Generate inference recipe
70+
- name: Generate recipe
6771
if: inputs.method == 'bundle'
6872
shell: bash
6973
run: |
@@ -75,7 +79,7 @@ runs:
7579
--service kind \
7680
--accelerator ${{ inputs.accelerator }} \
7781
--os ubuntu \
78-
--intent inference \
82+
--intent ${{ inputs.intent }} \
7983
${PLATFORM_FLAG} \
8084
--output recipe.yaml
8185
echo "--- Recipe ---"
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: GPU Training Test (nvkind + H100 x2)
16+
17+
on:
18+
schedule:
19+
- cron: '0 6,18 * * *' # Every 12 hours (2x daily)
20+
push:
21+
branches:
22+
- "pull-request/[0-9]+"
23+
paths:
24+
- '.github/workflows/gpu-h100-training-test.yaml'
25+
- '.github/actions/gpu-cluster-setup/**'
26+
- '.github/actions/gpu-operator-install/**'
27+
- '.github/actions/eidos-build/**'
28+
- '.github/actions/gpu-test-cleanup/**'
29+
- '.github/actions/load-versions/**'
30+
- 'tests/manifests/gang-scheduling-test.yaml'
31+
- 'tests/chainsaw/ai-conformance/kind-training/**'
32+
- 'recipes/components/dynamo-platform/**'
33+
- 'recipes/overlays/kind.yaml'
34+
- 'recipes/overlays/h100-kind-training-kubeflow.yaml'
35+
workflow_dispatch: {} # Allow manual runs
36+
37+
permissions:
38+
contents: read
39+
40+
concurrency:
41+
group: ${{ github.workflow }}-${{ github.ref }}
42+
cancel-in-progress: true
43+
44+
jobs:
45+
46+
gpu-training-test:
47+
name: GPU Training Test (nvkind + H100 x2)
48+
runs-on: linux-amd64-gpu-h100-latest-2
49+
timeout-minutes: 45
50+
51+
env:
52+
KIND_CLUSTER_NAME: gpu-training-test
53+
54+
steps:
55+
56+
- name: Checkout Code
57+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
58+
with:
59+
persist-credentials: false
60+
61+
- name: Set up GPU cluster
62+
uses: ./.github/actions/gpu-cluster-setup
63+
64+
- name: Build eidos
65+
uses: ./.github/actions/eidos-build
66+
67+
- name: Install GPU operator (bundle)
68+
uses: ./.github/actions/gpu-operator-install
69+
with:
70+
method: bundle
71+
accelerator: h100
72+
intent: training
73+
platform: kubeflow
74+
75+
# --- Kubeflow Training Operator v1.9 (PyTorchJob support) ---
76+
# Installed via kustomize (not bundled) because the eidos bundler is Helm-only.
77+
78+
- name: Install Kubeflow Training Operator v1.9
79+
run: |
80+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply --server-side \
81+
-k "github.com/kubeflow/trainer.git/manifests/overlays/standalone?ref=v1.9.1"
82+
83+
# --- Health checks ---
84+
85+
- name: Install chainsaw
86+
run: |
87+
CHAINSAW_VERSION=$(yq eval '.testing_tools.chainsaw' .settings.yaml)
88+
GOFLAGS= go install "github.com/kyverno/chainsaw@${CHAINSAW_VERSION}"
89+
chainsaw version
90+
91+
- name: Run chainsaw health checks
92+
run: |
93+
chainsaw test \
94+
--test-dir tests/chainsaw/ai-conformance/kind-training \
95+
--config tests/chainsaw/chainsaw-config.yaml
96+
97+
# --- Gang scheduling test with PyTorchJob + KAI scheduler ---
98+
99+
- name: Deploy gang scheduling test
100+
run: |
101+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
102+
-f tests/manifests/gang-scheduling-test.yaml
103+
104+
echo "Waiting for PyTorchJob to complete..."
105+
for i in $(seq 1 60); do
106+
SUCCEEDED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
107+
-n gang-scheduling-test get pytorchjob pytorch-dist-mnist-nccl \
108+
-o jsonpath='{.status.conditions[?(@.type=="Succeeded")].status}' 2>/dev/null)
109+
if [[ "${SUCCEEDED}" == "True" ]]; then
110+
echo "PyTorchJob succeeded!"
111+
break
112+
fi
113+
114+
FAILED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
115+
-n gang-scheduling-test get pytorchjob pytorch-dist-mnist-nccl \
116+
-o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null)
117+
if [[ "${FAILED}" == "True" ]]; then
118+
echo "::error::PyTorchJob failed"
119+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
120+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
121+
exit 1
122+
fi
123+
124+
echo "Waiting for PyTorchJob completion... (${i}/60)"
125+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
126+
get pods 2>/dev/null || true
127+
sleep 10
128+
done
129+
130+
if [[ "${SUCCEEDED}" != "True" ]]; then
131+
echo "::error::PyTorchJob did not complete within 10 minutes"
132+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
133+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
134+
exit 1
135+
fi
136+
137+
echo "=== PyTorchJob master logs ==="
138+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
139+
logs pytorch-dist-mnist-nccl-master-0 2>/dev/null || true
140+
echo "=== PyTorchJob worker logs ==="
141+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
142+
logs pytorch-dist-mnist-nccl-worker-0 2>/dev/null || true
143+
144+
# --- Debug diagnostics (before cleanup so resources still exist) ---
145+
146+
- name: Debug diagnostics
147+
if: failure()
148+
run: |
149+
echo "=== KAI scheduler pods ==="
150+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
151+
echo "=== KAI scheduler logs ==="
152+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
153+
logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
154+
echo "=== KAI scheduler queues ==="
155+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
156+
echo "=== KAI scheduler podgroups ==="
157+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
158+
echo "=== Training operator pods ==="
159+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
160+
echo "=== Training operator logs ==="
161+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow \
162+
logs deployment/training-operator --tail=100 2>/dev/null || true
163+
echo "=== PyTorchJob status ==="
164+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
165+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
166+
echo "=== Gang scheduling test pods ==="
167+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
168+
describe pods 2>/dev/null || true
169+
echo "=== Non-running pods (all namespaces) ==="
170+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
171+
--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
172+
echo "=== Recent events (gang-scheduling-test) ==="
173+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
174+
get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
175+
echo "=== GPU Operator pods ==="
176+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
177+
echo "=== Node resources ==="
178+
kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
179+
grep -A 20 "Allocated resources" || true
180+
181+
- name: Gang scheduling test cleanup
182+
if: always()
183+
run: |
184+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
185+
-f tests/manifests/gang-scheduling-test.yaml --ignore-not-found 2>/dev/null || true
186+
187+
- name: GPU Test Cleanup
188+
if: always()
189+
uses: ./.github/actions/gpu-test-cleanup
190+
with:
191+
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
kind: RecipeMetadata
16+
apiVersion: eidos.nvidia.com/v1alpha1
17+
metadata:
18+
name: h100-kind-training-kubeflow
19+
20+
spec:
21+
# Inherits from kind recipe (kind-specific GPU operator + monitoring settings)
22+
# Adds Dynamo inference platform components.
23+
#
24+
# Kubeflow Training Operator v1.9 (PyTorchJob support) is installed separately
25+
# via kustomize in the CI workflow because the eidos bundler only supports Helm
26+
# components. See .github/workflows/gpu-h100-training-test.yaml.
27+
base: kind
28+
29+
criteria:
30+
service: kind
31+
accelerator: h100
32+
intent: training
33+
platform: kubeflow
34+
35+
# DRA requires Kubernetes 1.34+ (GA)
36+
constraints:
37+
- name: K8s.server.version
38+
value: ">= 1.34"
39+
40+
componentRefs:
41+
# Dynamo inference platform (also useful for serving trained models)
42+
- name: dynamo-crds
43+
type: Helm
44+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
45+
version: "0.8.1"
46+
valuesFile: components/dynamo-crds/values.yaml
47+
48+
- name: dynamo-platform
49+
type: Helm
50+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
51+
version: "0.8.1"
52+
valuesFile: components/dynamo-platform/values.yaml
53+
dependencyRefs:
54+
- dynamo-crds
55+
- cert-manager
56+
- kube-prometheus-stack
57+
overrides:
58+
# Use kind's local-path-provisioner instead of EBS gp2
59+
etcd:
60+
persistence:
61+
storageClass: standard
62+
nats:
63+
config:
64+
jetstream:
65+
fileStore:
66+
pvc:
67+
storageClassName: standard
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Assert Kubeflow Training Operator is healthy.
16+
# Chart: kubeflow-trainer 2.1.0 (oci://ghcr.io/kubeflow/charts)
17+
# Provides distributed training with TrainJob API, PyTorchJob support,
18+
# and JobSet integration for multi-replica training jobs.
19+
# Satisfies CNCF AI Conformance distributed training requirement.
20+
21+
# Kubeflow Trainer controller-manager — manages TrainJob, PyTorchJob lifecycle
22+
apiVersion: apps/v1
23+
kind: Deployment
24+
metadata:
25+
name: kubeflow-trainer-controller-manager
26+
namespace: kubeflow
27+
status:
28+
(conditions[?type == 'Available']):
29+
- status: "True"
30+
---
31+
# JobSet controller — provides multi-replica job primitives for TrainJob
32+
apiVersion: apps/v1
33+
kind: Deployment
34+
metadata:
35+
name: jobset-controller
36+
namespace: kubeflow
37+
status:
38+
(conditions[?type == 'Available']):
39+
- status: "True"
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Assert that critical CRDs are installed by the kind training stack.
16+
# Compared to the cluster (inference) assert-crds:
17+
# Removed: kgateway-crds (Gateway API, Inference Extension) — inference-only
18+
# Added: PyTorchJob CRD from Kubeflow Training Operator v1.9
19+
20+
# ── GPU Operator ───────────────────────────────────────────────────────
21+
# ClusterPolicy CRD — the GPU operator's primary configuration object
22+
apiVersion: apiextensions.k8s.io/v1
23+
kind: CustomResourceDefinition
24+
metadata:
25+
name: clusterpolicies.nvidia.com
26+
---
27+
# ── cert-manager ───────────────────────────────────────────────────────
28+
apiVersion: apiextensions.k8s.io/v1
29+
kind: CustomResourceDefinition
30+
metadata:
31+
name: certificates.cert-manager.io
32+
---
33+
apiVersion: apiextensions.k8s.io/v1
34+
kind: CustomResourceDefinition
35+
metadata:
36+
name: issuers.cert-manager.io
37+
---
38+
apiVersion: apiextensions.k8s.io/v1
39+
kind: CustomResourceDefinition
40+
metadata:
41+
name: clusterissuers.cert-manager.io
42+
---
43+
# ── dynamo-crds ────────────────────────────────────────────────────────
44+
apiVersion: apiextensions.k8s.io/v1
45+
kind: CustomResourceDefinition
46+
metadata:
47+
name: dynamocomponentdeployments.nvidia.com
48+
---
49+
# ── Skyhook ────────────────────────────────────────────────────────────
50+
apiVersion: apiextensions.k8s.io/v1
51+
kind: CustomResourceDefinition
52+
metadata:
53+
name: skyhooks.skyhook.nvidia.com
54+
---
55+
# ── Kubeflow Training Operator v1.9 ───────────────────────────────────
56+
apiVersion: apiextensions.k8s.io/v1
57+
kind: CustomResourceDefinition
58+
metadata:
59+
name: pytorchjobs.kubeflow.org

0 commit comments

Comments
 (0)