Skip to content

Commit 33a495f

Browse files
committed
feat: add GPU training CI workflow with gang scheduling test
Add a new H100 training test workflow that validates PyTorchJob gang scheduling with KAI scheduler on a 2-GPU runner. This includes: - New overlay h100-kind-training-kubeflow with kubeflow-trainer and dynamo components for kind-based training validation - New workflow gpu-h100-training-test.yaml targeting the linux-amd64-gpu-h100-latest-2 runner - Parameterize intent in gpu-operator-install action (default: inference) so training workflows can pass --intent training Signed-off-by: Davanum Srinivas <[email protected]>
1 parent ed4973b commit 33a495f

File tree

3 files changed

+247
-3
lines changed

3 files changed

+247
-3
lines changed

.github/actions/gpu-operator-install/action.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ inputs:
2323
description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)'
2424
required: false
2525
default: ''
26+
intent:
27+
description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)'
28+
required: false
29+
default: 'inference'
2630
platform:
27-
description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
31+
description: 'Platform for recipe generation (bundle mode only, e.g. dynamo, kubeflow)'
2832
required: false
2933
default: ''
3034

@@ -63,7 +67,7 @@ runs:
6367
6468
# --- Bundle mode: eidos recipe → bundle → deploy ---
6569

66-
- name: Generate inference recipe
70+
- name: Generate recipe
6771
if: inputs.method == 'bundle'
6872
shell: bash
6973
run: |
@@ -75,7 +79,7 @@ runs:
7579
--service kind \
7680
--accelerator ${{ inputs.accelerator }} \
7781
--os ubuntu \
78-
--intent inference \
82+
--intent ${{ inputs.intent }} \
7983
${PLATFORM_FLAG} \
8084
--output recipe.yaml
8185
echo "--- Recipe ---"
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: GPU Training Test (nvkind + H100 x2)
16+
17+
on:
18+
schedule:
19+
- cron: '0 6,18 * * *' # Every 12 hours (2x daily)
20+
push:
21+
branches:
22+
- "pull-request/[0-9]+"
23+
paths:
24+
- '.github/workflows/gpu-h100-training-test.yaml'
25+
- '.github/actions/gpu-cluster-setup/**'
26+
- '.github/actions/gpu-operator-install/**'
27+
- '.github/actions/eidos-build/**'
28+
- '.github/actions/gpu-test-cleanup/**'
29+
- '.github/actions/load-versions/**'
30+
- 'tests/manifests/gang-scheduling-test.yaml'
31+
- 'recipes/components/kubeflow-trainer/**'
32+
- 'recipes/components/dynamo-platform/**'
33+
- 'recipes/overlays/kind.yaml'
34+
- 'recipes/overlays/h100-kind-training-kubeflow.yaml'
35+
workflow_dispatch: {} # Allow manual runs
36+
37+
permissions:
38+
contents: read
39+
40+
concurrency:
41+
group: ${{ github.workflow }}-${{ github.ref }}
42+
cancel-in-progress: true
43+
44+
jobs:
45+
46+
gpu-training-test:
47+
name: GPU Training Test (nvkind + H100 x2)
48+
runs-on: linux-amd64-gpu-h100-latest-2
49+
timeout-minutes: 45
50+
51+
env:
52+
KIND_CLUSTER_NAME: gpu-training-test
53+
54+
steps:
55+
56+
- name: Checkout Code
57+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
58+
with:
59+
persist-credentials: false
60+
61+
- name: Set up GPU cluster
62+
uses: ./.github/actions/gpu-cluster-setup
63+
64+
- name: Build eidos
65+
uses: ./.github/actions/eidos-build
66+
67+
- name: Install GPU operator (bundle)
68+
uses: ./.github/actions/gpu-operator-install
69+
with:
70+
method: bundle
71+
accelerator: h100
72+
intent: training
73+
platform: kubeflow
74+
75+
# --- Gang scheduling test with PyTorchJob + KAI scheduler ---
76+
77+
- name: Deploy gang scheduling test
78+
run: |
79+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
80+
-f tests/manifests/gang-scheduling-test.yaml
81+
82+
echo "Waiting for PyTorchJob to complete..."
83+
for i in $(seq 1 60); do
84+
SUCCEEDED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
85+
-n gang-scheduling-test get pytorchjob pytorch-dist-mnist-nccl \
86+
-o jsonpath='{.status.conditions[?(@.type=="Succeeded")].status}' 2>/dev/null)
87+
if [[ "${SUCCEEDED}" == "True" ]]; then
88+
echo "PyTorchJob succeeded!"
89+
break
90+
fi
91+
92+
FAILED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
93+
-n gang-scheduling-test get pytorchjob pytorch-dist-mnist-nccl \
94+
-o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null)
95+
if [[ "${FAILED}" == "True" ]]; then
96+
echo "::error::PyTorchJob failed"
97+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
98+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
99+
exit 1
100+
fi
101+
102+
echo "Waiting for PyTorchJob completion... (${i}/60)"
103+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
104+
get pods 2>/dev/null || true
105+
sleep 10
106+
done
107+
108+
if [[ "${SUCCEEDED}" != "True" ]]; then
109+
echo "::error::PyTorchJob did not complete within 10 minutes"
110+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
111+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
112+
exit 1
113+
fi
114+
115+
echo "=== PyTorchJob master logs ==="
116+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
117+
logs pytorch-dist-mnist-nccl-master-0 2>/dev/null || true
118+
echo "=== PyTorchJob worker logs ==="
119+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
120+
logs pytorch-dist-mnist-nccl-worker-0 2>/dev/null || true
121+
122+
- name: Gang scheduling test cleanup
123+
if: always()
124+
run: |
125+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
126+
-f tests/manifests/gang-scheduling-test.yaml --ignore-not-found 2>/dev/null || true
127+
128+
# --- Debug diagnostics ---
129+
130+
- name: Debug diagnostics
131+
if: failure()
132+
run: |
133+
echo "=== ClusterPolicy status ==="
134+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
135+
echo "=== GPU Operator pods ==="
136+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
137+
echo "=== KAI scheduler pods ==="
138+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
139+
echo "=== KAI scheduler logs ==="
140+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
141+
logs deployment/kai-scheduler --tail=100 2>/dev/null || true
142+
echo "=== Kubeflow trainer pods ==="
143+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
144+
echo "=== Kubeflow trainer logs ==="
145+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow \
146+
logs deployment/kubeflow-trainer --tail=100 2>/dev/null || true
147+
echo "=== PyTorchJob status ==="
148+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
149+
get pytorchjob pytorch-dist-mnist-nccl -o yaml 2>/dev/null || true
150+
echo "=== Gang scheduling test pods ==="
151+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
152+
get pods -o wide 2>/dev/null || true
153+
echo "=== Non-running pods (all namespaces) ==="
154+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
155+
--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
156+
echo "=== Recent events (gang-scheduling-test) ==="
157+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
158+
get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
159+
echo "=== Recent events (gpu-operator) ==="
160+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
161+
get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
162+
echo "=== Node status ==="
163+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
164+
165+
- name: GPU Test Cleanup
166+
if: always()
167+
uses: ./.github/actions/gpu-test-cleanup
168+
with:
169+
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
kind: RecipeMetadata
16+
apiVersion: eidos.nvidia.com/v1alpha1
17+
metadata:
18+
name: h100-kind-training-kubeflow
19+
20+
spec:
21+
# Inherits from kind recipe (kind-specific GPU operator + monitoring settings)
22+
# Adds Kubeflow Training Operator for distributed training (PyTorchJob, TrainJob)
23+
# and Dynamo inference platform components.
24+
base: kind
25+
26+
criteria:
27+
service: kind
28+
accelerator: h100
29+
intent: training
30+
platform: kubeflow
31+
32+
# DRA requires Kubernetes 1.34+ (GA)
33+
constraints:
34+
- name: K8s.server.version
35+
value: ">= 1.34"
36+
37+
componentRefs:
38+
# Kubeflow Training Operator for TrainJob / PyTorchJob support
39+
- name: kubeflow-trainer
40+
type: Helm
41+
valuesFile: components/kubeflow-trainer/values.yaml
42+
manifestFiles:
43+
- components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
44+
45+
# Dynamo inference platform (also useful for serving trained models)
46+
- name: dynamo-crds
47+
type: Helm
48+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
49+
version: "0.8.1"
50+
valuesFile: components/dynamo-crds/values.yaml
51+
52+
- name: dynamo-platform
53+
type: Helm
54+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
55+
version: "0.8.1"
56+
valuesFile: components/dynamo-platform/values.yaml
57+
dependencyRefs:
58+
- dynamo-crds
59+
- cert-manager
60+
- kube-prometheus-stack
61+
overrides:
62+
# Use kind's local-path-provisioner instead of EBS gp2
63+
etcd:
64+
persistence:
65+
storageClass: standard
66+
nats:
67+
config:
68+
jetstream:
69+
fileStore:
70+
pvc:
71+
storageClassName: standard

0 commit comments

Comments
 (0)