Skip to content

Commit 378d38a

Browse files
committed
review and add gpu trainer example
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent b499369 commit 378d38a

File tree

2 files changed

+43
-1
lines changed

2 files changed

+43
-1
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# This was developed on AWS g4dn.xlarge, Tesla T4 GPUs
2+
# eksctl create cluster --config-file ./eks-config-gpu.yaml
3+
# eksctl delete cluster --config-file ./eks-config-gpu.yaml --wait
4+
# apiVersion: eksctl.io/v1alpha5
5+
# kind: ClusterConfig
6+
# metadata:
7+
# name: gpu-cluster
8+
# region: us-east-2
9+
10+
# nodeGroups:
11+
# - name: hpsf-gpu-workers
12+
# instanceType: g4dn.xlarge
13+
# minSize: 2
14+
# maxSize: 2
15+
# desiredCapacity: 2
16+
apiVersion: trainer.kubeflow.org/v1alpha1
17+
kind: TrainJob
18+
metadata:
19+
name: lammps-flux
20+
spec:
21+
# Reference the pre-defined runtime by name
22+
runtimeRef:
23+
name: flux-runtime
24+
trainer:
25+
numNodes: 2
26+
numProcPerNode: 1
27+
image: ghcr.io/flux-framework/tutorials:gpu-lammps-hwloc
28+
# You do not need to write "flux run, etc" here. It will be wrapped
29+
command: [lmp_gpu, -k, "on", g, "8", "-sf", "kk", "-pk", "kokkos", "cuda/aware", "off", "newton", "on", neigh, half, -in, in.reaxff.hns, -v, "x", "8", -v, "y", "8", -v, z, "16", "-in", in.reaxff.hns, "-nocite"]
30+
resourcesPerNode:
31+
limits:
32+
nvidia.com/gpu: "1"
33+
requests:
34+
nvidia.com/gpu: "1"
35+
env:
36+
- name: OMPI_MCA_btl
37+
value: tcp,self
38+
# This is how we match the view (operating system and version) of the initContainer to install Flux
39+
- name: FLUX_VIEW_IMAGE
40+
value: ghcr.io/converged-computing/flux-view-ubuntu:tag-jammy

pkg/runtime/framework/plugins/flux/flux.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,12 +427,14 @@ func (f *Flux) generateFluxEntrypoint(trainJob *trainer.TrainJob, info *runtime.
427427

428428
// Resource file for cluster includes GPUs or not
429429
// flux R encode --hosts=${hosts} --cores=0-1 --gpu=0
430-
Rspec := fmt.Sprintf("--cores=0-%d", tasks-1)
430+
coreSpec := generateRange(int32(tasks), 0)
431+
Rspec := fmt.Sprintf("--cores=%s", coreSpec)
431432
if gpus > 0 {
432433
flags = fmt.Sprintf("%s -g %d", flags, gpus)
433434
gpuSpec := generateRange(int32(gpus), 0)
434435
Rspec = fmt.Sprintf("%s --gpu=%s", Rspec, gpuSpec)
435436
}
437+
fmt.Println(Rspec)
436438
return fmt.Sprintf(entrypointTemplate, Rspec, mainHost, flags)
437439
}
438440

0 commit comments

Comments
 (0)