forked from ray-project/kuberay
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpytorch-mnist-rayjob.yaml
More file actions
51 lines (51 loc) · 1.33 KB
/
pytorch-mnist-rayjob.yaml
File metadata and controls
51 lines (51 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
labels:
perf-test: rayjob-pytorch-mnist
spec:
shutdownAfterJobFinishes: true
entrypoint: python ray_train_pytorch_mnist.py
runtimeEnvYAML: |
env_vars:
NUM_WORKERS: "2"
CPUS_PER_WORKER: "1"
OMP_NUM_THREADS: "1" # Set OMP_NUM_THREADS to avoid KeyErorr race condition.
rayClusterSpec:
rayVersion: "2.52.0"
headGroupSpec:
template:
spec:
containers:
- name: ray-head
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
memory: "5Gi"
requests:
cpu: "1"
memory: "5Gi"
workerGroupSpecs:
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: worker-group
template:
spec:
containers:
- name: ray-worker
image: {{.Image}}
resources:
limits:
memory: "4Gi"
requests:
cpu: "1"
memory: "4Gi"