-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_pytorch_xla_spmd.yaml
More file actions
57 lines (57 loc) · 1.35 KB
/
test_pytorch_xla_spmd.yaml
File metadata and controls
57 lines (57 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
apiVersion: v1
kind: Pod
metadata:
labels:
eidf/user: <your-username>
kueue.x-k8s.io/queue-name: <project-namespace>-user-queue
generateName: pytorch-xla-imagenet-test-
spec:
containers:
- args:
- -c
- |
cd /workspace
git clone -b ddp-benchmark https://github.com/marijaEf/xla.git
cd xla
python3 test/spmd/test_train_spmd_imagenet.py --fake_data --sharding batch --batch_size 128 --epochs 2 --log_steps 20
command:
- /bin/bash
env:
- name: OMP_NUM_THREADS
value: '1'
- name: PJRT_DEVICE
value: 'CUDA'
- name: XLA_NUM_DEVICES
value: '1'
image: herefortheimage/pytorch-xla-2.7.1-mlperf-resnet50:latest
name: mlperf-resnet
resources:
limits:
cpu: 16
memory: 64Gi
nvidia.com/gpu: 1
requests:
cpu: 16
memory: 32Gi
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /mnt/ceph_rbd
name: volume
- mountPath: /experiments
name: logs
- mountPath: /dev/shm
name: devshm
workingDir: /workspace/ML
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
restartPolicy: Never
volumes:
- name: volume
persistentVolumeClaim:
claimName: imagenet-dataset
- name: logs
persistentVolumeClaim:
claimName: resnet-logs
- emptyDir:
medium: Memory
name: devshm