10
10
WORKSPACE_PREFIX : $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
11
11
SLURM_PARTITION : llm_s
12
12
13
+ concurrency :
14
+ group : ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
15
+ cancel-in-progress : true
16
+
13
17
jobs :
14
18
training_4GPU :
15
19
runs-on : [t_cluster]
@@ -23,11 +27,14 @@ jobs:
23
27
24
28
- name : training_4GPU
25
29
run : |
30
+ ssh ${USER}@${CI_HOST} bash << EOF
31
+ cd $GITHUB_WORKSPACE
26
32
source activate ${evo_env_torch21_flash2}
27
33
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
28
- srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
34
+ srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\ $jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
29
35
exit_code=$?
30
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
36
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
37
+ EOF
31
38
32
39
training_8GPU_ISP :
33
40
runs-on : [t_cluster]
@@ -41,11 +48,14 @@ jobs:
41
48
42
49
- name : training_8GPU_ISP
43
50
run : |
51
+ ssh ${USER}@${CI_HOST} bash << EOF
52
+ cd $GITHUB_WORKSPACE
44
53
source activate ${evo_env_torch21_flash2}
45
54
jobname=ISP-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
46
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
55
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
47
56
exit_code=$?
48
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
57
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
58
+ EOF
49
59
50
60
training_8GPU_ISP_CKPT :
51
61
runs-on : [t_cluster]
@@ -59,16 +69,17 @@ jobs:
59
69
60
70
- name : training_8GPU_ISP_CKPT
61
71
run : |
72
+ ssh ${USER}@${CI_HOST} bash << EOF
73
+ cd $GITHUB_WORKSPACE
62
74
source activate ${evo_env_torch21_flash2}
63
75
jobname=ISP_CKPT-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
64
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
65
- exit_code=$?
66
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
76
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
67
77
68
78
jobname=LOAD-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
69
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
79
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
70
80
exit_code=$?
71
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
81
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
82
+ EOF
72
83
73
84
training_8GPU_4DP2TP :
74
85
strategy :
@@ -85,11 +96,14 @@ jobs:
85
96
- name : training_8GPU_4DP2TP_T
86
97
if : ${{ matrix.runner == 't_cluster' }}
87
98
run : |
99
+ ssh ${USER}@${CI_HOST} bash << EOF
100
+ cd $GITHUB_WORKSPACE
88
101
source activate ${evo_env_torch21_flash2}
89
102
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
90
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
103
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
91
104
exit_code=$?
92
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
105
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
106
+ EOF
93
107
94
108
training_8GPU_4DP2TPSP :
95
109
strategy :
@@ -106,11 +120,13 @@ jobs:
106
120
- name : training_8GPU_4DP2TPSP_T
107
121
if : ${{ matrix.runner == 't_cluster' }}
108
122
run : |
123
+ ssh ${USER}@${CI_HOST} bash << EOF
124
+ cd $GITHUB_WORKSPACE
109
125
source activate ${evo_env_torch21_flash2}
110
- jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
111
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
126
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
112
127
exit_code=$?
113
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
128
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
129
+ EOF
114
130
115
131
training_8GPU_4DP2PP :
116
132
strategy :
@@ -127,11 +143,14 @@ jobs:
127
143
- name : training_8GPU_4DP2PP_T
128
144
if : ${{ matrix.runner == 't_cluster' }}
129
145
run : |
146
+ ssh ${USER}@${CI_HOST} bash << EOF
147
+ cd $GITHUB_WORKSPACE
130
148
source activate ${evo_env_torch21_flash2}
131
149
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
132
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
150
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
133
151
exit_code=$?
134
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
152
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
153
+ EOF
135
154
136
155
training_8GPU_4DP2PP_ZB :
137
156
runs-on : [t_cluster]
@@ -145,11 +164,14 @@ jobs:
145
164
146
165
- name : training_8GPU_4DP2PP_ZB
147
166
run : |
167
+ ssh ${USER}@${CI_HOST} bash << EOF
168
+ cd $GITHUB_WORKSPACE
148
169
source activate ${evo_env_torch21_flash2}
149
170
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
150
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
171
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
151
172
exit_code=$?
152
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
173
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
174
+ EOF
153
175
154
176
training_16GPU_4DP2TP2PP_MTP :
155
177
strategy :
@@ -166,11 +188,14 @@ jobs:
166
188
- name : training_16GPU_4DP2TP2PP_MTP_T
167
189
if : ${{ matrix.runner == 't_cluster' }}
168
190
run : |
191
+ ssh ${USER}@${CI_HOST} bash << EOF
192
+ cd $GITHUB_WORKSPACE
169
193
source activate ${evo_env_torch21_flash2}
170
194
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
171
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
195
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
172
196
exit_code=$?
173
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
197
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
198
+ EOF
174
199
175
200
training_16GPU_4DP2TP2PP_MSP :
176
201
strategy :
@@ -187,11 +212,14 @@ jobs:
187
212
- name : training_16GPU_4DP2TP2PP_MSP_T
188
213
if : ${{ matrix.runner == 't_cluster' }}
189
214
run : |
215
+ ssh ${USER}@${CI_HOST} bash << EOF
216
+ cd $GITHUB_WORKSPACE
190
217
source activate ${evo_env_torch21_flash2}
191
218
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
192
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
219
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
193
220
exit_code=$?
194
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
221
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
222
+ EOF
195
223
196
224
training_16GPU_4DP2TP2PP_FSP :
197
225
strategy :
@@ -208,11 +236,14 @@ jobs:
208
236
- name : training_16GPU_4DP2TP2PP_FSP_T
209
237
if : ${{ matrix.runner == 't_cluster' }}
210
238
run : |
239
+ ssh ${USER}@${CI_HOST} bash << EOF
240
+ cd $GITHUB_WORKSPACE
211
241
source activate ${evo_env_torch21_flash2}
212
242
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
213
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
243
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
214
244
exit_code=$?
215
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
245
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
246
+ EOF
216
247
217
248
training_llama2 :
218
249
strategy :
@@ -228,8 +259,11 @@ jobs:
228
259
- uses : actions/checkout@v3
229
260
- name : training_llama2_T
230
261
run : |
262
+ ssh ${USER}@${CI_HOST} bash << EOF
263
+ cd $GITHUB_WORKSPACE
231
264
source activate ${evo_env_torch21_flash2}
232
265
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
233
- srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
266
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\ $jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
234
267
exit_code=$?
235
- sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
268
+ sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
269
+ EOF
0 commit comments