Skip to content

Commit 02ea919

Browse files
authored
test(ci): update ci (#405)
1 parent 7d03512 commit 02ea919

8 files changed

+275
-96
lines changed

.github/workflows/demo_in_readme.yaml

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ env:
1111
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
1212
SLURM_PARTITION: llm_s
1313

14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
16+
cancel-in-progress: true
17+
1418
jobs:
1519
dataset-preparation:
1620
runs-on: [t_cluster]
@@ -23,13 +27,23 @@ jobs:
2327

2428
- name: raw-chinese-data
2529
run: |
30+
ssh ${USER}@${CI_HOST} bash << EOF
31+
cd $GITHUB_WORKSPACE
32+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
33+
export SLURM_PARTITION=$SLURM_PARTITION
2634
source activate ${evo_env_torch21_flash2}
2735
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
36+
EOF
2837
2938
- name: alpaca-data
3039
run: |
40+
ssh ${USER}@${CI_HOST} bash << EOF
41+
cd $GITHUB_WORKSPACE
42+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
43+
export SLURM_PARTITION=$SLURM_PARTITION
3144
source activate ${evo_env_torch21_flash2}
3245
sh ./ci_scripts/data/tokenizer_alpaca.sh
46+
EOF
3347
3448
train:
3549
runs-on: [t_cluster]
@@ -44,28 +58,48 @@ jobs:
4458
- name: slurm-train
4559
id: basic_train
4660
run: |
61+
ssh ${USER}@${CI_HOST} bash << EOF
62+
cd $GITHUB_WORKSPACE
63+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
64+
export SLURM_PARTITION=$SLURM_PARTITION
4765
source activate ${evo_env_torch21_flash2}
4866
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
67+
EOF
4968
5069
- name: load_preset_ckpt
5170
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
5271
run: |
72+
ssh ${USER}@${CI_HOST} bash << EOF
73+
cd $GITHUB_WORKSPACE
74+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
75+
export SLURM_PARTITION=$SLURM_PARTITION
5376
source activate ${evo_env_torch21_flash2}
5477
export PYTHONPATH=$PWD:$PYTHONPATH
5578
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
79+
EOF
5680
5781
- name: load_new_ckpt
5882
run: |
83+
ssh ${USER}@${CI_HOST} bash << EOF
84+
cd $GITHUB_WORKSPACE
85+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
86+
export SLURM_PARTITION=$SLURM_PARTITION
5987
source activate ${evo_env_torch21_flash2}
6088
export PYTHONPATH=$PWD:$PYTHONPATH
6189
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
6290
rm -rf $GITHUB_WORKSPACE/llm_ckpts
91+
EOF
6392
6493
- name: torchrun-train
6594
run: |
95+
ssh ${USER}@${CI_HOST} bash << EOF
96+
cd $GITHUB_WORKSPACE
97+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
98+
export SLURM_PARTITION=$SLURM_PARTITION
6699
source activate ${evo_env_torch21_flash2}
67100
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
68101
rm -rf $GITHUB_WORKSPACE/llm_ckpts
102+
EOF
69103
70104
convert-model-then-load:
71105
runs-on: [t_cluster]
@@ -79,13 +113,18 @@ jobs:
79113

80114
- name: convert-model-then-load
81115
run: |
116+
ssh ${USER}@${CI_HOST} bash << EOF
117+
cd $GITHUB_WORKSPACE
118+
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
119+
export SLURM_PARTITION=$SLURM_PARTITION
82120
source activate ${evo_env_torch21_flash2}
83121
export PYTHONPATH=$PWD:$PYTHONPATH
84122
sh ./ci_scripts/model/convert_to_hf.sh
85123
cd ./hf_ckpt
86124
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
87-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
125+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
88126
exit_code=$?
89127
cd ..
90128
rm -rf $GITHUB_WORKSPACE/hf_ckpt
91-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
129+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
130+
EOF

.github/workflows/e2e_test.yaml

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ env:
1010
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
1111
SLURM_PARTITION: llm_s
1212

13+
concurrency:
14+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
15+
cancel-in-progress: true
16+
1317
jobs:
1418
training_4GPU:
1519
runs-on: [t_cluster]
@@ -23,11 +27,14 @@ jobs:
2327

2428
- name: training_4GPU
2529
run: |
30+
ssh ${USER}@${CI_HOST} bash << EOF
31+
cd $GITHUB_WORKSPACE
2632
source activate ${evo_env_torch21_flash2}
2733
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
28-
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
34+
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
2935
exit_code=$?
30-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
36+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
37+
EOF
3138
3239
training_8GPU_ISP:
3340
runs-on: [t_cluster]
@@ -41,11 +48,14 @@ jobs:
4148

4249
- name: training_8GPU_ISP
4350
run: |
51+
ssh ${USER}@${CI_HOST} bash << EOF
52+
cd $GITHUB_WORKSPACE
4453
source activate ${evo_env_torch21_flash2}
4554
jobname=ISP-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
46-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
55+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
4756
exit_code=$?
48-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
57+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
58+
EOF
4959
5060
training_8GPU_ISP_CKPT:
5161
runs-on: [t_cluster]
@@ -59,16 +69,17 @@ jobs:
5969

6070
- name: training_8GPU_ISP_CKPT
6171
run: |
72+
ssh ${USER}@${CI_HOST} bash << EOF
73+
cd $GITHUB_WORKSPACE
6274
source activate ${evo_env_torch21_flash2}
6375
jobname=ISP_CKPT-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
64-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
65-
exit_code=$?
66-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
76+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
6777
6878
jobname=LOAD-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
69-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
79+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
7080
exit_code=$?
71-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
81+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
82+
EOF
7283
7384
training_8GPU_4DP2TP:
7485
strategy:
@@ -85,11 +96,14 @@ jobs:
8596
- name: training_8GPU_4DP2TP_T
8697
if: ${{ matrix.runner == 't_cluster' }}
8798
run: |
99+
ssh ${USER}@${CI_HOST} bash << EOF
100+
cd $GITHUB_WORKSPACE
88101
source activate ${evo_env_torch21_flash2}
89102
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
90-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
103+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
91104
exit_code=$?
92-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
105+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
106+
EOF
93107
94108
training_8GPU_4DP2TPSP:
95109
strategy:
@@ -106,11 +120,13 @@ jobs:
106120
- name: training_8GPU_4DP2TPSP_T
107121
if: ${{ matrix.runner == 't_cluster' }}
108122
run: |
123+
ssh ${USER}@${CI_HOST} bash << EOF
124+
cd $GITHUB_WORKSPACE
109125
source activate ${evo_env_torch21_flash2}
110-
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
111-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
126+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
112127
exit_code=$?
113-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
128+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
129+
EOF
114130
115131
training_8GPU_4DP2PP:
116132
strategy:
@@ -127,11 +143,14 @@ jobs:
127143
- name: training_8GPU_4DP2PP_T
128144
if: ${{ matrix.runner == 't_cluster' }}
129145
run: |
146+
ssh ${USER}@${CI_HOST} bash << EOF
147+
cd $GITHUB_WORKSPACE
130148
source activate ${evo_env_torch21_flash2}
131149
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
132-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
150+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
133151
exit_code=$?
134-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
152+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
153+
EOF
135154
136155
training_8GPU_4DP2PP_ZB:
137156
runs-on: [t_cluster]
@@ -145,11 +164,14 @@ jobs:
145164

146165
- name: training_8GPU_4DP2PP_ZB
147166
run: |
167+
ssh ${USER}@${CI_HOST} bash << EOF
168+
cd $GITHUB_WORKSPACE
148169
source activate ${evo_env_torch21_flash2}
149170
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
150-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
171+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
151172
exit_code=$?
152-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
173+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
174+
EOF
153175
154176
training_16GPU_4DP2TP2PP_MTP:
155177
strategy:
@@ -166,11 +188,14 @@ jobs:
166188
- name: training_16GPU_4DP2TP2PP_MTP_T
167189
if: ${{ matrix.runner == 't_cluster' }}
168190
run: |
191+
ssh ${USER}@${CI_HOST} bash << EOF
192+
cd $GITHUB_WORKSPACE
169193
source activate ${evo_env_torch21_flash2}
170194
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
171-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
195+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
172196
exit_code=$?
173-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
197+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
198+
EOF
174199
175200
training_16GPU_4DP2TP2PP_MSP:
176201
strategy:
@@ -187,11 +212,14 @@ jobs:
187212
- name: training_16GPU_4DP2TP2PP_MSP_T
188213
if: ${{ matrix.runner == 't_cluster' }}
189214
run: |
215+
ssh ${USER}@${CI_HOST} bash << EOF
216+
cd $GITHUB_WORKSPACE
190217
source activate ${evo_env_torch21_flash2}
191218
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
192-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
219+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
193220
exit_code=$?
194-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
221+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
222+
EOF
195223
196224
training_16GPU_4DP2TP2PP_FSP:
197225
strategy:
@@ -208,11 +236,14 @@ jobs:
208236
- name: training_16GPU_4DP2TP2PP_FSP_T
209237
if: ${{ matrix.runner == 't_cluster' }}
210238
run: |
239+
ssh ${USER}@${CI_HOST} bash << EOF
240+
cd $GITHUB_WORKSPACE
211241
source activate ${evo_env_torch21_flash2}
212242
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
213-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
243+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
214244
exit_code=$?
215-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
245+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
246+
EOF
216247
217248
training_llama2:
218249
strategy:
@@ -228,8 +259,11 @@ jobs:
228259
- uses: actions/checkout@v3
229260
- name: training_llama2_T
230261
run: |
262+
ssh ${USER}@${CI_HOST} bash << EOF
263+
cd $GITHUB_WORKSPACE
231264
source activate ${evo_env_torch21_flash2}
232265
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
233-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
266+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
234267
exit_code=$?
235-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
268+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
269+
EOF

.github/workflows/monthly_test.yaml

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@ jobs:
2121

2222
- name: training_tp_norm_layer_msp
2323
run: |
24+
ssh ${USER}@${CI_HOST} bash << EOF
25+
cd $GITHUB_WORKSPACE
2426
source activate ${evo_env_torch21_flash2}
2527
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
26-
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
28+
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_msp" ./tests/test_training/test_norm_weight.py
2729
exit_code=$?
28-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
30+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
31+
EOF
2932
3033
training_tp_norm_layer_fsp:
3134
runs-on: [t_cluster]
@@ -40,11 +43,14 @@ jobs:
4043

4144
- name: training_tp_norm_layer_fsp
4245
run: |
46+
ssh ${USER}@${CI_HOST} bash << EOF
47+
cd $GITHUB_WORKSPACE
4348
source activate ${evo_env_torch21_flash2}
4449
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
45-
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
50+
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_fsp" ./tests/test_training/test_norm_weight.py
4651
exit_code=$?
47-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
52+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
53+
EOF
4854
4955
training_tp_norm_layer_isp:
5056
runs-on: [t_cluster]
@@ -59,11 +65,14 @@ jobs:
5965

6066
- name: training_tp_norm_layer_isp
6167
run: |
68+
ssh ${USER}@${CI_HOST} bash << EOF
69+
cd $GITHUB_WORKSPACE
6270
source activate ${evo_env_torch21_flash2}
6371
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
64-
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
72+
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\$jobname -N 1 -n 1 --gres=gpu:8 pytest -s -v --color=yes -m "check_norm_isp" ./tests/test_training/test_norm_weight.py
6573
exit_code=$?
66-
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
74+
sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname
75+
EOF
6776
6877
notify_to_feishu:
6978
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}

0 commit comments

Comments
 (0)