Skip to content

Commit 56f9b80

Browse files
authored
Use jumpbox runners for MGMN/SLURM jobs (#1103)
This would allow us to control access to the cluster via IP whitelist.
1 parent 2f73127 commit 56f9b80

File tree

7 files changed

+102
-111
lines changed

7 files changed

+102
-111
lines changed

.github/workflows/_runner_ondemand_slurm.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ on:
2020
jobs:
2121

2222
launch-slurm-runner:
23-
runs-on: ubuntu-latest
23+
runs-on: jumpbox
2424
steps:
2525
- name: Print environment variables
2626
run: env
@@ -58,7 +58,7 @@ jobs:
5858
shell: bash -x -e {0}
5959
run: |
6060
SLURM_JOB_ID_FILE=$(mktemp)
61-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
61+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
6262
sbatch --parsable \
6363
<<"EOF"
6464
#!/bin/bash
@@ -117,5 +117,5 @@ jobs:
117117
if: cancelled()
118118
shell: bash -x -e {0}
119119
run: |
120-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
120+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
121121
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}

.github/workflows/_test_maxtext.yaml

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,13 @@ jobs:
3737

3838
single-process-multi-device:
3939
strategy:
40+
max-parallel: 1
4041
matrix:
4142
PARALLEL_CONFIG:
4243
- [1, 1, 2, 4]
4344
# - [1, 1, 1, 8] # PP, DP, FSDP, TP
4445
fail-fast: false
45-
46-
runs-on: ubuntu-22.04
47-
46+
runs-on: jumpbox
4847
steps:
4948
- name: Print environment variables
5049
run: env
@@ -88,7 +87,7 @@ jobs:
8887
id: submit
8988
shell: bash -O expand_aliases -x -e {0}
9089
run: |
91-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
90+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
9291
sshx "date && hostname && sinfo"
9392
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
9493
JOB=$(sshx sbatch --parsable << EOF
@@ -149,17 +148,17 @@ jobs:
149148
if: cancelled()
150149
shell: bash -x -e {0}
151150
run: |
152-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
151+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
153152
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
154153
155154
- name: Retrieve training logs and upload to TensorBoard server
156155
shell: bash -x -e {0}
157156
run: |
158157
mkdir output/
159-
rsync -rtz --progress -e 'ssh -p 3000' \
158+
rsync -rtz --progress\
160159
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
161160
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
162-
rsync -rtz --progress -e 'ssh -p 3000' \
161+
rsync -rtz --progress\
163162
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
164163
output/ || true
165164
rsync -rtz --progress \
@@ -184,6 +183,7 @@ jobs:
184183

185184
maxtext-multinode:
186185
strategy:
186+
max-parallel: 1
187187
matrix:
188188
PARALLEL_CONFIG:
189189
- [1, 1, 1, 1]
@@ -193,9 +193,7 @@ jobs:
193193
- [1, 2, 2, 2]
194194
- [1, 4, 2, 2]
195195
fail-fast: false
196-
197-
runs-on: ubuntu-22.04
198-
196+
runs-on: jumpbox
199197
steps:
200198
- name: Print environment variables
201199
run: env
@@ -240,7 +238,7 @@ jobs:
240238
id: submit
241239
shell: bash -O expand_aliases -x -e {0}
242240
run: |
243-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
241+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
244242
sshx "date && hostname && sinfo"
245243
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
246244
JOB=$(sshx sbatch --parsable << EOF
@@ -304,17 +302,17 @@ jobs:
304302
if: cancelled()
305303
shell: bash -x -e {0}
306304
run: |
307-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
305+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
308306
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
309307
310308
- name: Retrieve training logs and upload to TensorBoard server
311309
shell: bash -x -e {0}
312310
run: |
313311
mkdir output/
314-
rsync -rtz --progress -e 'ssh -p 3000' \
312+
rsync -rtz --progress\
315313
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
316314
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
317-
rsync -rtz --progress -e 'ssh -p 3000' \
315+
rsync -rtz --progress\
318316
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
319317
output/ || true
320318
rsync -rtz --progress \

.github/workflows/_test_pax_rosetta.yaml

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,15 @@ jobs:
3232

3333
single-process-multi-device-te:
3434
strategy:
35+
max-parallel: 1
3536
matrix:
3637
PARALLEL_CONFIG:
3738
- [1, 8, 1, 1]
3839
- [1, 1, 2, 4]
3940
fail-fast: false
40-
41-
runs-on: ubuntu-22.04
42-
41+
runs-on: jumpbox
4342
env:
4443
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-multi-device-te
45-
4644
steps:
4745
- name: Print environment variables
4846
run: env
@@ -87,7 +85,7 @@ jobs:
8785
shell: bash -O expand_aliases -x -e {0}
8886
run: |
8987
cd $GITHUB_WORKSPACE
90-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
88+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
9189
sshx "date && hostname && sinfo"
9290
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
9391
JOB=$(sshx sbatch --parsable << EOF
@@ -146,18 +144,18 @@ jobs:
146144
if: cancelled()
147145
shell: bash -x -e {0}
148146
run: |
149-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
147+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
150148
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
151149
152150
- name: Retrieve training logs and upload to TensorBoard server
153151
shell: bash -x -e {0}
154152
run: |
155153
cd $GITHUB_WORKSPACE
156154
mkdir output/
157-
rsync -rtz --progress -e 'ssh -p 3000' \
155+
rsync -rtz --progress\
158156
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
159157
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
160-
rsync -rtz --progress -e 'ssh -p 3000' \
158+
rsync -rtz --progress\
161159
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
162160
output/ || true
163161
rsync -rtz --progress \
@@ -223,6 +221,7 @@ jobs:
223221

224222
rosetta-pax-multi-node-te:
225223
strategy:
224+
max-parallel: 1
226225
matrix:
227226
include:
228227
- TEST_NAME: 1DP1FSDP1TP1PP_TE
@@ -259,8 +258,7 @@ jobs:
259258
EVALUATE: true
260259
ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
261260
fail-fast: false
262-
263-
runs-on: ubuntu-22.04
261+
runs-on: jumpbox
264262
env:
265263
BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node-te
266264
steps:
@@ -308,7 +306,7 @@ jobs:
308306
id: submit
309307
shell: bash -O expand_aliases -x -e {0}
310308
run: |
311-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
309+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
312310
sshx "date && hostname && sinfo"
313311
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
314312
JOB=$(sshx sbatch --parsable << EOF
@@ -372,18 +370,18 @@ jobs:
372370
if: cancelled()
373371
shell: bash -x -e {0}
374372
run: |
375-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
373+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
376374
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
377375
378376
- name: Retrieve training logs and upload to TensorBoard server
379377
shell: bash -x -e {0}
380378
run: |
381379
cd $GITHUB_WORKSPACE
382380
mkdir output/
383-
rsync -rtz --progress -e 'ssh -p 3000' \
381+
rsync -rtz --progress\
384382
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
385383
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
386-
rsync -rtz --progress -e 'ssh -p 3000' \
384+
rsync -rtz --progress\
387385
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
388386
output/ || true
389387
rsync -rtz --progress \
@@ -450,15 +448,15 @@ jobs:
450448
451449
rosetta-pax-multi-node:
452450
strategy:
451+
max-parallel: 1
453452
matrix:
454453
PARALLEL_CONFIG:
455454
- [1, 8, 1, 1]
456455
- [1, 4, 1, 2]
457456
- [4, 2, 1, 1]
458457
- [4, 2, 1, 2]
459458
fail-fast: false
460-
461-
runs-on: ubuntu-22.04
459+
runs-on: jumpbox
462460
env:
463461
BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node
464462
steps:
@@ -506,7 +504,7 @@ jobs:
506504
shell: bash -O expand_aliases -x -e {0}
507505
run: |
508506
cd $GITHUB_WORKSPACE
509-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
507+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
510508
sshx "date && hostname && sinfo"
511509
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
512510
JOB=$(sshx sbatch --parsable << EOF
@@ -567,18 +565,18 @@ jobs:
567565
if: cancelled()
568566
shell: bash -x -e {0}
569567
run: |
570-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
568+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
571569
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
572570
573571
- name: Retrieve training logs and upload to TensorBoard server
574572
shell: bash -x -e {0}
575573
run: |
576574
cd $GITHUB_WORKSPACE
577575
mkdir output/
578-
rsync -rtz --progress -e 'ssh -p 3000' \
576+
rsync -rtz --progress\
579577
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
580578
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
581-
rsync -rtz --progress -e 'ssh -p 3000' \
579+
rsync -rtz --progress\
582580
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
583581
output/ || true
584582
rsync -rtz --progress \
@@ -645,12 +643,12 @@ jobs:
645643

646644
rosetta-pax-single-node-dropout-te:
647645
strategy:
646+
max-parallel: 1
648647
matrix:
649648
PARALLEL_CONFIG:
650649
- [1, 8, 1, 1]
651650
fail-fast: false
652-
653-
runs-on: ubuntu-22.04
651+
runs-on: jumpbox
654652
env:
655653
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-node-dropout-te
656654
steps:
@@ -698,7 +696,7 @@ jobs:
698696
id: submit
699697
shell: bash -O expand_aliases -x -e {0}
700698
run: |
701-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
699+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
702700
sshx "date && hostname && sinfo"
703701
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
704702
JOB=$(sshx sbatch --parsable << EOF
@@ -762,18 +760,18 @@ jobs:
762760
if: cancelled()
763761
shell: bash -x -e {0}
764762
run: |
765-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
763+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
766764
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
767765
768766
- name: Retrieve training logs and upload to TensorBoard server
769767
shell: bash -x -e {0}
770768
run: |
771769
cd $GITHUB_WORKSPACE
772770
mkdir output/
773-
rsync -rtz --progress -e 'ssh -p 3000' \
771+
rsync -rtz --progress\
774772
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
775773
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
776-
rsync -rtz --progress -e 'ssh -p 3000' \
774+
rsync -rtz --progress\
777775
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
778776
output/ || true
779777
rsync -rtz --progress \
@@ -839,12 +837,12 @@ jobs:
839837

840838
single-process-evaluation-te:
841839
strategy:
840+
max-parallel: 1
842841
matrix:
843842
PARALLEL_CONFIG:
844843
- [1, 8, 1, 1]
845844
fail-fast: false
846-
847-
runs-on: ubuntu-22.04
845+
runs-on: jumpbox
848846
env:
849847
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-evaluation-te
850848
steps:
@@ -890,7 +888,7 @@ jobs:
890888
shell: bash -O expand_aliases -x -e {0}
891889
run: |
892890
cd $GITHUB_WORKSPACE
893-
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
891+
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
894892
sshx "date && hostname && sinfo"
895893
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
896894
JOB=$(sshx sbatch --parsable << EOF
@@ -952,18 +950,18 @@ jobs:
952950
if: cancelled()
953951
shell: bash -x -e {0}
954952
run: |
955-
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
953+
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
956954
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
957955
958956
- name: Retrieve training logs and upload to TensorBoard server
959957
shell: bash -x -e {0}
960958
run: |
961959
cd $GITHUB_WORKSPACE
962960
mkdir output/
963-
rsync -rtz --progress -e 'ssh -p 3000' \
961+
rsync -rtz --progress\
964962
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
965963
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
966-
rsync -rtz --progress -e 'ssh -p 3000' \
964+
rsync -rtz --progress\
967965
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
968966
output/ || true
969967
rsync -rtz --progress \

0 commit comments

Comments
 (0)