@@ -32,17 +32,15 @@ jobs:
3232
3333 single-process-multi-device-te :
3434 strategy :
35+ max-parallel : 1
3536 matrix :
3637 PARALLEL_CONFIG :
3738 - [1, 8, 1, 1]
3839 - [1, 1, 2, 4]
3940 fail-fast : false
40-
41- runs-on : ubuntu-22.04
42-
41+ runs-on : jumpbox
4342 env :
4443 BADGE_FILENAME_PREFIX : badge-rosetta-pax-single-process-multi-device-te
45-
4644 steps :
4745 - name : Print environment variables
4846 run : env
8785 shell : bash -O expand_aliases -x -e {0}
8886 run : |
8987 cd $GITHUB_WORKSPACE
90- alias sshx='ssh -p 3000 - o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
88+ alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
9189 sshx "date && hostname && sinfo"
9290 sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
9391 JOB=$(sshx sbatch --parsable << EOF
@@ -146,18 +144,18 @@ jobs:
146144 if : cancelled()
147145 shell : bash -x -e {0}
148146 run : |
149- ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
147+ ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
150148 scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
151149
152150 - name : Retrieve training logs and upload to TensorBoard server
153151 shell : bash -x -e {0}
154152 run : |
155153 cd $GITHUB_WORKSPACE
156154 mkdir output/
157- rsync -rtz --progress -e 'ssh -p 3000' \
155+ rsync -rtz --progress\
158156 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
159157 output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
160- rsync -rtz --progress -e 'ssh -p 3000' \
158+ rsync -rtz --progress\
161159 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
162160 output/ || true
163161 rsync -rtz --progress \
@@ -223,6 +221,7 @@ jobs:
223221
224222 rosetta-pax-multi-node-te :
225223 strategy :
224+ max-parallel : 1
226225 matrix :
227226 include :
228227 - TEST_NAME : 1DP1FSDP1TP1PP_TE
@@ -259,8 +258,7 @@ jobs:
259258 EVALUATE : true
260259 ADDITIONAL_ARGS : " --model-type LLaMA70BProxy --evaluate"
261260 fail-fast : false
262-
263- runs-on : ubuntu-22.04
261+ runs-on : jumpbox
264262 env :
265263 BADGE_FILENAME_PREFIX : badge-rosetta-pax-multi-node-te
266264 steps :
@@ -308,7 +306,7 @@ jobs:
308306 id : submit
309307 shell : bash -O expand_aliases -x -e {0}
310308 run : |
311- alias sshx='ssh -p 3000 - o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
309+ alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
312310 sshx "date && hostname && sinfo"
313311 sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
314312 JOB=$(sshx sbatch --parsable << EOF
@@ -372,18 +370,18 @@ jobs:
372370 if : cancelled()
373371 shell : bash -x -e {0}
374372 run : |
375- ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
373+ ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
376374 scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
377375
378376 - name : Retrieve training logs and upload to TensorBoard server
379377 shell : bash -x -e {0}
380378 run : |
381379 cd $GITHUB_WORKSPACE
382380 mkdir output/
383- rsync -rtz --progress -e 'ssh -p 3000' \
381+ rsync -rtz --progress\
384382 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
385383 output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
386- rsync -rtz --progress -e 'ssh -p 3000' \
384+ rsync -rtz --progress\
387385 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
388386 output/ || true
389387 rsync -rtz --progress \
@@ -450,15 +448,15 @@ jobs:
450448
451449 rosetta-pax-multi-node :
452450 strategy :
451+ max-parallel : 1
453452 matrix :
454453 PARALLEL_CONFIG :
455454 - [1, 8, 1, 1]
456455 - [1, 4, 1, 2]
457456 - [4, 2, 1, 1]
458457 - [4, 2, 1, 2]
459458 fail-fast : false
460-
461- runs-on : ubuntu-22.04
459+ runs-on : jumpbox
462460 env :
463461 BADGE_FILENAME_PREFIX : badge-rosetta-pax-multi-node
464462 steps :
@@ -506,7 +504,7 @@ jobs:
506504 shell : bash -O expand_aliases -x -e {0}
507505 run : |
508506 cd $GITHUB_WORKSPACE
509- alias sshx='ssh -p 3000 - o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
507+ alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
510508 sshx "date && hostname && sinfo"
511509 sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
512510 JOB=$(sshx sbatch --parsable << EOF
@@ -567,18 +565,18 @@ jobs:
567565 if : cancelled()
568566 shell : bash -x -e {0}
569567 run : |
570- ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
568+ ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
571569 scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
572570
573571 - name : Retrieve training logs and upload to TensorBoard server
574572 shell : bash -x -e {0}
575573 run : |
576574 cd $GITHUB_WORKSPACE
577575 mkdir output/
578- rsync -rtz --progress -e 'ssh -p 3000' \
576+ rsync -rtz --progress\
579577 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
580578 output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
581- rsync -rtz --progress -e 'ssh -p 3000' \
579+ rsync -rtz --progress\
582580 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
583581 output/ || true
584582 rsync -rtz --progress \
@@ -645,12 +643,12 @@ jobs:
645643
646644 rosetta-pax-single-node-dropout-te :
647645 strategy :
646+ max-parallel : 1
648647 matrix :
649648 PARALLEL_CONFIG :
650649 - [1, 8, 1, 1]
651650 fail-fast : false
652-
653- runs-on : ubuntu-22.04
651+ runs-on : jumpbox
654652 env :
655653 BADGE_FILENAME_PREFIX : badge-rosetta-pax-single-node-dropout-te
656654 steps :
@@ -698,7 +696,7 @@ jobs:
698696 id : submit
699697 shell : bash -O expand_aliases -x -e {0}
700698 run : |
701- alias sshx='ssh -p 3000 - o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
699+ alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
702700 sshx "date && hostname && sinfo"
703701 sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
704702 JOB=$(sshx sbatch --parsable << EOF
@@ -762,18 +760,18 @@ jobs:
762760 if : cancelled()
763761 shell : bash -x -e {0}
764762 run : |
765- ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
763+ ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
766764 scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
767765
768766 - name : Retrieve training logs and upload to TensorBoard server
769767 shell : bash -x -e {0}
770768 run : |
771769 cd $GITHUB_WORKSPACE
772770 mkdir output/
773- rsync -rtz --progress -e 'ssh -p 3000' \
771+ rsync -rtz --progress\
774772 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
775773 output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
776- rsync -rtz --progress -e 'ssh -p 3000' \
774+ rsync -rtz --progress\
777775 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
778776 output/ || true
779777 rsync -rtz --progress \
@@ -839,12 +837,12 @@ jobs:
839837
840838 single-process-evaluation-te :
841839 strategy :
840+ max-parallel : 1
842841 matrix :
843842 PARALLEL_CONFIG :
844843 - [1, 8, 1, 1]
845844 fail-fast : false
846-
847- runs-on : ubuntu-22.04
845+ runs-on : jumpbox
848846 env :
849847 BADGE_FILENAME_PREFIX : badge-rosetta-pax-single-process-evaluation-te
850848 steps :
@@ -890,7 +888,7 @@ jobs:
890888 shell : bash -O expand_aliases -x -e {0}
891889 run : |
892890 cd $GITHUB_WORKSPACE
893- alias sshx='ssh -p 3000 - o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
891+ alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
894892 sshx "date && hostname && sinfo"
895893 sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
896894 JOB=$(sshx sbatch --parsable << EOF
@@ -952,18 +950,18 @@ jobs:
952950 if : cancelled()
953951 shell : bash -x -e {0}
954952 run : |
955- ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
953+ ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
956954 scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
957955
958956 - name : Retrieve training logs and upload to TensorBoard server
959957 shell : bash -x -e {0}
960958 run : |
961959 cd $GITHUB_WORKSPACE
962960 mkdir output/
963- rsync -rtz --progress -e 'ssh -p 3000' \
961+ rsync -rtz --progress\
964962 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
965963 output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
966- rsync -rtz --progress -e 'ssh -p 3000' \
964+ rsync -rtz --progress\
967965 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
968966 output/ || true
969967 rsync -rtz --progress \
0 commit comments