From 08903d402955205e9da46d463e4c986fd3a6f14e Mon Sep 17 00:00:00 2001 From: Vladislav Date: Tue, 16 Jul 2024 10:51:56 -0700 Subject: [PATCH] Temp WAR for ssh to Slurm Cluster (#947) Co-authored-by: Olli Lupton --- .github/workflows/_runner_ondemand_slurm.yaml | 4 +- .github/workflows/_test_maxtext.yaml | 16 ++++---- .github/workflows/_test_pax_rosetta.yaml | 40 +++++++++---------- .github/workflows/_test_slurm_pyxis.yaml | 10 ++--- .github/workflows/_test_t5x_rosetta.yaml | 28 ++++++------- .github/workflows/_test_upstream_pax.yaml | 24 +++++------ .github/workflows/_test_upstream_t5x.yaml | 14 +++---- .../workflows/scripts/wait_for_slurm_job.sh | 2 +- 8 files changed, 69 insertions(+), 69 deletions(-) diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml index a9bf08ec9..13576dd42 100644 --- a/.github/workflows/_runner_ondemand_slurm.yaml +++ b/.github/workflows/_runner_ondemand_slurm.yaml @@ -58,7 +58,7 @@ jobs: shell: bash -x -e {0} run: | SLURM_JOB_ID_FILE=$(mktemp) - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \ sbatch --parsable \ <<"EOF" #!/bin/bash @@ -117,5 +117,5 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_maxtext.yaml index 6f50997a5..c46eb62f3 100644 --- a/.github/workflows/_test_maxtext.yaml +++ b/.github/workflows/_test_maxtext.yaml @@ -88,7 +88,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -149,17 +149,17 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -240,7 +240,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -304,17 +304,17 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 22223bf63..abdc1dd13 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -87,7 +87,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -146,7 +146,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -154,10 +154,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -308,7 +308,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -372,7 +372,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -380,10 +380,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -506,7 +506,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -567,7 +567,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -575,10 +575,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -698,7 +698,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -762,7 +762,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -770,10 +770,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -890,7 +890,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -952,7 +952,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -960,10 +960,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ diff --git a/.github/workflows/_test_slurm_pyxis.yaml b/.github/workflows/_test_slurm_pyxis.yaml index ca11d1dca..35379ae38 100644 --- a/.github/workflows/_test_slurm_pyxis.yaml +++ b/.github/workflows/_test_slurm_pyxis.yaml @@ -123,7 +123,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias SSH='ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}' + alias SSH='ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}' SSH mkdir -p ${{ steps.meta.outputs.OUTPUT_PATH }} SLURM_JOB_ID=$(SSH sbatch --parsable <<"EOF" #!/bin/bash @@ -179,7 +179,7 @@ jobs: shell: bash -exu -o pipefail {0} run: | JOB_INFO=$( - ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \ + ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \ sacct -j ${{ steps.submit.outputs.SLURM_JOB_ID }} --format=JobID,JobName,State,Exitcode --parsable2 --noheader |\ grep -E '^[0-9]+\|' ) @@ -196,7 +196,7 @@ jobs: echo "******************** TAIL OF SLURM LOG BEG ********************" echo "***************************************************************" echo "***************************************************************" - ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }} + ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }} echo "***************************************************************" echo "***************************************************************" echo "******************** TAIL OF SLURM LOG END ********************" @@ -212,7 +212,7 @@ jobs: shell: bash -x -e {0} run: | function rsync-down() { - rsync -rtz --progress ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2 + rsync -rtz --progress -e 'ssh -p 3000' ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2 } mkdir -p artifacts/ rsync-down ${{ steps.meta.outputs.LOG_FILE }} artifacts/ @@ -243,5 +243,5 @@ jobs: if: always() && steps.exit-info.outputs.SLURM_EXITCODE != 0 shell: bash -x -e {0} run: | - ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \ + ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_t5x_rosetta.yaml index 9e3d7a670..66c55dced 100644 --- a/.github/workflows/_test_t5x_rosetta.yaml +++ b/.github/workflows/_test_t5x_rosetta.yaml @@ -92,7 +92,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -151,7 +151,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -159,10 +159,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -287,7 +287,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -347,7 +347,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -355,10 +355,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -472,7 +472,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -524,10 +524,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -643,7 +643,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -697,10 +697,10 @@ jobs: run: | cd $GITHUB_WORKSPACE mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ diff --git a/.github/workflows/_test_upstream_pax.yaml b/.github/workflows/_test_upstream_pax.yaml index eee957ec4..b53a4a1b2 100644 --- a/.github/workflows/_test_upstream_pax.yaml +++ b/.github/workflows/_test_upstream_pax.yaml @@ -82,7 +82,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -141,17 +141,17 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -258,7 +258,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -320,17 +320,17 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -406,7 +406,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -468,17 +468,17 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ diff --git a/.github/workflows/_test_upstream_t5x.yaml b/.github/workflows/_test_upstream_t5x.yaml index e1ff55597..2b4e1cb59 100644 --- a/.github/workflows/_test_upstream_t5x.yaml +++ b/.github/workflows/_test_upstream_t5x.yaml @@ -88,7 +88,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -151,10 +151,10 @@ jobs: shell: bash -x -e {0} run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ @@ -234,7 +234,7 @@ jobs: id: submit shell: bash -O expand_aliases -x -e {0} run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF @@ -292,7 +292,7 @@ jobs: if: cancelled() shell: bash -x -e {0} run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ + ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - name: Retrieve training logs and upload to TensorBoard server @@ -300,10 +300,10 @@ jobs: run: | mkdir output/ - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ + rsync -rtz --progress -e 'ssh -p 3000' \ ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ output/ || true rsync -rtz --progress \ diff --git a/.github/workflows/scripts/wait_for_slurm_job.sh b/.github/workflows/scripts/wait_for_slurm_job.sh index cec54c1f7..9801909cb 100755 --- a/.github/workflows/scripts/wait_for_slurm_job.sh +++ b/.github/workflows/scripts/wait_for_slurm_job.sh @@ -11,7 +11,7 @@ function wait_for_slurm_job() { check_every=${3:-15} while true; do - status=$(ssh $host squeue --job $job_id --noheader --format=%T 2>/dev/null || echo "SSH error: $?") + status=$(ssh -p 3000 $host squeue --job $job_id --noheader --format=%T 2>/dev/null || echo "SSH error: $?") echo "[$(date)] job $job_id: $status" if [ -z "$status" ]; then