Skip to content

Commit

Permalink
Temp WAR for ssh to Slurm Cluster (#947)
Browse files Browse the repository at this point in the history
Co-authored-by: Olli Lupton <[email protected]>
  • Loading branch information
DwarKapex and olupton committed Jul 16, 2024
1 parent 95021b4 commit 08903d4
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 69 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/_runner_ondemand_slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
shell: bash -x -e {0}
run: |
SLURM_JOB_ID_FILE=$(mktemp)
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
sbatch --parsable \
<<"EOF"
#!/bin/bash
Expand Down Expand Up @@ -117,5 +117,5 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
16 changes: 8 additions & 8 deletions .github/workflows/_test_maxtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -149,17 +149,17 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -240,7 +240,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -304,17 +304,17 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down
40 changes: 20 additions & 20 deletions .github/workflows/_test_pax_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -146,18 +146,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -308,7 +308,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -372,18 +372,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -506,7 +506,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -567,18 +567,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -698,7 +698,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -762,18 +762,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -890,7 +890,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -952,18 +952,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/_test_slurm_pyxis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias SSH='ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}'
alias SSH='ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}'
SSH mkdir -p ${{ steps.meta.outputs.OUTPUT_PATH }}
SLURM_JOB_ID=$(SSH sbatch --parsable <<"EOF"
#!/bin/bash
Expand Down Expand Up @@ -179,7 +179,7 @@ jobs:
shell: bash -exu -o pipefail {0}
run: |
JOB_INFO=$(
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
sacct -j ${{ steps.submit.outputs.SLURM_JOB_ID }} --format=JobID,JobName,State,Exitcode --parsable2 --noheader |\
grep -E '^[0-9]+\|'
)
Expand All @@ -196,7 +196,7 @@ jobs:
echo "******************** TAIL OF SLURM LOG BEG ********************"
echo "***************************************************************"
echo "***************************************************************"
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
echo "***************************************************************"
echo "***************************************************************"
echo "******************** TAIL OF SLURM LOG END ********************"
Expand All @@ -212,7 +212,7 @@ jobs:
shell: bash -x -e {0}
run: |
function rsync-down() {
rsync -rtz --progress ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2
rsync -rtz --progress -e 'ssh -p 3000' ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2
}
mkdir -p artifacts/
rsync-down ${{ steps.meta.outputs.LOG_FILE }} artifacts/
Expand Down Expand Up @@ -243,5 +243,5 @@ jobs:
if: always() && steps.exit-info.outputs.SLURM_EXITCODE != 0
shell: bash -x -e {0}
run: |
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
ssh -p 3000 ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
28 changes: 14 additions & 14 deletions .github/workflows/_test_t5x_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -151,18 +151,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -287,7 +287,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -347,18 +347,18 @@ jobs:
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -472,7 +472,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -524,10 +524,10 @@ jobs:
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down Expand Up @@ -643,7 +643,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
cd $GITHUB_WORKSPACE
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
Expand Down Expand Up @@ -697,10 +697,10 @@ jobs:
run: |
cd $GITHUB_WORKSPACE
mkdir output/
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
rsync -rtz --progress -e 'ssh -p 3000' \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
Expand Down
Loading

0 comments on commit 08903d4

Please sign in to comment.