Skip to content

Commit

Permalink
update uno for cloudmesh-sbatch
Browse files Browse the repository at this point in the history
  • Loading branch information
laszewsk committed Jul 24, 2023
1 parent e6d5543 commit b62da1e
Show file tree
Hide file tree
Showing 8 changed files with 692 additions and 0 deletions.
128 changes: 128 additions & 0 deletions benchmarks/uno/target/rivanna/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
SHELL=/bin/bash
AWS_S3=aws s3 --no-sign-request --endpoint-url https://s3.echo.stfc.ac.uk
USER_SCRATCH=/scratch/${USER}
PROJECT_DIR=${USER_SCRATCH}/mlcommons/benchmarks/cloudmask
PROJECT_DATA=${USER_SCRATCH}/data
NAME=cloudmask

.PHONY: image-singularity image-docker project

all: requirements data


download:
git clone [email protected]:laszewsk/mlcommons.git

requirements:
time pip install -r ${PROJECT_DIR}/experiments/rivanna/requirements.txt

data:
mkdir -p ${PROJECT_DATA}/ssts
mkdir -p ${PROJECT_DATA}/one-day
echo -n "Downloading first portion of data..."
cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/one-day ./one-day --cli-read-timeout 0
echo -n "Downloading second portion of data..."
cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/ssts ./ssts --cli-read-timeout 0


project: clean project.json generate

setup:
python setup_env_and_yaml.py
source ~/ENV3/bin/activate && pip install -r /scratch/${USER}/mlcommons/benchmarks/cloudmask/experiments/rivanna/requirements.txt

generate: jobs-project.sh

run: submit

submit:
-sh jobs-project.sh

localscratch: localscratch.json


jobs-%.sh: %.json
cms sbatch generate submit --name=$< > $@


simple:
cms sbatch generate \
--source=simple.in.slurm \
--config=$< \
--name=$(basename $@) \
--noos \
--os=USER,HOME \
--nocm \
--output_dir=./$(basename $@) \
--source_dir=. \
--verbose


%.json: config.in.yaml
cms sbatch generate \
--source=cloudmask_v2.in.slurm \
--config=$< \
--name=$(basename $@) \
--noos \
--os=USER,HOME \
--nocm \
--output_dir=./$(basename $@) \
--source_dir=. \
--copycode="cloudmask_v2.py,data_loader.py,model.py" \
--verbose

kill: stop

stop:
for i in "$$(squeue --user $$USER | awk 'NR>1{print $$1}')"; do scancel $$i ; done

inspect:
$(eval D=$(shell ls project/$(ls -1) | head -n 1))
echo ${D}
$(shell emacs project/${D}/config.yaml project/${D}/job.slurm)

watch: status

status:
watch squeue --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" --me


clean:
@-rm -rf localscratch localscratch.json jobs-localscratch.sh
@-rm -rf project project.json jobs-project.sh
@-rm -f rivanna.slurm
@-rm -rf '__pycache__'
@-rm -rf *~


# image


image-singularity:
cms rivanna singularity build image-singularity/cloudmask.def

run-singularity:
cd image-singularity; make run

shell-singularity:
cd image-singularity; make shell

run-localscratch:
cd image-singularity; make run-localscratch

shell-localscratch:
cd image-singularity; make shell-localscratch

shell-rivanna:
cd image-singularity; make shell-rivanna

run: run-singularity

image: image-singularity

push:
-git push
ssh -tt rivanna "cd /scratch/thf2bn/mlcommons/benchmarks/cloudmask; ssh-add; git pull"


shell: shell-singularity
File renamed without changes.
188 changes: 188 additions & 0 deletions benchmarks/uno/target/rivanna/cloudmask_v2.in.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/env bash

#SBATCH --job-name={experiment.card_name}-cloudmask-{experiment.epoch}-{experiment.repeat}
#SBATCH --output={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.out
#SBATCH --error={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.err
{slurm.sbatch}
#SBATCH -c {experiment.cpu_num}
#SBATCH --mem={experiment.mem}
#SBATCH --gres=gpu:{experiment.card_name}:{experiment.gpu_count}
#SBATCH --cpus-per-task=1
#SBATCH --mail-user=%[email protected]
#SBATCH --mail-type=ALL
#SBATCH --time={sbatch.time}


# xSBATCH --partition=gpu
# xSBATCH --mem=64GB


PROGRESS () {
echo "# ###########################################"
echo "# cloudmesh status="$1" progress=$2 pid=$$"
echo "# ###########################################"
}

PROGRESS "running" 1

echo "# ==================================="
echo "# SLURM info"
echo "# ==================================="

echo USER {os.USER}
echo HOME {os.HOME}
echo cardname {experiment.card_name}
echo gpu count {experiment.gpu_count}
echo epoc {experiment.epoch}
echo repeat {experiment.repeat}
echo jobno $SLURM_JOB_ID
echo {slurm.sbatch}
echo cpu num {experiment.cpu_num}
echo mem {experiment.mem}
echo USER $USER

PROGRESS "running" 2

echo "# ==================================="
echo "# Set up file system"
echo "# ==================================="

#
# PYTHON with cms on rivanna
#

export PYTHON_DIR=$HOME/ENV3
#export PYTHON_DIR=$USER_SCRACTH/ENV3

#
# CODE
#
export USER_SCRATCH=/scratch/$USER
export PROJECT_DIR=$USER_SCRATCH/mlcommons/benchmarks/cloudmask
export CODE_DIR=$PROJECT_DIR/target/rivanna
export CONTAINERDIR=${CODE_DIR}

export OUTPUTS_DIR="${CODE_DIR}/project/{sbatch.identifier}/outputs"

#
# DATA
#

export PROJECT_DATA=/project/bii_dsc_community/mlcommons/data/cloudmask/



PROGRESS "running" 3

# set -uxe

if [ -n $SLURM_JOB_ID ] ; then
THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
else
THEPATH=$(realpath $0)
fi
LOCATION=$(dirname $THEPATH)

echo "LOCATION:", $LOCATION
echo "THEPATH:", $THEPATH
echo
echo "USER_SCRATCH: $USER_SCRATCH"
echo "PROJECT_DIR: $PROJECT_DIR"
echo "PYTHON_DIR: $PYTHON_DIR"
echo "PROJECT_DATA: $PROJECT_DATA"
echo "CONTAINERDIR: $CONTAINERDIR"


mkdir -p $OUTPUTS_DIR

PROGRESS "running" 4


# ####################################################################################################
# MODULE LOAD
# ####################################################################################################

echo "# cloudmesh status=running progress=2 pid=$$"

module purge
module load singularity

PROGRESS "running" 4

source $PYTHON_DIR/bin/activate

which python

PROGRESS "running" 6

# ####################################################################################################
# PROJECT ENVIRONMENT
# ####################################################################################################

echo "# cloudmesh status=running progress=5 pid=$$"

echo "Working in Directory: $(pwd)"
echo "Repository Revision: $(git rev-parse HEAD)"
echo "Python Version: $(python -V)"
echo "Running on host: $(hostname -a)"

PROGRESS "running" 7

# ####################################################################################################
# GPU environment
# ####################################################################################################

nvidia-smi

PROGRESS "running" 8

echo "# ==================================="
echo "# go to codedir"
echo "# ==================================="

# cd $CODE_DIR


PROGRESS "running" 9

echo "# ==================================="
echo "# check filesystem"
echo "# ==================================="
pwd
ls
singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\""

PROGRESS "running" 10

# ####################################################################################################
# CLOUDMASK
# ####################################################################################################

PROGRESS "running" 20


echo "# ==================================="
echo "# start gpu log"
echo "# ==================================="

cms gpu watch --gpu=0 --delay=0.5 --dense > project/{sbatch.identifier}/gpu0-{experiment.card_name}-$USER-$SLURM_JOB_ID.log &

PROGRESS "running" 21

echo "# ==================================="
echo "# start cloudmask"
echo "# ==================================="

singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python cloudmask_v2.py --config=config.yaml"

PROGRESS "running" 99

seff $SLURM_JOB_ID

PROGRESS "done" 100

echo "Execution Complete"

#
exit 0

Loading

0 comments on commit b62da1e

Please sign in to comment.