-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
692 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
SHELL=/bin/bash | ||
AWS_S3=aws s3 --no-sign-request --endpoint-url https://s3.echo.stfc.ac.uk | ||
USER_SCRATCH=/scratch/${USER} | ||
PROJECT_DIR=${USER_SCRATCH}/mlcommons/benchmarks/cloudmask | ||
PROJECT_DATA=${USER_SCRATCH}/data | ||
NAME=cloudmask | ||
|
||
.PHONY: image-singularity image-docker project | ||
|
||
all: requirements data | ||
|
||
|
||
download: | ||
git clone [email protected]:laszewsk/mlcommons.git | ||
|
||
requirements: | ||
time pip install -r ${PROJECT_DIR}/experiments/rivanna/requirements.txt | ||
|
||
data: | ||
mkdir -p ${PROJECT_DATA}/ssts | ||
mkdir -p ${PROJECT_DATA}/one-day | ||
echo -n "Downloading first portion of data..." | ||
cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/one-day ./one-day --cli-read-timeout 0 | ||
echo -n "Downloading second portion of data..." | ||
cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/ssts ./ssts --cli-read-timeout 0 | ||
|
||
|
||
project: clean project.json generate | ||
|
||
setup: | ||
python setup_env_and_yaml.py | ||
source ~/ENV3/bin/activate && pip install -r /scratch/${USER}/mlcommons/benchmarks/cloudmask/experiments/rivanna/requirements.txt | ||
|
||
generate: jobs-project.sh | ||
|
||
run: submit | ||
|
||
submit: | ||
-sh jobs-project.sh | ||
|
||
localscratch: localscratch.json | ||
|
||
|
||
jobs-%.sh: %.json | ||
cms sbatch generate submit --name=$< > $@ | ||
|
||
|
||
simple: | ||
cms sbatch generate \ | ||
--source=simple.in.slurm \ | ||
--config=$< \ | ||
--name=$(basename $@) \ | ||
--noos \ | ||
--os=USER,HOME \ | ||
--nocm \ | ||
--output_dir=./$(basename $@) \ | ||
--source_dir=. \ | ||
--verbose | ||
|
||
|
||
%.json: config.in.yaml | ||
cms sbatch generate \ | ||
--source=cloudmask_v2.in.slurm \ | ||
--config=$< \ | ||
--name=$(basename $@) \ | ||
--noos \ | ||
--os=USER,HOME \ | ||
--nocm \ | ||
--output_dir=./$(basename $@) \ | ||
--source_dir=. \ | ||
--copycode="cloudmask_v2.py,data_loader.py,model.py" \ | ||
--verbose | ||
|
||
kill: stop | ||
|
||
stop: | ||
for i in "$$(squeue --user $$USER | awk 'NR>1{print $$1}')"; do scancel $$i ; done | ||
|
||
inspect: | ||
$(eval D=$(shell ls project/$(ls -1) | head -n 1)) | ||
echo ${D} | ||
$(shell emacs project/${D}/config.yaml project/${D}/job.slurm) | ||
|
||
watch: status | ||
|
||
status: | ||
watch squeue --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" --me | ||
|
||
|
||
clean: | ||
@-rm -rf localscratch localscratch.json jobs-localscratch.sh | ||
@-rm -rf project project.json jobs-project.sh | ||
@-rm -f rivanna.slurm | ||
@-rm -rf '__pycache__' | ||
@-rm -rf *~ | ||
|
||
|
||
# image | ||
|
||
|
||
image-singularity: | ||
cms rivanna singularity build image-singularity/cloudmask.def | ||
|
||
run-singularity: | ||
cd image-singularity; make run | ||
|
||
shell-singularity: | ||
cd image-singularity; make shell | ||
|
||
run-localscratch: | ||
cd image-singularity; make run-localscratch | ||
|
||
shell-localscratch: | ||
cd image-singularity; make shell-localscratch | ||
|
||
shell-rivanna: | ||
cd image-singularity; make shell-rivanna | ||
|
||
run: run-singularity | ||
|
||
image: image-singularity | ||
|
||
push: | ||
-git push | ||
ssh -tt rivanna "cd /scratch/thf2bn/mlcommons/benchmarks/cloudmask; ssh-add; git pull" | ||
|
||
|
||
shell: shell-singularity |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
#!/usr/bin/env bash | ||
|
||
#SBATCH --job-name={experiment.card_name}-cloudmask-{experiment.epoch}-{experiment.repeat} | ||
#SBATCH --output={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.out | ||
#SBATCH --error={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.err | ||
{slurm.sbatch} | ||
#SBATCH -c {experiment.cpu_num} | ||
#SBATCH --mem={experiment.mem} | ||
#SBATCH --gres=gpu:{experiment.card_name}:{experiment.gpu_count} | ||
#SBATCH --cpus-per-task=1 | ||
#SBATCH --mail-user=%[email protected] | ||
#SBATCH --mail-type=ALL | ||
#SBATCH --time={sbatch.time} | ||
|
||
|
||
# xSBATCH --partition=gpu | ||
# xSBATCH --mem=64GB | ||
|
||
|
||
PROGRESS () { | ||
echo "# ###########################################" | ||
echo "# cloudmesh status="$1" progress=$2 pid=$$" | ||
echo "# ###########################################" | ||
} | ||
|
||
PROGRESS "running" 1 | ||
|
||
echo "# ===================================" | ||
echo "# SLURM info" | ||
echo "# ===================================" | ||
|
||
echo USER {os.USER} | ||
echo HOME {os.HOME} | ||
echo cardname {experiment.card_name} | ||
echo gpu count {experiment.gpu_count} | ||
echo epoc {experiment.epoch} | ||
echo repeat {experiment.repeat} | ||
echo jobno $SLURM_JOB_ID | ||
echo {slurm.sbatch} | ||
echo cpu num {experiment.cpu_num} | ||
echo mem {experiment.mem} | ||
echo USER $USER | ||
|
||
PROGRESS "running" 2 | ||
|
||
echo "# ===================================" | ||
echo "# Set up file system" | ||
echo "# ===================================" | ||
|
||
# | ||
# PYTHON with cms on rivanna | ||
# | ||
|
||
export PYTHON_DIR=$HOME/ENV3 | ||
#export PYTHON_DIR=$USER_SCRACTH/ENV3 | ||
|
||
# | ||
# CODE | ||
# | ||
export USER_SCRATCH=/scratch/$USER | ||
export PROJECT_DIR=$USER_SCRATCH/mlcommons/benchmarks/cloudmask | ||
export CODE_DIR=$PROJECT_DIR/target/rivanna | ||
export CONTAINERDIR=${CODE_DIR} | ||
|
||
export OUTPUTS_DIR="${CODE_DIR}/project/{sbatch.identifier}/outputs" | ||
|
||
# | ||
# DATA | ||
# | ||
|
||
export PROJECT_DATA=/project/bii_dsc_community/mlcommons/data/cloudmask/ | ||
|
||
|
||
|
||
PROGRESS "running" 3 | ||
|
||
# set -uxe | ||
|
||
if [ -n $SLURM_JOB_ID ] ; then | ||
THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}') | ||
else | ||
THEPATH=$(realpath $0) | ||
fi | ||
LOCATION=$(dirname $THEPATH) | ||
|
||
echo "LOCATION:", $LOCATION | ||
echo "THEPATH:", $THEPATH | ||
echo | ||
echo "USER_SCRATCH: $USER_SCRATCH" | ||
echo "PROJECT_DIR: $PROJECT_DIR" | ||
echo "PYTHON_DIR: $PYTHON_DIR" | ||
echo "PROJECT_DATA: $PROJECT_DATA" | ||
echo "CONTAINERDIR: $CONTAINERDIR" | ||
|
||
|
||
mkdir -p $OUTPUTS_DIR | ||
|
||
PROGRESS "running" 4 | ||
|
||
|
||
# #################################################################################################### | ||
# MODULE LOAD | ||
# #################################################################################################### | ||
|
||
echo "# cloudmesh status=running progress=2 pid=$$" | ||
|
||
module purge | ||
module load singularity | ||
|
||
PROGRESS "running" 4 | ||
|
||
source $PYTHON_DIR/bin/activate | ||
|
||
which python | ||
|
||
PROGRESS "running" 6 | ||
|
||
# #################################################################################################### | ||
# PROJECT ENVIRONMENT | ||
# #################################################################################################### | ||
|
||
echo "# cloudmesh status=running progress=5 pid=$$" | ||
|
||
echo "Working in Directory: $(pwd)" | ||
echo "Repository Revision: $(git rev-parse HEAD)" | ||
echo "Python Version: $(python -V)" | ||
echo "Running on host: $(hostname -a)" | ||
|
||
PROGRESS "running" 7 | ||
|
||
# #################################################################################################### | ||
# GPU environment | ||
# #################################################################################################### | ||
|
||
nvidia-smi | ||
|
||
PROGRESS "running" 8 | ||
|
||
echo "# ===================================" | ||
echo "# go to codedir" | ||
echo "# ===================================" | ||
|
||
# cd $CODE_DIR | ||
|
||
|
||
PROGRESS "running" 9 | ||
|
||
echo "# ===================================" | ||
echo "# check filesystem" | ||
echo "# ===================================" | ||
pwd | ||
ls | ||
singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\"" | ||
|
||
PROGRESS "running" 10 | ||
|
||
# #################################################################################################### | ||
# CLOUDMASK | ||
# #################################################################################################### | ||
|
||
PROGRESS "running" 20 | ||
|
||
|
||
echo "# ===================================" | ||
echo "# start gpu log" | ||
echo "# ===================================" | ||
|
||
cms gpu watch --gpu=0 --delay=0.5 --dense > project/{sbatch.identifier}/gpu0-{experiment.card_name}-$USER-$SLURM_JOB_ID.log & | ||
|
||
PROGRESS "running" 21 | ||
|
||
echo "# ===================================" | ||
echo "# start cloudmask" | ||
echo "# ===================================" | ||
|
||
singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python cloudmask_v2.py --config=config.yaml" | ||
|
||
PROGRESS "running" 99 | ||
|
||
seff $SLURM_JOB_ID | ||
|
||
PROGRESS "done" 100 | ||
|
||
echo "Execution Complete" | ||
|
||
# | ||
exit 0 | ||
|
Oops, something went wrong.