From b62da1e2f2a08246549f763ebbb1eb92a8e74e51 Mon Sep 17 00:00:00 2001 From: Gregor von Laszewski Date: Mon, 24 Jul 2023 17:19:13 -0400 Subject: [PATCH] update uno for cloudmesh-sbatch --- benchmarks/uno/target/rivanna/Makefile | 128 ++++++++++++ benchmarks/uno/{ => target}/rivanna/README.md | 0 .../uno/target/rivanna/cloudmask_v2.in.slurm | 188 ++++++++++++++++++ benchmarks/uno/target/rivanna/config.in.yaml | 106 ++++++++++ .../target/rivanna/image-singularity/Makefile | 60 ++++++ .../rivanna/image-singularity/cloudmask.def | 46 +++++ .../uno/target/rivanna/requirements.txt | 16 ++ benchmarks/uno/target/rivanna/simple.slurm | 148 ++++++++++++++ 8 files changed, 692 insertions(+) create mode 100644 benchmarks/uno/target/rivanna/Makefile rename benchmarks/uno/{ => target}/rivanna/README.md (100%) create mode 100644 benchmarks/uno/target/rivanna/cloudmask_v2.in.slurm create mode 100644 benchmarks/uno/target/rivanna/config.in.yaml create mode 100644 benchmarks/uno/target/rivanna/image-singularity/Makefile create mode 100644 benchmarks/uno/target/rivanna/image-singularity/cloudmask.def create mode 100644 benchmarks/uno/target/rivanna/requirements.txt create mode 100644 benchmarks/uno/target/rivanna/simple.slurm diff --git a/benchmarks/uno/target/rivanna/Makefile b/benchmarks/uno/target/rivanna/Makefile new file mode 100644 index 00000000..9788f158 --- /dev/null +++ b/benchmarks/uno/target/rivanna/Makefile @@ -0,0 +1,128 @@ +SHELL=/bin/bash +AWS_S3=aws s3 --no-sign-request --endpoint-url https://s3.echo.stfc.ac.uk +USER_SCRATCH=/scratch/${USER} +PROJECT_DIR=${USER_SCRATCH}/mlcommons/benchmarks/cloudmask +PROJECT_DATA=${USER_SCRATCH}/data +NAME=cloudmask + +.PHONY: image-singularity image-docker project + +all: requirements data + + +download: + git clone git@github.com:laszewsk/mlcommons.git + +requirements: + time pip install -r ${PROJECT_DIR}/experiments/rivanna/requirements.txt + +data: + mkdir -p ${PROJECT_DATA}/ssts + mkdir -p ${PROJECT_DATA}/one-day + echo -n "Downloading first portion of data..." + cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/one-day ./one-day --cli-read-timeout 0 + echo -n "Downloading second portion of data..." + cd ${PROJECT_DATA}; ${AWS_S3} sync s3://sciml-datasets/es/cloud_slstr_ds1/ssts ./ssts --cli-read-timeout 0 + + +project: clean project.json generate + +setup: + python setup_env_and_yaml.py + source ~/ENV3/bin/activate && pip install -r /scratch/${USER}/mlcommons/benchmarks/cloudmask/experiments/rivanna/requirements.txt + +generate: jobs-project.sh + +run: submit + +submit: + -sh jobs-project.sh + +localscratch: localscratch.json + + +jobs-%.sh: %.json + cms sbatch generate submit --name=$< > $@ + + +simple: + cms sbatch generate \ + --source=simple.in.slurm \ + --config=$< \ + --name=$(basename $@) \ + --noos \ + --os=USER,HOME \ + --nocm \ + --output_dir=./$(basename $@) \ + --source_dir=. \ + --verbose + + +%.json: config.in.yaml + cms sbatch generate \ + --source=cloudmask_v2.in.slurm \ + --config=$< \ + --name=$(basename $@) \ + --noos \ + --os=USER,HOME \ + --nocm \ + --output_dir=./$(basename $@) \ + --source_dir=. \ + --copycode="cloudmask_v2.py,data_loader.py,model.py" \ + --verbose + +kill: stop + +stop: + for i in "$$(squeue --user $$USER | awk 'NR>1{print $$1}')"; do scancel $$i ; done + +inspect: + $(eval D=$(shell ls project/$(ls -1) | head -n 1)) + echo ${D} + $(shell emacs project/${D}/config.yaml project/${D}/job.slurm) + +watch: status + +status: + watch squeue --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" --me + + +clean: + @-rm -rf localscratch localscratch.json jobs-localscratch.sh + @-rm -rf project project.json jobs-project.sh + @-rm -f rivanna.slurm + @-rm -rf '__pycache__' + @-rm -rf *~ + + +# image + + +image-singularity: + cms rivanna singularity build image-singularity/cloudmask.def + +run-singularity: + cd image-singularity; make run + +shell-singularity: + cd image-singularity; make shell + +run-localscratch: + cd image-singularity; make run-localscratch + +shell-localscratch: + cd image-singularity; make shell-localscratch + +shell-rivanna: + cd image-singularity; make shell-rivanna + +run: run-singularity + +image: image-singularity + +push: + -git push + ssh -tt rivanna "cd /scratch/thf2bn/mlcommons/benchmarks/cloudmask; ssh-add; git pull" + + +shell: shell-singularity diff --git a/benchmarks/uno/rivanna/README.md b/benchmarks/uno/target/rivanna/README.md similarity index 100% rename from benchmarks/uno/rivanna/README.md rename to benchmarks/uno/target/rivanna/README.md diff --git a/benchmarks/uno/target/rivanna/cloudmask_v2.in.slurm b/benchmarks/uno/target/rivanna/cloudmask_v2.in.slurm new file mode 100644 index 00000000..44bc1c7b --- /dev/null +++ b/benchmarks/uno/target/rivanna/cloudmask_v2.in.slurm @@ -0,0 +1,188 @@ +#!/usr/bin/env bash + +#SBATCH --job-name={experiment.card_name}-cloudmask-{experiment.epoch}-{experiment.repeat} +#SBATCH --output={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.out +#SBATCH --error={experiment.card_name}-{experiment.epoch}-{experiment.repeat}-cloudmask-%u-%j.err +{slurm.sbatch} +#SBATCH -c {experiment.cpu_num} +#SBATCH --mem={experiment.mem} +#SBATCH --gres=gpu:{experiment.card_name}:{experiment.gpu_count} +#SBATCH --cpus-per-task=1 +#SBATCH --mail-user=%u@virginia.edu +#SBATCH --mail-type=ALL +#SBATCH --time={sbatch.time} + + +# xSBATCH --partition=gpu +# xSBATCH --mem=64GB + + +PROGRESS () { + echo "# ###########################################" + echo "# cloudmesh status="$1" progress=$2 pid=$$" + echo "# ###########################################" +} + +PROGRESS "running" 1 + +echo "# ===================================" +echo "# SLURM info" +echo "# ===================================" + +echo USER {os.USER} +echo HOME {os.HOME} +echo cardname {experiment.card_name} +echo gpu count {experiment.gpu_count} +echo epoc {experiment.epoch} +echo repeat {experiment.repeat} +echo jobno $SLURM_JOB_ID +echo {slurm.sbatch} +echo cpu num {experiment.cpu_num} +echo mem {experiment.mem} +echo USER $USER + +PROGRESS "running" 2 + +echo "# ===================================" +echo "# Set up file system" +echo "# ===================================" + +# +# PYTHON with cms on rivanna +# + +export PYTHON_DIR=$HOME/ENV3 +#export PYTHON_DIR=$USER_SCRACTH/ENV3 + +# +# CODE +# +export USER_SCRATCH=/scratch/$USER +export PROJECT_DIR=$USER_SCRATCH/mlcommons/benchmarks/cloudmask +export CODE_DIR=$PROJECT_DIR/target/rivanna +export CONTAINERDIR=${CODE_DIR} + +export OUTPUTS_DIR="${CODE_DIR}/project/{sbatch.identifier}/outputs" + +# +# DATA +# + +export PROJECT_DATA=/project/bii_dsc_community/mlcommons/data/cloudmask/ + + + +PROGRESS "running" 3 + +# set -uxe + +if [ -n $SLURM_JOB_ID ] ; then +THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}') +else +THEPATH=$(realpath $0) +fi +LOCATION=$(dirname $THEPATH) + +echo "LOCATION:", $LOCATION +echo "THEPATH:", $THEPATH +echo +echo "USER_SCRATCH: $USER_SCRATCH" +echo "PROJECT_DIR: $PROJECT_DIR" +echo "PYTHON_DIR: $PYTHON_DIR" +echo "PROJECT_DATA: $PROJECT_DATA" +echo "CONTAINERDIR: $CONTAINERDIR" + + +mkdir -p $OUTPUTS_DIR + +PROGRESS "running" 4 + + +# #################################################################################################### +# MODULE LOAD +# #################################################################################################### + +echo "# cloudmesh status=running progress=2 pid=$$" + +module purge +module load singularity + +PROGRESS "running" 4 + +source $PYTHON_DIR/bin/activate + +which python + +PROGRESS "running" 6 + +# #################################################################################################### +# PROJECT ENVIRONMENT +# #################################################################################################### + +echo "# cloudmesh status=running progress=5 pid=$$" + +echo "Working in Directory: $(pwd)" +echo "Repository Revision: $(git rev-parse HEAD)" +echo "Python Version: $(python -V)" +echo "Running on host: $(hostname -a)" + +PROGRESS "running" 7 + +# #################################################################################################### +# GPU environment +# #################################################################################################### + +nvidia-smi + +PROGRESS "running" 8 + +echo "# ===================================" +echo "# go to codedir" +echo "# ===================================" + +# cd $CODE_DIR + + +PROGRESS "running" 9 + +echo "# ===================================" +echo "# check filesystem" +echo "# ===================================" +pwd +ls +singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\"" + +PROGRESS "running" 10 + +# #################################################################################################### +# CLOUDMASK +# #################################################################################################### + +PROGRESS "running" 20 + + +echo "# ===================================" +echo "# start gpu log" +echo "# ===================================" + +cms gpu watch --gpu=0 --delay=0.5 --dense > project/{sbatch.identifier}/gpu0-{experiment.card_name}-$USER-$SLURM_JOB_ID.log & + +PROGRESS "running" 21 + +echo "# ===================================" +echo "# start cloudmask" +echo "# ===================================" + +singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python cloudmask_v2.py --config=config.yaml" + +PROGRESS "running" 99 + +seff $SLURM_JOB_ID + +PROGRESS "done" 100 + +echo "Execution Complete" + +# +exit 0 + diff --git a/benchmarks/uno/target/rivanna/config.in.yaml b/benchmarks/uno/target/rivanna/config.in.yaml new file mode 100644 index 00000000..8fd17485 --- /dev/null +++ b/benchmarks/uno/target/rivanna/config.in.yaml @@ -0,0 +1,106 @@ +# file: config-new.yaml +# +# This configuratixcon file was created by Gregor von Laszewski laszewski@gmail.com +# The values under image: and some other values have been taken from +# +# SciML-Bench, which is under +# Copyright © 2022 Scientific Machine Learning Research Group +# Scientific Computing Department, Rutherford Appleton Laboratory +# Science and Technology Facilities Council, UK. +# All rights reserved. +# +# Sveral values of that have been modified also by +# Gregor von Laszewski, Juri Papay +# +# This is a configuration file for the mlcommons Science CloudMask benchmark. +# It works with cloudmask_v2.py +# +name: cloudmask-rivanna + +sbatch: + time: "6:00:00" + mode: h + dir: scratch + +run: + fit-verbose: 1 + host: ubuntu + target: rivanna-singularity + filesystem: "/scratch/thf2bn" + venvpath: "/scratch/thf2bn/ENV3/bin/python" + datadir: data + branch: 'main' + +# Submission Information +submission: + name: cloudmask + submitter: Gregor von Laszewski + email: laszewski@gmail.com + org: University of Virginia + division: open + version: mlcommons-cloudmask-v2.0 + github_commit_version: TBD + status: completed + platform: rivanna + accelerators_per_node: 1 + +experiment: +# card_name: a100-dgx + card_name: "a100,v100,p100,k80,rtx2080,rtx3090" + gpu_count: "1" + cpu_num: 1 + mem: "64GB" + repeat: "2" + epoch: "2,10,30,50,70,100" +# epoch: "1" + seed: 1234 + learning_rate: 0.001 + batch_size: 32 + train_split: 0.8 + clip_offset: 15 + no_cache: False + nodes: 1 + gpu: 1 + +system: + host: "rivanna" + python: "3.10.8" + num_cpus: 1 + platform: rivanna + accelerators_per_node: 1 + constraint: "" + reservation: "" + +mask: "float" + +training: + loss: binary_crossentropy + metrics: accuracy + +data: + scratch: "/project/bii_dsc_community/mlcommons/data/cloudmask" +# scratch: "/scratch2/data/cloudmask/data" + training: "{data.scratch}/one-day" + inference: "{data.scratch}/ssts" + model: cloudModel.h5 + output: "./outputs" + +log: + file: c_cloudmask_run.log + mlperf: c_mlperf_cloudmask.log + +image: + # Size of each patch to feed to the network + PATCH_SIZE: 256 + # Original height of the image + IMAGE_H: 1200 + # Original width of the image + IMAGE_W: 1500 + # No. of channels + N_CHANNELS: 9 + # Min allowable SST + MIN_SST: 273.15 + # Amount to crop the edges of the images by + CROP_SIZE: 80 + +# \ No newline at end of file diff --git a/benchmarks/uno/target/rivanna/image-singularity/Makefile b/benchmarks/uno/target/rivanna/image-singularity/Makefile new file mode 100644 index 00000000..9d28a3de --- /dev/null +++ b/benchmarks/uno/target/rivanna/image-singularity/Makefile @@ -0,0 +1,60 @@ +NAME=cloudmask +CODE_DIR=.. +BUILD=cloudmask.def +IMAGE=cloudmask.sif +DATA=/scratch2/data/cloudmask/data +BIND=--bind ${DATA}:${DATA} + +DATA_LOCALSCRATCH=/localscratch/${USER}/cloudmask/ +BIND_LOCALSCRATCH=--bind ${DATA}:${DATA} + +image: + time ../bin/singularity-image.py ${NAME} + +queue: watch + +delete: + rm -f *.out *.err + rm -f *_output.ipynb + rm *.log + +# TODO +submit: + sbatch simple-a100-singularity.sh + +watch: status + +status: + watch squeue --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" --me + +run: + cd ${CODE_DIR}; mkdir -p outputs + cd ${CODE_DIR}; singularity exec ${BIND} --nv cloudmask.sif bash -c "python cloudmask_v2.py --config=config-new.yaml" + +#singularity exec --nv ${NAME}.sif papermill ${NAME}.ipynb ${NAME}_output.ipynb + +shell: + singularity ${BIND} shell --nv ${IMAGE} + +run-localscratch: + cd ${CODE_DIR}; mkdir -p outputs + cd ${CODE_DIR}; singularity exec ${BIND_LOCALSCRATCH} --nv cloudmask.sif bash -c "python cloudmask_v2.py --config=config-new.yaml" + +#singularity exec --nv ${NAME}.sif papermill ${NAME}.ipynb ${NAME}_output.ipynb + +shell-localscratch: + singularity ${BIND_LOCALSCRATCH} shell --nv ${IMAGE} + +shell-rivanna: + singularity shell --nv ${IMAGE} + + +cancel: stop + +stop: + for i in "$$(squeue --user $$USER | awk 'NR>1{print $$1}')"; do scancel $$i ; done + +view: + watch tail -n 50 *.err + +all: delete clean image submit view diff --git a/benchmarks/uno/target/rivanna/image-singularity/cloudmask.def b/benchmarks/uno/target/rivanna/image-singularity/cloudmask.def new file mode 100644 index 00000000..a465b62a --- /dev/null +++ b/benchmarks/uno/target/rivanna/image-singularity/cloudmask.def @@ -0,0 +1,46 @@ +#Bootstrap: localimage +#From: /share/resources/containers/singularity/tensorflow-2.10.0.sif +#From: /share/resources/containers/singularity/tensorflow-2.4.1.sif +#From: /share/resources/containers/singularity/tensorflow_23.03-tf1-py3.sif +#From: /share/resources/containers/singularity/tensorflow_23.03-tf2-py3.sif + +Bootstrap: docker +From: nvcr.io/nvidia/tensorflow:22.10-tf2-py3 + +%post + apt update + apt install -y python3-venv git + pip install pip -U ; python --version + # install from requirements file in rivanna folder + pip install protobuf==3.20.0 numpy tensorflow cloudmesh-gpu cloudmesh-common cloudmesh-sbatch scikit-learn h5py pyyaml awscli + pip install git+https://github.com/mlperf/logging.git@1.0.0 + + + + # protobuf solution + + # # module purge ; module load gcc/11.2 openmpi/4.1.4 python/3.11.1 ; python -m venv ./EQVENV + # # source ./EQVENV/bin/activate ; + # # pip install --upgrade protobuf + # # python -m site + # # cp ~/.local/lib/python3.8/site-packages/google/protobuf/internal/builder.py ./builder.py + # pip install protobuf==3.20.3 + # # cp ./builder.py ~/.local/lib/python3.8/site-packages/google/protobuf/internal/builder.py + + + + # pip install git+https://github.com/mlperf/logging.git@1.0.0 + # # pip install googleapis-common-protos + # # pip install --upgrade protobuf + + pip install humanize + pip install numpy matplotlib pandas scikit-learn jupyter jupyterlab papermill + + pip install tensorflow_datasets tensorflow-metadata + pip install tqdm wheel ipywidgets jupyter-autotime ipython-autotime + + pip install cloudmesh-common cloudmesh-gpu + + + + diff --git a/benchmarks/uno/target/rivanna/requirements.txt b/benchmarks/uno/target/rivanna/requirements.txt new file mode 100644 index 00000000..56935bad --- /dev/null +++ b/benchmarks/uno/target/rivanna/requirements.txt @@ -0,0 +1,16 @@ +matplotlib +#tensorflow==2.9.0 +protobuf==3.20.1 +numpy +tqdm +tensorflow +tensorflow-addons +cloudmesh-gpu +cloudmesh-common +cloudmesh-sbatch +scikit-learn +h5py +pyyaml +awscli +# git+https://github.com/mlperf/logging.git@2.1.0 +git+https://github.com/mlperf/logging.git@1.0.0 diff --git a/benchmarks/uno/target/rivanna/simple.slurm b/benchmarks/uno/target/rivanna/simple.slurm new file mode 100644 index 00000000..295684ba --- /dev/null +++ b/benchmarks/uno/target/rivanna/simple.slurm @@ -0,0 +1,148 @@ +#!/bin/bash + +#SBATCH --job-name=simple-cloudmask-gpu-rivanna +#SBATCH --nodes=1 +#SBATCH --gres=gpu:v100:1 +#SBATCH --time=06:00:00 +#SBATCH --mem=64G +#SBATCH -o outputs/simple-v100-%u-%j.out +#SBATCH -e outputs/simple-v100-%u-%j.err +#SBATCH --partition=bii-gpu +#SBATCH --account=bii_dsc_community + +PROGRESS () { + echo "# ###########################################" + echo "# cloudmesh status="$1" progress=$2 pid=$$" + echo "# ###########################################" +} + + +PROGRESS "running" 1 + + +echo "# ===================================" +echo "# SLURM info" +echo "# ===================================" + +# echo USER {os.USER} +# echo HOME {os.HOME} +# echo cardname {experiment.card_name} +# echo gpu count {experiment.gpu_count} +# echo epoc {experiment.epoch} +# echo repeat {experiment.repeat} +echo jobno $SLURM_JOB_ID +# echo partition {system.partition} +# echo allocation {system.allocation} +# echo reservation {system.reservation} +# echo constraint {system.constraint} +# echo cpu num {experiment.cpu_num} +# echo mem {experiment.mem} +echo $USER + +PROGRESS "running" 2 + +echo "# ===================================" +echo "# Set up file system" +echo "# ===================================" + + +export USER_SCRATCH=/scratch/$USER +export PROJECT_DIR=$USER_SCRATCH/mlcommons/benchmarks/cloudmask +# export PYTHON_DIR=$HOME/ENV3 +export PYTHON_DIR=$USER_SCRACTH/ENV3 +export PROJECT_DATA=/project/bii_dsc_community/thf2bn/data/cloudmask +export CONTAINERDIR=. + +export CODE_DIR=$PROJECT_DIR/target/rivanna + +PROGRESS "running" 3 + + +# set -uxe + +if [ -n $SLURM_JOB_ID ] ; then +THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}') +else +THEPATH=$(realpath $0) +fi +LOCATION=$(dirname $THEPATH) + +echo "LOCATION:", $LOCATION +echo "THEPATH:", $THEPATH +echo +echo "USER_SCRATCH: $USER_SCRATCH" +echo "PROJECT_DIR: $PROJECT_DIR" +echo "PYTHON_DIR: $PYTHON_DIR" +echo "PROJECT_DATA: $PROJECT_DATA" +echo "CONTAINERDIR: $CONTAINERDIR" + +PROGRESS "running" 4 + +# #################################################################################################### +# MODULE LOAD +# #################################################################################################### + +echo "# cloudmesh status=running progress=2 pid=$$" + +module purge +module load singularity + +# module load gcc/9.2.0 cuda/11.0.228 openmpi/3.1.6 python/3.8.8 +# module load singularity tensorflow/2.8.0 + +PROGRESS "running" 4 + +source $PYTHON_DIR/bin/activate + +which python + +nvidia-smi + +PROGRESS "running" 8 + + +echo "# ===================================" +echo "# go to codedir" +echo "# ===================================" + +cd $CODE_DIR + +PROGRESS "running" 9 + +echo "# ===================================" +echo "# check filesystem" +echo "# ===================================" +pwd +ls +singularity exec --nv ./cloudmask.sif bash -c "cd ${CODE_DIR} ; python -c \"import os; os.system('ls')\"" + +PROGRESS "running" 10 + + + +echo "# ===================================" +echo "# start gpu log" +echo "# ===================================" + +cms gpu watch --gpu=0 --delay=0.5 --dense > outputs/simple-$USER-$SLURM_JOB_ID-gpu0.log & + +PROGRESS "running" 21 + + +echo "# ===================================" +echo "# start cloudmask" +echo "# ===================================" + +singularity exec --nv ./cloudmask.sif bash -c "cd ${CODE_DIR} ; python cloudmask_v2.py --config=config-new.yaml" + +PROGRESS "running" 99 + +seff $SLURM_JOB_ID + +PROGRESS "done" 100 + +echo "Execution Complete" + +# +exit 0 +