From fea7b9db751e0edb6d6543f9a51b08531ee3343f Mon Sep 17 00:00:00 2001 From: Gregor von Laszewski Date: Thu, 19 Oct 2023 13:40:39 -0400 Subject: [PATCH] update with cloudmesh timers --- .../cloudmask/target/forJunqiCloud/Makefile | 26 ++++++++ .../target/forJunqiCloud/cloud_GPU_1.job | 1 + .../target/forJunqiCloud/cloud_GPU_12.job | 1 + .../target/forJunqiCloud/cloud_GPU_18.job | 1 + .../target/forJunqiCloud/cloud_GPU_2.job | 4 +- .../target/forJunqiCloud/cloud_GPU_24.job | 1 + .../target/forJunqiCloud/cloud_GPU_30.job | 1 + .../target/forJunqiCloud/cloud_GPU_4.job | 1 + .../target/forJunqiCloud/cloud_GPU_6.job | 1 + .../target/forJunqiCloud/slstr_cloud.py | 59 ++++++++++++++++--- 10 files changed, 86 insertions(+), 10 deletions(-) diff --git a/benchmarks/cloudmask/target/forJunqiCloud/Makefile b/benchmarks/cloudmask/target/forJunqiCloud/Makefile index b06d8668..9a7c7561 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/Makefile +++ b/benchmarks/cloudmask/target/forJunqiCloud/Makefile @@ -4,35 +4,61 @@ mkdir -p outputs-18/h5 bsub cloud_GPU_18.job +w18: + tail -f outputs-18/cloudmask_gpu_18-*.log + + +all: 2 4 6 12 18 24 w +# 30 + 2: -rm -rf outputs-2 mkdir -p outputs-2/h5 bsub cloud_GPU_2.job +w2: + tail -f outputs-2/cloudmask_gpu_2-*.log + 4: -rm -rf outputs-4 mkdir -p outputs-4/h5 bsub cloud_GPU_4.job +w4: + tail -f outputs-4/cloudmask_gpu_4-*.log + 6: -rm -rf outputs-6 mkdir -p outputs-6/h5 bsub cloud_GPU_6.job +w6: + tail -f outputs-6/cloudmask_gpu_6-*.log + + 12: -rm -rf outputs-12 mkdir -p outputs-12/h5 bsub cloud_GPU_12.job +w12: + tail -f outputs-12/cloudmask_gpu_12-*.log + 24: -rm -rf outputs-24 mkdir -p outputs-24/h5 bsub cloud_GPU_24.job +w24: + tail -f outputs-24/cloudmask_gpu_24-*.log + 30: -rm -rf outputs-30 mkdir -p outputs-30/h5 bsub cloud_GPU_30.job +w30: + tail -f outputs-30/cloudmask_gpu_30-*.log + w: watch bjobs diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job index a9c25e13..62910483 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job @@ -27,3 +27,4 @@ jsrun -n1 -a1 -r1 -c1 -g1 python slstr_cloud.py --config ./cloudMaskConfig_GPU_ #./cloudMaskConfig.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job index 9c3953e5..b734179e 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job @@ -25,3 +25,4 @@ echo "***************************" #jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml jsrun -n2 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job index 25f7faae..ef9b5858 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job @@ -28,3 +28,4 @@ echo "***************************" #jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml jsrun -n3 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_18.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job index 3bebc279..8118b486 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job @@ -1,4 +1,4 @@ -l#!/bin/bash +#!/bin/bash #BSUB -W 1:59 #BSUB -nnodes 1 #BSUB -P GEN150_bench @@ -25,3 +25,5 @@ echo "***************************" #jsrun -n1 -a2 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml jsrun -n1 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml + +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job index 6c2e3577..4eb2d35f 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job @@ -26,3 +26,4 @@ echo "***************************" #jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml jsrun -n4 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_24.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job index b7599715..560387d0 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job @@ -26,3 +26,4 @@ echo "***************************" #jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml jsrun -n5 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_30.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job index 1dc16691..32d8b2d3 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job @@ -27,3 +27,4 @@ echo "***************************" jsrun -n1 -r1 -c1 -g4 python slstr_cloud.py --config ./cloudMaskConfig_GPU_4.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job index c54ab7ea..58240cc5 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job +++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job @@ -26,3 +26,4 @@ echo "***************************" #jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml jsrun -n1 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml +# scontrol show job -d diff --git a/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py b/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py index 6944f84e..ee658e5d 100644 --- a/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py +++ b/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py @@ -17,6 +17,7 @@ import numpy as np from data_loader import SLSTRDataLoader from cloudmesh.common.util import banner +from cloudmesh.common.StopWatch import StopWatch # Loss function def weighted_cross_entropy(beta): @@ -95,9 +96,11 @@ def cloud_inference(args)-> None: # reconstructed. data_loader = SLSTRDataLoader(args, file_paths, single_image=True, crop_size=CROP_SIZE) dataset = data_loader.to_dataset() - + + counter = 0 # Inference Loop for patches, file_name in dataset: + counter = counter + 1 file_name = Path(file_name.numpy().decode('utf-8')) #print(f"Processing file {file_name}") @@ -109,15 +112,21 @@ def cloud_inference(args)-> None: mask_patches = model.predict_on_batch(patches) # crop edge artifacts - mask_patches = tf.image.crop_to_bounding_box(mask_patches, CROP_SIZE // 2, CROP_SIZE // 2, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE) + mask_patches = tf.image.crop_to_bounding_box( + mask_patches, + CROP_SIZE // 2, + CROP_SIZE // 2, + PATCH_SIZE - CROP_SIZE, + PATCH_SIZE - CROP_SIZE) # reconstruct patches back to full size image mask_patches = tf.reshape(mask_patches, (n, ny, nx, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE, 1)) mask = reconstruct_from_patches(args, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE) output_dir = os.path.expanduser(args['output_dir']) # mask_name = output_dir + file_name.name + '.h5' - mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5")) - print('MMMM mask_name: ', mask_name) + # mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5")) + mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", f"{counter:03}_" + str(file_name.name) + ".h5")) + print(counter, 'MMMM mask_name: ', mask_name) with h5py.File(mask_name, 'w') as handle: handle.create_dataset('mask', data=mask) @@ -146,10 +155,31 @@ def cloud_training(args)-> None: with mirrored_strategy.scope(): # create U-Net model - model = unet(input_shape=(args['PATCH_SIZE'], args['PATCH_SIZE'], args['N_CHANNELS'])) - model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) - history = model.fit(train_dataset, validation_data=test_dataset, epochs=args['epochs'], verbose=1) - + model = unet(input_shape=(args['PATCH_SIZE'], + args['PATCH_SIZE'], + args['N_CHANNELS'])) + model.compile(optimizer=optimizer, + loss='binary_crossentropy', + metrics=['accuracy']) + history = model.fit(train_dataset, + validation_data=test_dataset, + epochs=args['epochs'], + verbose=1) + + print("Loss history:") + print(history.history['loss']) + + print("Accuracy history:") + print(history.history['accuracy']) + + accuracy_history = history.history['accuracy'] + + best_accuracy = max(accuracy_history) + best_epoch = accuracy_history.index(best_accuracy) + print("Best Accuracy:", best_accuracy) + print("Epoch with Best Accuracy:", best_epoch + 1) # Adding 1 to convert zero-based index to epoch number + + # Close file descriptors # atexit.register(mirrored_strategy._extended._collective_ops._pool.close) @@ -167,6 +197,8 @@ def cloud_training(args)-> None: # Running the benchmark: python slstr_cloud.py --config ./cloudMaskConfig.yaml def main(): + + StopWatch.start("total") banner("read commandline") # Read command line arguments parser = argparse.ArgumentParser(description='CloudMask command line arguments',\ @@ -177,7 +209,8 @@ def main(): configFile = os.path.abspath(os.path.expanduser(command_line_args.config)) print ("CCCC configFile", configFile) - + + StopWatch.start("read yaml") banner("read yaml") # Read YAML file with open(configFile, 'r') as stream: @@ -185,11 +218,15 @@ def main(): print("AAA", args) log_file = os.path.abspath(os.path.expanduser(args['log_file'])) print("LLLL log_file", log_file) + StopWatch.stop("read yaml") + banner ("Training") # Training start = time.time() + StopWatch.start("training") samples = cloud_training(args) + StopWatch.start("training") print ("TTTT") diff = time.time() - start elapsedTime = decimal.Decimal(diff) @@ -205,7 +242,9 @@ def main(): banner("Inference") # Inference start = time.time() + StopWatch.start("inference") number_inferences = cloud_inference(args) + StopWatch.stop("inference") diff = time.time() - start elapsedTime = decimal.Decimal(diff) time_per_inference = elapsedTime/number_inferences @@ -213,6 +252,8 @@ def main(): print("number_inferences: ", number_inferences) with open(log_file, "a") as logfile: logfile.write(f"CloudMask inference, inferences={number_inferences}, bs={args['batch_size']}, nodes={args['nodes']}, gpus={args['gpu']}, time_per_inference={time_per_inference_str}\n") + StopWatch.stop("total") + StopWatch.benchmark() if __name__ == "__main__": main()