From fea7b9db751e0edb6d6543f9a51b08531ee3343f Mon Sep 17 00:00:00 2001
From: Gregor von Laszewski <laszewski@gmail.com>
Date: Thu, 19 Oct 2023 13:40:39 -0400
Subject: [PATCH] update with cloudmesh timers

---
 .../cloudmask/target/forJunqiCloud/Makefile   | 26 ++++++++
 .../target/forJunqiCloud/cloud_GPU_1.job      |  1 +
 .../target/forJunqiCloud/cloud_GPU_12.job     |  1 +
 .../target/forJunqiCloud/cloud_GPU_18.job     |  1 +
 .../target/forJunqiCloud/cloud_GPU_2.job      |  4 +-
 .../target/forJunqiCloud/cloud_GPU_24.job     |  1 +
 .../target/forJunqiCloud/cloud_GPU_30.job     |  1 +
 .../target/forJunqiCloud/cloud_GPU_4.job      |  1 +
 .../target/forJunqiCloud/cloud_GPU_6.job      |  1 +
 .../target/forJunqiCloud/slstr_cloud.py       | 59 ++++++++++++++++---
 10 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/benchmarks/cloudmask/target/forJunqiCloud/Makefile b/benchmarks/cloudmask/target/forJunqiCloud/Makefile
index b06d8668..9a7c7561 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/Makefile
+++ b/benchmarks/cloudmask/target/forJunqiCloud/Makefile
@@ -4,35 +4,61 @@
 	mkdir -p outputs-18/h5
 	bsub  cloud_GPU_18.job
 
+w18:
+	tail -f outputs-18/cloudmask_gpu_18-*.log
+
+
+all: 2 4 6 12 18 24 w
+# 30
+
 2:
 	-rm -rf outputs-2
 	mkdir -p outputs-2/h5
 	bsub  cloud_GPU_2.job
 
+w2:
+	tail -f outputs-2/cloudmask_gpu_2-*.log
+
 4: 
 	-rm -rf outputs-4
 	mkdir -p outputs-4/h5
 	bsub  cloud_GPU_4.job
 
+w4:
+	tail -f outputs-4/cloudmask_gpu_4-*.log
+
 6: 
 	-rm -rf outputs-6
 	mkdir -p outputs-6/h5
 	bsub  cloud_GPU_6.job
 
+w6:
+	tail -f outputs-6/cloudmask_gpu_6-*.log
+
+
 12: 
 	-rm -rf outputs-12
 	mkdir -p outputs-12/h5
 	bsub  cloud_GPU_12.job
 
+w12:
+	tail -f outputs-12/cloudmask_gpu_12-*.log
+
 24: 
 	-rm -rf outputs-24
 	mkdir -p outputs-24/h5
 	bsub  cloud_GPU_24.job
 
+w24:
+	tail -f outputs-24/cloudmask_gpu_24-*.log
+
 30: 
 	-rm -rf outputs-30
 	mkdir -p outputs-30/h5
 	bsub  cloud_GPU_30.job
 
+w30:
+	tail -f outputs-30/cloudmask_gpu_30-*.log
+
 w:
 	watch bjobs
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job
index a9c25e13..62910483 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job
@@ -27,3 +27,4 @@ jsrun  -n1 -a1 -r1 -c1 -g1 python slstr_cloud.py --config ./cloudMaskConfig_GPU_
 #./cloudMaskConfig.yaml 
 
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job
index 9c3953e5..b734179e 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job
@@ -25,3 +25,4 @@ echo "***************************"
 #jsrun  -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml 
 jsrun  -n2 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job
index 25f7faae..ef9b5858 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job
@@ -28,3 +28,4 @@ echo "***************************"
 #jsrun  -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml 
 jsrun  -n3 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_18.yaml
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job
index 3bebc279..8118b486 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job
@@ -1,4 +1,4 @@
-l#!/bin/bash
+#!/bin/bash
 #BSUB -W 1:59
 #BSUB -nnodes 1
 #BSUB -P GEN150_bench
@@ -25,3 +25,5 @@ echo "***************************"
 
 #jsrun  -n1 -a2 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml
 jsrun  -n1 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml 
+
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job
index 6c2e3577..4eb2d35f 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job
@@ -26,3 +26,4 @@ echo "***************************"
 #jsrun  -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml 
 jsrun  -n4 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_24.yaml
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job
index b7599715..560387d0 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job
@@ -26,3 +26,4 @@ echo "***************************"
 #jsrun  -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml 
 jsrun  -n5 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_30.yaml
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job
index 1dc16691..32d8b2d3 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job
@@ -27,3 +27,4 @@ echo "***************************"
 jsrun  -n1 -r1 -c1 -g4 python slstr_cloud.py --config ./cloudMaskConfig_GPU_4.yaml
 
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job
index c54ab7ea..58240cc5 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job
+++ b/benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job
@@ -26,3 +26,4 @@ echo "***************************"
 #jsrun  -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml 
 jsrun  -n1 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml
 
+# scontrol show job -d
diff --git a/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py b/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py
index 6944f84e..ee658e5d 100644
--- a/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py
+++ b/benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py
@@ -17,6 +17,7 @@
 import numpy as np
 from data_loader import SLSTRDataLoader
 from cloudmesh.common.util import banner
+from cloudmesh.common.StopWatch import StopWatch
 
 # Loss function
 def weighted_cross_entropy(beta):
@@ -95,9 +96,11 @@ def cloud_inference(args)-> None:
     # reconstructed.
     data_loader = SLSTRDataLoader(args, file_paths, single_image=True, crop_size=CROP_SIZE)
     dataset = data_loader.to_dataset()
-    
+
+    counter = 0
     # Inference Loop
     for patches, file_name in dataset:
+        counter = counter + 1
         file_name = Path(file_name.numpy().decode('utf-8'))
         #print(f"Processing file {file_name}")
 
@@ -109,15 +112,21 @@ def cloud_inference(args)-> None:
         mask_patches = model.predict_on_batch(patches)
 
         # crop edge artifacts
-        mask_patches = tf.image.crop_to_bounding_box(mask_patches, CROP_SIZE // 2, CROP_SIZE // 2, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE)
+        mask_patches = tf.image.crop_to_bounding_box(
+            mask_patches,
+            CROP_SIZE // 2,
+            CROP_SIZE // 2,
+            PATCH_SIZE - CROP_SIZE,
+            PATCH_SIZE - CROP_SIZE)
 
         # reconstruct patches back to full size image
         mask_patches = tf.reshape(mask_patches, (n, ny, nx, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE, 1))
         mask = reconstruct_from_patches(args, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE)
         output_dir = os.path.expanduser(args['output_dir'])
         # mask_name = output_dir + file_name.name + '.h5'
-        mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5"))
-        print('MMMM mask_name: ', mask_name)
+        # mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5"))
+        mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", f"{counter:03}_" + str(file_name.name) + ".h5"))
+        print(counter, 'MMMM mask_name: ', mask_name)
 
         with h5py.File(mask_name, 'w') as handle:
             handle.create_dataset('mask', data=mask)
@@ -146,10 +155,31 @@ def cloud_training(args)-> None:
     
     with mirrored_strategy.scope():
         # create U-Net model
-        model = unet(input_shape=(args['PATCH_SIZE'], args['PATCH_SIZE'], args['N_CHANNELS']))
-        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
-        history = model.fit(train_dataset, validation_data=test_dataset, epochs=args['epochs'], verbose=1)
-
+        model = unet(input_shape=(args['PATCH_SIZE'],
+                                  args['PATCH_SIZE'],
+                                  args['N_CHANNELS']))
+        model.compile(optimizer=optimizer,
+                      loss='binary_crossentropy',
+                      metrics=['accuracy'])
+        history = model.fit(train_dataset,
+                            validation_data=test_dataset,
+                            epochs=args['epochs'],
+                            verbose=1)
+
+        print("Loss history:")
+        print(history.history['loss'])
+
+        print("Accuracy history:")
+        print(history.history['accuracy'])
+
+        accuracy_history = history.history['accuracy']
+
+        best_accuracy = max(accuracy_history)
+        best_epoch = accuracy_history.index(best_accuracy)
+        print("Best Accuracy:", best_accuracy)
+        print("Epoch with Best Accuracy:", best_epoch + 1)  # Adding 1 to convert zero-based index to epoch number
+
+        
     # Close file descriptors
    # atexit.register(mirrored_strategy._extended._collective_ops._pool.close)
 
@@ -167,6 +197,8 @@ def cloud_training(args)-> None:
 # Running the benchmark: python slstr_cloud.py --config ./cloudMaskConfig.yaml
 def main():
 
+
+    StopWatch.start("total")
     banner("read commandline")
     # Read command line arguments
     parser = argparse.ArgumentParser(description='CloudMask command line arguments',\
@@ -177,7 +209,8 @@ def main():
 
     configFile = os.path.abspath(os.path.expanduser(command_line_args.config))
     print ("CCCC configFile", configFile)
-    
+
+    StopWatch.start("read yaml")
     banner("read yaml")
     # Read YAML file
     with open(configFile, 'r') as stream:
@@ -185,11 +218,15 @@ def main():
     print("AAA", args)
     log_file = os.path.abspath(os.path.expanduser(args['log_file']))
     print("LLLL log_file", log_file)
+    StopWatch.stop("read yaml")
+
 
     banner ("Training")
     # Training
     start = time.time()
+    StopWatch.start("training")        
     samples = cloud_training(args)
+    StopWatch.start("training")    
     print ("TTTT")
     diff = time.time() - start
     elapsedTime = decimal.Decimal(diff)
@@ -205,7 +242,9 @@ def main():
     banner("Inference")
     # Inference
     start = time.time()
+    StopWatch.start("inference")    
     number_inferences = cloud_inference(args)
+    StopWatch.stop("inference")    
     diff = time.time() - start
     elapsedTime = decimal.Decimal(diff)
     time_per_inference = elapsedTime/number_inferences
@@ -213,6 +252,8 @@ def main():
     print("number_inferences: ", number_inferences)
     with open(log_file, "a") as logfile:
         logfile.write(f"CloudMask inference, inferences={number_inferences}, bs={args['batch_size']}, nodes={args['nodes']}, gpus={args['gpu']}, time_per_inference={time_per_inference_str}\n")
+    StopWatch.stop("total")
+    StopWatch.benchmark()
     
 if __name__ == "__main__":
     main()