Skip to content

Commit

Permalink
update with cloudmesh timers
Browse files Browse the repository at this point in the history
  • Loading branch information
laszewsk committed Oct 19, 2023
1 parent 49ca628 commit fea7b9d
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 10 deletions.
26 changes: 26 additions & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,61 @@
mkdir -p outputs-18/h5
bsub cloud_GPU_18.job

w18:
tail -f outputs-18/cloudmask_gpu_18-*.log


all: 2 4 6 12 18 24 w
# 30

2:
-rm -rf outputs-2
mkdir -p outputs-2/h5
bsub cloud_GPU_2.job

w2:
tail -f outputs-2/cloudmask_gpu_2-*.log

4:
-rm -rf outputs-4
mkdir -p outputs-4/h5
bsub cloud_GPU_4.job

w4:
tail -f outputs-4/cloudmask_gpu_4-*.log

6:
-rm -rf outputs-6
mkdir -p outputs-6/h5
bsub cloud_GPU_6.job

w6:
tail -f outputs-6/cloudmask_gpu_6-*.log


12:
-rm -rf outputs-12
mkdir -p outputs-12/h5
bsub cloud_GPU_12.job

w12:
tail -f outputs-12/cloudmask_gpu_12-*.log

24:
-rm -rf outputs-24
mkdir -p outputs-24/h5
bsub cloud_GPU_24.job

w24:
tail -f outputs-24/cloudmask_gpu_24-*.log

30:
-rm -rf outputs-30
mkdir -p outputs-30/h5
bsub cloud_GPU_30.job

w30:
tail -f outputs-30/cloudmask_gpu_30-*.log

w:
watch bjobs
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_1.job
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ jsrun -n1 -a1 -r1 -c1 -g1 python slstr_cloud.py --config ./cloudMaskConfig_GPU_
#./cloudMaskConfig.yaml


# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_12.job
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ echo "***************************"
#jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml
jsrun -n2 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml

# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_18.job
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ echo "***************************"
#jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml
jsrun -n3 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_18.yaml

# scontrol show job -d
4 changes: 3 additions & 1 deletion benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_2.job
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
l#!/bin/bash
#!/bin/bash
#BSUB -W 1:59
#BSUB -nnodes 1
#BSUB -P GEN150_bench
Expand All @@ -25,3 +25,5 @@ echo "***************************"

#jsrun -n1 -a2 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml
jsrun -n1 -r1 -c1 -g2 python slstr_cloud.py --config ./cloudMaskConfig_GPU_2.yaml

# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_24.job
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ echo "***************************"
#jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml
jsrun -n4 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_24.yaml

# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_30.job
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ echo "***************************"
#jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_12.yaml
jsrun -n5 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_30.yaml

# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_4.job
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ echo "***************************"
jsrun -n1 -r1 -c1 -g4 python slstr_cloud.py --config ./cloudMaskConfig_GPU_4.yaml


# scontrol show job -d
1 change: 1 addition & 0 deletions benchmarks/cloudmask/target/forJunqiCloud/cloud_GPU_6.job
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ echo "***************************"
#jsrun -n1 -a6 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml
jsrun -n1 -r1 -c1 -g6 python slstr_cloud.py --config ./cloudMaskConfig_GPU_6.yaml

# scontrol show job -d
59 changes: 50 additions & 9 deletions benchmarks/cloudmask/target/forJunqiCloud/slstr_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import numpy as np
from data_loader import SLSTRDataLoader
from cloudmesh.common.util import banner
from cloudmesh.common.StopWatch import StopWatch

# Loss function
def weighted_cross_entropy(beta):
Expand Down Expand Up @@ -95,9 +96,11 @@ def cloud_inference(args)-> None:
# reconstructed.
data_loader = SLSTRDataLoader(args, file_paths, single_image=True, crop_size=CROP_SIZE)
dataset = data_loader.to_dataset()


counter = 0
# Inference Loop
for patches, file_name in dataset:
counter = counter + 1
file_name = Path(file_name.numpy().decode('utf-8'))
#print(f"Processing file {file_name}")

Expand All @@ -109,15 +112,21 @@ def cloud_inference(args)-> None:
mask_patches = model.predict_on_batch(patches)

# crop edge artifacts
mask_patches = tf.image.crop_to_bounding_box(mask_patches, CROP_SIZE // 2, CROP_SIZE // 2, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE)
mask_patches = tf.image.crop_to_bounding_box(
mask_patches,
CROP_SIZE // 2,
CROP_SIZE // 2,
PATCH_SIZE - CROP_SIZE,
PATCH_SIZE - CROP_SIZE)

# reconstruct patches back to full size image
mask_patches = tf.reshape(mask_patches, (n, ny, nx, PATCH_SIZE - CROP_SIZE, PATCH_SIZE - CROP_SIZE, 1))
mask = reconstruct_from_patches(args, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE)
output_dir = os.path.expanduser(args['output_dir'])
# mask_name = output_dir + file_name.name + '.h5'
mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5"))
print('MMMM mask_name: ', mask_name)
# mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", str(file_name.name) + ".h5"))
mask_name = os.path.abspath(os.path.join(str(output_dir), "h5", f"{counter:03}_" + str(file_name.name) + ".h5"))
print(counter, 'MMMM mask_name: ', mask_name)

with h5py.File(mask_name, 'w') as handle:
handle.create_dataset('mask', data=mask)
Expand Down Expand Up @@ -146,10 +155,31 @@ def cloud_training(args)-> None:

with mirrored_strategy.scope():
# create U-Net model
model = unet(input_shape=(args['PATCH_SIZE'], args['PATCH_SIZE'], args['N_CHANNELS']))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_dataset, validation_data=test_dataset, epochs=args['epochs'], verbose=1)

model = unet(input_shape=(args['PATCH_SIZE'],
args['PATCH_SIZE'],
args['N_CHANNELS']))
model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_dataset,
validation_data=test_dataset,
epochs=args['epochs'],
verbose=1)

print("Loss history:")
print(history.history['loss'])

print("Accuracy history:")
print(history.history['accuracy'])

accuracy_history = history.history['accuracy']

best_accuracy = max(accuracy_history)
best_epoch = accuracy_history.index(best_accuracy)
print("Best Accuracy:", best_accuracy)
print("Epoch with Best Accuracy:", best_epoch + 1) # Adding 1 to convert zero-based index to epoch number


# Close file descriptors
# atexit.register(mirrored_strategy._extended._collective_ops._pool.close)

Expand All @@ -167,6 +197,8 @@ def cloud_training(args)-> None:
# Running the benchmark: python slstr_cloud.py --config ./cloudMaskConfig.yaml
def main():


StopWatch.start("total")
banner("read commandline")
# Read command line arguments
parser = argparse.ArgumentParser(description='CloudMask command line arguments',\
Expand All @@ -177,19 +209,24 @@ def main():

configFile = os.path.abspath(os.path.expanduser(command_line_args.config))
print ("CCCC configFile", configFile)


StopWatch.start("read yaml")
banner("read yaml")
# Read YAML file
with open(configFile, 'r') as stream:
args = yaml.safe_load(stream)
print("AAA", args)
log_file = os.path.abspath(os.path.expanduser(args['log_file']))
print("LLLL log_file", log_file)
StopWatch.stop("read yaml")


banner ("Training")
# Training
start = time.time()
StopWatch.start("training")
samples = cloud_training(args)
StopWatch.start("training")
print ("TTTT")
diff = time.time() - start
elapsedTime = decimal.Decimal(diff)
Expand All @@ -205,14 +242,18 @@ def main():
banner("Inference")
# Inference
start = time.time()
StopWatch.start("inference")
number_inferences = cloud_inference(args)
StopWatch.stop("inference")
diff = time.time() - start
elapsedTime = decimal.Decimal(diff)
time_per_inference = elapsedTime/number_inferences
time_per_inference_str = f"{time_per_inference:.2f}"
print("number_inferences: ", number_inferences)
with open(log_file, "a") as logfile:
logfile.write(f"CloudMask inference, inferences={number_inferences}, bs={args['batch_size']}, nodes={args['nodes']}, gpus={args['gpu']}, time_per_inference={time_per_inference_str}\n")
StopWatch.stop("total")
StopWatch.benchmark()

if __name__ == "__main__":
main()

0 comments on commit fea7b9d

Please sign in to comment.