From 4e2aae1e4c4ae1a9f073aeb6aa28252266a9bc75 Mon Sep 17 00:00:00 2001 From: "Edwards, Brandon" Date: Fri, 25 Oct 2024 17:53:59 -0700 Subject: [PATCH] enabling dampening of the train_completion with admin control --- .../fl_post/fl/mlcube/workspace/training_config.yaml | 7 +++++++ examples/fl_post/fl/project/src/runner_nnunetv1.py | 10 +++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/examples/fl_post/fl/mlcube/workspace/training_config.yaml b/examples/fl_post/fl/mlcube/workspace/training_config.yaml index cb1bbf476..bd9daf629 100644 --- a/examples/fl_post/fl/mlcube/workspace/training_config.yaml +++ b/examples/fl_post/fl/mlcube/workspace/training_config.yaml @@ -24,12 +24,19 @@ aggregator : min: 10 # 10 seconds max: 86400 # one day value: 86400 # one day + train_completion_dampener: # train_completed -> (train_completed)**(train_completion_dampener) + admin_settable: True + min: 1e-2 # shifts non 0.0 completion rates much closer to 1.0 + max: 1.0 # leaves completion rates as is + value: 1.0 + aggregated_model_validation: val_cutoff_time: admin_settable: True min: 10 # 10 seconds max: 86400 # one day value: 86400 # one day + weights_alpha: *weights_alpha collaborator : diff --git a/examples/fl_post/fl/project/src/runner_nnunetv1.py b/examples/fl_post/fl/project/src/runner_nnunetv1.py index db84b0acd..96cb257cc 100644 --- a/examples/fl_post/fl/project/src/runner_nnunetv1.py +++ b/examples/fl_post/fl/project/src/runner_nnunetv1.py @@ -143,7 +143,7 @@ def write_tensors_into_checkpoint(self, tensor_dict, with_opt_vars): return epoch - def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time, train_cutoff_time, **kwargs): + def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time, train_cutoff_time, train_completion_dampener, **kwargs): # TODO: Figure out the right name to use for this method and the default assigner """Perform training for a specified number of epochs.""" @@ -169,6 +169,14 @@ def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time, val_epoch=True, train_epoch=True) + # dampen the train_completion + """ + values in range: (0, 1] with values near 0.0 making all train_completion rates shift nearer to 1.0, thus making the + trained model update weighting during aggregation stay closer to the plain data size weighting + specifically, update_weight = train_data_size / train_completed**train_completion_dampener + """ + train_completed = train_completed**train_completion_dampener + # update amount of task completed self.task_completed['train'] = train_completed self.task_completed['locally_tuned_model_validation'] = val_completed