enabling dampening of the train_completion with admin control

hasan7n · Oct 26, 2024 · 4e2aae1 · 4e2aae1
1 parent b5b6fa4
commit 4e2aae1
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/examples/fl_post/fl/mlcube/workspace/training_config.yaml b/examples/fl_post/fl/mlcube/workspace/training_config.yaml
@@ -24,12 +24,19 @@ aggregator :
           min: 10    # 10 seconds
           max: 86400 # one day
           value: 86400   # one day
+        train_completion_dampener: # train_completed -> (train_completed)**(train_completion_dampener)
+          admin_settable: True
+          min: 1e-2    # shifts non 0.0 completion rates much closer to 1.0
+          max: 1.0 # leaves completion rates as is
+          value: 1.0
+
       aggregated_model_validation:
         val_cutoff_time:
           admin_settable: True
           min: 10    # 10 seconds
           max: 86400 # one day
           value: 86400   # one day
+        weights_alpha: *weights_alpha
 
 
 collaborator :

diff --git a/examples/fl_post/fl/project/src/runner_nnunetv1.py b/examples/fl_post/fl/project/src/runner_nnunetv1.py
@@ -143,7 +143,7 @@ def write_tensors_into_checkpoint(self, tensor_dict, with_opt_vars):
         return epoch
 
 
-    def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time, train_cutoff_time, **kwargs):
+    def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time, train_cutoff_time, train_completion_dampener, **kwargs):
         # TODO: Figure out the right name to use for this method and the default assigner
         """Perform training for a specified number of epochs."""
 
@@ -169,6 +169,14 @@ def train(self, col_name, round_num, input_tensor_dict, epochs, val_cutoff_time,
                                                       val_epoch=True,
                                                       train_epoch=True)
 
+        # dampen the train_completion
+        """
+        values in range: (0, 1] with values near 0.0 making all train_completion rates shift nearer to 1.0, thus making the
+        trained model update weighting during aggregation stay closer to the plain data size weighting
+        specifically, update_weight = train_data_size / train_completed**train_completion_dampener
+        """
+        train_completed = train_completed**train_completion_dampener
+
         # update amount of task completed
         self.task_completed['train'] = train_completed
         self.task_completed['locally_tuned_model_validation'] = val_completed