post review with Micah

hasan7n · Oct 17, 2024 · a98b2ef · a98b2ef
1 parent f0e0170
commit a98b2ef
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 40 deletions.
diff --git a/examples/fl_post/fl/mlcube/workspace/training_config.yaml b/examples/fl_post/fl/mlcube/workspace/training_config.yaml
@@ -30,11 +30,12 @@ task_runner :
   defaults : plan/defaults/task_runner.yaml
   template : src.runner_nnunetv1.PyTorchNNUNetCheckpointTaskRunner
   settings :
-    device          : cuda
-    gpu_num_string  : '0'
-    nnunet_task     : Task537_FLPost
-    train_cutoff    : 100
-    val_cutoff      : 3
+    device                  : cuda
+    gpu_num_string          : '0'
+    nnunet_task             : Task537_FLPost
+    train_cutoff            : 100
+    val_cutoff              : 3
+    actual_max_num_epochs   : 1000
 
 network :
   defaults : plan/defaults/network.yaml

diff --git a/examples/fl_post/fl/project/be_Dockerfile b/examples/fl_post/fl/project/be_Dockerfile
@@ -6,14 +6,14 @@ ENV CUDA_VISIBLE_DEVICES="0"
 # ENV https_proxy="http://proxy-us.intel.com:912"
 ENV no_proxy=localhost,spr-gpu01.jf.intel.com
 
-ENV no_proxy________________="http://proxy-us.intel.com:912"
+ENV no_proxy_________________________________________________________________________="http://proxy-us.intel.com:912"
 
 # install project dependencies
 RUN apt-get update && apt-get install --no-install-recommends -y git zlib1g-dev libffi-dev libgl1 libgtk2.0-dev gcc g++
 
 RUN pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
 
-COPY ./requirements.txt /mlcube_project/requirements.txt
+COPY ./be_requirements.txt /mlcube_project/requirements.txt
 RUN pip install --no-cache-dir -r /mlcube_project/requirements.txt
 
 # Create similar env with cuda118

diff --git a/examples/fl_post/fl/project/nnunet_setup.py b/examples/fl_post/fl/project/nnunet_setup.py
@@ -28,7 +28,7 @@ def main(postopp_pardir,
          plans_path=None, 
          local_plans_identifier=local_plans_identifier,
          shared_plans_identifier=shared_plans_identifier,
-         overwrite_nnunet_datadirs=False, 
+         overwrite_nnunet_datadirs=True, 
          timestamp_selection='all', 
          cuda_device='0', 
          verbose=False):

diff --git a/examples/fl_post/fl/project/requirements.txt b/examples/fl_post/fl/project/requirements.txt
@@ -1,4 +1,4 @@
 onnx==1.13.0
 typer==0.9.0
-git+https://github.com/brandon-edwards/nnUNet_v1.7.1_local.git@supporting_partial_epochs
+git+https://github.com/brandon-edwards/nnUNet_v1.7.1_local.git@main#egg=nnunet
 numpy==1.26.4
diff --git a/examples/fl_post/fl/project/src/nnunet_dummy_dataloader.py b/examples/fl_post/fl/project/src/nnunet_dummy_dataloader.py
@@ -12,24 +12,16 @@
 import os
 
 class NNUNetDummyDataLoader():
-    def __init__(self, data_path, p_train, partial_epoch=1.0):
+    def __init__(self, data_path, p_train):
         self.task_name = data_path
         data_base_path = os.path.join(os.environ['nnUNet_preprocessed'], self.task_name)
         with open(f'{data_base_path}/dataset.json', 'r') as f:
             data_config = json.load(f)
         data_size = data_config['numTraining']
 
-        # NOTE: Intended use with PyTorchNNUNetCheckpointTaskRunner where partial_epoch scales down num_train_batches_per_epoch 
-        # and num_val_batches_per_epoch. NNUnet loaders sample batches with replacement. Ignoring rounding (int()),
-        # the 'data sizes' below are divided by batch_size to obtain the number of batches used per epoch.
-        # These 'data sizes' therefore establish correct relative weights for train and val result aggregation over collaboarators
-        # due to the fact that batch_size is equal across all collaborators. In addition, over many rounds each data point
-        # at a particular collaborator informs the results with equal measure. In particular, the average number of times (over 
-        # repeated runs of the federation) that a particular sample is used for a training or val result
-        # over the corse of the whole federation is given by the 'data sizes' below.
         # TODO: determine how nnunet validation splits round
-        self.train_data_size = int(partial_epoch * p_train * data_size)
-        self.valid_data_size = int(partial_epoch * (1 - p_train) * data_size)
+        self.train_data_size = int(p_train * data_size)
+        self.valid_data_size = data_size - self.train_data_size
 
     def get_feature_shape(self):
         return [1,1,1]
@@ -41,4 +33,4 @@ def get_valid_data_size(self):
         return self.valid_data_size
 
     def get_task_name(self):
-        return self.task_name
+        return self.task_name
diff --git a/examples/fl_post/fl/project/src/nnunet_v1.py b/examples/fl_post/fl/project/src/nnunet_v1.py
@@ -54,7 +54,7 @@ def seed_everything(seed=1234):
     torch.backends.cudnn.deterministic = True
 
 
-def train_nnunet(TOTAL_max_num_epochs, 
+def train_nnunet(actual_max_num_epochs, 
                  epochs,
                  current_epoch,
                  val_epoch=True,
@@ -83,7 +83,8 @@ def train_nnunet(TOTAL_max_num_epochs,
                  pretrained_weights=None):
 
     """
-    TOTAL_max_num_epochs (int): Provides the total number of epochs intended to be trained (this needs to be held constant outside of individual calls to this function during the course of federated training)
+    actual_max_num_epochs (int): Provides the number of epochs intended to be trained 
+    (this needs to be held constant outside of individual calls to this function during with max_num_epochs is set to one more than the current epoch)
     epochs (int): Number of epochs to trainon top of current epoch
     current_epoch (int): Which epoch will be used to grab the model
     val_epoch (bool) : Will validation be performed
@@ -211,7 +212,7 @@ def __init__(self, **kwargs):
     trainer = trainer_class(
         plans_file,
         fold,
-        TOTAL_max_num_epochs=TOTAL_max_num_epochs,
+        actual_max_num_epochs=actual_max_num_epochs,
         output_folder=output_folder_name,
         dataset_directory=dataset_directory,
         batch_dice=batch_dice,
@@ -259,7 +260,7 @@ def __init__(self, **kwargs):
         return
 
     if find_lr:
-        trainer.find_lr(num_iters=self.TOTAL_max_num_epochs)
+        trainer.find_lr(num_iters=self.actual_max_num_epochs)
     else:
         if not validation_only:
             if args.continue_training:

diff --git a/examples/fl_post/fl/project/src/runner_nnunetv1.py b/examples/fl_post/fl/project/src/runner_nnunetv1.py
@@ -37,7 +37,7 @@ def __init__(self,
                  val_cutoff=np.inf,
                  nnunet_task=None,
                  config_path=None,
-                 TOTAL_max_num_epochs=1000,
+                 actual_max_num_epochs=1000,
                  **kwargs):
         """Initialize.
 
@@ -46,7 +46,7 @@ def __init__(self,
             val_cutoff (int)                    : Total time (in seconds) allowed for iterating over val batches (plus or minus one iteration since check willl be once an iteration).
             nnunet_task (str)                   : Task string used to identify the data and model folders
             config_path(str)                    : Path to the configuration file used by the training and validation script.
-            TOTAL_max_num_epochs (int)          : Total number of epochs for which this collaborator's model will be trained, should match the total rounds of federation in which this runner is participating
+            actual_max_num_epochs (int)         : Number of epochs for which this collaborator's model will be trained, should match the total rounds of federation in which this runner is participating
             kwargs                              : Additional key work arguments (will be passed to rebuild_model, initialize_tensor_key_functions, TODO: <Fill this in>).
             TODO: 
         """ 
@@ -80,7 +80,7 @@ def __init__(self,
         self.train_cutoff = train_cutoff
         self.val_cutoff = val_cutoff
         self.config_path = config_path
-        self.TOTAL_max_num_epochs=TOTAL_max_num_epochs
+        self.actual_max_num_epochs=actual_max_num_epochs
 
         # self.task_completed is a dictionary of task to amount completed as a float in [0,1]
         # Values will be dynamically updated
@@ -172,7 +172,7 @@ def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs):
         this_val_eval_metrics_C1, \
         this_val_eval_metrics_C2, \
         this_val_eval_metrics_C3, \
-        this_val_eval_metrics_C4 = train_nnunet(TOTAL_max_num_epochs=self.TOTAL_max_num_epochs, 
+        this_val_eval_metrics_C4 = train_nnunet(actual_max_num_epochs=self.actual_max_num_epochs, 
                                                       epochs=epochs, 
                                                       current_epoch=current_epoch, 
                                                       train_cutoff=self.train_cutoff,
@@ -227,7 +227,7 @@ def compare_tensor_dicts(td_1, td_2, tag="", epsilon=0.1, verbose=True):
             this_val_eval_metrics_C1, \
             this_val_eval_metrics_C2, \
             this_val_eval_metrics_C3, \
-            this_val_eval_metrics_C4 = train_nnunet(TOTAL_max_num_epochs=self.TOTAL_max_num_epochs, 
+            this_val_eval_metrics_C4 = train_nnunet(actual_max_num_epochs=self.actual_max_num_epochs, 
                                                 epochs=1, 
                                                 current_epoch=current_epoch, 
                                                 train_cutoff=0,
@@ -286,7 +286,7 @@ def load_metrics(self, filepath):
         """
 
 
-    def get_train_data_size(self, task_dependent=False, task_name=None):
+    def get_train_data_size(self, task_name=None):
         """Get the number of training examples.
 
         It will be used for weighted averaging in aggregation. 
@@ -296,16 +296,14 @@ def get_train_data_size(self, task_dependent=False, task_name=None):
         Returns:
             int: The number of training examples, weighted by how much of the task got completed, then cast to int to satisy proto schema
         """
-        if not task_dependent:
+        if not task_name:
             return self.data_loader.get_train_data_size()
-        elif not task_name:
-            raise ValueError(f"If using task dependent data size, must provide task_name.")
         else:
             # self.task_completed is a dictionary of task_name to amount completed as a float in [0,1]
-            return int(np.ceil(self.task_completed[task_name] * self.data_loader.get_train_data_size()))
+            return int(np.ceil(self.task_completed[task_name]**(-1) * self.data_loader.get_train_data_size()))
 
 
-    def get_valid_data_size(self, task_dependent=False, task_name=None):
+    def get_valid_data_size(self, task_name=None):
         """Get the number of training examples.
 
         It will be used for weighted averaging in aggregation. 
@@ -315,10 +313,8 @@ def get_valid_data_size(self, task_dependent=False, task_name=None):
         Returns:
             int: The number of training examples, weighted by how much of the task got completed, then cast to int to satisy proto schema
         """
-        if not task_dependent:
+        if not task_name:
             return self.data_loader.get_valid_data_size()
-        elif not task_name:
-            raise ValueError(f"If using task dependent data size, must provide task_name.")
         else:
             # self.task_completed is a dictionary of task_name to amount completed as a float in [0,1]
-            return int(np.ceil(self.task_completed[task_name] * self.data_loader.get_valid_data_size()))  
+            return int(np.ceil(self.task_completed[task_name]**(-1) * self.data_loader.get_valid_data_size()))