Skip to content

Commit

Permalink
post review with Micah
Browse files Browse the repository at this point in the history
  • Loading branch information
brandon-edwards committed Oct 17, 2024
1 parent f0e0170 commit a98b2ef
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 40 deletions.
11 changes: 6 additions & 5 deletions examples/fl_post/fl/mlcube/workspace/training_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ task_runner :
defaults : plan/defaults/task_runner.yaml
template : src.runner_nnunetv1.PyTorchNNUNetCheckpointTaskRunner
settings :
device : cuda
gpu_num_string : '0'
nnunet_task : Task537_FLPost
train_cutoff : 100
val_cutoff : 3
device : cuda
gpu_num_string : '0'
nnunet_task : Task537_FLPost
train_cutoff : 100
val_cutoff : 3
actual_max_num_epochs : 1000

network :
defaults : plan/defaults/network.yaml
Expand Down
4 changes: 2 additions & 2 deletions examples/fl_post/fl/project/be_Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ ENV CUDA_VISIBLE_DEVICES="0"
# ENV https_proxy="http://proxy-us.intel.com:912"
ENV no_proxy=localhost,spr-gpu01.jf.intel.com

ENV no_proxy________________="http://proxy-us.intel.com:912"
ENV no_proxy_________________________________________________________________________="http://proxy-us.intel.com:912"

# install project dependencies
RUN apt-get update && apt-get install --no-install-recommends -y git zlib1g-dev libffi-dev libgl1 libgtk2.0-dev gcc g++

RUN pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121

COPY ./requirements.txt /mlcube_project/requirements.txt
COPY ./be_requirements.txt /mlcube_project/requirements.txt
RUN pip install --no-cache-dir -r /mlcube_project/requirements.txt

# Create similar env with cuda118
Expand Down
2 changes: 1 addition & 1 deletion examples/fl_post/fl/project/nnunet_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def main(postopp_pardir,
plans_path=None,
local_plans_identifier=local_plans_identifier,
shared_plans_identifier=shared_plans_identifier,
overwrite_nnunet_datadirs=False,
overwrite_nnunet_datadirs=True,
timestamp_selection='all',
cuda_device='0',
verbose=False):
Expand Down
2 changes: 1 addition & 1 deletion examples/fl_post/fl/project/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
onnx==1.13.0
typer==0.9.0
git+https://github.com/brandon-edwards/nnUNet_v1.7.1_local.git@supporting_partial_epochs
git+https://github.com/brandon-edwards/nnUNet_v1.7.1_local.git@main#egg=nnunet
numpy==1.26.4
16 changes: 4 additions & 12 deletions examples/fl_post/fl/project/src/nnunet_dummy_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,16 @@
import os

class NNUNetDummyDataLoader():
def __init__(self, data_path, p_train, partial_epoch=1.0):
def __init__(self, data_path, p_train):
self.task_name = data_path
data_base_path = os.path.join(os.environ['nnUNet_preprocessed'], self.task_name)
with open(f'{data_base_path}/dataset.json', 'r') as f:
data_config = json.load(f)
data_size = data_config['numTraining']

# NOTE: Intended use with PyTorchNNUNetCheckpointTaskRunner where partial_epoch scales down num_train_batches_per_epoch
# and num_val_batches_per_epoch. NNUnet loaders sample batches with replacement. Ignoring rounding (int()),
# the 'data sizes' below are divided by batch_size to obtain the number of batches used per epoch.
# These 'data sizes' therefore establish correct relative weights for train and val result aggregation over collaboarators
# due to the fact that batch_size is equal across all collaborators. In addition, over many rounds each data point
# at a particular collaborator informs the results with equal measure. In particular, the average number of times (over
# repeated runs of the federation) that a particular sample is used for a training or val result
# over the corse of the whole federation is given by the 'data sizes' below.
# TODO: determine how nnunet validation splits round
self.train_data_size = int(partial_epoch * p_train * data_size)
self.valid_data_size = int(partial_epoch * (1 - p_train) * data_size)
self.train_data_size = int(p_train * data_size)
self.valid_data_size = data_size - self.train_data_size

def get_feature_shape(self):
return [1,1,1]
Expand All @@ -41,4 +33,4 @@ def get_valid_data_size(self):
return self.valid_data_size

def get_task_name(self):
return self.task_name
return self.task_name
9 changes: 5 additions & 4 deletions examples/fl_post/fl/project/src/nnunet_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def seed_everything(seed=1234):
torch.backends.cudnn.deterministic = True


def train_nnunet(TOTAL_max_num_epochs,
def train_nnunet(actual_max_num_epochs,
epochs,
current_epoch,
val_epoch=True,
Expand Down Expand Up @@ -83,7 +83,8 @@ def train_nnunet(TOTAL_max_num_epochs,
pretrained_weights=None):

"""
TOTAL_max_num_epochs (int): Provides the total number of epochs intended to be trained (this needs to be held constant outside of individual calls to this function during the course of federated training)
actual_max_num_epochs (int): Provides the number of epochs intended to be trained
(this needs to be held constant outside of individual calls to this function during with max_num_epochs is set to one more than the current epoch)
epochs (int): Number of epochs to trainon top of current epoch
current_epoch (int): Which epoch will be used to grab the model
val_epoch (bool) : Will validation be performed
Expand Down Expand Up @@ -211,7 +212,7 @@ def __init__(self, **kwargs):
trainer = trainer_class(
plans_file,
fold,
TOTAL_max_num_epochs=TOTAL_max_num_epochs,
actual_max_num_epochs=actual_max_num_epochs,
output_folder=output_folder_name,
dataset_directory=dataset_directory,
batch_dice=batch_dice,
Expand Down Expand Up @@ -259,7 +260,7 @@ def __init__(self, **kwargs):
return

if find_lr:
trainer.find_lr(num_iters=self.TOTAL_max_num_epochs)
trainer.find_lr(num_iters=self.actual_max_num_epochs)
else:
if not validation_only:
if args.continue_training:
Expand Down
26 changes: 11 additions & 15 deletions examples/fl_post/fl/project/src/runner_nnunetv1.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self,
val_cutoff=np.inf,
nnunet_task=None,
config_path=None,
TOTAL_max_num_epochs=1000,
actual_max_num_epochs=1000,
**kwargs):
"""Initialize.
Expand All @@ -46,7 +46,7 @@ def __init__(self,
val_cutoff (int) : Total time (in seconds) allowed for iterating over val batches (plus or minus one iteration since check willl be once an iteration).
nnunet_task (str) : Task string used to identify the data and model folders
config_path(str) : Path to the configuration file used by the training and validation script.
TOTAL_max_num_epochs (int) : Total number of epochs for which this collaborator's model will be trained, should match the total rounds of federation in which this runner is participating
actual_max_num_epochs (int) : Number of epochs for which this collaborator's model will be trained, should match the total rounds of federation in which this runner is participating
kwargs : Additional key work arguments (will be passed to rebuild_model, initialize_tensor_key_functions, TODO: <Fill this in>).
TODO:
"""
Expand Down Expand Up @@ -80,7 +80,7 @@ def __init__(self,
self.train_cutoff = train_cutoff
self.val_cutoff = val_cutoff
self.config_path = config_path
self.TOTAL_max_num_epochs=TOTAL_max_num_epochs
self.actual_max_num_epochs=actual_max_num_epochs

# self.task_completed is a dictionary of task to amount completed as a float in [0,1]
# Values will be dynamically updated
Expand Down Expand Up @@ -172,7 +172,7 @@ def train(self, col_name, round_num, input_tensor_dict, epochs, **kwargs):
this_val_eval_metrics_C1, \
this_val_eval_metrics_C2, \
this_val_eval_metrics_C3, \
this_val_eval_metrics_C4 = train_nnunet(TOTAL_max_num_epochs=self.TOTAL_max_num_epochs,
this_val_eval_metrics_C4 = train_nnunet(actual_max_num_epochs=self.actual_max_num_epochs,
epochs=epochs,
current_epoch=current_epoch,
train_cutoff=self.train_cutoff,
Expand Down Expand Up @@ -227,7 +227,7 @@ def compare_tensor_dicts(td_1, td_2, tag="", epsilon=0.1, verbose=True):
this_val_eval_metrics_C1, \
this_val_eval_metrics_C2, \
this_val_eval_metrics_C3, \
this_val_eval_metrics_C4 = train_nnunet(TOTAL_max_num_epochs=self.TOTAL_max_num_epochs,
this_val_eval_metrics_C4 = train_nnunet(actual_max_num_epochs=self.actual_max_num_epochs,
epochs=1,
current_epoch=current_epoch,
train_cutoff=0,
Expand Down Expand Up @@ -286,7 +286,7 @@ def load_metrics(self, filepath):
"""


def get_train_data_size(self, task_dependent=False, task_name=None):
def get_train_data_size(self, task_name=None):
"""Get the number of training examples.
It will be used for weighted averaging in aggregation.
Expand All @@ -296,16 +296,14 @@ def get_train_data_size(self, task_dependent=False, task_name=None):
Returns:
int: The number of training examples, weighted by how much of the task got completed, then cast to int to satisy proto schema
"""
if not task_dependent:
if not task_name:
return self.data_loader.get_train_data_size()
elif not task_name:
raise ValueError(f"If using task dependent data size, must provide task_name.")
else:
# self.task_completed is a dictionary of task_name to amount completed as a float in [0,1]
return int(np.ceil(self.task_completed[task_name] * self.data_loader.get_train_data_size()))
return int(np.ceil(self.task_completed[task_name]**(-1) * self.data_loader.get_train_data_size()))


def get_valid_data_size(self, task_dependent=False, task_name=None):
def get_valid_data_size(self, task_name=None):
"""Get the number of training examples.
It will be used for weighted averaging in aggregation.
Expand All @@ -315,10 +313,8 @@ def get_valid_data_size(self, task_dependent=False, task_name=None):
Returns:
int: The number of training examples, weighted by how much of the task got completed, then cast to int to satisy proto schema
"""
if not task_dependent:
if not task_name:
return self.data_loader.get_valid_data_size()
elif not task_name:
raise ValueError(f"If using task dependent data size, must provide task_name.")
else:
# self.task_completed is a dictionary of task_name to amount completed as a float in [0,1]
return int(np.ceil(self.task_completed[task_name] * self.data_loader.get_valid_data_size()))
return int(np.ceil(self.task_completed[task_name]**(-1) * self.data_loader.get_valid_data_size()))

0 comments on commit a98b2ef

Please sign in to comment.