From c8b5abb29eb6c432ca3591035dfd24669b58b208 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 24 Apr 2024 13:03:38 -0700 Subject: [PATCH] pc --- scripts/train/train.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 66e369cad0..a930ca0263 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -114,7 +114,19 @@ def validate_config(cfg: DictConfig): f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' ) + def _initialize_gloo_and_nccl(dist_timeout: Union[int, float]): + """Initialize a GLOO process group (immediately destroyed) and a device + process group. + + We have experienced an issue where the first barrier with NCCL does not timeout properly, + and can hang forever if something is wrong. To attempt to mitigate this, we will first + initialize with a gloo process group and test a barrier, then destroy the process group + + Args: + dist_timeout (Union[int, float]): Timeout for initializing the process group + """ + # First, initialize with a gloo process group and test a barrier log.debug('Initializing dist with cpu...') dist.initialize_dist('cpu', timeout=dist_timeout) @@ -131,6 +143,7 @@ def _initialize_gloo_and_nccl(dist_timeout: Union[int, float]): dist.barrier() log.debug('Barrier test passed with device.') + def main(cfg: DictConfig) -> Trainer: # Run user provided code if specified code_paths = pop_config(cfg, @@ -204,10 +217,6 @@ def main(cfg: DictConfig) -> Trainer: logging.getLogger(__name__).setLevel( python_log_level.upper()) # Train script - # We have experienced an issue where the first barrier with NCCL does not timeout properly, - # and can hang forever if something is wrong. To attempt to mitigate this, we will first - # initialize with a gloo process group and test a barrier, then destroy the process group - _initialize_gloo_and_nccl(dist_timeout=dist_timeout) # Get global and device batch size information from distributed/single node setting