Skip to content

Commit

Permalink
pc
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Apr 24, 2024
1 parent 62b0907 commit c8b5abb
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions scripts/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,19 @@ def validate_config(cfg: DictConfig):
f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.'
)


def _initialize_gloo_and_nccl(dist_timeout: Union[int, float]):
"""Initialize a GLOO process group (immediately destroyed) and a device
process group.
We have experienced an issue where the first barrier with NCCL does not timeout properly,
and can hang forever if something is wrong. To attempt to mitigate this, we will first
initialize with a gloo process group and test a barrier, then destroy the process group
Args:
dist_timeout (Union[int, float]): Timeout for initializing the process group
"""

# First, initialize with a gloo process group and test a barrier
log.debug('Initializing dist with cpu...')
dist.initialize_dist('cpu', timeout=dist_timeout)
Expand All @@ -131,6 +143,7 @@ def _initialize_gloo_and_nccl(dist_timeout: Union[int, float]):
dist.barrier()
log.debug('Barrier test passed with device.')


def main(cfg: DictConfig) -> Trainer:
# Run user provided code if specified
code_paths = pop_config(cfg,
Expand Down Expand Up @@ -204,10 +217,6 @@ def main(cfg: DictConfig) -> Trainer:
logging.getLogger(__name__).setLevel(
python_log_level.upper()) # Train script

# We have experienced an issue where the first barrier with NCCL does not timeout properly,
# and can hang forever if something is wrong. To attempt to mitigate this, we will first
# initialize with a gloo process group and test a barrier, then destroy the process group

_initialize_gloo_and_nccl(dist_timeout=dist_timeout)

# Get global and device batch size information from distributed/single node setting
Expand Down

0 comments on commit c8b5abb

Please sign in to comment.