diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index b62b3d3e58..72d25cde04 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1986,7 +1986,7 @@ def _get_autoresume_checkpoint( signal_file_path = os.path.join( os.path.dirname(latest_checkpoint_path), - f'.node_{dist.get_node_rank()}_local_rank0_completed_autoresume', + dist.get_node_signal_file_name(), ) if dist.get_local_rank() == 0: os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index f9ad516724..8bf39729e4 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -819,7 +819,7 @@ def download_checkpoint( if not checkpoint_is_sharded: signal_file_path = os.path.join( node_checkpoint_folder, - f'.node_{dist.get_node_rank()}_local_rank0_completed', + dist.get_node_signal_file_name(), ) if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: diff --git a/composer/utils/dist.py b/composer/utils/dist.py index 95a95835f4..2178ce2dd5 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -644,7 +644,7 @@ def get_node_signal_file_name(rng: Optional[random.Random] = None) -> str: random_string = ''.join(rng.choices(string.ascii_letters + string.digits, k=6)) node_rank = get_node_rank() file_name_list = [f'._signal_file_node{node_rank}_{random_string}'] - dist.broadcast_object_list(file_name_list, src=0) + broadcast_object_list(file_name_list, src=0) return file_name_list[0]