Skip to content

Commit

Permalink
Revert "Autoresume Validation with Max Duration (#3358)" (#3364)
Browse files Browse the repository at this point in the history
This reverts commit f0eae8a.
  • Loading branch information
mvpatel2000 authored and bigning committed Jun 5, 2024
1 parent 49e129b commit dd510a4
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 42 deletions.
16 changes: 1 addition & 15 deletions composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1723,12 +1723,9 @@ def __init__(
# Load Checkpoint
self._rng_state = None
# If autoresume is enabled, first check for existing checkpoints to load
self.autoresume = autoresume
if self.autoresume:
if autoresume:
log.info('Searching for a previous checkpoint to autoresume')
error_message = ''
if max_duration is None:
error_message += 'The `max_duration` must be specified on trainer.__init__ when autoresume is enabled. '
if save_folder is None:
error_message += 'The `save_folder` must be specified when autoresume is enabled. '
if save_overwrite:
Expand Down Expand Up @@ -2191,21 +2188,10 @@ def fit(

# Reset Time
if reset_time:
if self.autoresume:
raise ValueError(
'Cannot specify `reset_time=True` when autoresume is enabled. Please instead '
'specify `load_ignore_keys` when constructing the Trainer, which will only '
'run on the initial load and not any subsequent autoresumptions.',
)
self.state.timestamp = Timestamp()

# Max Duration
if duration is not None:
if self.autoresume:
raise ValueError(
'`duration` cannot be specified when autoresume is enabled. Please instead '
'specify `max_duration` when constructing the Trainer.',
)
duration = ensure_time(duration, TimeUnit.EPOCH)
if duration.unit == TimeUnit.SECOND:
raise ValueError('Wall clock time not an allowed time unit.')
Expand Down
34 changes: 7 additions & 27 deletions tests/trainer/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,6 @@ def get_trainer(
max_duration: str = '2ep',
latest_filename: str = 'latest-rank{rank}.pt',
file_extension: str = '.pt',
use_scheduler: bool = True,
**kwargs,
):
if model is None:
Expand Down Expand Up @@ -705,7 +704,7 @@ def get_trainer(
save_filename='ep{epoch}' + file_extension,
max_duration=max_duration,
optimizers=optimizer,
schedulers=ExponentialScheduler(gamma=0.9) if use_scheduler else None,
schedulers=ExponentialScheduler(gamma=0.9),
callbacks=callbacks,
**kwargs,
)
Expand Down Expand Up @@ -1213,43 +1212,24 @@ def test_load_weights_object_store(self, tmp_path):
)

@pytest.mark.parametrize(
'run_name,save_folder,save_overwrite,latest_filename,max_duration',
'run_name,save_folder,save_overwrite,latest_filename',
[
[None, 'first', False, 'latest-rank{rank}.pt', '2ep'],
['big-chungus', None, False, 'latest-rank{rank}.pt', '2ep'],
['big-chungus', 'first', True, 'latest-rank{rank}.pt', '2ep'],
['big-chungus', 'first', False, None, '2ep'],
['big-chungus', 'first', False, 'latest-rank{rank}.pt', None],
[None, 'first', False, 'latest-rank{rank}.pt'],
['big-chungus', None, False, 'latest-rank{rank}.pt'],
['big-chungus', 'first', True, 'latest-rank{rank}.pt'],
['big-chungus', 'first', False, None],
],
)
def test_autoresume_fail_init(self, run_name, save_folder, save_overwrite, latest_filename, max_duration):
def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename):
with pytest.raises(ValueError):
self.get_trainer(
latest_filename=latest_filename,
save_overwrite=save_overwrite,
save_folder=save_folder,
run_name=run_name,
max_duration=max_duration,
autoresume=True,
use_scheduler=False,
)

@pytest.mark.parametrize(
'duration,reset_time',
[
['1ep', False],
[None, True],
],
)
def test_autoresume_fail_fit(self, duration: Optional[str], reset_time: bool):
trainer = self.get_trainer(
run_name='bigtrainer',
save_folder='first',
autoresume=True,
)
with pytest.raises(ValueError):
trainer.fit(duration=duration, reset_time=reset_time)

def test_different_run_names(self):

trainer_1 = self.get_trainer(
Expand Down

0 comments on commit dd510a4

Please sign in to comment.