Skip to content

Commit

Permalink
Fix HF checkpointer + mlflow bugs (#1125)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Apr 23, 2024
1 parent 4952183 commit c53622e
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 12 deletions.
30 changes: 19 additions & 11 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
from composer.core import Callback, Event, State, Time, TimeUnit
Expand Down Expand Up @@ -160,8 +161,6 @@ def __init__(
if mlflow_logging_config is None:
mlflow_logging_config = {}
if self.mlflow_registered_model_name is not None:
import numpy as np

# Both the metadata and the task are needed in order for mlflow
# and databricks optimized model serving to work
passed_metadata = mlflow_logging_config.get('metadata', {})
Expand All @@ -171,18 +170,17 @@ def __init__(
default_input_example = {
'prompt': np.array(['What is Machine Learning?'])
}
is_chat = mlflow_logging_config['task'].endswith(
'chat') or mlflow_logging_config['metadata'].get(
'task', '').endswith('chat')
is_chat = mlflow_logging_config['task'].endswith('chat') or (
mlflow_logging_config['metadata'] is not None and
mlflow_logging_config['metadata'].get('task',
'').endswith('chat'))
if is_chat:
default_input_example = {
'messages':
np.array([{
'role': 'user',
'content': 'What is Machine Learning?'
}])
'messages': [{
'role': 'user',
'content': 'What is Machine Learning?'
}]
}
mlflow_logging_config.setdefault('example_no_conversion', True)
mlflow_logging_config.setdefault('input_example',
default_input_example)

Expand Down Expand Up @@ -260,6 +258,16 @@ def _is_last_batch(self, state: State):
return True

assert state.max_duration is not None # for pyright

epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
state.timestamp.epoch == state.max_duration.value - 1)
# If the save interval is specified as exactly the same number of batches as the total duration,
# but the max duration is specified in epochs, we need a special case to identify we are on the last batch
# and should write the mlflow checkpoint. This should occur on the last batch of the final epoch.
if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
return True

# If the save interval is specified as 1dur, and the max duration is in epoch units
# we need a special case to identify we are on the last batch and should write the mlflow checkpoint
if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

install_requires = [
'mosaicml[libcloud,wandb,oci,gcs]>=0.21.3,<0.22',
'mlflow>=2.10,<2.12',
'mlflow>=2.12.1,<2.13',
'accelerate>=0.25,<0.26', # for HF inference `device_map`
'transformers>=4.40,<4.41',
'mosaicml-streaming>=0.7.5,<0.8',
Expand Down

0 comments on commit c53622e

Please sign in to comment.