Skip to content

Commit

Permalink
Merge branch 'dev' into log-image-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Jun 12, 2024
2 parents afa880b + ba82cc9 commit bc44224
Show file tree
Hide file tree
Showing 17 changed files with 82 additions and 70 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.7
ref: v0.0.8
path: ./ci-testing
- uses: ./ci-testing/.github/actions/code-quality
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.7
ref: v0.0.8
path: ./ci-testing
- uses: ./ci-testing/.github/actions/codeql-analysis
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.7
ref: v0.0.8
path: ./ci-testing
- uses: ./ci-testing/.github/actions/coverage
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
jobs:
daily-pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
include:
Expand Down Expand Up @@ -100,7 +100,7 @@ jobs:
download-path: artifacts

daily-pytest-gpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
jobs:
pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
include:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
jobs:
pytest-gpu-1:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
include:
Expand All @@ -35,7 +35,7 @@ jobs:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}

pytest-gpu-2:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
include:
Expand All @@ -62,7 +62,7 @@ jobs:


pytest-gpu-4:
uses: mosaicml/ci-testing/.github/workflows/[email protected].7
uses: mosaicml/ci-testing/.github/workflows/[email protected].8
strategy:
matrix:
include:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.7
ref: v0.0.8
path: ./ci-testing
- uses: ./ci-testing/.github/actions/code-quality
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.7
ref: v0.0.8
path: ./ci-testing
- uses: ./ci-testing/.github/actions/smoketest
with:
Expand Down
10 changes: 6 additions & 4 deletions composer/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,8 +707,10 @@ def train_dataloader(self, train_dataloader: Optional[Union[Iterable, DataLoader
train_dataloader (Iterable | DataLoader, optional): The dataloader.
"""
self._train_dataloader = train_dataloader
# Load dataset state from checkpoint when train_dataloader is set
if self.dataset_state:
# Load dataset state from checkpoint when train_dataloader is set. This occurs if
# dataset_state was loaded from checkpoint and train_dataloader has not already
# consumed dataset_state['train'] to resume.
if self.dataset_state is not None and self.dataset_state.get('train') is not None:
dataset = self._dataset_of(self._train_dataloader)
if hasattr(dataset, 'load_state_dict'):
dataset.load_state_dict(self.dataset_state['train']) # pyright: ignore
Expand Down Expand Up @@ -1278,14 +1280,14 @@ def _load_dataset_state(self, obj: dict[str, Any]) -> None:
Args:
obj (dict[str, Any]): The state to load.
"""
self.dataset_state = obj

dataset = self._dataset_of(self.train_dataloader)
if hasattr(dataset, 'load_state_dict'):
dataset.load_state_dict(obj['train']) # pyright: ignore
obj['train'] = None
self.dataset_resumption['train'] = True

self.dataset_state = obj

def load_model_state(
self,
state_dict: dict[str, Any],
Expand Down
26 changes: 1 addition & 25 deletions composer/distributed/dist_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,36 +328,12 @@ def sync_hook(*args):

mixed_precision = fsdp_config.mixed_precision
keep_low_precision_grads = fsdp_config.keep_low_precision_grads
mixed_precision, param_dtype, _, _ = get_mixed_precision(
mixed_precision, _, _, _ = get_mixed_precision(
precision,
mixed_precision=mixed_precision,
keep_low_precision_grads=keep_low_precision_grads,
)

# Note: FSDP does support the use of torch.float32 with sharding.
# They just never expected a user to pass in torch.float32 into mixed_precision as a param_dtype.
# See: https://github.com/pytorch/pytorch/issues/90584
# The PR fixing this bug is merged into PyTorch, but it hasn't made its way into a release yet.
# Instead a user needs to pass in `None` as param_dtype to have the parameters as torch.float32.
# TODO: remove these checks when PyTorch has a release that includes the fix.
if sharding_map_key != 'NO_SHARD':
if (
precision == Precision.AMP_FP16 and param_dtype not in [torch.float16, None] or
precision == Precision.AMP_BF16 and param_dtype not in [torch.bfloat16, None]
):
raise ValueError(
f'FSDP in PyTorch 1.13 does not support precision `{precision}` with sharding strategy `{sharding_strategy}` '
f'and param_dtype `{param_dtype}.` Consider using one of the predefined mixed_precision strategies '
"(choose: `'FULL'`, `'DEFAULT'`, `'PURE'`)",
)

if param_dtype == torch.float32:
raise ValueError(
f'FSDP in PyTorch 1.13 does not support param_dtype `{param_dtype}` with sharding_strategy `{sharding_map_key}` '
f'Consider using `amp` or `bf16` for precision or setting param_dtype in mixed_precision to `None` '
f'with sharding strategy `{sharding_map_key}.`',
)

process_group = None
if fsdp_config.process_group is not None:
process_group_dict = {'process_group': fsdp_config.process_group}
Expand Down
11 changes: 10 additions & 1 deletion composer/loggers/mlflow_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def __init__(
) -> None:
try:
import mlflow
from databricks.sdk import WorkspaceClient
from mlflow import MlflowClient
except ImportError as e:
raise MissingConditionalImportError(
Expand Down Expand Up @@ -143,9 +142,19 @@ def __init__(
DEFAULT_MLFLOW_EXPERIMENT_NAME,
)
assert self.experiment_name is not None # type hint

if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'):
try:
from databricks.sdk import WorkspaceClient
except ImportError as e:
raise MissingConditionalImportError(
extra_deps_group='mlflow',
conda_package='databricks-sdk',
conda_channel='conda-forge',
) from e
databricks_username = WorkspaceClient().current_user.me().user_name or ''
self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name)

self._mlflow_client = MlflowClient(self.tracking_uri)
# Set experiment
env_exp_id = os.getenv(
Expand Down
24 changes: 24 additions & 0 deletions composer/metrics/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,27 @@ def compute(self) -> Tensor:
"""Returns torch.exp() of the LanguageCrossEntropy."""
avg_loss = super().compute()
return torch.exp(avg_loss)


# For backward compatibility
class InContextLearningMetric:
"""InContextLearningMetric only exists for backwards compatibility of checkpoints that contain pickled metrics."""

def __init__(self):
raise RuntimeError(
f'This class only exists for maintaining backward compatibility for checkpoints that contain pickled metrics. Please instead use https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/README.md.',
)

def __getstate__(self):
return None

def __setstate__(self, state):
pass


InContextLearningCodeEvalAccuracy = InContextLearningMetric
InContextLearningLMAccuracy = InContextLearningMetric
InContextLearningLMExpectedCalibrationError = InContextLearningMetric
InContextLearningMCExpectedCalibrationError = InContextLearningMetric
InContextLearningQAAccuracy = InContextLearningMetric
InContextLearningMultipleChoiceAccuracy = InContextLearningMetric
4 changes: 2 additions & 2 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
<!-- BEGIN_COMPOSER_BUILD_MATRIX -->
| Composer Version | CUDA Support | Docker Tag |
|--------------------|----------------|----------------------------------------------------------------|
| 0.23.1 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.1` |
| 0.23.1 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.1_cpu` |
| 0.23.2 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2` |
| 0.23.2 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` |
<!-- END_COMPOSER_BUILD_MATRIX -->

**Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
Expand Down
12 changes: 6 additions & 6 deletions docker/build_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@
TORCHVISION_VERSION: 0.16.2
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
CUDA_VERSION: 12.1.1
IMAGE_NAME: composer-0-23-1
IMAGE_NAME: composer-0-23-2
MOFED_VERSION: latest-23.10
NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
Expand All @@ -231,23 +231,23 @@
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.3.1
TAGS:
- mosaicml/composer:0.23.1
- mosaicml/composer:0.23.2
- mosaicml/composer:latest
TARGET: composer_stage
TORCHVISION_VERSION: 0.18.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: ubuntu:20.04
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
CUDA_VERSION: ''
IMAGE_NAME: composer-0-23-1-cpu
IMAGE_NAME: composer-0-23-2-cpu
MOFED_VERSION: latest-23.10
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.3.1
TAGS:
- mosaicml/composer:0.23.1_cpu
- mosaicml/composer:0.23.2_cpu
- mosaicml/composer:latest_cpu
TARGET: composer_stage
TORCHVISION_VERSION: 0.18.1
2 changes: 1 addition & 1 deletion docker/generate_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _main():
composer_entries = []

# The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
composer_versions = ['0.23.1'] # Only build images for the latest composer version
composer_versions = ['0.23.2'] # Only build images for the latest composer version
composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest

for product in itertools.product(composer_python_versions, composer_versions, cuda_options):
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def package_files(prefix: str, directory: str, extension: str):
'coolname>=1.1.0,<3',
'tabulate==0.9.0', # for auto-generating tables
'py-cpuinfo>=8.0.0,<10',
'packaging>=21.3.0,<24.1',
'packaging>=21.3.0,<24.2',
'importlib-metadata>=5.0.0,<7',
'mosaicml-cli>=0.5.25,<0.7',
]
Expand Down Expand Up @@ -139,7 +139,7 @@ def package_files(prefix: str, directory: str, extension: str):
'GitPython==3.1.43',
'moto[s3]>=4.0.1,<5',
'mock-ssh-server==0.9.1',
'cryptography==42.0.6',
'cryptography==42.0.8',
'pytest-httpserver>=1.0.4,<1.1',
'setuptools<=59.5.0',
'pillow==9.3.0', # Matches the Pillow version listed in the Dockerfile
Expand Down
37 changes: 19 additions & 18 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,24 +306,25 @@ def test_logging(
monkeypatch: pytest.MonkeyPatch,
):
"""Test that engine logs statements as expected"""
caplog.set_level(logging.DEBUG, logger=Engine.__module__)
# Include a callback, since most logging happens around callback events
dummy_state.callbacks = [EventCounterCallback()]

monkeypatch.setenv('ENGINE_DEBUG', '1')
engine = Engine(dummy_state, dummy_logger)
engine.run_event('INIT')
engine.close()

# Validate that we have the expected log entries
assert caplog.record_tuples == [
('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running event'),
('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running callback EventCounterCallback'),
('composer.core.engine', 10, 'Closing the engine.'),
('composer.core.engine', 10, 'Closing callback EventCounterCallback'),
('composer.core.engine', 10, 'Post-closing callback EventCounterCallback'),
('composer.core.engine', 10, 'Engine closed.'),
]
caplog.clear()
with caplog.at_level(logging.DEBUG, logger=Engine.__module__):
# Include a callback, since most logging happens around callback events
dummy_state.callbacks = [EventCounterCallback()]

monkeypatch.setenv('ENGINE_DEBUG', '1')
engine = Engine(dummy_state, dummy_logger)
engine.run_event('INIT')
engine.close()

# Validate that we have the expected log entries
assert caplog.record_tuples == [
('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running event'),
('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running callback EventCounterCallback'),
('composer.core.engine', 10, 'Closing the engine.'),
('composer.core.engine', 10, 'Closing callback EventCounterCallback'),
('composer.core.engine', 10, 'Post-closing callback EventCounterCallback'),
('composer.core.engine', 10, 'Engine closed.'),
]


def _worker():
Expand Down

0 comments on commit bc44224

Please sign in to comment.