Skip to content

Commit

Permalink
Merge branch 'mosaicml:main' into tp-checkpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
eitanturok authored Sep 4, 2024
2 parents 642e497 + 998198d commit 23477b5
Show file tree
Hide file tree
Showing 23 changed files with 200 additions and 79 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.9
ref: v0.2.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/code-quality
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.9
ref: v0.2.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/coverage
with:
Expand Down
86 changes: 47 additions & 39 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
jobs:
daily-pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: ubuntu-latest
container: ${{ matrix.container }}
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
include:
Expand Down Expand Up @@ -67,26 +70,26 @@ jobs:
markers: daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
name: ${{ matrix.name }}
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
composer_package_name: ${{ matrix.composer_package_name }}
safe_directory: composer
secrets:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
code-eval-url: ${{ secrets.CODE_EVAL_URL }}
code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
gcs-key: ${{ secrets.GCS_KEY }}
gcs-secret: ${{ secrets.GCS_SECRET }}
azure-account-name: ${{ secrets.AZURE_ACCOUNT_NAME }}
azure-account-access-key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
steps:
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
name: ${{ matrix.name }}
pip_deps: "[all]"
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
safe_directory: composer
composer_package_name: ${{ matrix.composer_package_name }}
container: ${{ inputs.container }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
code_eval_device: ${{ secrets.CODE_EVAL_DEVICE }}
code_eval_url: ${{ secrets.CODE_EVAL_URL }}
code_eval_apikey: ${{ secrets.CODE_EVAL_APIKEY }}
gcs_key: ${{ secrets.GCS_KEY }}
gcs_secret: ${{ secrets.GCS_SECRET }}
azure_account_name: ${{ secrets.AZURE_ACCOUNT_NAME }}
azure_account_access_key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
coverage:
uses: ./.github/workflows/coverage.yaml
name: Coverage Results
Expand All @@ -96,12 +99,14 @@ jobs:
download-path: artifacts

daily-pytest-gpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: ubuntu-latest
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
include:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand Down Expand Up @@ -156,19 +161,22 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
composer_package_name: ${{ matrix.composer_package_name }}
container: ${{ matrix.container }}
git_repo: mosaicml/composer
mcloud-timeout: 5400
name: ${{ matrix.name }}
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.11
gpu_num: ${{ matrix.gpu_num }}
gha-timeout: 5400
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
name: ${{ matrix.name }}
composer_package_name: ${{ matrix.composer_package_name }}
container: ${{ matrix.container }}
git_repo: mosaicml/composer
mcloud_timeout: 5400
pip_deps: "[all]"
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.11
gpu_num: ${{ matrix.gpu_num }}
mcloud_api_key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
gha_timeout: 5400
ci_repo_gpu_test_ref: v0.1.2
2 changes: 1 addition & 1 deletion .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
pytest_command: coverage run -m pytest tests/test_docs.py
steps:
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.2
uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.2.2
with:
name: ${{ matrix.name }}
pip_deps: "[all]"
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@v3
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
with:
name: ${{ matrix.name }}
composer_package_name: ${{ matrix.composer_package_name }}
Expand All @@ -34,10 +34,10 @@ jobs:
pip_deps: "[all]"
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
python_version: 3.11
gpu_num: 1
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: v0.1.2
ci_repo_gpu_test_ref: v0.2.2
pytest-gpu-2:
name: ${{ matrix.name }}
runs-on: ubuntu-latest
Expand All @@ -54,7 +54,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@v3
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
with:
name: ${{ matrix.name }}
composer_package_name: ${{ matrix.composer_package_name }}
Expand All @@ -64,10 +64,10 @@ jobs:
pip_deps: "[all]"
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
python_version: 3.11
gpu_num: 2
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: v0.1.2
ci_repo_gpu_test_ref: v0.2.2
pytest-gpu-4:
name: ${{ matrix.name }}
runs-on: ubuntu-latest
Expand All @@ -84,7 +84,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@v3
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
with:
name: ${{ matrix.name }}
composer_package_name: ${{ matrix.composer_package_name }}
Expand All @@ -94,7 +94,7 @@ jobs:
pip_deps: "[all]"
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
python_version: 3.11
gpu_num: 4
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: v0.1.2
ci_repo_gpu_test_ref: v0.2.2
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: mosaicml/ci-testing
ref: v0.0.9
ref: v0.2.2
path: ./ci-testing
- uses: ./ci-testing/.github/actions/smoketest
with:
Expand Down
2 changes: 1 addition & 1 deletion composer/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

"""The Composer Version."""

__version__ = '0.24.0'
__version__ = '0.25.0.dev0'
2 changes: 2 additions & 0 deletions composer/algorithms/selective_backprop/selective_backprop.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ def apply(self, event: Event, state: State, logger: Optional[Logger] = None) ->
raise RuntimeError('Model must be of type ComposerModel')
self._loss_fn = state.model.loss
return

state.batch = state.device.batch_to_device(state.batch)
input, target = state.batch_get_item(key=self.input_key), state.batch_get_item(key=self.target_key)
assert isinstance(input, torch.Tensor) and isinstance(target, torch.Tensor), \
'Multiple tensors not supported for this method yet.'
Expand Down
2 changes: 2 additions & 0 deletions composer/algorithms/seq_length_warmup/seq_length_warmup.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ def _activate_model(self, state: State, logger: Logger) -> None:
while True:
model_inputs = {k: v[:state.device_train_microbatch_size] for k, v in batch_clone.items()}

model_inputs = state.device.batch_to_device(model_inputs)

found_cuda_oom = 0 # int since bool BOR not supported on all torch.distributed backends
try:
# Start by running a forward and backward pass
Expand Down
2 changes: 1 addition & 1 deletion composer/core/data_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def _default_get_num_samples_in_batch(self, batch: Batch) -> int:
'`get_num_samples_in_batch(your_batch) -> int` method.',
)
dim0_sizes.append(t.shape[0])
elif isinstance(batch, dict):
elif isinstance(batch, Mapping):
for t in batch.values():
if isinstance(t, torch.Tensor):
dim0_sizes.append(t.shape[0])
Expand Down
4 changes: 2 additions & 2 deletions composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2659,7 +2659,6 @@ def _train_loop(self) -> None:
self._rng_state = None
continue

self.state.batch = self.state.device.batch_to_device(self.state.batch)
self.state.batch = self._train_data_spec.device_transforms(self.state.batch)
rank_num_samples = self._train_data_spec.get_num_samples_in_batch(self.state.batch)
rank_num_tokens = self._train_data_spec.get_num_tokens_in_batch(self.state.batch)
Expand Down Expand Up @@ -3071,6 +3070,7 @@ def _train_microbatches(
current_batch = self.state.batch

for microbatch_idx, self.state.batch in enumerate(microbatches):
self.state.batch = self.state.device.batch_to_device(self.state.batch)
is_final_microbatch = microbatch_idx + 1 == len(microbatches)
microbatch_loss_dict = self._train_microbatch(use_grad_scaling, current_batch_size, is_final_microbatch)

Expand Down Expand Up @@ -3619,7 +3619,6 @@ def _eval_loop(
)

for self.state.batch in self._iter_dataloader(TrainerMode.EVAL):
self.state.batch = self.state.device.batch_to_device(self.state.batch)
self.state.batch = data_spec.device_transforms(self.state.batch)

# Count the batch size and num tokens before any events run
Expand Down Expand Up @@ -3649,6 +3648,7 @@ def _eval_loop(
try:
microbatches = data_spec.split_batch(device_batch, evaluator.device_eval_microbatch_size)
for i, self.state.batch in enumerate(microbatches):
self.state.batch = self.state.device.batch_to_device(self.state.batch)
last_microbatch = i == len(microbatches) - 1
skip_metric_update = False
# Distributed samplers pad batches to be the same size. If using a
Expand Down
2 changes: 1 addition & 1 deletion composer/utils/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_device(device: Optional[Union[str, 'Device']] = None) -> 'Device':
"""
from composer.devices import DeviceCPU, DeviceGPU, DeviceHPU, DeviceMPS, DeviceNeuron, DeviceTPU

if not device:
if device is None:
device = DeviceGPU() if torch.cuda.is_available() else DeviceCPU()
elif isinstance(device, str):
if device.lower() == 'cpu':
Expand Down
6 changes: 3 additions & 3 deletions composer/utils/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def is_initialized():
return dist.is_initialized()


def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
def initialize_dist(device: Optional[Union[str, Device]] = None, timeout: float = 300.0) -> None:
"""Initialize the default PyTorch distributed process group.
This function assumes that the following environment variables are set:
Expand All @@ -517,9 +517,9 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
.. seealso:: :func:`torch.distributed.init_process_group`
Args:
device (str | Device): The device from which the distributed backend is
device (Optional[str | Device] ): The device from which the distributed backend is
interpreted. Either a string corresponding to a device (one of ``'cpu'``,
``'gpu'``, ``'mps'``, or ``'tpu'``) or a :class:`.Device`.
``'gpu'``, ``'mps'``, or ``'tpu'``) or a :class:`.Device`. (default: ``None``)
timeout (float, optional): The timeout for operations executed against the process
group, expressed in seconds. (default: ``300.0``).
"""
Expand Down
23 changes: 21 additions & 2 deletions composer/utils/parallelism.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""Parallelism configs."""

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Any, Optional

from torch.distributed._tensor.device_mesh import DeviceMesh
Expand All @@ -23,7 +23,6 @@ class FSDPConfig:
cpu_offload: bool = False
data_parallel_shard_degree: int = -1
data_parallel_replicate_degree: Optional[int] = None
device_mesh: Optional[DeviceMesh] = None
forward_prefetch: bool = False
forward_prefetch_limit: int = 1
ignored_modules: Optional[Any] = None
Expand All @@ -41,6 +40,26 @@ class FSDPConfig:
use_orig_params: bool = True
verbose: bool = False

_device_mesh: Optional[DeviceMesh] = field(default=None, init=False, repr=False)

def __init__(self, **kwargs):
if 'device_mesh' in kwargs or '_device_mesh' in kwargs:
raise ValueError(
f'Directly specifying device mesh for FSDP was deprecated in Composer version 0.24.0. ' +
f"Please specify 'data_parallel_shard_degree' and/or 'data_parallel_replicate_degree' instead.",
)

for k, v in kwargs.items():
setattr(self, k, v)

@property
def device_mesh(self) -> Optional[DeviceMesh]:
return self._device_mesh

@device_mesh.setter
def device_mesh(self, value: Optional[DeviceMesh]):
self._device_mesh = value


@dataclass
class TPConfig:
Expand Down
4 changes: 2 additions & 2 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
<!-- BEGIN_COMPOSER_BUILD_MATRIX -->
| Composer Version | CUDA Support | Docker Tag |
|--------------------|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 0.24.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.24.0` |
| 0.24.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.24.0_cpu` |
| 0.24.1 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.24.1` |
| 0.24.1 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.24.1_cpu` |
<!-- END_COMPOSER_BUILD_MATRIX -->

**Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
Expand Down
Loading

0 comments on commit 23477b5

Please sign in to comment.