Merge branch 'mosaicml:main' into tp-checkpoint

mosaicml · Sep 4, 2024 · 23477b5 · 23477b5
2 parents 642e497 + 998198d
commit 23477b5
Show file tree

Hide file tree

Showing 23 changed files with 200 additions and 79 deletions.
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -34,7 +34,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.9
+        ref: v0.2.2
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:

diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
@@ -16,7 +16,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.9
+        ref: v0.2.2
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/coverage
       with:

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -13,7 +13,10 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   daily-pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-latest
+    container: ${{ matrix.container }}
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         include:
@@ -67,26 +70,26 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      composer_package_name: ${{ matrix.composer_package_name }}
-      safe_directory: composer
-    secrets:
-      aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-      aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
-      code-eval-url: ${{ secrets.CODE_EVAL_URL }}
-      code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
-      gcs-key: ${{ secrets.GCS_KEY }}
-      gcs-secret: ${{ secrets.GCS_SECRET }}
-      azure-account-name: ${{ secrets.AZURE_ACCOUNT_NAME }}
-      azure-account-access-key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
+    steps:
+    - name: Run PR CPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        name: ${{ matrix.name }}
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        safe_directory: composer
+        composer_package_name: ${{ matrix.composer_package_name }}
+        container: ${{ inputs.container }}
+        aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        code_eval_device: ${{ secrets.CODE_EVAL_DEVICE }}
+        code_eval_url: ${{ secrets.CODE_EVAL_URL }}
+        code_eval_apikey: ${{ secrets.CODE_EVAL_APIKEY }}
+        gcs_key: ${{ secrets.GCS_KEY }}
+        gcs_secret: ${{ secrets.GCS_SECRET }}
+        azure_account_name: ${{ secrets.AZURE_ACCOUNT_NAME }}
+        azure_account_access_key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results
@@ -96,12 +99,14 @@ jobs:
       download-path: artifacts
 
   daily-pytest-gpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
+        include:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
-        include:
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -156,19 +161,22 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 4
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      composer_package_name: ${{ matrix.composer_package_name }}
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/composer
-      mcloud-timeout: 5400
-      name: ${{ matrix.name }}
-      pip_deps: "[all]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.11
-      gpu_num: ${{ matrix.gpu_num }}
-      gha-timeout: 5400
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        name: ${{ matrix.name }}
+        composer_package_name: ${{ matrix.composer_package_name }}
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/composer
+        mcloud_timeout: 5400
+        pip_deps: "[all]"
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.11
+        gpu_num: ${{ matrix.gpu_num }}
+        mcloud_api_key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
+        gha_timeout: 5400
+        ci_repo_gpu_test_ref: v0.1.2
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -34,7 +34,7 @@ jobs:
           pytest_command: coverage run -m pytest tests/test_docs.py
     steps:
     - name: Run PR CPU Tests
-      uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.2
+      uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.2.2
       with:
         name: ${{ matrix.name }}
         pip_deps: "[all]"

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -24,7 +24,7 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v3
     - name: Run PR GPU Tests
-      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
       with:
         name: ${{ matrix.name }}
         composer_package_name: ${{ matrix.composer_package_name }}
@@ -34,10 +34,10 @@ jobs:
         pip_deps: "[all]"
         pytest_command: ${{ matrix.pytest_command }}
         pytest_markers: ${{ matrix.markers }}
-        python_version: 3.9
+        python_version: 3.11
         gpu_num: 1
         mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
-        ci_repo_gpu_test_ref: v0.1.2
+        ci_repo_gpu_test_ref: v0.2.2
   pytest-gpu-2:
     name: ${{ matrix.name }}
     runs-on: ubuntu-latest
@@ -54,7 +54,7 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v3
     - name: Run PR GPU Tests
-      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
       with:
         name: ${{ matrix.name }}
         composer_package_name: ${{ matrix.composer_package_name }}
@@ -64,10 +64,10 @@ jobs:
         pip_deps: "[all]"
         pytest_command: ${{ matrix.pytest_command }}
         pytest_markers: ${{ matrix.markers }}
-        python_version: 3.9
+        python_version: 3.11
         gpu_num: 2
         mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
-        ci_repo_gpu_test_ref: v0.1.2
+        ci_repo_gpu_test_ref: v0.2.2
   pytest-gpu-4:
     name: ${{ matrix.name }}
     runs-on: ubuntu-latest
@@ -84,7 +84,7 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v3
     - name: Run PR GPU Tests
-      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2
+      uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.2.2
       with:
         name: ${{ matrix.name }}
         composer_package_name: ${{ matrix.composer_package_name }}
@@ -94,7 +94,7 @@ jobs:
         pip_deps: "[all]"
         pytest_command: ${{ matrix.pytest_command }}
         pytest_markers: ${{ matrix.markers }}
-        python_version: 3.9
+        python_version: 3.11
         gpu_num: 4
         mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
-        ci_repo_gpu_test_ref: v0.1.2
+        ci_repo_gpu_test_ref: v0.2.2
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
@@ -33,7 +33,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.9
+        ref: v0.2.2
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/smoketest
       with:

diff --git a/composer/_version.py b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.24.0'
+__version__ = '0.25.0.dev0'
diff --git a/composer/algorithms/selective_backprop/selective_backprop.py b/composer/algorithms/selective_backprop/selective_backprop.py
@@ -272,6 +272,8 @@ def apply(self, event: Event, state: State, logger: Optional[Logger] = None) ->
                     raise RuntimeError('Model must be of type ComposerModel')
                 self._loss_fn = state.model.loss
             return
+
+        state.batch = state.device.batch_to_device(state.batch)
         input, target = state.batch_get_item(key=self.input_key), state.batch_get_item(key=self.target_key)
         assert isinstance(input, torch.Tensor) and isinstance(target, torch.Tensor), \
             'Multiple tensors not supported for this method yet.'

diff --git a/composer/algorithms/seq_length_warmup/seq_length_warmup.py b/composer/algorithms/seq_length_warmup/seq_length_warmup.py
@@ -292,6 +292,8 @@ def _activate_model(self, state: State, logger: Logger) -> None:
         while True:
             model_inputs = {k: v[:state.device_train_microbatch_size] for k, v in batch_clone.items()}
 
+            model_inputs = state.device.batch_to_device(model_inputs)
+
             found_cuda_oom = 0  # int since bool BOR not supported on all torch.distributed backends
             try:
                 # Start by running a forward and backward pass

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
@@ -258,7 +258,7 @@ def _default_get_num_samples_in_batch(self, batch: Batch) -> int:
                             '`get_num_samples_in_batch(your_batch) -> int` method.',
                         )
                     dim0_sizes.append(t.shape[0])
-        elif isinstance(batch, dict):
+        elif isinstance(batch, Mapping):
             for t in batch.values():
                 if isinstance(t, torch.Tensor):
                     dim0_sizes.append(t.shape[0])

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -2659,7 +2659,6 @@ def _train_loop(self) -> None:
                         self._rng_state = None
                     continue
 
-                self.state.batch = self.state.device.batch_to_device(self.state.batch)
                 self.state.batch = self._train_data_spec.device_transforms(self.state.batch)
                 rank_num_samples = self._train_data_spec.get_num_samples_in_batch(self.state.batch)
                 rank_num_tokens = self._train_data_spec.get_num_tokens_in_batch(self.state.batch)
@@ -3071,6 +3070,7 @@ def _train_microbatches(
             current_batch = self.state.batch
 
             for microbatch_idx, self.state.batch in enumerate(microbatches):
+                self.state.batch = self.state.device.batch_to_device(self.state.batch)
                 is_final_microbatch = microbatch_idx + 1 == len(microbatches)
                 microbatch_loss_dict = self._train_microbatch(use_grad_scaling, current_batch_size, is_final_microbatch)
 
@@ -3619,7 +3619,6 @@ def _eval_loop(
                         )
 
             for self.state.batch in self._iter_dataloader(TrainerMode.EVAL):
-                self.state.batch = self.state.device.batch_to_device(self.state.batch)
                 self.state.batch = data_spec.device_transforms(self.state.batch)
 
                 # Count the batch size and num tokens before any events run
@@ -3649,6 +3648,7 @@ def _eval_loop(
                     try:
                         microbatches = data_spec.split_batch(device_batch, evaluator.device_eval_microbatch_size)
                         for i, self.state.batch in enumerate(microbatches):
+                            self.state.batch = self.state.device.batch_to_device(self.state.batch)
                             last_microbatch = i == len(microbatches) - 1
                             skip_metric_update = False
                             # Distributed samplers pad batches to be the same size. If using a

diff --git a/composer/utils/device.py b/composer/utils/device.py
@@ -30,7 +30,7 @@ def get_device(device: Optional[Union[str, 'Device']] = None) -> 'Device':
     """
     from composer.devices import DeviceCPU, DeviceGPU, DeviceHPU, DeviceMPS, DeviceNeuron, DeviceTPU
 
-    if not device:
+    if device is None:
         device = DeviceGPU() if torch.cuda.is_available() else DeviceCPU()
     elif isinstance(device, str):
         if device.lower() == 'cpu':

diff --git a/composer/utils/dist.py b/composer/utils/dist.py
@@ -498,7 +498,7 @@ def is_initialized():
     return dist.is_initialized()
 
 
-def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
+def initialize_dist(device: Optional[Union[str, Device]] = None, timeout: float = 300.0) -> None:
     """Initialize the default PyTorch distributed process group.
 
     This function assumes that the following environment variables are set:
@@ -517,9 +517,9 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
     .. seealso:: :func:`torch.distributed.init_process_group`
 
     Args:
-        device (str | Device): The device from which the distributed backend is
+        device (Optional[str | Device] ): The device from which the distributed backend is
             interpreted. Either a string corresponding to a device (one of ``'cpu'``,
-            ``'gpu'``, ``'mps'``, or ``'tpu'``) or a :class:`.Device`.
+            ``'gpu'``, ``'mps'``, or ``'tpu'``) or a :class:`.Device`. (default: ``None``)
         timeout (float, optional): The timeout for operations executed against the process
             group, expressed in seconds. (default: ``300.0``).
     """

diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py
@@ -3,7 +3,7 @@
 
 """Parallelism configs."""
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 
 from torch.distributed._tensor.device_mesh import DeviceMesh
@@ -23,7 +23,6 @@ class FSDPConfig:
     cpu_offload: bool = False
     data_parallel_shard_degree: int = -1
     data_parallel_replicate_degree: Optional[int] = None
-    device_mesh: Optional[DeviceMesh] = None
     forward_prefetch: bool = False
     forward_prefetch_limit: int = 1
     ignored_modules: Optional[Any] = None
@@ -41,6 +40,26 @@ class FSDPConfig:
     use_orig_params: bool = True
     verbose: bool = False
 
+    _device_mesh: Optional[DeviceMesh] = field(default=None, init=False, repr=False)
+
+    def __init__(self, **kwargs):
+        if 'device_mesh' in kwargs or '_device_mesh' in kwargs:
+            raise ValueError(
+                f'Directly specifying device mesh for FSDP was deprecated in Composer version 0.24.0. ' +
+                f"Please specify 'data_parallel_shard_degree' and/or 'data_parallel_replicate_degree' instead.",
+            )
+
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    @property
+    def device_mesh(self) -> Optional[DeviceMesh]:
+        return self._device_mesh
+
+    @device_mesh.setter
+    def device_mesh(self, value: Optional[DeviceMesh]):
+        self._device_mesh = value
+
 
 @dataclass
 class TPConfig:

diff --git a/docker/README.md b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                                                                                                                       |
 |--------------------|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| 0.24.0             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.24.0`                 |
-| 0.24.0             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.24.0_cpu` |
+| 0.24.1             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.24.1`                 |
+| 0.24.1             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.24.1_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually