Add Torch 2.4 Tests (mosaicml#3549)

mvpatel2000 · Aug 14, 2024 · ec792aa · ec792aa
1 parent 6664382
commit ec792aa
Show file tree

Hide file tree

Showing 11 changed files with 120 additions and 45 deletions.
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -22,18 +22,23 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: cpu-3.11-2.2-composer
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: composer
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
+        - name: cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.11-2.4-composer
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: composer
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.10-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -42,18 +47,23 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.2-composer
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
+        - name: daily-cpu-3.11-2.3
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
-          composer_package_name: composer
-        - name: daily-cpu-3.11-2.3-composer
-          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.4-composer
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -104,6 +114,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
+        - name: "gpu-3.11-2.4-1-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 1
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -116,6 +132,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
+        - name: "gpu-3.11-2.4-2-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 2
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -128,6 +150,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 4
+        - name: "gpu-3.11-2.4-4-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 4
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -21,8 +21,12 @@ jobs:
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
+        - name: cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     name: ${{ matrix.name }}

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -13,8 +13,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-1
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.4-1
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -39,8 +39,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-2
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.4-2
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -66,8 +66,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-4
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.4-4
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml

diff --git a/composer/core/precision.py b/composer/core/precision.py
@@ -26,9 +26,9 @@ class Precision(StringEnum):
 
     Attributes:
         FP32: Use 32-bit floating-point precision. Compatible with CPUs and GPUs.
-        AMP_FP16: Use :mod:`torch.cuda.amp` with 16-bit floating-point precision. Only compatible
+        AMP_FP16: Use :mod:`torch.amp` with 16-bit floating-point precision. Only compatible
             with GPUs.
-        AMP_BF16: Use :mod:`torch.cuda.amp` with 16-bit BFloat precision.
+        AMP_BF16: Use :mod:`torch.amp` with 16-bit BFloat precision.
         AMP_FP8: Use :mod:`transformer_engine.pytorch.fp8_autocast` with 8-bit FP8 precison.
     """
     FP32 = 'fp32'
@@ -60,15 +60,15 @@ def get_precision_context(
     precision = Precision(precision)
     if precision == Precision.FP32:
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(False):
+            with torch.autocast('cuda', enabled=False):
                 yield
         else:
             # Yield here to avoid warnings about cuda not being available
             yield
     elif precision == Precision.AMP_FP16:
         # Retain compatibility with PyTorch < 1.10
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(True):
+            with torch.autocast('cuda', enabled=True):
                 yield
         elif is_xla_installed():
             with torch.autocast('xla', dtype=torch.float16):
@@ -77,7 +77,7 @@ def get_precision_context(
             yield
     elif precision == Precision.AMP_BF16:
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+            with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True):
                 yield
         elif is_xla_installed():
             with torch.autocast('xla', dtype=torch.bfloat16):

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -1725,7 +1725,7 @@ def __init__(
 
         # Suppressing GradScaler warnings as they are always created
         # self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required
-        warnings.filterwarnings(action='ignore', message='torch.cuda.amp.GradScaler')
+        warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.GradScaler.*')
         self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler()
 
         if self.state.fsdp_config is not None:
@@ -2442,6 +2442,17 @@ def fit(
         self.first_batch_complete = False
         self._train_loop()
 
+        # Zero gradients at the end of fit so same model/optimizer can be used for further training
+        # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415
+        for optimizer in self.state.optimizers:
+            try:
+                try:
+                    optimizer.zero_grad(set_to_none=True)
+                except TypeError:
+                    optimizer.zero_grad()
+            except:
+                log.exception('Failed to zero out optimizer at end of fit')
+
     def close(self):
         """Shutdown the trainer.
 

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
@@ -16,6 +16,7 @@
 import os
 import sys
 import tempfile
+import warnings
 from typing import Any
 from typing import Callable as Callable
 from urllib.parse import urlparse
@@ -53,10 +54,16 @@
 from composer.loggers import Logger as Logger
 from composer.loggers import RemoteUploaderDownloader
 from composer.models import ComposerModel as ComposerModel
-from composer.optim.scheduler import ConstantScheduler
+from composer.optim import ConstantScheduler, DecoupledSGDW
 from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
+# Ignore certain warnings for doctest
+warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*')  # Expected
+warnings.filterwarnings(action='ignore', message='.*Some weights of Bert*')  # Expected
+warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*')  # DeepSpeed
+warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*')  # DeepSpeed
+
 try:
     import wandb
     _WANDB_INSTALLED = True
@@ -117,7 +124,7 @@
 
 model = SimpleModel(num_channels, num_classes)
 
-optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+optimizer = DecoupledSGDW(model.parameters(), lr=0.001)
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 
@@ -188,7 +195,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any):
     if 'model' not in kwargs:
         kwargs['model'] = model
     if 'optimizers' not in kwargs:
-        kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01)
+        kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01)
     if 'schedulers' not in kwargs:
         kwargs['schedulers'] = ConstantScheduler()
     if 'max_duration' not in kwargs:

diff --git a/docs/source/trainer/checkpointing.rst b/docs/source/trainer/checkpointing.rst
@@ -531,10 +531,10 @@ object stores like WandB or LibCloud, you must still specify a ``load_object_sto
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()
@@ -547,10 +547,10 @@ Similarly for OCI:
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()
@@ -564,10 +564,10 @@ Similarly for GCS:
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()

diff --git a/pyproject.toml b/pyproject.toml
@@ -164,6 +164,10 @@ filterwarnings = [
     '''ignore:The 'transformers' MLflow Models integration.*:UserWarning''',
     # Ignore our own deprecation warnings,
     '''ignore::composer.utils.warnings.VersionedDeprecationWarning''',
+    # Ignore deprecation warning for torch.load
+    '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
+    # Ignore deprecation warning as DeepSpeed uses old path
+    '''ignore:.*torch.cuda.amp.custom.*:FutureWarning''',
 ]
 
 # Coverage

diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
@@ -174,7 +174,18 @@ def test_autoload(
             context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
         # Excluding some algorithms leads to errors when loading
         elif exclude:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0'):
+                if algo_name in [
+                    'BlurPool',
+                    'Factorize',
+                    'GatedLinearUnits',
+                    'GhostBatchNorm',
+                    'SqueezeExcite',
+                ]:
+                    context = pytest.raises(KeyError)  # Optimizer loading is strict
+                elif algo_name == 'Alibi':
+                    context = pytest.raises(RuntimeError)  # Alibi has shape issues
+            elif version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
                 if algo_name in [
                     'Alibi',
                     'BlurPool',

diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
@@ -996,7 +996,9 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool):
         last_checkpoint = os.path.join('first', 'ep2.pt')
         if missing_key or unexpected_key:
             message = r'Error\(s\) in loading state_dict'
-            if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
+            if version.parse(torch.__version__) < version.parse('2.2.3') or (
+                version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
+            ):
                 # Composer implements strict for older torch versions
                 message = 'Failed to load checkpoint due to'
             error_context = pytest.raises(RuntimeError, match=message)
@@ -1354,7 +1356,9 @@ def test_autoload_algorithm_old_checkpoint(self):
         NoOpModel.__init__ = lambda self, x: None  # type: ignore
         NoOpModel.__repr__ = lambda self: 'NoOpModel(3)'
         error_context = pytest.raises(KeyError, match='module.0.weight')
-        if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
+        if version.parse(torch.__version__) < version.parse('2.2.3') or (
+            version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
+        ):
             error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*')
         with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context:
             trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt'))

diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 import torch.nn as nn
+from packaging import version
 from torch import Tensor
 from torch.utils.data import DataLoader
 
@@ -45,7 +46,11 @@ def loss(self, output: Tensor, target: Tensor):
 @pytest.mark.parametrize(
     'ddp_sync_strategy,expected_grads',
     [
-        pytest.param('single_auto_sync', ([-1, None, None], [-1, -1.5, None], [-1, -1.5, None]), id='single_auto_sync'),
+        pytest.param(
+            'single_auto_sync',
+            ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
+            id='single_auto_sync',
+        ),
         pytest.param(
             'multi_auto_sync',
             ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
@@ -61,8 +66,9 @@ def test_ddp_sync_strategy(
     rank_zero_seed: int,
     request: pytest.FixtureRequest,
 ):
+    if version.parse(torch.__version__) < version.parse('2.4.0'):
+        pytest.skip('Before PyTorch 2.4, single_auto_sync did not properly run on last microbatch')
     original_model = MinimalConditionalModel()
-    # ddp = DDP(backend="gloo", find_unused_parameters=True, sync_strategy=ddp_sync_strategy, timeout=5.)
     optimizer = torch.optim.SGD(original_model.parameters(), 0.1)
     device = None
     for item in request.session.items: