Skip to content

Commit

Permalink
Add Torch 2.4 Tests (mosaicml#3549)
Browse files Browse the repository at this point in the history
  • Loading branch information
mvpatel2000 committed Aug 14, 2024
1 parent 6664382 commit ec792aa
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 45 deletions.
52 changes: 40 additions & 12 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,23 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.2-composer
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: cpu-3.11-2.3
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.4-composer
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: cpu-doctest
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
container: mosaicml/pytorch:2.4.0_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
Expand All @@ -42,18 +47,23 @@ jobs:
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2-composer
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
- name: daily-cpu-3.11-2.3
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: daily-cpu-3.11-2.3-composer
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.4-composer
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: daily-cpu-doctest
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
Expand Down Expand Up @@ -104,6 +114,12 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.4-1-gpu"
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -116,6 +132,12 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.4-2-gpu"
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -128,6 +150,12 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.4-4-gpu"
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ jobs:
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-doctest
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
name: ${{ matrix.name }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.3-1
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
- name: gpu-3.11-2.4-1
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand All @@ -39,8 +39,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.3-2
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
- name: gpu-3.11-2.4-2
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand All @@ -66,8 +66,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.3-4
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
- name: gpu-3.11-2.4-4
container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down
10 changes: 5 additions & 5 deletions composer/core/precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ class Precision(StringEnum):
Attributes:
FP32: Use 32-bit floating-point precision. Compatible with CPUs and GPUs.
AMP_FP16: Use :mod:`torch.cuda.amp` with 16-bit floating-point precision. Only compatible
AMP_FP16: Use :mod:`torch.amp` with 16-bit floating-point precision. Only compatible
with GPUs.
AMP_BF16: Use :mod:`torch.cuda.amp` with 16-bit BFloat precision.
AMP_BF16: Use :mod:`torch.amp` with 16-bit BFloat precision.
AMP_FP8: Use :mod:`transformer_engine.pytorch.fp8_autocast` with 8-bit FP8 precison.
"""
FP32 = 'fp32'
Expand Down Expand Up @@ -60,15 +60,15 @@ def get_precision_context(
precision = Precision(precision)
if precision == Precision.FP32:
if torch.cuda.is_available():
with torch.cuda.amp.autocast(False):
with torch.autocast('cuda', enabled=False):
yield
else:
# Yield here to avoid warnings about cuda not being available
yield
elif precision == Precision.AMP_FP16:
# Retain compatibility with PyTorch < 1.10
if torch.cuda.is_available():
with torch.cuda.amp.autocast(True):
with torch.autocast('cuda', enabled=True):
yield
elif is_xla_installed():
with torch.autocast('xla', dtype=torch.float16):
Expand All @@ -77,7 +77,7 @@ def get_precision_context(
yield
elif precision == Precision.AMP_BF16:
if torch.cuda.is_available():
with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True):
yield
elif is_xla_installed():
with torch.autocast('xla', dtype=torch.bfloat16):
Expand Down
13 changes: 12 additions & 1 deletion composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1725,7 +1725,7 @@ def __init__(

# Suppressing GradScaler warnings as they are always created
# self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required
warnings.filterwarnings(action='ignore', message='torch.cuda.amp.GradScaler')
warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.GradScaler.*')
self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler()

if self.state.fsdp_config is not None:
Expand Down Expand Up @@ -2442,6 +2442,17 @@ def fit(
self.first_batch_complete = False
self._train_loop()

# Zero gradients at the end of fit so same model/optimizer can be used for further training
# with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415
for optimizer in self.state.optimizers:
try:
try:
optimizer.zero_grad(set_to_none=True)
except TypeError:
optimizer.zero_grad()
except:
log.exception('Failed to zero out optimizer at end of fit')

def close(self):
"""Shutdown the trainer.
Expand Down
13 changes: 10 additions & 3 deletions docs/source/doctest_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os
import sys
import tempfile
import warnings
from typing import Any
from typing import Callable as Callable
from urllib.parse import urlparse
Expand Down Expand Up @@ -53,10 +54,16 @@
from composer.loggers import Logger as Logger
from composer.loggers import RemoteUploaderDownloader
from composer.models import ComposerModel as ComposerModel
from composer.optim.scheduler import ConstantScheduler
from composer.optim import ConstantScheduler, DecoupledSGDW
from composer.utils import LibcloudObjectStore, RemoteUploader
from composer.utils import ensure_tuple as ensure_tuple

# Ignore certain warnings for doctest
warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*') # Expected
warnings.filterwarnings(action='ignore', message='.*Some weights of Bert*') # Expected
warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') # DeepSpeed
warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*') # DeepSpeed

try:
import wandb
_WANDB_INSTALLED = True
Expand Down Expand Up @@ -117,7 +124,7 @@

model = SimpleModel(num_channels, num_classes)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer = DecoupledSGDW(model.parameters(), lr=0.001)

scheduler = CosineAnnealingLR(optimizer, T_max=1)

Expand Down Expand Up @@ -188,7 +195,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any):
if 'model' not in kwargs:
kwargs['model'] = model
if 'optimizers' not in kwargs:
kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01)
kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01)
if 'schedulers' not in kwargs:
kwargs['schedulers'] = ConstantScheduler()
if 'max_duration' not in kwargs:
Expand Down
24 changes: 12 additions & 12 deletions docs/source/trainer/checkpointing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -531,10 +531,10 @@ object stores like WandB or LibCloud, you must still specify a ``load_object_sto
:skipif: not _LIBCLOUD_INSTALLED

new_trainer = Trainer(
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
)

new_trainer.fit()
Expand All @@ -547,10 +547,10 @@ Similarly for OCI:
:skipif: not _LIBCLOUD_INSTALLED

new_trainer = Trainer(
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
)

new_trainer.fit()
Expand All @@ -564,10 +564,10 @@ Similarly for GCS:
:skipif: not _LIBCLOUD_INSTALLED

new_trainer = Trainer(
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
model=model,
train_dataloader=train_dataloader,
max_duration="10ep",
load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
)

new_trainer.fit()
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ filterwarnings = [
'''ignore:The 'transformers' MLflow Models integration.*:UserWarning''',
# Ignore our own deprecation warnings,
'''ignore::composer.utils.warnings.VersionedDeprecationWarning''',
# Ignore deprecation warning for torch.load
'''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
# Ignore deprecation warning as DeepSpeed uses old path
'''ignore:.*torch.cuda.amp.custom.*:FutureWarning''',
]

# Coverage
Expand Down
13 changes: 12 additions & 1 deletion tests/algorithms/test_required_on_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,18 @@ def test_autoload(
context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
# Excluding some algorithms leads to errors when loading
elif exclude:
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0'):
if algo_name in [
'BlurPool',
'Factorize',
'GatedLinearUnits',
'GhostBatchNorm',
'SqueezeExcite',
]:
context = pytest.raises(KeyError) # Optimizer loading is strict
elif algo_name == 'Alibi':
context = pytest.raises(RuntimeError) # Alibi has shape issues
elif version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if algo_name in [
'Alibi',
'BlurPool',
Expand Down
8 changes: 6 additions & 2 deletions tests/trainer/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,9 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool):
last_checkpoint = os.path.join('first', 'ep2.pt')
if missing_key or unexpected_key:
message = r'Error\(s\) in loading state_dict'
if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
if version.parse(torch.__version__) < version.parse('2.2.3') or (
version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
):
# Composer implements strict for older torch versions
message = 'Failed to load checkpoint due to'
error_context = pytest.raises(RuntimeError, match=message)
Expand Down Expand Up @@ -1354,7 +1356,9 @@ def test_autoload_algorithm_old_checkpoint(self):
NoOpModel.__init__ = lambda self, x: None # type: ignore
NoOpModel.__repr__ = lambda self: 'NoOpModel(3)'
error_context = pytest.raises(KeyError, match='module.0.weight')
if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
if version.parse(torch.__version__) < version.parse('2.2.3') or (
version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
):
error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*')
with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context:
trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt'))
Expand Down
10 changes: 8 additions & 2 deletions tests/trainer/test_ddp_sync_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch
import torch.nn as nn
from packaging import version
from torch import Tensor
from torch.utils.data import DataLoader

Expand Down Expand Up @@ -45,7 +46,11 @@ def loss(self, output: Tensor, target: Tensor):
@pytest.mark.parametrize(
'ddp_sync_strategy,expected_grads',
[
pytest.param('single_auto_sync', ([-1, None, None], [-1, -1.5, None], [-1, -1.5, None]), id='single_auto_sync'),
pytest.param(
'single_auto_sync',
([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
id='single_auto_sync',
),
pytest.param(
'multi_auto_sync',
([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
Expand All @@ -61,8 +66,9 @@ def test_ddp_sync_strategy(
rank_zero_seed: int,
request: pytest.FixtureRequest,
):
if version.parse(torch.__version__) < version.parse('2.4.0'):
pytest.skip('Before PyTorch 2.4, single_auto_sync did not properly run on last microbatch')
original_model = MinimalConditionalModel()
# ddp = DDP(backend="gloo", find_unused_parameters=True, sync_strategy=ddp_sync_strategy, timeout=5.)
optimizer = torch.optim.SGD(original_model.parameters(), 0.1)
device = None
for item in request.session.items:
Expand Down

0 comments on commit ec792aa

Please sign in to comment.