Skip to content

Commit

Permalink
Merge branch 'main' into mlflow-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea authored Sep 10, 2024
2 parents 11b5306 + d8236db commit 0f6f0ae
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 76 deletions.
18 changes: 9 additions & 9 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ jobs:
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.4-composer
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: cpu-doctest
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
Expand All @@ -56,17 +56,17 @@ jobs:
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.4-composer
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: daily-cpu-doctest
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
Expand Down Expand Up @@ -120,7 +120,7 @@ jobs:
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.4-1-gpu"
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
Expand All @@ -138,7 +138,7 @@ jobs:
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.4-2-gpu"
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
Expand All @@ -156,7 +156,7 @@ jobs:
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.4-4-gpu"
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ jobs:
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.4
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-doctest
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
steps:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
matrix:
include:
- name: gpu-3.11-2.4-1
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down Expand Up @@ -45,7 +45,7 @@ jobs:
matrix:
include:
- name: gpu-3.11-2.4-2
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down Expand Up @@ -75,7 +75,7 @@ jobs:
matrix:
include:
- name: gpu-3.11-2.4-4
container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down
70 changes: 39 additions & 31 deletions composer/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,42 +608,50 @@ def dist_cp_load(
load_planner: Optional[LoadPlanner] = None,
):
if version.parse(torch.__version__) >= version.parse('2.4.0'):
from torch.distributed.checkpoint.utils import CheckpointException
try:
dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
except CheckpointException as e:
checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
# Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
# Torch issue: https://github.com/pytorch/pytorch/issues/133923.
# We override the traverse_state_dict so that the load planner could
# use the old way of flattening the state dict
log.debug('Trying to load checkpointing saved before torch 2.4')

import torch.distributed.checkpoint._nested_dict as nested_dict
import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0

from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse

nested_dict.traverse_state_dict = backward_compatible_traverse
sharded_tensor_util.traverse_state_dict = backward_compatible_traverse

if version.parse(torch.__version__) < version.parse('2.4.1'):
# PyTorch 2.4.0
from torch.distributed.checkpoint.utils import CheckpointException
try:
dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
# Revert the override
nested_dict.traverse_state_dict = traverse_2_4_0
sharded_tensor_util.traverse_state_dict = traverse_2_4_0
else:
raise e

except CheckpointException as e:
checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
# Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
# Torch issue: https://github.com/pytorch/pytorch/issues/133923.
# We override the traverse_state_dict so that the load planner could
# use the old way of flattening the state dict
log.debug('Trying to load checkpointing saved before torch 2.4')

import torch.distributed.checkpoint._nested_dict as nested_dict
import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0

from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse

nested_dict.traverse_state_dict = backward_compatible_traverse
sharded_tensor_util.traverse_state_dict = backward_compatible_traverse

dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
# Revert the override
nested_dict.traverse_state_dict = traverse_2_4_0
sharded_tensor_util.traverse_state_dict = traverse_2_4_0
else:
raise e
else:
# PyTorch 2.4.1
dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
else:
dist_cp.load_state_dict(
state_dict=state_dict,
Expand Down
6 changes: 3 additions & 3 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ To install composer, once inside the image, run `pip install mosaicml`.
<!-- BEGIN_PYTORCH_BUILD_MATRIX -->
| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags |
|----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` |
Expand Down
38 changes: 19 additions & 19 deletions docker/build_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,54 @@
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
CUDA_VERSION: 12.4.1
IMAGE_NAME: torch-2-4-0-cu124
IMAGE_NAME: torch-2-4-1-cu124
MOFED_VERSION: latest-23.10
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.4.0
PYTORCH_VERSION: 2.4.1
TAGS:
- mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
- ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
- mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
- ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
- mosaicml/pytorch:latest
- ghcr.io/databricks-mosaic/pytorch:latest
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.19.0
TORCHVISION_VERSION: 0.19.1
- AWS_OFI_NCCL_VERSION: v1.11.0-aws
BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
CUDA_VERSION: 12.4.1
IMAGE_NAME: torch-2-4-0-cu124-aws
IMAGE_NAME: torch-2-4-1-cu124-aws
MOFED_VERSION: ''
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.4.0
PYTORCH_VERSION: 2.4.1
TAGS:
- mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
- ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
- mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
- ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
- mosaicml/pytorch:latest-aws
- ghcr.io/databricks-mosaic/pytorch:latest-aws
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.19.0
TORCHVISION_VERSION: 0.19.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: ubuntu:20.04
CUDA_VERSION: ''
IMAGE_NAME: torch-2-4-0-cpu
IMAGE_NAME: torch-2-4-1-cpu
MOFED_VERSION: ''
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.4.0
PYTORCH_VERSION: 2.4.1
TAGS:
- mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
- ghcr.io/databricks-mosaic/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
- mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
- ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
- mosaicml/pytorch:latest_cpu
- ghcr.io/databricks-mosaic/pytorch:latest_cpu
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.19.0
TORCHVISION_VERSION: 0.19.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.1
Expand Down Expand Up @@ -202,14 +202,14 @@
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.4.0
PYTORCH_VERSION: 2.4.1
TAGS:
- mosaicml/composer:0.24.1
- ghcr.io/databricks-mosaic/composer:0.24.1
- mosaicml/composer:latest
- ghcr.io/databricks-mosaic/composer:latest
TARGET: composer_stage
TORCHVISION_VERSION: 0.19.0
TORCHVISION_VERSION: 0.19.1
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: ubuntu:20.04
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.1
Expand All @@ -220,11 +220,11 @@
PYTHON_VERSION: '3.11'
PYTORCH_NIGHTLY_URL: ''
PYTORCH_NIGHTLY_VERSION: ''
PYTORCH_VERSION: 2.4.0
PYTORCH_VERSION: 2.4.1
TAGS:
- mosaicml/composer:0.24.1_cpu
- ghcr.io/databricks-mosaic/composer:0.24.1_cpu
- mosaicml/composer:latest_cpu
- ghcr.io/databricks-mosaic/composer:latest_cpu
TARGET: composer_stage
TORCHVISION_VERSION: 0.19.0
TORCHVISION_VERSION: 0.19.1
10 changes: 5 additions & 5 deletions docker/generate_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
import yaml

PRODUCTION_PYTHON_VERSION = '3.11'
PRODUCTION_PYTORCH_VERSION = '2.4.0'
PRODUCTION_PYTORCH_VERSION = '2.4.1'


def _get_torchvision_version(pytorch_version: str):
if pytorch_version == '2.4.0':
return '0.19.0'
if pytorch_version == '2.4.1':
return '0.19.1'
if pytorch_version == '2.3.1':
return '0.18.1'
if pytorch_version == '2.2.2':
Expand All @@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
# From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
if not use_cuda:
return ''
if pytorch_version == '2.4.0':
if pytorch_version == '2.4.1':
return '12.4.1'
if pytorch_version == '2.3.1':
return '12.1.1'
Expand Down Expand Up @@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str):


def _main():
python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
cuda_options = [True, False]
stages = ['pytorch_stage']
interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
'tqdm>=4.62.3,<5',
'torchmetrics>=1.4.0.post0,<1.4.1',
'torch_optimizer>=0.3.0,<0.4',
'torchvision>=0.14.0,<0.19.1',
'torch>=2.2.0,<2.4.1',
'torchvision>=0.14.0,<0.19.2',
'torch>=2.2.0,<2.4.2',
'requests>=2.26.0,<3',
'numpy>=1.21.5,<2.2.0',
'psutil>=5.8.0,<7',
Expand Down Expand Up @@ -225,13 +225,13 @@ def package_files(prefix: str, directory: str, extension: str):

extra_deps['mlflow'] = [
'mlflow>=2.14.1,<3.0',
'databricks-sdk==0.31.1',
'databricks-sdk==0.32.0',
'pynvml>=11.5.0,<12',
]

extra_deps['pandas'] = ['pandas>=2.0.0,<3.0']

extra_deps['databricks'] = ['databricks-sdk==0.31.1']
extra_deps['databricks'] = ['databricks-sdk==0.32.0']

extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps}

Expand Down

0 comments on commit 0f6f0ae

Please sign in to comment.