diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 38d7927ad4..320c1a5fe6 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -34,7 +34,7 @@ jobs: pytest_command: coverage run -m pytest composer_package_name: composer - name: cpu-3.11-2.3 - container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -59,7 +59,7 @@ jobs: pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-3.11-2.3-composer - container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer @@ -119,7 +119,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 1 - name: "gpu-3.11-2.3-1-gpu" - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" @@ -137,7 +137,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 2 - name: "gpu-3.11-2.3-2-gpu" - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" @@ -155,7 +155,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 4 - name: "gpu-3.11-2.3-4-gpu" - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 78aee76241..f32a589160 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -22,11 +22,11 @@ jobs: markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-3.11-2.3 - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-doctest - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py name: ${{ matrix.name }} diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index fdc9f014f5..3cb434ca58 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -14,7 +14,7 @@ jobs: matrix: include: - name: gpu-3.11-2.3-1 - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -40,7 +40,7 @@ jobs: matrix: include: - name: gpu-3.11-2.3-2 - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -67,7 +67,7 @@ jobs: matrix: include: - name: gpu-3.11-2.3-4 - container: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index b9416c445f..e4269cebd3 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -104,6 +104,25 @@ def patch_pytorch(): DeviceMesh.__getitem__ = device_mesh__getitem__ DeviceMesh.__init__ = device_mesh__init__ + elif version.parse(torch.__version__) < version.parse('2.3.2'): + # Monkey patch for torch < 2.3.2 ie torch == 2.3.1 + + # Issue: https://github.com/pytorch/pytorch/issues/122946 + # - PR: https://github.com/pytorch/pytorch/pull/125336 + from torch.distributed.checkpoint import state_dict + + state_dict._verify_options = _verify_options + state_dict._get_model_state_dict = _get_model_state_dict + state_dict._load_model_state_dict = _load_model_state_dict + + # Monkeypatch for ND child submeshes + # PR: https://github.com/pytorch/pytorch/pull/119752 + from torch.distributed.device_mesh import DeviceMesh, _MeshEnv + + _MeshEnv.create_child_mesh = create_child_mesh + DeviceMesh.__getitem__ = device_mesh__getitem__ + DeviceMesh.__init__ = device_mesh__init__ + def build_metadata( self, diff --git a/setup.py b/setup.py index a17a722a19..0b40fe0c72 100644 --- a/setup.py +++ b/setup.py @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str): 'tqdm>=4.62.3,<5', 'torchmetrics>=0.10.0,<1.3.3', 'torch_optimizer>=0.3.0,<0.4', - 'torchvision>=0.13.1,<0.18.1', - 'torch>=2.1.2,<2.3.1', + 'torchvision>=0.13.1,<0.18.2', + 'torch>=2.1.2,<2.3.2', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<1.27.0', 'psutil>=5.8.0,<6',