mosaicml · bigning · Sep 10, 2024 · Sep 9, 2024 · Sep 10, 2024
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
@@ -608,42 +608,48 @@ def dist_cp_load(
     load_planner: Optional[LoadPlanner] = None,
 ):
     if version.parse(torch.__version__) >= version.parse('2.4.0'):
-        from torch.distributed.checkpoint.utils import CheckpointException
-        try:
-            dist_cp.load(
-                state_dict=state_dict,
-                storage_reader=storage_reader,
-                planner=load_planner,
-            )
-        except CheckpointException as e:
-            checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
-            if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
-                # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
-                # Torch issue: https://github.com/pytorch/pytorch/issues/133923.
-                # We override the traverse_state_dict so that the load planner could
-                # use the old way of flattening the state dict
-                log.debug('Trying to load checkpointing saved before torch 2.4')
-
-                import torch.distributed.checkpoint._nested_dict as nested_dict
-                import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
-                from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0
-
-                from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse
-
-                nested_dict.traverse_state_dict = backward_compatible_traverse
-                sharded_tensor_util.traverse_state_dict = backward_compatible_traverse
-
+        if version.parse(torch.__version__) < version.parse('2.4.1'):
+            from torch.distributed.checkpoint.utils import CheckpointException
+            try:
                 dist_cp.load(
                     state_dict=state_dict,
                     storage_reader=storage_reader,
                     planner=load_planner,
                 )
-                # Revert the override
-                nested_dict.traverse_state_dict = traverse_2_4_0
-                sharded_tensor_util.traverse_state_dict = traverse_2_4_0
-            else:
-                raise e
-
+            except CheckpointException as e:
+                checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
+                if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
+                    # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
+                    # Torch issue: https://github.com/pytorch/pytorch/issues/133923.
+                    # We override the traverse_state_dict so that the load planner could
+                    # use the old way of flattening the state dict
+                    log.debug('Trying to load checkpointing saved before torch 2.4')
+
+                    import torch.distributed.checkpoint._nested_dict as nested_dict
+                    import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
+                    from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0
+
+                    from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse
+
+                    nested_dict.traverse_state_dict = backward_compatible_traverse
+                    sharded_tensor_util.traverse_state_dict = backward_compatible_traverse
+
+                    dist_cp.load(
+                        state_dict=state_dict,
+                        storage_reader=storage_reader,
+                        planner=load_planner,
+                    )
+                    # Revert the override
+                    nested_dict.traverse_state_dict = traverse_2_4_0
+                    sharded_tensor_util.traverse_state_dict = traverse_2_4_0
+                else:
+                    raise e
+        else:
+            dist_cp.load(
+                state_dict=state_dict,
+                storage_reader=storage_reader,
+                planner=load_planner,
+            )
     else:
         dist_cp.load_state_dict(
             state_dict=state_dict,

@@ -30,9 +30,9 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                                                                                                                                                          |
 |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`                 |
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                 |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`             |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                                                                        |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                                                                |
 | Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                                                                            |

@@ -2,54 +2,54 @@
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-0-cu124
+  IMAGE_NAME: torch-2-4-1-cu124
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   - ghcr.io/databricks-mosaic/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: v1.11.0-aws
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-0-cu124-aws
+  IMAGE_NAME: torch-2-4-1-cu124-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   - ghcr.io/databricks-mosaic/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-4-0-cpu
+  IMAGE_NAME: torch-2-4-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   - ghcr.io/databricks-mosaic/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
@@ -202,14 +202,14 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
   - mosaicml/composer:0.24.1
   - ghcr.io/databricks-mosaic/composer:0.24.1
   - mosaicml/composer:latest
   - ghcr.io/databricks-mosaic/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.1
@@ -220,11 +220,11 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
   - mosaicml/composer:0.24.1_cpu
   - ghcr.io/databricks-mosaic/composer:0.24.1_cpu
   - mosaicml/composer:latest_cpu
   - ghcr.io/databricks-mosaic/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
@@ -20,12 +20,12 @@
 import yaml
 
 PRODUCTION_PYTHON_VERSION = '3.11'
-PRODUCTION_PYTORCH_VERSION = '2.4.0'
+PRODUCTION_PYTORCH_VERSION = '2.4.1'
 
 
 def _get_torchvision_version(pytorch_version: str):
-    if pytorch_version == '2.4.0':
-        return '0.19.0'
+    if pytorch_version == '2.4.1':
+        return '0.19.1'
     if pytorch_version == '2.3.1':
         return '0.18.1'
     if pytorch_version == '2.2.2':
@@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
-    if pytorch_version == '2.4.0':
+    if pytorch_version == '2.4.1':
         return '12.4.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
@@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
+    python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS

@@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
     'tqdm>=4.62.3,<5',
     'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.14.0,<0.19.1',
-    'torch>=2.2.0,<2.4.1',
+    'torchvision>=0.14.0,<0.19.2',
+    'torch>=2.2.0,<2.4.2',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.2.0',
     'psutil>=5.8.0,<7',