From 7dfd4d321cadbfe567bcb8bd0ce7b38f564ff1f3 Mon Sep 17 00:00:00 2001
From: Chen Qian <chen.qian@databricks.com>
Date: Fri, 6 Sep 2024 13:38:28 -0700
Subject: [PATCH 1/9] Reduce system metrics logging frequency (#3604)

---
 composer/loggers/mlflow_logger.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index eb2f917a92..660c315c8e 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -174,9 +174,9 @@ def __init__(
 
         if log_system_metrics:
             # Set system metrics sampling interval and samples before logging so that system metrics
-            # are collected every 5s, and aggregated over 3 samples before being logged
-            # (logging per 15s).
-            mlflow.set_system_metrics_samples_before_logging(3)
+            # are collected every 5s, and aggregated over 6 samples before being logged
+            # (logging per 30s).
+            mlflow.set_system_metrics_samples_before_logging(6)
             mlflow.set_system_metrics_sampling_interval(5)
 
         self._rank_zero_only = rank_zero_only
@@ -545,7 +545,11 @@ def register_model_with_run_id(
         """
         if self._enabled:
             from mlflow.exceptions import MlflowException
-            from mlflow.protos.databricks_pb2 import ALREADY_EXISTS, RESOURCE_ALREADY_EXISTS, ErrorCode
+            from mlflow.protos.databricks_pb2 import (
+                ALREADY_EXISTS,
+                RESOURCE_ALREADY_EXISTS,
+                ErrorCode,
+            )
 
             full_name = f'{self.model_registry_prefix}.{name}' if len(self.model_registry_prefix) > 0 else name
 

From baaeeef8bb84d5ca902f2dda7f1da3e7ec255fdc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 8 Sep 2024 20:46:06 -0700
Subject: [PATCH 2/9] Bump databricks-sdk from 0.31.1 to 0.32.0 (#3608)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 26c44738e4..e8095d3f4e 100644
--- a/setup.py
+++ b/setup.py
@@ -225,13 +225,13 @@ def package_files(prefix: str, directory: str, extension: str):
 
 extra_deps['mlflow'] = [
     'mlflow>=2.14.1,<3.0',
-    'databricks-sdk==0.31.1',
+    'databricks-sdk==0.32.0',
     'pynvml>=11.5.0,<12',
 ]
 
 extra_deps['pandas'] = ['pandas>=2.0.0,<3.0']
 
-extra_deps['databricks'] = ['databricks-sdk==0.31.1']
+extra_deps['databricks'] = ['databricks-sdk==0.32.0']
 
 extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps}
 

From d6e55203a2684556dd766a26373f428280502cca Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Tue, 10 Sep 2024 11:21:56 -0700
Subject: [PATCH 3/9] torch2.4.1 (#3609)

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/utils/checkpoint.py    | 70 ++++++++++++++++++---------------
 docker/README.md                |  6 +--
 docker/build_matrix.yaml        | 38 +++++++++---------
 docker/generate_build_matrix.py | 10 ++---
 setup.py                        |  4 +-
 5 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index 8c0caea0f4..c6f5af15ca 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -608,42 +608,50 @@ def dist_cp_load(
     load_planner: Optional[LoadPlanner] = None,
 ):
     if version.parse(torch.__version__) >= version.parse('2.4.0'):
-        from torch.distributed.checkpoint.utils import CheckpointException
-        try:
-            dist_cp.load(
-                state_dict=state_dict,
-                storage_reader=storage_reader,
-                planner=load_planner,
-            )
-        except CheckpointException as e:
-            checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
-            if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
-                # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
-                # Torch issue: https://github.com/pytorch/pytorch/issues/133923.
-                # We override the traverse_state_dict so that the load planner could
-                # use the old way of flattening the state dict
-                log.debug('Trying to load checkpointing saved before torch 2.4')
-
-                import torch.distributed.checkpoint._nested_dict as nested_dict
-                import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
-                from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0
-
-                from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse
-
-                nested_dict.traverse_state_dict = backward_compatible_traverse
-                sharded_tensor_util.traverse_state_dict = backward_compatible_traverse
-
+        if version.parse(torch.__version__) < version.parse('2.4.1'):
+            # PyTorch 2.4.0
+            from torch.distributed.checkpoint.utils import CheckpointException
+            try:
                 dist_cp.load(
                     state_dict=state_dict,
                     storage_reader=storage_reader,
                     planner=load_planner,
                 )
-                # Revert the override
-                nested_dict.traverse_state_dict = traverse_2_4_0
-                sharded_tensor_util.traverse_state_dict = traverse_2_4_0
-            else:
-                raise e
-
+            except CheckpointException as e:
+                checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
+                if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
+                    # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
+                    # Torch issue: https://github.com/pytorch/pytorch/issues/133923.
+                    # We override the traverse_state_dict so that the load planner could
+                    # use the old way of flattening the state dict
+                    log.debug('Trying to load checkpointing saved before torch 2.4')
+
+                    import torch.distributed.checkpoint._nested_dict as nested_dict
+                    import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
+                    from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0
+
+                    from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse
+
+                    nested_dict.traverse_state_dict = backward_compatible_traverse
+                    sharded_tensor_util.traverse_state_dict = backward_compatible_traverse
+
+                    dist_cp.load(
+                        state_dict=state_dict,
+                        storage_reader=storage_reader,
+                        planner=load_planner,
+                    )
+                    # Revert the override
+                    nested_dict.traverse_state_dict = traverse_2_4_0
+                    sharded_tensor_util.traverse_state_dict = traverse_2_4_0
+                else:
+                    raise e
+        else:
+            # PyTorch 2.4.1
+            dist_cp.load(
+                state_dict=state_dict,
+                storage_reader=storage_reader,
+                planner=load_planner,
+            )
     else:
         dist_cp.load_state_dict(
             state_dict=state_dict,
diff --git a/docker/README.md b/docker/README.md
index 57c4dc8000..a561d1237d 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,9 +30,9 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                                                                                                                                                          |
 |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`                 |
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                 |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`             |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                                                                        |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                                                                |
 | Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                                                                            |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 40b3d6e85f..40edd23992 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -2,54 +2,54 @@
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-0-cu124
+  IMAGE_NAME: torch-2-4-1-cu124
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   - ghcr.io/databricks-mosaic/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: v1.11.0-aws
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-0-cu124-aws
+  IMAGE_NAME: torch-2-4-1-cu124-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   - ghcr.io/databricks-mosaic/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-4-0-cpu
+  IMAGE_NAME: torch-2-4-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   - ghcr.io/databricks-mosaic/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
@@ -202,14 +202,14 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
   - mosaicml/composer:0.24.1
   - ghcr.io/databricks-mosaic/composer:0.24.1
   - mosaicml/composer:latest
   - ghcr.io/databricks-mosaic/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.1
@@ -220,11 +220,11 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.0
+  PYTORCH_VERSION: 2.4.1
   TAGS:
   - mosaicml/composer:0.24.1_cpu
   - ghcr.io/databricks-mosaic/composer:0.24.1_cpu
   - mosaicml/composer:latest_cpu
   - ghcr.io/databricks-mosaic/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.0
+  TORCHVISION_VERSION: 0.19.1
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index c908df52a4..9e47662a4b 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -20,12 +20,12 @@
 import yaml
 
 PRODUCTION_PYTHON_VERSION = '3.11'
-PRODUCTION_PYTORCH_VERSION = '2.4.0'
+PRODUCTION_PYTORCH_VERSION = '2.4.1'
 
 
 def _get_torchvision_version(pytorch_version: str):
-    if pytorch_version == '2.4.0':
-        return '0.19.0'
+    if pytorch_version == '2.4.1':
+        return '0.19.1'
     if pytorch_version == '2.3.1':
         return '0.18.1'
     if pytorch_version == '2.2.2':
@@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
-    if pytorch_version == '2.4.0':
+    if pytorch_version == '2.4.1':
         return '12.4.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
@@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
+    python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS
diff --git a/setup.py b/setup.py
index e8095d3f4e..befb663b98 100644
--- a/setup.py
+++ b/setup.py
@@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
     'tqdm>=4.62.3,<5',
     'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.14.0,<0.19.1',
-    'torch>=2.2.0,<2.4.1',
+    'torchvision>=0.14.0,<0.19.2',
+    'torch>=2.2.0,<2.4.2',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.2.0',
     'psutil>=5.8.0,<7',

From d8236dbc5ce2cadfbb1640cb502f2f9df4ca6c83 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Tue, 10 Sep 2024 11:48:26 -0700
Subject: [PATCH 4/9] Test with staging image (#3610)

---
 .github/workflows/daily.yaml  | 18 +++++++++---------
 .github/workflows/pr-cpu.yaml |  4 ++--
 .github/workflows/pr-gpu.yaml |  6 +++---
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 401c3a7e22..a35c6d42c4 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -31,17 +31,17 @@ jobs:
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-3.11-2.4
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -56,17 +56,17 @@ jobs:
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.4
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -120,7 +120,7 @@ jobs:
           composer_package_name: "mosaicml"
           gpu_num: 1
         - name: "gpu-3.11-2.4-1-gpu"
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
@@ -138,7 +138,7 @@ jobs:
           composer_package_name: "mosaicml"
           gpu_num: 2
         - name: "gpu-3.11-2.4-2-gpu"
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
@@ -156,7 +156,7 @@ jobs:
           composer_package_name: "mosaicml"
           gpu_num: 4
         - name: "gpu-3.11-2.4-4-gpu"
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 41572cf190..38ebe9d2c7 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -25,11 +25,11 @@ jobs:
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.4
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     steps:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index e7c55cbe95..447f824e67 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         include:
         - name: gpu-3.11-2.4-1
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -45,7 +45,7 @@ jobs:
       matrix:
         include:
         - name: gpu-3.11-2.4-2
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -75,7 +75,7 @@ jobs:
       matrix:
         include:
         - name: gpu-3.11-2.4-4
-          container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml

From a9cd76852080ed0fb4834b7de685c089940ffec0 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Tue, 10 Sep 2024 16:13:04 -0700
Subject: [PATCH 5/9] fix 2.4.1 test (#3612)

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/trainer/_patch_pytorch.py | 94 ++++++++++++++++--------------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index a6056eae62..fcca94d73a 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -946,51 +946,7 @@ def unshard_with_sync(self):
 if version.parse(torch.__version__) >= version.parse('2.4.0') and version.parse(
         torch.__version__,
 ) < version.parse('2.4.1'):
-    # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks
-    from torch.distributed.fsdp._flat_param import FlatParamHandle
-    original_unshard = FlatParamHandle.unshard
-
-    @no_type_check
-    def unshard_with_sync(self):
-        """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`.
-
-        This prevents deadlocks when some ranks OOM after the alloc call and others do not.
-        This is a patched method from pytorch, meant to be called when automicrobatching
-        turns on hooks in its search process for the optimal non-OOMing microbatch size.
-        This includes all-gathering the flat parameter
-        and switching to using the unsharded flat parameter. If the handle does
-        not need unsharding, then this only switches to using the unsharded
-        flat parameter. For ``NO_SHARD``, this is a no-op.
-        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
-        mixed precision, then the parameter is forced to full precision.
-        """
-        if not self.needs_unshard():
-            # Even when not needing an unshard, we should switch to using
-            # the unsharded flat parameter
-            unsharded_flat_param = (
-                self._get_padded_unsharded_flat_param()
-                if self.uses_sharded_strategy
-                else self.flat_param
-            )
-            self._use_unsharded_flat_param(unsharded_flat_param)
-            return
-        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
-
-        # Check if any other rank hit an OOM
-        found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
-
-        dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX')
-        found_cuda_oom = found_cuda_oom_tensor.item()
-        # Signal current rank is still in batch
-        all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
-
-        dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN')
-
-        if found_cuda_oom == 1:
-            raise RuntimeError('CUDA out of memory encountered on a different rank')
-        padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
-        self._use_unsharded_flat_param(padded_unsharded_flat_param)
-
+    # 2.4.0 only patch
     # PyTorch issue: https://github.com/pytorch/pytorch/issues/133923
     from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
     from typing import Mapping, Collection
@@ -1046,3 +1002,51 @@ def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
 
         for key, value in state_dict.items():
             _traverse_obj((str(key),), value)
+
+if version.parse(torch.__version__) >= version.parse('2.4.0') and version.parse(
+        torch.__version__,
+) < version.parse('2.4.2'):
+    # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks
+    from torch.distributed.fsdp._flat_param import FlatParamHandle
+    original_unshard = FlatParamHandle.unshard
+
+    @no_type_check
+    def unshard_with_sync(self):
+        """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`.
+
+        This prevents deadlocks when some ranks OOM after the alloc call and others do not.
+        This is a patched method from pytorch, meant to be called when automicrobatching
+        turns on hooks in its search process for the optimal non-OOMing microbatch size.
+        This includes all-gathering the flat parameter
+        and switching to using the unsharded flat parameter. If the handle does
+        not need unsharding, then this only switches to using the unsharded
+        flat parameter. For ``NO_SHARD``, this is a no-op.
+        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
+        mixed precision, then the parameter is forced to full precision.
+        """
+        if not self.needs_unshard():
+            # Even when not needing an unshard, we should switch to using
+            # the unsharded flat parameter
+            unsharded_flat_param = (
+                self._get_padded_unsharded_flat_param()
+                if self.uses_sharded_strategy
+                else self.flat_param
+            )
+            self._use_unsharded_flat_param(unsharded_flat_param)
+            return
+        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+
+        # Check if any other rank hit an OOM
+        found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX')
+        found_cuda_oom = found_cuda_oom_tensor.item()
+        # Signal current rank is still in batch
+        all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN')
+
+        if found_cuda_oom == 1:
+            raise RuntimeError('CUDA out of memory encountered on a different rank')
+        padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
+        self._use_unsharded_flat_param(padded_unsharded_flat_param)

From fea4a88002639bec3498e9a592d4e1a35814db9b Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Wed, 11 Sep 2024 06:30:09 -0700
Subject: [PATCH 6/9] Remove tensor option for _global_exception_occured
 (#3611)

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/loggers/mlflow_logger.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 660c315c8e..3da777a0ec 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -312,10 +312,7 @@ def init(self, state: State, logger: Logger) -> None:
         if self.run_name is None:
             self.run_name = state.run_name
 
-        if hasattr(state, 'device'):
-            self._global_exception_occurred = state.device.tensor_to_device(torch.tensor([0], dtype=torch.uint8),)
-        else:
-            self._global_exception_occurred = 0
+        self._global_exception_occurred = 0
 
         # Store the Composer run name in the MLFlow run tags so it can be retrieved for autoresume
         self.tags['run_name'] = os.environ.get('RUN_NAME', state.run_name)
@@ -615,10 +612,7 @@ def post_close(self):
             if hasattr(self, 'monitor_process'):
                 # Check if there is an uncaught exception, which means `post_close()` is triggered
                 # due to program crash.
-                if isinstance(self._global_exception_occurred, torch.Tensor):
-                    finish_with_exception = (self._global_exception_occurred == 1).item()
-                else:
-                    finish_with_exception = (self._global_exception_occurred == 1)
+                finish_with_exception = self._global_exception_occurred == 1
                 if finish_with_exception:
                     self.monitor_process.crash()
                     return

From 893f398b348402015d0b69da88d23fe79972cf15 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 16 Sep 2024 10:38:09 -0700
Subject: [PATCH 7/9] Update error message for overwrite to be more user
 friendly (#3619)

---
 composer/utils/remote_uploader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/composer/utils/remote_uploader.py b/composer/utils/remote_uploader.py
index 33793e7c91..da8b2da3dc 100644
--- a/composer/utils/remote_uploader.py
+++ b/composer/utils/remote_uploader.py
@@ -92,7 +92,10 @@ def upload_file(retry_index: int = 0):
                 # Good! It shouldn't exist.
                 pass
             else:
-                raise FileExistsError(f'Object {remote_file_name} already exists, but overwrite was set to False.')
+                raise FileExistsError(
+                    f'Object {remote_file_name} already exists, but overwrite was set to False. '
+                    'Please set `save_overwrite` to `True` in Trainer to overwrite the existing file.',
+                )
         log.info(f'Uploading file {local_file_path} to {remote_file_name}')
         object_store.upload_object(
             object_name=remote_file_name,

From 129dcbe4d2aef094b8b9a9d61a9fa2a3f0476995 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 19 Sep 2024 22:21:00 +0000
Subject: [PATCH 8/9] Update wandb requirement from <0.18,>=0.13.2 to
 >=0.13.2,<0.19 (#3615)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Saaketh Narayan <saaketh@mosaicml.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 setup.py                           | 2 +-
 tests/loggers/test_wandb_logger.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index befb663b98..6cc65702a7 100644
--- a/setup.py
+++ b/setup.py
@@ -160,7 +160,7 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['wandb'] = [
-    'wandb>=0.13.2,<0.18',
+    'wandb>=0.13.2,<0.19',
 ]
 
 extra_deps['comet_ml'] = [
diff --git a/tests/loggers/test_wandb_logger.py b/tests/loggers/test_wandb_logger.py
index e190e39663..b0462fc842 100644
--- a/tests/loggers/test_wandb_logger.py
+++ b/tests/loggers/test_wandb_logger.py
@@ -269,10 +269,10 @@ def test_wandb_log_metrics(test_wandb_logger):
     eval_metrics_cross_entropy_count = all_run_text.count('metrics/eval/CrossEntropy')
     train_loss_count = all_run_text.count('loss/train/total')
 
-    expected_number_train_loss_count = (dataset_size / batch_size) + 1  # wandb includes it in the file one extra time
+    expected_number_train_loss_count = (dataset_size / batch_size) * 2  # wandb includes it twice per step
     expected_number_train_metrics_count = (
         dataset_size / batch_size
-    ) + 2  # wandb includes it in the file two extra times
+    ) * 2 + 2  # wandb includes it twice per step plus two extra times
     expected_number_eval_metrics_count = 2  # wandb includes it in the file twice
     assert train_metrics_accuracy_count == expected_number_train_metrics_count
     assert train_loss_count == expected_number_train_loss_count

From 7597ab6e873e4c77caa5cb87ec06282123af1e60 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 20 Sep 2024 20:22:59 -0700
Subject: [PATCH 9/9] Fix RNG key checking (#3623)

---
 composer/utils/checkpoint.py     | 17 ++++++++++++++++-
 tests/trainer/test_checkpoint.py | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index c6f5af15ca..b966c918c5 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -148,10 +148,25 @@ def _get_write_mode(name: str) -> str:
     raise ValueError(f'{name} does not end with a valid tarfile extension.')
 
 
+def _is_rng_key(key: str, value: tuple) -> bool:
+    """Check if the key is an RNG key.
+
+    We expect the RNG key to be of the form 'rng.{rank}.cuda|torch|python|numpy'.
+    This function ensures that we don't accidentally pick up other keys.
+    """
+    starts_with_rng = key.startswith('rng')
+    ends_with_expected = key.endswith(('cuda', 'torch', 'python', 'numpy'))
+    three_parts = isinstance(value, tuple) and len(value) == 3
+    if starts_with_rng and ends_with_expected and three_parts:
+        return True
+
+    return False
+
+
 def _get_num_ranks_that_saved_rng(metadata: Metadata):
     rng_inds = []
     for field_name, field_value in metadata.planner_data.items():
-        if 'rng' in field_name:
+        if _is_rng_key(field_name, field_value):
             _, rng_rank_index, _ = field_value
             rng_inds.append(rng_rank_index)
     rng_inds = set(rng_inds)
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 82629d245b..c2e4929535 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -35,6 +35,7 @@
     _COMPOSER_STATES_FILENAME,
     PartialFilePath,
     _ensure_valid_checkpoint,
+    _is_rng_key,
     _write_checkpoint_file,
     glob_filter,
 )
@@ -130,6 +131,23 @@ def _assert_checkpoints_equivalent(file1, file2, atol=0.0, rtol=0.0):
     assert all(keys_in) or not any(keys_in)
 
 
+@pytest.mark.parametrize(
+    'key,value,expected_result',
+    [
+        ('rng.0.cuda', ('rng', '0', 'cuda'), True),
+        ('rng.0.torch', ('rng', '0', 'torch'), True),
+        ('rng.0.numpy', ('rng', '0', 'numpy'), True),
+        ('rng.0.python', ('rng', '0', 'python'), True),
+        ('rng.0', ('rng', '0'), False),
+        ('test.test.rng', ('test', 'test', 'rng'), False),
+        ('test.rng.test', ('test', 'rng', 'test'), False),
+        ('test.notatuple.test', 0, False),
+    ],
+)
+def test_is_rng_key(key: str, value: tuple, expected_result: bool):
+    assert _is_rng_key(key, value) == expected_result
+
+
 @pytest.mark.parametrize(
     'remove_field_paths,filter_params',
     [