From 7dfd4d321cadbfe567bcb8bd0ce7b38f564ff1f3 Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Fri, 6 Sep 2024 13:38:28 -0700 Subject: [PATCH 1/9] Reduce system metrics logging frequency (#3604) --- composer/loggers/mlflow_logger.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index eb2f917a92..660c315c8e 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -174,9 +174,9 @@ def __init__( if log_system_metrics: # Set system metrics sampling interval and samples before logging so that system metrics - # are collected every 5s, and aggregated over 3 samples before being logged - # (logging per 15s). - mlflow.set_system_metrics_samples_before_logging(3) + # are collected every 5s, and aggregated over 6 samples before being logged + # (logging per 30s). + mlflow.set_system_metrics_samples_before_logging(6) mlflow.set_system_metrics_sampling_interval(5) self._rank_zero_only = rank_zero_only @@ -545,7 +545,11 @@ def register_model_with_run_id( """ if self._enabled: from mlflow.exceptions import MlflowException - from mlflow.protos.databricks_pb2 import ALREADY_EXISTS, RESOURCE_ALREADY_EXISTS, ErrorCode + from mlflow.protos.databricks_pb2 import ( + ALREADY_EXISTS, + RESOURCE_ALREADY_EXISTS, + ErrorCode, + ) full_name = f'{self.model_registry_prefix}.{name}' if len(self.model_registry_prefix) > 0 else name From baaeeef8bb84d5ca902f2dda7f1da3e7ec255fdc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 8 Sep 2024 20:46:06 -0700 Subject: [PATCH 2/9] Bump databricks-sdk from 0.31.1 to 0.32.0 (#3608) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 26c44738e4..e8095d3f4e 100644 --- a/setup.py +++ b/setup.py @@ -225,13 +225,13 @@ def package_files(prefix: str, directory: str, extension: str): extra_deps['mlflow'] = [ 'mlflow>=2.14.1,<3.0', - 'databricks-sdk==0.31.1', + 'databricks-sdk==0.32.0', 'pynvml>=11.5.0,<12', ] extra_deps['pandas'] = ['pandas>=2.0.0,<3.0'] -extra_deps['databricks'] = ['databricks-sdk==0.31.1'] +extra_deps['databricks'] = ['databricks-sdk==0.32.0'] extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps} From d6e55203a2684556dd766a26373f428280502cca Mon Sep 17 00:00:00 2001 From: bigning Date: Tue, 10 Sep 2024 11:21:56 -0700 Subject: [PATCH 3/9] torch2.4.1 (#3609) Co-authored-by: Mihir Patel --- composer/utils/checkpoint.py | 70 ++++++++++++++++++--------------- docker/README.md | 6 +-- docker/build_matrix.yaml | 38 +++++++++--------- docker/generate_build_matrix.py | 10 ++--- setup.py | 4 +- 5 files changed, 68 insertions(+), 60 deletions(-) diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index 8c0caea0f4..c6f5af15ca 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -608,42 +608,50 @@ def dist_cp_load( load_planner: Optional[LoadPlanner] = None, ): if version.parse(torch.__version__) >= version.parse('2.4.0'): - from torch.distributed.checkpoint.utils import CheckpointException - try: - dist_cp.load( - state_dict=state_dict, - storage_reader=storage_reader, - planner=load_planner, - ) - except CheckpointException as e: - checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata - if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata: - # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility. - # Torch issue: https://github.com/pytorch/pytorch/issues/133923. - # We override the traverse_state_dict so that the load planner could - # use the old way of flattening the state dict - log.debug('Trying to load checkpointing saved before torch 2.4') - - import torch.distributed.checkpoint._nested_dict as nested_dict - import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util - from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0 - - from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse - - nested_dict.traverse_state_dict = backward_compatible_traverse - sharded_tensor_util.traverse_state_dict = backward_compatible_traverse - + if version.parse(torch.__version__) < version.parse('2.4.1'): + # PyTorch 2.4.0 + from torch.distributed.checkpoint.utils import CheckpointException + try: dist_cp.load( state_dict=state_dict, storage_reader=storage_reader, planner=load_planner, ) - # Revert the override - nested_dict.traverse_state_dict = traverse_2_4_0 - sharded_tensor_util.traverse_state_dict = traverse_2_4_0 - else: - raise e - + except CheckpointException as e: + checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata + if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata: + # Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility. + # Torch issue: https://github.com/pytorch/pytorch/issues/133923. + # We override the traverse_state_dict so that the load planner could + # use the old way of flattening the state dict + log.debug('Trying to load checkpointing saved before torch 2.4') + + import torch.distributed.checkpoint._nested_dict as nested_dict + import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util + from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0 + + from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse + + nested_dict.traverse_state_dict = backward_compatible_traverse + sharded_tensor_util.traverse_state_dict = backward_compatible_traverse + + dist_cp.load( + state_dict=state_dict, + storage_reader=storage_reader, + planner=load_planner, + ) + # Revert the override + nested_dict.traverse_state_dict = traverse_2_4_0 + sharded_tensor_util.traverse_state_dict = traverse_2_4_0 + else: + raise e + else: + # PyTorch 2.4.1 + dist_cp.load( + state_dict=state_dict, + storage_reader=storage_reader, + planner=load_planner, + ) else: dist_cp.load_state_dict( state_dict=state_dict, diff --git a/docker/README.md b/docker/README.md index 57c4dc8000..a561d1237d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,9 +30,9 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 40b3d6e85f..40edd23992 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -2,54 +2,54 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 CUDA_VERSION: 12.4.1 - IMAGE_NAME: torch-2-4-0-cu124 + IMAGE_NAME: torch-2-4-1-cu124 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.0 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest - ghcr.io/databricks-mosaic/pytorch:latest TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.0 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 CUDA_VERSION: 12.4.1 - IMAGE_NAME: torch-2-4-0-cu124-aws + IMAGE_NAME: torch-2-4-1-cu124-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.0 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws - ghcr.io/databricks-mosaic/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.0 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-4-0-cpu + IMAGE_NAME: torch-2-4-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.0 + PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu - ghcr.io/databricks-mosaic/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.19.0 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 @@ -202,14 +202,14 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.0 + PYTORCH_VERSION: 2.4.1 TAGS: - mosaicml/composer:0.24.1 - ghcr.io/databricks-mosaic/composer:0.24.1 - mosaicml/composer:latest - ghcr.io/databricks-mosaic/composer:latest TARGET: composer_stage - TORCHVISION_VERSION: 0.19.0 + TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.1 @@ -220,11 +220,11 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.4.0 + PYTORCH_VERSION: 2.4.1 TAGS: - mosaicml/composer:0.24.1_cpu - ghcr.io/databricks-mosaic/composer:0.24.1_cpu - mosaicml/composer:latest_cpu - ghcr.io/databricks-mosaic/composer:latest_cpu TARGET: composer_stage - TORCHVISION_VERSION: 0.19.0 + TORCHVISION_VERSION: 0.19.1 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index c908df52a4..9e47662a4b 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -20,12 +20,12 @@ import yaml PRODUCTION_PYTHON_VERSION = '3.11' -PRODUCTION_PYTORCH_VERSION = '2.4.0' +PRODUCTION_PYTORCH_VERSION = '2.4.1' def _get_torchvision_version(pytorch_version: str): - if pytorch_version == '2.4.0': - return '0.19.0' + if pytorch_version == '2.4.1': + return '0.19.1' if pytorch_version == '2.3.1': return '0.18.1' if pytorch_version == '2.2.2': @@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/ if not use_cuda: return '' - if pytorch_version == '2.4.0': + if pytorch_version == '2.4.1': return '12.4.1' if pytorch_version == '2.3.1': return '12.1.1' @@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')] + python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS diff --git a/setup.py b/setup.py index e8095d3f4e..befb663b98 100644 --- a/setup.py +++ b/setup.py @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str): 'tqdm>=4.62.3,<5', 'torchmetrics>=1.4.0.post0,<1.4.1', 'torch_optimizer>=0.3.0,<0.4', - 'torchvision>=0.14.0,<0.19.1', - 'torch>=2.2.0,<2.4.1', + 'torchvision>=0.14.0,<0.19.2', + 'torch>=2.2.0,<2.4.2', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<2.2.0', 'psutil>=5.8.0,<7', From d8236dbc5ce2cadfbb1640cb502f2f9df4ca6c83 Mon Sep 17 00:00:00 2001 From: bigning Date: Tue, 10 Sep 2024 11:48:26 -0700 Subject: [PATCH 4/9] Test with staging image (#3610) --- .github/workflows/daily.yaml | 18 +++++++++--------- .github/workflows/pr-cpu.yaml | 4 ++-- .github/workflows/pr-gpu.yaml | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 401c3a7e22..a35c6d42c4 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -31,17 +31,17 @@ jobs: pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-3.11-2.4 - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: cpu-3.11-2.4-composer - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: cpu-doctest - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -56,17 +56,17 @@ jobs: pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: daily-cpu-3.11-2.4 - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: daily-cpu-3.11-2.4-composer - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-doctest - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -120,7 +120,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 1 - name: "gpu-3.11-2.4-1-gpu" - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" @@ -138,7 +138,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 2 - name: "gpu-3.11-2.4-2-gpu" - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" @@ -156,7 +156,7 @@ jobs: composer_package_name: "mosaicml" gpu_num: 4 - name: "gpu-3.11-2.4-4-gpu" - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 41572cf190..38ebe9d2c7 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -25,11 +25,11 @@ jobs: markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-3.11-2.4 - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-doctest - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py steps: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index e7c55cbe95..447f824e67 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -16,7 +16,7 @@ jobs: matrix: include: - name: gpu-3.11-2.4-1 - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -45,7 +45,7 @@ jobs: matrix: include: - name: gpu-3.11-2.4-2 - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -75,7 +75,7 @@ jobs: matrix: include: - name: gpu-3.11-2.4-4 - container: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml From a9cd76852080ed0fb4834b7de685c089940ffec0 Mon Sep 17 00:00:00 2001 From: bigning Date: Tue, 10 Sep 2024 16:13:04 -0700 Subject: [PATCH 5/9] fix 2.4.1 test (#3612) Co-authored-by: Mihir Patel --- composer/trainer/_patch_pytorch.py | 94 ++++++++++++++++-------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index a6056eae62..fcca94d73a 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -946,51 +946,7 @@ def unshard_with_sync(self): if version.parse(torch.__version__) >= version.parse('2.4.0') and version.parse( torch.__version__, ) < version.parse('2.4.1'): - # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks - from torch.distributed.fsdp._flat_param import FlatParamHandle - original_unshard = FlatParamHandle.unshard - - @no_type_check - def unshard_with_sync(self): - """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`. - - This prevents deadlocks when some ranks OOM after the alloc call and others do not. - This is a patched method from pytorch, meant to be called when automicrobatching - turns on hooks in its search process for the optimal non-OOMing microbatch size. - This includes all-gathering the flat parameter - and switching to using the unsharded flat parameter. If the handle does - not need unsharding, then this only switches to using the unsharded - flat parameter. For ``NO_SHARD``, this is a no-op. - If FSDP is in :meth:`summon_full_params` and the handle uses parameter - mixed precision, then the parameter is forced to full precision. - """ - if not self.needs_unshard(): - # Even when not needing an unshard, we should switch to using - # the unsharded flat parameter - unsharded_flat_param = ( - self._get_padded_unsharded_flat_param() - if self.uses_sharded_strategy - else self.flat_param - ) - self._use_unsharded_flat_param(unsharded_flat_param) - return - unsharded_flat_param = self._alloc_padded_unsharded_flat_param() - - # Check if any other rank hit an OOM - found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) - - dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX') - found_cuda_oom = found_cuda_oom_tensor.item() - # Signal current rank is still in batch - all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) - - dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN') - - if found_cuda_oom == 1: - raise RuntimeError('CUDA out of memory encountered on a different rank') - padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param) - self._use_unsharded_flat_param(padded_unsharded_flat_param) - + # 2.4.0 only patch # PyTorch issue: https://github.com/pytorch/pytorch/issues/133923 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE from typing import Mapping, Collection @@ -1046,3 +1002,51 @@ def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None: for key, value in state_dict.items(): _traverse_obj((str(key),), value) + +if version.parse(torch.__version__) >= version.parse('2.4.0') and version.parse( + torch.__version__, +) < version.parse('2.4.2'): + # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks + from torch.distributed.fsdp._flat_param import FlatParamHandle + original_unshard = FlatParamHandle.unshard + + @no_type_check + def unshard_with_sync(self): + """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`. + + This prevents deadlocks when some ranks OOM after the alloc call and others do not. + This is a patched method from pytorch, meant to be called when automicrobatching + turns on hooks in its search process for the optimal non-OOMing microbatch size. + This includes all-gathering the flat parameter + and switching to using the unsharded flat parameter. If the handle does + not need unsharding, then this only switches to using the unsharded + flat parameter. For ``NO_SHARD``, this is a no-op. + If FSDP is in :meth:`summon_full_params` and the handle uses parameter + mixed precision, then the parameter is forced to full precision. + """ + if not self.needs_unshard(): + # Even when not needing an unshard, we should switch to using + # the unsharded flat parameter + unsharded_flat_param = ( + self._get_padded_unsharded_flat_param() + if self.uses_sharded_strategy + else self.flat_param + ) + self._use_unsharded_flat_param(unsharded_flat_param) + return + unsharded_flat_param = self._alloc_padded_unsharded_flat_param() + + # Check if any other rank hit an OOM + found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) + + dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX') + found_cuda_oom = found_cuda_oom_tensor.item() + # Signal current rank is still in batch + all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True) + + dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN') + + if found_cuda_oom == 1: + raise RuntimeError('CUDA out of memory encountered on a different rank') + padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param) + self._use_unsharded_flat_param(padded_unsharded_flat_param) From fea4a88002639bec3498e9a592d4e1a35814db9b Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 11 Sep 2024 06:30:09 -0700 Subject: [PATCH 6/9] Remove tensor option for _global_exception_occured (#3611) Co-authored-by: Mihir Patel --- composer/loggers/mlflow_logger.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index 660c315c8e..3da777a0ec 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -312,10 +312,7 @@ def init(self, state: State, logger: Logger) -> None: if self.run_name is None: self.run_name = state.run_name - if hasattr(state, 'device'): - self._global_exception_occurred = state.device.tensor_to_device(torch.tensor([0], dtype=torch.uint8),) - else: - self._global_exception_occurred = 0 + self._global_exception_occurred = 0 # Store the Composer run name in the MLFlow run tags so it can be retrieved for autoresume self.tags['run_name'] = os.environ.get('RUN_NAME', state.run_name) @@ -615,10 +612,7 @@ def post_close(self): if hasattr(self, 'monitor_process'): # Check if there is an uncaught exception, which means `post_close()` is triggered # due to program crash. - if isinstance(self._global_exception_occurred, torch.Tensor): - finish_with_exception = (self._global_exception_occurred == 1).item() - else: - finish_with_exception = (self._global_exception_occurred == 1) + finish_with_exception = self._global_exception_occurred == 1 if finish_with_exception: self.monitor_process.crash() return From 893f398b348402015d0b69da88d23fe79972cf15 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 16 Sep 2024 10:38:09 -0700 Subject: [PATCH 7/9] Update error message for overwrite to be more user friendly (#3619) --- composer/utils/remote_uploader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/composer/utils/remote_uploader.py b/composer/utils/remote_uploader.py index 33793e7c91..da8b2da3dc 100644 --- a/composer/utils/remote_uploader.py +++ b/composer/utils/remote_uploader.py @@ -92,7 +92,10 @@ def upload_file(retry_index: int = 0): # Good! It shouldn't exist. pass else: - raise FileExistsError(f'Object {remote_file_name} already exists, but overwrite was set to False.') + raise FileExistsError( + f'Object {remote_file_name} already exists, but overwrite was set to False. ' + 'Please set `save_overwrite` to `True` in Trainer to overwrite the existing file.', + ) log.info(f'Uploading file {local_file_path} to {remote_file_name}') object_store.upload_object( object_name=remote_file_name, From 129dcbe4d2aef094b8b9a9d61a9fa2a3f0476995 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 Sep 2024 22:21:00 +0000 Subject: [PATCH 8/9] Update wandb requirement from <0.18,>=0.13.2 to >=0.13.2,<0.19 (#3615) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Saaketh Narayan Co-authored-by: Mihir Patel --- setup.py | 2 +- tests/loggers/test_wandb_logger.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index befb663b98..6cc65702a7 100644 --- a/setup.py +++ b/setup.py @@ -160,7 +160,7 @@ def package_files(prefix: str, directory: str, extension: str): ] extra_deps['wandb'] = [ - 'wandb>=0.13.2,<0.18', + 'wandb>=0.13.2,<0.19', ] extra_deps['comet_ml'] = [ diff --git a/tests/loggers/test_wandb_logger.py b/tests/loggers/test_wandb_logger.py index e190e39663..b0462fc842 100644 --- a/tests/loggers/test_wandb_logger.py +++ b/tests/loggers/test_wandb_logger.py @@ -269,10 +269,10 @@ def test_wandb_log_metrics(test_wandb_logger): eval_metrics_cross_entropy_count = all_run_text.count('metrics/eval/CrossEntropy') train_loss_count = all_run_text.count('loss/train/total') - expected_number_train_loss_count = (dataset_size / batch_size) + 1 # wandb includes it in the file one extra time + expected_number_train_loss_count = (dataset_size / batch_size) * 2 # wandb includes it twice per step expected_number_train_metrics_count = ( dataset_size / batch_size - ) + 2 # wandb includes it in the file two extra times + ) * 2 + 2 # wandb includes it twice per step plus two extra times expected_number_eval_metrics_count = 2 # wandb includes it in the file twice assert train_metrics_accuracy_count == expected_number_train_metrics_count assert train_loss_count == expected_number_train_loss_count From 7597ab6e873e4c77caa5cb87ec06282123af1e60 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 20 Sep 2024 20:22:59 -0700 Subject: [PATCH 9/9] Fix RNG key checking (#3623) --- composer/utils/checkpoint.py | 17 ++++++++++++++++- tests/trainer/test_checkpoint.py | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index c6f5af15ca..b966c918c5 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -148,10 +148,25 @@ def _get_write_mode(name: str) -> str: raise ValueError(f'{name} does not end with a valid tarfile extension.') +def _is_rng_key(key: str, value: tuple) -> bool: + """Check if the key is an RNG key. + + We expect the RNG key to be of the form 'rng.{rank}.cuda|torch|python|numpy'. + This function ensures that we don't accidentally pick up other keys. + """ + starts_with_rng = key.startswith('rng') + ends_with_expected = key.endswith(('cuda', 'torch', 'python', 'numpy')) + three_parts = isinstance(value, tuple) and len(value) == 3 + if starts_with_rng and ends_with_expected and three_parts: + return True + + return False + + def _get_num_ranks_that_saved_rng(metadata: Metadata): rng_inds = [] for field_name, field_value in metadata.planner_data.items(): - if 'rng' in field_name: + if _is_rng_key(field_name, field_value): _, rng_rank_index, _ = field_value rng_inds.append(rng_rank_index) rng_inds = set(rng_inds) diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 82629d245b..c2e4929535 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -35,6 +35,7 @@ _COMPOSER_STATES_FILENAME, PartialFilePath, _ensure_valid_checkpoint, + _is_rng_key, _write_checkpoint_file, glob_filter, ) @@ -130,6 +131,23 @@ def _assert_checkpoints_equivalent(file1, file2, atol=0.0, rtol=0.0): assert all(keys_in) or not any(keys_in) +@pytest.mark.parametrize( + 'key,value,expected_result', + [ + ('rng.0.cuda', ('rng', '0', 'cuda'), True), + ('rng.0.torch', ('rng', '0', 'torch'), True), + ('rng.0.numpy', ('rng', '0', 'numpy'), True), + ('rng.0.python', ('rng', '0', 'python'), True), + ('rng.0', ('rng', '0'), False), + ('test.test.rng', ('test', 'test', 'rng'), False), + ('test.rng.test', ('test', 'rng', 'test'), False), + ('test.notatuple.test', 0, False), + ], +) +def test_is_rng_key(key: str, value: tuple, expected_result: bool): + assert _is_rng_key(key, value) == expected_result + + @pytest.mark.parametrize( 'remove_field_paths,filter_params', [