Merge branch 'dev' into log-image-fix

mosaicml · Jun 12, 2024 · bc44224 · bc44224
2 parents afa880b + ba82cc9
commit bc44224
Show file tree

Hide file tree

Showing 17 changed files with 82 additions and 70 deletions.
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -34,7 +34,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -45,7 +45,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/codeql-analysis
       with:

diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
@@ -16,7 +16,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/coverage
       with:

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -14,7 +14,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   daily-pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         include:
@@ -100,7 +100,7 @@ jobs:
       download-path: artifacts
 
   daily-pytest-gpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         include:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -9,7 +9,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         include:
@@ -35,7 +35,7 @@ jobs:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
 
   pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         include:
@@ -62,7 +62,7 @@ jobs:
 
 
   pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected].7
+    uses: mosaicml/ci-testing/.github/workflows/[email protected].8
     strategy:
       matrix:
         include:

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -24,7 +24,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/code-quality
       with:

diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
@@ -33,7 +33,7 @@ jobs:
       uses: actions/checkout@v3
       with:
         repository: mosaicml/ci-testing
-        ref: v0.0.7
+        ref: v0.0.8
         path: ./ci-testing
     - uses: ./ci-testing/.github/actions/smoketest
       with:

diff --git a/composer/core/state.py b/composer/core/state.py
@@ -707,8 +707,10 @@ def train_dataloader(self, train_dataloader: Optional[Union[Iterable, DataLoader
             train_dataloader (Iterable | DataLoader, optional): The dataloader.
         """
         self._train_dataloader = train_dataloader
-        # Load dataset state from checkpoint when train_dataloader is set
-        if self.dataset_state:
+        # Load dataset state from checkpoint when train_dataloader is set. This occurs if
+        # dataset_state was loaded from checkpoint and train_dataloader has not already
+        # consumed dataset_state['train'] to resume.
+        if self.dataset_state is not None and self.dataset_state.get('train') is not None:
             dataset = self._dataset_of(self._train_dataloader)
             if hasattr(dataset, 'load_state_dict'):
                 dataset.load_state_dict(self.dataset_state['train'])  # pyright: ignore
@@ -1278,14 +1280,14 @@ def _load_dataset_state(self, obj: dict[str, Any]) -> None:
         Args:
             obj (dict[str, Any]): The state to load.
         """
-        self.dataset_state = obj
-
         dataset = self._dataset_of(self.train_dataloader)
         if hasattr(dataset, 'load_state_dict'):
             dataset.load_state_dict(obj['train'])  # pyright: ignore
             obj['train'] = None
             self.dataset_resumption['train'] = True
 
+        self.dataset_state = obj
+
     def load_model_state(
         self,
         state_dict: dict[str, Any],

diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
@@ -328,36 +328,12 @@ def sync_hook(*args):
 
     mixed_precision = fsdp_config.mixed_precision
     keep_low_precision_grads = fsdp_config.keep_low_precision_grads
-    mixed_precision, param_dtype, _, _ = get_mixed_precision(
+    mixed_precision, _, _, _ = get_mixed_precision(
         precision,
         mixed_precision=mixed_precision,
         keep_low_precision_grads=keep_low_precision_grads,
     )
 
-    # Note: FSDP does support the use of torch.float32 with sharding.
-    # They just never expected a user to pass in torch.float32 into mixed_precision as a param_dtype.
-    # See: https://github.com/pytorch/pytorch/issues/90584
-    # The PR fixing this bug is merged into PyTorch, but it hasn't made its way into a release yet.
-    # Instead a user needs to pass in `None` as param_dtype to have the parameters as torch.float32.
-    # TODO: remove these checks when PyTorch has a release that includes the fix.
-    if sharding_map_key != 'NO_SHARD':
-        if (
-            precision == Precision.AMP_FP16 and param_dtype not in [torch.float16, None] or
-            precision == Precision.AMP_BF16 and param_dtype not in [torch.bfloat16, None]
-        ):
-            raise ValueError(
-                f'FSDP in PyTorch 1.13 does not support precision `{precision}` with sharding strategy `{sharding_strategy}` '
-                f'and param_dtype `{param_dtype}.` Consider using one of the predefined mixed_precision strategies '
-                "(choose: `'FULL'`, `'DEFAULT'`, `'PURE'`)",
-            )
-
-        if param_dtype == torch.float32:
-            raise ValueError(
-                f'FSDP in PyTorch 1.13 does not support param_dtype `{param_dtype}` with sharding_strategy `{sharding_map_key}` '
-                f'Consider using `amp` or `bf16` for precision or setting param_dtype in mixed_precision to `None` '
-                f'with sharding strategy `{sharding_map_key}.`',
-            )
-
     process_group = None
     if fsdp_config.process_group is not None:
         process_group_dict = {'process_group': fsdp_config.process_group}

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
@@ -88,7 +88,6 @@ def __init__(
     ) -> None:
         try:
             import mlflow
-            from databricks.sdk import WorkspaceClient
             from mlflow import MlflowClient
         except ImportError as e:
             raise MissingConditionalImportError(
@@ -143,9 +142,19 @@ def __init__(
                     DEFAULT_MLFLOW_EXPERIMENT_NAME,
                 )
             assert self.experiment_name is not None  # type hint
+
             if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'):
+                try:
+                    from databricks.sdk import WorkspaceClient
+                except ImportError as e:
+                    raise MissingConditionalImportError(
+                        extra_deps_group='mlflow',
+                        conda_package='databricks-sdk',
+                        conda_channel='conda-forge',
+                    ) from e
                 databricks_username = WorkspaceClient().current_user.me().user_name or ''
                 self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name)
+
             self._mlflow_client = MlflowClient(self.tracking_uri)
             # Set experiment
             env_exp_id = os.getenv(

diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
@@ -178,3 +178,27 @@ def compute(self) -> Tensor:
         """Returns torch.exp() of the LanguageCrossEntropy."""
         avg_loss = super().compute()
         return torch.exp(avg_loss)
+
+
+# For backward compatibility
+class InContextLearningMetric:
+    """InContextLearningMetric only exists for backwards compatibility of checkpoints that contain pickled metrics."""
+
+    def __init__(self):
+        raise RuntimeError(
+            f'This class only exists for maintaining backward compatibility for checkpoints that contain pickled metrics. Please instead use https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/README.md.',
+        )
+
+    def __getstate__(self):
+        return None
+
+    def __setstate__(self, state):
+        pass
+
+
+InContextLearningCodeEvalAccuracy = InContextLearningMetric
+InContextLearningLMAccuracy = InContextLearningMetric
+InContextLearningLMExpectedCalibrationError = InContextLearningMetric
+InContextLearningMCExpectedCalibrationError = InContextLearningMetric
+InContextLearningQAAccuracy = InContextLearningMetric
+InContextLearningMultipleChoiceAccuracy = InContextLearningMetric
diff --git a/docker/README.md b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.23.1             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.1`         |
-| 0.23.1             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.1_cpu` |
+| 0.23.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2`         |
+| 0.23.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually

diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
@@ -208,9 +208,9 @@
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: composer-0-23-1
+  IMAGE_NAME: composer-0-23-2
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -231,23 +231,23 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.1
+  - mosaicml/composer:0.23.2
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-23-1-cpu
+  IMAGE_NAME: composer-0-23-2-cpu
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/composer:0.23.1_cpu
+  - mosaicml/composer:0.23.2_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.18.1
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
@@ -231,7 +231,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.23.1']  # Only build images for the latest composer version
+    composer_versions = ['0.23.2']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

diff --git a/setup.py b/setup.py
@@ -88,7 +88,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'coolname>=1.1.0,<3',
     'tabulate==0.9.0',  # for auto-generating tables
     'py-cpuinfo>=8.0.0,<10',
-    'packaging>=21.3.0,<24.1',
+    'packaging>=21.3.0,<24.2',
     'importlib-metadata>=5.0.0,<7',
     'mosaicml-cli>=0.5.25,<0.7',
 ]
@@ -139,7 +139,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'GitPython==3.1.43',
     'moto[s3]>=4.0.1,<5',
     'mock-ssh-server==0.9.1',
-    'cryptography==42.0.6',
+    'cryptography==42.0.8',
     'pytest-httpserver>=1.0.4,<1.1',
     'setuptools<=59.5.0',
     'pillow==9.3.0',  # Matches the Pillow version listed in the Dockerfile

diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -306,24 +306,25 @@ def test_logging(
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that engine logs statements as expected"""
-    caplog.set_level(logging.DEBUG, logger=Engine.__module__)
-    # Include a callback, since most logging happens around callback events
-    dummy_state.callbacks = [EventCounterCallback()]
-
-    monkeypatch.setenv('ENGINE_DEBUG', '1')
-    engine = Engine(dummy_state, dummy_logger)
-    engine.run_event('INIT')
-    engine.close()
-
-    # Validate that we have the expected log entries
-    assert caplog.record_tuples == [
-        ('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running event'),
-        ('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running callback EventCounterCallback'),
-        ('composer.core.engine', 10, 'Closing the engine.'),
-        ('composer.core.engine', 10, 'Closing callback EventCounterCallback'),
-        ('composer.core.engine', 10, 'Post-closing callback EventCounterCallback'),
-        ('composer.core.engine', 10, 'Engine closed.'),
-    ]
+    caplog.clear()
+    with caplog.at_level(logging.DEBUG, logger=Engine.__module__):
+        # Include a callback, since most logging happens around callback events
+        dummy_state.callbacks = [EventCounterCallback()]
+
+        monkeypatch.setenv('ENGINE_DEBUG', '1')
+        engine = Engine(dummy_state, dummy_logger)
+        engine.run_event('INIT')
+        engine.close()
+
+        # Validate that we have the expected log entries
+        assert caplog.record_tuples == [
+            ('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running event'),
+            ('composer.core.engine', 10, '[ep=0][ba=0][event=INIT]: Running callback EventCounterCallback'),
+            ('composer.core.engine', 10, 'Closing the engine.'),
+            ('composer.core.engine', 10, 'Closing callback EventCounterCallback'),
+            ('composer.core.engine', 10, 'Post-closing callback EventCounterCallback'),
+            ('composer.core.engine', 10, 'Engine closed.'),
+        ]
 
 
 def _worker():