Merge remote-tracking branch 'mosaicml/dev' into batch_code_eval

mosaicml · Feb 9, 2024 · b6dc973 · b6dc973
2 parents 7137d04 + 4238884
commit b6dc973
Show file tree

Hide file tree

Showing 48 changed files with 825 additions and 129 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -20,8 +20,8 @@
 /composer/algorithms/ @mosaicml/composer-team-eng
 /composer/cli/ @mosaicml/composer-team-eng
 /composer/datasets/ @mosaicml/composer-team-eng
-/composer/functional/ @mosaicml/composer-team-eng @dblalock
-/composer/loggers/ @mosaicml/composer-team-eng @eracah @dakinggg
+/composer/functional/ @mosaicml/composer-team-eng
+/composer/loggers/ @mosaicml/composer-team-eng
 /composer/loss/ @mosaicml/composer-team-eng
 /composer/metrics/ @mosaicml/composer-team-eng
 /composer/models/ @mosaicml/composer-team-eng

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -33,6 +33,11 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
+        - name: cpu-3.11-2.2
+          container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
         - name: cpu-doctest
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and doctest
@@ -53,6 +58,11 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
+        - name: daily-cpu-3.11-2.2
+          container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
         - name: daily-cpu-doctest
           container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
@@ -73,7 +83,6 @@ jobs:
       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
       wandb-api-key: ${{ secrets.WANDB_API_KEY }}
-      slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
       code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
       code-eval-url: ${{ secrets.CODE_EVAL_URL }}
       code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
@@ -100,7 +109,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
         - name: "gpu-3.10-2.1"
-          container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+        - name: "gpu-3.10-2.2"
+          container: mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
@@ -116,4 +130,3 @@ jobs:
       python-version: 3.9
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
-      slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
diff --git a/.github/workflows/pytest-cpu.yaml b/.github/workflows/pytest-cpu.yaml
@@ -89,14 +89,3 @@ jobs:
       with:
         name: coverage-${{ github.sha }}-${{ inputs.name }}
         path: .coverage
-    - name: Notify slack fail
-      if: >
-        failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') &&
-        (github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
-      uses: voxmedia/github-action-slack-notify-build@v1
-      with:
-        channel: composer-issues
-        status: FAILED
-        color: danger
diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
@@ -87,14 +87,3 @@ jobs:
         python .github/mcli/mcli_pytest.py --image '${{ inputs.container }}' --pip_package_name \
           '${{ inputs.composer_package_name }}' --pytest_markers '${{ inputs.pytest-markers }}' --pytest_command \
           '${{ inputs.pytest-command }}' --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
-    - name: Notify slack fail
-      if: >
-        failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') &&
-        (github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
-      uses: voxmedia/github-action-slack-notify-build@v1
-      with:
-        channel: composer-issues
-        status: FAILED
-        color: danger
diff --git a/.gitignore b/.gitignore
@@ -136,6 +136,9 @@ venv/
 # WandB
 wandb/
 
+# Neptune
+.neptune/
+
 # Spacemacs
 ._#*
 .#*

diff --git a/README.md b/README.md
@@ -105,7 +105,7 @@ Composer is built to automate away low-level pain points and headaches so you ca
 Integrate with the tools you know and love for experiment tracking and data streaming.
 
 - **Cloud integrations**: Our Checkpointing and logging features have first-class support for remote storage and loading from Cloud bucket (OCI, GCP, AWS S3).
-- **********Experiment tracking:********** Weights and Biases, MLFlow, and CometML — the choice is yours, easily log your data to your favorite platform.
+- **********Experiment tracking:********** Weights and Biases, MLFlow, CometML, and neptune.ai — the choice is yours, easily log your data to your favorite platform.
 
 # **🚀 Getting Started**
 

diff --git a/composer/_version.py b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.19.0'
+__version__ = '0.19.1'
diff --git a/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py b/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py
@@ -52,8 +52,10 @@ class LowPrecisionGroupNorm(Algorithm):
 
     def __init__(self, apply_at: Event = Event.INIT):
         self.apply_at = apply_at
-        if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
-            raise ValueError('LowPrecisionGroupNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
+        if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
+            raise ValueError(
+                'LowPrecisionGroupNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
+            )
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(apply_at={self.apply_at})'

diff --git a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
@@ -52,8 +52,10 @@ class LowPrecisionLayerNorm(Algorithm):
 
     def __init__(self, apply_at: Event = Event.INIT):
         self.apply_at = apply_at
-        if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
-            raise ValueError('LowPrecisionLayerNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
+        if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
+            raise ValueError(
+                'LowPrecisionLayerNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
+            )
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(apply_at={self.apply_at})'

diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
@@ -46,7 +46,7 @@ class ImageVisualizer(Callback):
     +---------------------------------------------+---------------------------------------+
 
         .. note::
-            This callback only works with wandb logging for now.
+            This callback only works with wandb and Neptune logging for now.
 
     Args:
         interval (int | str | Time, optional): Time string specifying how often to log train images. For example, ``interval='1ep'``

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
@@ -4,6 +4,7 @@
 """Log memory snapshot during training."""
 import logging
 import os
+import pickle
 import warnings
 from typing import Optional, Union
 
@@ -50,10 +51,10 @@ class MemorySnapshot(Callback):
         max_entries (int, optional): Maximum number of memory alloc/free events to record. Defaults to 100000.
         folder (str, optional): A format string describing the folder containing the memory snapshot files.
             Defaults to ``'{{run_name}}/torch_traces'``.
-        filename (str, optional): A format string describing how to name the memory snapshot files.
-            Defaults to ``'rank{{rank}}.{{batch}}.pickle'``.
-        remote_file_name (str, optional): A format string for the memory snapshot remote file name.
-            Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.pickle'``.
+        filename (str, optional): A format string describing the prefix used to name the memory snapshot files.
+            Defaults to ``'rank{{rank}}.{{batch}}.memory_snapshot'``.
+        remote_file_name (str, optional): A format string describing the prefix for the memory snapshot remote file name.
+            Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.memory_snapshot'``.
 
             Whenever a trace file is saved, it is also uploaded as a file according to this format string.
             The same format variables as for ``filename`` are available.
@@ -74,9 +75,8 @@ def __init__(
         interval: Union[int, str, Time] = '3ba',
         max_entries: int = 100000,
         folder: str = '{run_name}/torch_traces',
-        filename: str = 'rank{rank}.{batch}.pt.trace.memory_snapshot.html',
-        remote_file_name: Optional[
-            str] = '{run_name}/torch_memory_traces/rank{rank}.{batch}.pt.trace.memory_snapshot.html',
+        filename: str = 'rank{rank}.{batch}.memory_snapshot',
+        remote_file_name: Optional[str] = '{run_name}/torch_memory_traces/rank{rank}.{batch}.memory_snapshot',
         overwrite: bool = False,
     ) -> None:
         self.batches_left_to_skip = skip_batches
@@ -157,26 +157,33 @@ def export_memory_snapshot(self, state: State, logger: Logger) -> None:
             self.folder_name,
             format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp))
         try:
-            log.info(f'Saving memory snapshot to local file: {filename}')
+            snapshot_file = filename + '.pickle'
+            trace_plot_file = filename + '.html'
+            log.info(f'Saving memory snapshot files')
+
             snapshot = torch.cuda.memory._snapshot()
             # No data was recorded - avoids a `ValueError` in `trace_plot`
             if all(len(t) == 0 for t in snapshot['device_traces']):
                 log.info(f'No allocation is recorded in memory snapshot)')
                 return
-            with open(filename, 'w+') as fd:
-                fd.write(torch.cuda._memory_viz.trace_plot(snapshot, device=None, plot_segments=False))  # type: ignore
+
+            with open(snapshot_file, 'wb') as fd:
+                pickle.dump(snapshot, fd)
+
+            with open(trace_plot_file, 'w+') as fd:
+                fd.write(torch.cuda._memory_viz.trace_plot(snapshot))  # type: ignore
+
+            log.info(f'Saved memory snapshot to local files with prefix = {filename}')
+
+            if self.remote_path_in_bucket is not None:
+                for f in [snapshot_file, trace_plot_file]:
+                    remote_file_name = (self.remote_path_in_bucket + os.path.basename(f)).lstrip('/')
+                    log.info(f'Uploading memory snapshot to remote: {remote_file_name} from {f}')
+                    try:
+                        logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite)
+                    except FileExistsError as e:
+                        raise FileExistsError(
+                            f'Uploading memory snapshot failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory snapshot with Trainer, set `overwrite` to True.'
+                        ) from e
         except Exception as e:
             log.error(f'Failed to capture memory snapshot {e}')
-            return
-        if self.remote_path_in_bucket is not None:
-            remote_file_name = format_name_with_dist_and_time(self.remote_path_in_bucket,
-                                                              run_name=state.run_name,
-                                                              timestamp=state.timestamp)
-            remote_file_name = remote_file_name.lstrip('/')
-            log.info(f'Uploading memory snapshot to remote: {remote_file_name} from {filename}')
-            try:
-                logger.upload_file(remote_file_name=remote_file_name, file_path=filename, overwrite=self.overwrite)
-            except FileExistsError as e:
-                raise FileExistsError(
-                    f'Uploading memory snapshot failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory snapshot with Trainer, set save_overwrite to True.'
-                ) from e
diff --git a/composer/core/callback.py b/composer/core/callback.py
@@ -105,6 +105,16 @@ def init(self, state: State, logger: Logger) -> None:
         del state, logger  # unused
         pass
 
+    def before_load(self, state: State, logger: Logger) -> None:
+        """Called on the :attr:`.Event.BEFORE_LOAD` event.
+
+        Args:
+            state (State): The training state.
+            logger (Logger): The logger.
+        """
+        del state, logger  # unused
+        pass
+
     def after_load(self, state: State, logger: Logger) -> None:
         """Called on the :attr:`.Event.AFTER_LOAD` event.
 

diff --git a/composer/core/engine.py b/composer/core/engine.py
@@ -351,11 +351,13 @@ def register_pass(self, algorithm_pass: passes.AlgorithmPass, index: int = -1):
     def _assert_dataloader_and_duration_set(state: State, event: Event):
         # correctness checks that dataloader and max duration need to be set for certain events
 
-        # dataloader should be set on all events expect INIT/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
-        if event not in {Event.INIT, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END}:
+        # dataloader should be set on all events except INIT/BEFORE_LOAD/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
+        if event not in {
+                Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END
+        }:
             assert state.dataloader is not None, f'The trainer should have set state.dataloader for event {event}.'
 
-        if event != Event.INIT and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
+        if event != Event.INIT and event != Event.BEFORE_LOAD and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
             assert state.max_duration is not None, f'The trainer should have set state.max_duration for event {event}.'
 
     def _run_algorithms(

diff --git a/composer/core/event.py b/composer/core/event.py
@@ -18,6 +18,7 @@ class Event(StringEnum):
     .. code-block:: python
 
         # <INIT>
+        # <BEFORE_LOAD>
         # <AFTER_LOAD>
         # <FIT_START>
         for epoch in range(NUM_EPOCHS):
@@ -93,6 +94,7 @@ class Event(StringEnum):
     Attributes:
         INIT: Invoked in the constructor of :class:`~.trainer.Trainer`. Model surgery (see
             :mod:`~composer.utils.module_surgery`) typically occurs here.
+        BEFORE_LOAD: Immediately before the checkpoint is loaded in :class:`~.trainer.Trainer`.
         AFTER_LOAD: Immediately after checkpoint is loaded in constructor of :class:`~.trainer.Trainer`.
         FIT_START: Invoked at the beginning of each call to :meth:`.Trainer.fit`. Dataset transformations typically
             occur here.
@@ -142,6 +144,7 @@ class Event(StringEnum):
     """
 
     INIT = 'init'
+    BEFORE_LOAD = 'before_load'
     AFTER_LOAD = 'after_load'
     FIT_START = 'fit_start'
 
@@ -243,12 +246,12 @@ def is_eval(self) -> bool:
         return self.value.startswith('eval')
 
 
-_BEFORE_EVENTS = (Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
+_BEFORE_EVENTS = (Event.BEFORE_LOAD, Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
                   Event.BEFORE_TRAIN_BATCH, Event.BEFORE_FORWARD, Event.BEFORE_LOSS, Event.BEFORE_BACKWARD,
                   Event.EVAL_BEFORE_ALL, Event.EVAL_START, Event.EVAL_BATCH_START, Event.EVAL_BEFORE_FORWARD,
                   Event.PREDICT_START, Event.PREDICT_BATCH_START, Event.PREDICT_BEFORE_FORWARD,
                   Event.EVAL_STANDALONE_START)
-_AFTER_EVENTS = (Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH, Event.AFTER_FORWARD,
-                 Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END, Event.EVAL_BATCH_END,
-                 Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END, Event.PREDICT_BATCH_END,
-                 Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
+_AFTER_EVENTS = (Event.AFTER_LOAD, Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH,
+                 Event.AFTER_FORWARD, Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END,
+                 Event.EVAL_BATCH_END, Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END,
+                 Event.PREDICT_BATCH_END, Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
diff --git a/composer/loggers/__init__.py b/composer/loggers/__init__.py
@@ -20,6 +20,7 @@
 from composer.loggers.logger_destination import LoggerDestination
 from composer.loggers.mlflow_logger import MLFlowLogger
 from composer.loggers.mosaicml_logger import MosaicMLLogger
+from composer.loggers.neptune_logger import NeptuneLogger
 from composer.loggers.progress_bar_logger import ProgressBarLogger
 from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
 from composer.loggers.slack_logger import SlackLogger
@@ -32,6 +33,7 @@
     'LoggerDestination',
     'FileLogger',
     'InMemoryLogger',
+    'NeptuneLogger',
     'ProgressBarLogger',
     'WandBLogger',
     'RemoteUploaderDownloader',

diff --git a/composer/loggers/cometml_logger.py b/composer/loggers/cometml_logger.py
@@ -98,7 +98,12 @@ def init(self, state: State, logger: Logger) -> None:
             assert self.experiment is not None
             self.experiment.set_name(self.name)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         if self._enabled:
             assert self.experiment is not None
             try:

diff --git a/composer/loggers/console_logger.py b/composer/loggers/console_logger.py
@@ -77,7 +77,12 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         # Lazy logging of hyperparameters.
         self.hparams.update(hyperparameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e:

diff --git a/composer/loggers/file_logger.py b/composer/loggers/file_logger.py
@@ -185,7 +185,12 @@ def log_traces(self, traces: Dict[str, Any]):
                     trace_str + '\n',
                 )
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e:

diff --git a/composer/loggers/in_memory_logger.py b/composer/loggers/in_memory_logger.py
@@ -72,7 +72,12 @@ def __init__(self) -> None:
     def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         self.hyperparameters.update(hyperparameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e: