Skip to content

Commit

Permalink
Merge remote-tracking branch 'mosaicml/dev' into batch_code_eval
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi-mosaic committed Feb 9, 2024
2 parents 7137d04 + 4238884 commit b6dc973
Show file tree
Hide file tree
Showing 48 changed files with 825 additions and 129 deletions.
4 changes: 2 additions & 2 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
/composer/algorithms/ @mosaicml/composer-team-eng
/composer/cli/ @mosaicml/composer-team-eng
/composer/datasets/ @mosaicml/composer-team-eng
/composer/functional/ @mosaicml/composer-team-eng @dblalock
/composer/loggers/ @mosaicml/composer-team-eng @eracah @dakinggg
/composer/functional/ @mosaicml/composer-team-eng
/composer/loggers/ @mosaicml/composer-team-eng
/composer/loss/ @mosaicml/composer-team-eng
/composer/metrics/ @mosaicml/composer-team-eng
/composer/models/ @mosaicml/composer-team-eng
Expand Down
19 changes: 16 additions & 3 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-doctest
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and doctest
Expand All @@ -53,6 +58,11 @@ jobs:
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-doctest
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and doctest
Expand All @@ -73,7 +83,6 @@ jobs:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
wandb-api-key: ${{ secrets.WANDB_API_KEY }}
slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
code-eval-url: ${{ secrets.CODE_EVAL_URL }}
code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
Expand All @@ -100,7 +109,12 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
- name: "gpu-3.10-2.1"
container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
- name: "gpu-3.10-2.2"
container: mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
Expand All @@ -116,4 +130,3 @@ jobs:
python-version: 3.9
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
11 changes: 0 additions & 11 deletions .github/workflows/pytest-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,3 @@ jobs:
with:
name: coverage-${{ github.sha }}-${{ inputs.name }}
path: .coverage
- name: Notify slack fail
if: >
failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') &&
(github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
env:
SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
uses: voxmedia/github-action-slack-notify-build@v1
with:
channel: composer-issues
status: FAILED
color: danger
11 changes: 0 additions & 11 deletions .github/workflows/pytest-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,3 @@ jobs:
python .github/mcli/mcli_pytest.py --image '${{ inputs.container }}' --pip_package_name \
'${{ inputs.composer_package_name }}' --pytest_markers '${{ inputs.pytest-markers }}' --pytest_command \
'${{ inputs.pytest-command }}' --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
- name: Notify slack fail
if: >
failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') &&
(github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
env:
SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
uses: voxmedia/github-action-slack-notify-build@v1
with:
channel: composer-issues
status: FAILED
color: danger
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ venv/
# WandB
wandb/

# Neptune
.neptune/

# Spacemacs
._#*
.#*
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Composer is built to automate away low-level pain points and headaches so you ca
Integrate with the tools you know and love for experiment tracking and data streaming.

- **Cloud integrations**: Our Checkpointing and logging features have first-class support for remote storage and loading from Cloud bucket (OCI, GCP, AWS S3).
- **********Experiment tracking:********** Weights and Biases, MLFlow, and CometML — the choice is yours, easily log your data to your favorite platform.
- **********Experiment tracking:********** Weights and Biases, MLFlow, CometML, and neptune.ai — the choice is yours, easily log your data to your favorite platform.

# **🚀 Getting Started**

Expand Down
2 changes: 1 addition & 1 deletion composer/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

"""The Composer Version."""

__version__ = '0.19.0'
__version__ = '0.19.1'
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ class LowPrecisionGroupNorm(Algorithm):

def __init__(self, apply_at: Event = Event.INIT):
self.apply_at = apply_at
if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
raise ValueError('LowPrecisionGroupNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
raise ValueError(
'LowPrecisionGroupNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
)

def __repr__(self) -> str:
return f'{self.__class__.__name__}(apply_at={self.apply_at})'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ class LowPrecisionLayerNorm(Algorithm):

def __init__(self, apply_at: Event = Event.INIT):
self.apply_at = apply_at
if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
raise ValueError('LowPrecisionLayerNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
raise ValueError(
'LowPrecisionLayerNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
)

def __repr__(self) -> str:
return f'{self.__class__.__name__}(apply_at={self.apply_at})'
Expand Down
2 changes: 1 addition & 1 deletion composer/callbacks/image_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ImageVisualizer(Callback):
+---------------------------------------------+---------------------------------------+
.. note::
This callback only works with wandb logging for now.
This callback only works with wandb and Neptune logging for now.
Args:
interval (int | str | Time, optional): Time string specifying how often to log train images. For example, ``interval='1ep'``
Expand Down
53 changes: 30 additions & 23 deletions composer/callbacks/memory_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""Log memory snapshot during training."""
import logging
import os
import pickle
import warnings
from typing import Optional, Union

Expand Down Expand Up @@ -50,10 +51,10 @@ class MemorySnapshot(Callback):
max_entries (int, optional): Maximum number of memory alloc/free events to record. Defaults to 100000.
folder (str, optional): A format string describing the folder containing the memory snapshot files.
Defaults to ``'{{run_name}}/torch_traces'``.
filename (str, optional): A format string describing how to name the memory snapshot files.
Defaults to ``'rank{{rank}}.{{batch}}.pickle'``.
remote_file_name (str, optional): A format string for the memory snapshot remote file name.
Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.pickle'``.
filename (str, optional): A format string describing the prefix used to name the memory snapshot files.
Defaults to ``'rank{{rank}}.{{batch}}.memory_snapshot'``.
remote_file_name (str, optional): A format string describing the prefix for the memory snapshot remote file name.
Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.memory_snapshot'``.
Whenever a trace file is saved, it is also uploaded as a file according to this format string.
The same format variables as for ``filename`` are available.
Expand All @@ -74,9 +75,8 @@ def __init__(
interval: Union[int, str, Time] = '3ba',
max_entries: int = 100000,
folder: str = '{run_name}/torch_traces',
filename: str = 'rank{rank}.{batch}.pt.trace.memory_snapshot.html',
remote_file_name: Optional[
str] = '{run_name}/torch_memory_traces/rank{rank}.{batch}.pt.trace.memory_snapshot.html',
filename: str = 'rank{rank}.{batch}.memory_snapshot',
remote_file_name: Optional[str] = '{run_name}/torch_memory_traces/rank{rank}.{batch}.memory_snapshot',
overwrite: bool = False,
) -> None:
self.batches_left_to_skip = skip_batches
Expand Down Expand Up @@ -157,26 +157,33 @@ def export_memory_snapshot(self, state: State, logger: Logger) -> None:
self.folder_name,
format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp))
try:
log.info(f'Saving memory snapshot to local file: {filename}')
snapshot_file = filename + '.pickle'
trace_plot_file = filename + '.html'
log.info(f'Saving memory snapshot files')

snapshot = torch.cuda.memory._snapshot()
# No data was recorded - avoids a `ValueError` in `trace_plot`
if all(len(t) == 0 for t in snapshot['device_traces']):
log.info(f'No allocation is recorded in memory snapshot)')
return
with open(filename, 'w+') as fd:
fd.write(torch.cuda._memory_viz.trace_plot(snapshot, device=None, plot_segments=False)) # type: ignore

with open(snapshot_file, 'wb') as fd:
pickle.dump(snapshot, fd)

with open(trace_plot_file, 'w+') as fd:
fd.write(torch.cuda._memory_viz.trace_plot(snapshot)) # type: ignore

log.info(f'Saved memory snapshot to local files with prefix = {filename}')

if self.remote_path_in_bucket is not None:
for f in [snapshot_file, trace_plot_file]:
remote_file_name = (self.remote_path_in_bucket + os.path.basename(f)).lstrip('/')
log.info(f'Uploading memory snapshot to remote: {remote_file_name} from {f}')
try:
logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite)
except FileExistsError as e:
raise FileExistsError(
f'Uploading memory snapshot failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory snapshot with Trainer, set `overwrite` to True.'
) from e
except Exception as e:
log.error(f'Failed to capture memory snapshot {e}')
return
if self.remote_path_in_bucket is not None:
remote_file_name = format_name_with_dist_and_time(self.remote_path_in_bucket,
run_name=state.run_name,
timestamp=state.timestamp)
remote_file_name = remote_file_name.lstrip('/')
log.info(f'Uploading memory snapshot to remote: {remote_file_name} from {filename}')
try:
logger.upload_file(remote_file_name=remote_file_name, file_path=filename, overwrite=self.overwrite)
except FileExistsError as e:
raise FileExistsError(
f'Uploading memory snapshot failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory snapshot with Trainer, set save_overwrite to True.'
) from e
10 changes: 10 additions & 0 deletions composer/core/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ def init(self, state: State, logger: Logger) -> None:
del state, logger # unused
pass

def before_load(self, state: State, logger: Logger) -> None:
"""Called on the :attr:`.Event.BEFORE_LOAD` event.
Args:
state (State): The training state.
logger (Logger): The logger.
"""
del state, logger # unused
pass

def after_load(self, state: State, logger: Logger) -> None:
"""Called on the :attr:`.Event.AFTER_LOAD` event.
Expand Down
8 changes: 5 additions & 3 deletions composer/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,13 @@ def register_pass(self, algorithm_pass: passes.AlgorithmPass, index: int = -1):
def _assert_dataloader_and_duration_set(state: State, event: Event):
# correctness checks that dataloader and max duration need to be set for certain events

# dataloader should be set on all events expect INIT/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
if event not in {Event.INIT, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END}:
# dataloader should be set on all events except INIT/BEFORE_LOAD/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
if event not in {
Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END
}:
assert state.dataloader is not None, f'The trainer should have set state.dataloader for event {event}.'

if event != Event.INIT and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
if event != Event.INIT and event != Event.BEFORE_LOAD and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
assert state.max_duration is not None, f'The trainer should have set state.max_duration for event {event}.'

def _run_algorithms(
Expand Down
13 changes: 8 additions & 5 deletions composer/core/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Event(StringEnum):
.. code-block:: python
# <INIT>
# <BEFORE_LOAD>
# <AFTER_LOAD>
# <FIT_START>
for epoch in range(NUM_EPOCHS):
Expand Down Expand Up @@ -93,6 +94,7 @@ class Event(StringEnum):
Attributes:
INIT: Invoked in the constructor of :class:`~.trainer.Trainer`. Model surgery (see
:mod:`~composer.utils.module_surgery`) typically occurs here.
BEFORE_LOAD: Immediately before the checkpoint is loaded in :class:`~.trainer.Trainer`.
AFTER_LOAD: Immediately after checkpoint is loaded in constructor of :class:`~.trainer.Trainer`.
FIT_START: Invoked at the beginning of each call to :meth:`.Trainer.fit`. Dataset transformations typically
occur here.
Expand Down Expand Up @@ -142,6 +144,7 @@ class Event(StringEnum):
"""

INIT = 'init'
BEFORE_LOAD = 'before_load'
AFTER_LOAD = 'after_load'
FIT_START = 'fit_start'

Expand Down Expand Up @@ -243,12 +246,12 @@ def is_eval(self) -> bool:
return self.value.startswith('eval')


_BEFORE_EVENTS = (Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
_BEFORE_EVENTS = (Event.BEFORE_LOAD, Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
Event.BEFORE_TRAIN_BATCH, Event.BEFORE_FORWARD, Event.BEFORE_LOSS, Event.BEFORE_BACKWARD,
Event.EVAL_BEFORE_ALL, Event.EVAL_START, Event.EVAL_BATCH_START, Event.EVAL_BEFORE_FORWARD,
Event.PREDICT_START, Event.PREDICT_BATCH_START, Event.PREDICT_BEFORE_FORWARD,
Event.EVAL_STANDALONE_START)
_AFTER_EVENTS = (Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH, Event.AFTER_FORWARD,
Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END, Event.EVAL_BATCH_END,
Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END, Event.PREDICT_BATCH_END,
Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
_AFTER_EVENTS = (Event.AFTER_LOAD, Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH,
Event.AFTER_FORWARD, Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END,
Event.EVAL_BATCH_END, Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END,
Event.PREDICT_BATCH_END, Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
2 changes: 2 additions & 0 deletions composer/loggers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from composer.loggers.logger_destination import LoggerDestination
from composer.loggers.mlflow_logger import MLFlowLogger
from composer.loggers.mosaicml_logger import MosaicMLLogger
from composer.loggers.neptune_logger import NeptuneLogger
from composer.loggers.progress_bar_logger import ProgressBarLogger
from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
from composer.loggers.slack_logger import SlackLogger
Expand All @@ -32,6 +33,7 @@
'LoggerDestination',
'FileLogger',
'InMemoryLogger',
'NeptuneLogger',
'ProgressBarLogger',
'WandBLogger',
'RemoteUploaderDownloader',
Expand Down
7 changes: 6 additions & 1 deletion composer/loggers/cometml_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,12 @@ def init(self, state: State, logger: Logger) -> None:
assert self.experiment is not None
self.experiment.set_name(self.name)

def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
def log_table(self,
columns: List[str],
rows: List[List[Any]],
name: str = 'Table',
step: Optional[int] = None) -> None:
del step
if self._enabled:
assert self.experiment is not None
try:
Expand Down
7 changes: 6 additions & 1 deletion composer/loggers/console_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
# Lazy logging of hyperparameters.
self.hparams.update(hyperparameters)

def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
def log_table(self,
columns: List[str],
rows: List[List[Any]],
name: str = 'Table',
step: Optional[int] = None) -> None:
del step
try:
import pandas as pd
except ImportError as e:
Expand Down
7 changes: 6 additions & 1 deletion composer/loggers/file_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,12 @@ def log_traces(self, traces: Dict[str, Any]):
trace_str + '\n',
)

def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
def log_table(self,
columns: List[str],
rows: List[List[Any]],
name: str = 'Table',
step: Optional[int] = None) -> None:
del step
try:
import pandas as pd
except ImportError as e:
Expand Down
7 changes: 6 additions & 1 deletion composer/loggers/in_memory_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ def __init__(self) -> None:
def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
self.hyperparameters.update(hyperparameters)

def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
def log_table(self,
columns: List[str],
rows: List[List[Any]],
name: str = 'Table',
step: Optional[int] = None) -> None:
del step
try:
import pandas as pd
except ImportError as e:
Expand Down
Loading

0 comments on commit b6dc973

Please sign in to comment.