Skip to content

Commit

Permalink
Merge branch 'dev' into mvpatel2000/no-rng-dedup
Browse files Browse the repository at this point in the history
  • Loading branch information
mvpatel2000 committed Feb 13, 2024
2 parents dec0056 + 157af10 commit b1ebe23
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 148 deletions.
1 change: 1 addition & 0 deletions composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2086,6 +2086,7 @@ def _train_loop(self) -> None:
# asserted to be not None when Trainer.fit() is called
raise RuntimeError('max_duration must be specified when initializing the Trainer')

log.debug('Starting training loop')
while self.state.timestamp < self.state.max_duration:
if int(self.state.timestamp.batch_in_epoch) == 0:
self.engine.run_event(Event.EPOCH_START)
Expand Down
1 change: 1 addition & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ To install composer, once inside the image, run `pip install mosaicml`.
|----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.0 | cpu | 3.11 | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04` |
Expand Down
27 changes: 27 additions & 0 deletions docker/build_matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,33 @@
- mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.15.2
- AWS_OFI_NCCL_VERSION: v1.7.4-aws
BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.0
IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10-aws
MOFED_VERSION: ''
NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
PYTHON_VERSION: '3.10'
PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
PYTORCH_VERSION: 2.3.0
TAGS:
- mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws
TARGET: pytorch_stage
TORCHVISION_VERSION: 0.18.0
- AWS_OFI_NCCL_VERSION: ''
BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
CUDA_VERSION: 12.1.0
Expand Down
17 changes: 17 additions & 0 deletions docker/generate_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,23 @@ def _main():

pytorch_entries.append(entry)

nightly_entry_310_aws = {
'AWS_OFI_NCCL_VERSION': 'v1.7.4-aws',
'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
'CUDA_VERSION': '12.1.0',
'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10-aws',
'MOFED_VERSION': '',
'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
'PYTHON_VERSION': '3.10',
'PYTORCH_VERSION': '2.3.0',
'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws'],
'TARGET': 'pytorch_stage',
'TORCHVISION_VERSION': '0.18.0'
}
pytorch_entries.append(nightly_entry_310_aws)

nightly_entry_310 = {
'AWS_OFI_NCCL_VERSION': '',
'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
Expand Down
148 changes: 0 additions & 148 deletions scripts/ffcv/create_ffcv_datasets.py

This file was deleted.

0 comments on commit b1ebe23

Please sign in to comment.