Skip to content

Fix the FSDP.optim_state_dict_to_load OOM (#3184) #1375

Fix the FSDP.optim_state_dict_to_load OOM (#3184)

Fix the FSDP.optim_state_dict_to_load OOM (#3184) #1375

Workflow file for this run

name: Daily
on:
schedule:
- cron: "30 2 * * *" # 2:30 every day
push:
branches:
- dev
- main
- release/**
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
jobs:
daily-pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
strategy:
matrix:
include:
- name: cpu-3.10-2.0
container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.10-2.1-composer
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-doctest
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.0
container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1-composer
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: composer
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-doctest
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
name: ${{ matrix.name }}
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
composer_package_name: ${{ matrix.composer_package_name }}
pytest-wandb-entity: "mosaicml-public-integration-tests"
pytest-wandb-project: "integration-tests-${{ github.sha }}"
safe_directory: composer
secrets:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
wandb-api-key: ${{ secrets.WANDB_API_KEY }}
code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
code-eval-url: ${{ secrets.CODE_EVAL_URL }}
code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
gcs-key: ${{ secrets.GCS_KEY }}
gcs-secret: ${{ secrets.GCS_SECRET }}
azure-account-name: ${{ secrets.AZURE_ACCOUNT_NAME }}
azure-account-access-key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
coverage:
uses: ./.github/workflows/coverage.yaml
name: Coverage Results
if: github.repository_owner == 'mosaicml'
needs: [daily-pytest-cpu]
with:
download-path: artifacts
daily-pytest-gpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
strategy:
matrix:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.10-2.0"
container: mosaicml/pytorch_vision:2.0.1_cu117-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
- name: "gpu-3.10-2.1"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
- name: "gpu-3.10-2.2"
container: mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
composer_package_name: ${{ matrix.composer_package_name }}
container: ${{ matrix.container }}
git_repo: mosaicml/composer
mcloud-timeout: 3600
name: ${{ matrix.name }}
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}