diff --git a/.ci/FILE_HEADER b/.ci/FILE_HEADER
deleted file mode 100644
index 9ae76d4864..0000000000
--- a/.ci/FILE_HEADER
+++ /dev/null
@@ -1,2 +0,0 @@
-Copyright 2022 MosaicML Composer authors
-SPDX-License-Identifier: Apache-2.0
diff --git a/CODEOWNERS b/.github/CODEOWNERS
similarity index 91%
rename from CODEOWNERS
rename to .github/CODEOWNERS
index a183caa01f..b193288b3e 100644
--- a/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -20,8 +20,8 @@
 /composer/algorithms/ @mosaicml/composer-team-eng
 /composer/cli/ @mosaicml/composer-team-eng
 /composer/datasets/ @mosaicml/composer-team-eng
-/composer/functional/ @mosaicml/composer-team-eng @dblalock
-/composer/loggers/ @mosaicml/composer-team-eng @eracah @dakinggg
+/composer/functional/ @mosaicml/composer-team-eng
+/composer/loggers/ @mosaicml/composer-team-eng
 /composer/loss/ @mosaicml/composer-team-eng
 /composer/metrics/ @mosaicml/composer-team-eng
 /composer/models/ @mosaicml/composer-team-eng
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index b433af6b87..825fe27053 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,8 +5,8 @@
 
 version: 2
 updates:
-  - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/" # Location of package manifests
-    schedule:
-      interval: "weekly"
-    open-pull-requests-limit: 5
+- package-ecosystem: "pip"  # See documentation for possible values
+  directory: "/"  # Location of package manifests
+  schedule:
+    interval: "weekly"
+  open-pull-requests-limit: 5
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 317173e094..20bbf327b7 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -1,42 +1,30 @@
 name: Code Quality Checks
 on:
-  push:
-    branches:
-      - dev
-      - main
-      - release/**
-  pull_request:
   workflow_call:
-  workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+    inputs:
+      python_version:
+        required: true
+        type: string
+      pip_deps:
+        required: true
+        type: string
 defaults:
   run:
     working-directory: .
 jobs:
   code-quality:
     runs-on: ubuntu-20.04
-    timeout-minutes: 10
-    strategy:
-      matrix:
-        python_version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-        pip_deps:
-          - "[dev]"
+    timeout-minutes: 15
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python_version }}
-      - name: Setup
-        run: |
-          set -ex
-          python -m pip install --upgrade 'pip<23' wheel
-          python -m pip install --upgrade .${{ matrix.pip_deps }}
-      - name: Run checks
-        run: |
-          pre-commit run --all-files
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .${{ inputs.pip_deps }}
+    - name: Run checks
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 1b1ccfbade..151179d524 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -13,12 +13,12 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ dev, main ]
+    branches: [dev, main]
   pull_request:
     # The branches below must be a subset of the branches above
-    branches: [ dev, main ]
+    branches: [dev, main]
   schedule:
-    - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
+  - cron: "0 9 * * 1"  # Every Monday at 09:00 (9:00 AM)
 
 jobs:
   analyze:
@@ -32,9 +32,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'python' ]
-        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
-        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+        language: ["python"]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript',
+        # 'python', 'ruby' ]
+        # Learn more about CodeQL language support at
+        # https://git.io/codeql-language-support
 
     steps:
     - name: Checkout repository
@@ -45,24 +47,28 @@ jobs:
       uses: github/codeql-action/init@v2
       with:
         languages: ${{ matrix.language }}
-        # If you wish to specify custom queries, you can do so here or in a config file.
-        # By default, queries listed here will override any specified in a config file.
-        # Prefix the list here with "+" to use these queries and those in the config file.
+        # If you wish to specify custom queries, you can do so here or in a
+        # config file.
+        # By default, queries listed here will override any specified in a
+        # config file.
+        # Prefix the list here with "+" to use these queries and those in the
+        # config file.
         # queries: ./path/to/local/query, your-org/your-repo/queries@main
 
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
-    # If this step fails, then you should remove it and run the build manually (see below)
+    # If this step fails, then you should remove it and run the build manually
+    # (see below)
     - name: Autobuild
       uses: github/codeql-action/autobuild@v2
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 https://git.io/JvXDl
 
-    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
-    #    and modify them (or add more) to build your code if your project
-    #    uses a compiled language
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following
+    #    three lines and modify them (or add more) to build your code if your
+    #    project uses a compiled language
 
-    #- run: |
+    # - run: |
     #   make bootstrap
     #   make release
 
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index 724497b1d7..f89d67ec39 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -10,23 +10,23 @@ jobs:
     timeout-minutes: 5
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v3
-      - name: Setup
-        run: |
-          set -ex
-          python -m pip install --upgrade 'pip<23' wheel
-          pip install coverage[toml]==6.5.0
-      - name: Download artifacts
-        uses: actions/download-artifact@v3
-        with:
-          path: ${{ inputs.download-path }}
-      - name: Generate coverage report
-        run: |
-          set -ex
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        pip install coverage[toml]==6.5.0
+    - name: Download artifacts
+      uses: actions/download-artifact@v3
+      with:
+        path: ${{ inputs.download-path }}
+    - name: Generate coverage report
+      run: |
+        set -ex
 
-          # Flatten the coverage files
-          ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
+        # Flatten the coverage files
+        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
 
-          python -m coverage combine
-          python -m coverage report
+        python -m coverage combine
+        python -m coverage report
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 588572c18f..3c65b0f4fa 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -1,12 +1,12 @@
 name: Daily
 on:
   schedule:
-    - cron: '30 2 * * *' # 2:30 every day
+  - cron: "30 2 * * *"  # 2:30 every day
   push:
     branches:
-      - dev
-      - main
-      - release/**
+    - dev
+    - main
+    - release/**
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main or dev
 concurrency:
@@ -18,66 +18,56 @@ jobs:
     strategy:
       matrix:
         include:
-          - name: 'cpu-3.10-1.13'
-            container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-3.10-2.0'
-            container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-3.10-2.1'
-            container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-3.10-2.1-composer'
-            container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'composer'
-          - name: 'cpu-vision'
-            container: mosaicml/pytorch_vision:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-doctest'
-            container: mosaicml/pytorch_vision:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and (remote or not remote) and not gpu and not vision and doctest'
-            pytest_command: 'coverage run -m pytest tests/test_docs.py'
-            composer_package_name: 'mosaicml'
-          - name: 'daily-cpu-3.10-1.13'
-            container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'daily-cpu-3.10-2.0'
-            container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'daily-cpu-3.10-2.1'
-            container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'daily-cpu-3.10-2.1-composer'
-            container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'composer'
-          - name: 'daily-cpu-vision'
-            container: mosaicml/pytorch_vision:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'daily-cpu-doctest'
-            container: mosaicml/pytorch_vision:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'daily and (remote or not remote) and not gpu and not vision and doctest'
-            pytest_command: 'coverage run -m pytest tests/test_docs.py'
-            composer_package_name: 'mosaicml'
+        - name: cpu-3.10-2.0
+          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.10-2.1
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.10-2.1-composer
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: composer
+        - name: cpu-3.11-2.2
+          container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-doctest
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and doctest
+          pytest_command: coverage run -m pytest tests/test_docs.py
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.10-2.0
+          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.10-2.1
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.10-2.1-composer
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: composer
+        - name: daily-cpu-3.11-2.2
+          container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-doctest
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and doctest
+          pytest_command: coverage run -m pytest tests/test_docs.py
+          composer_package_name: mosaicml
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
@@ -86,24 +76,25 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       composer_package_name: ${{ matrix.composer_package_name }}
-      pytest-s3-bucket: 'mosaicml-internal-integration-testing'
-      pytest-wandb-entity: 'mosaicml-public-integration-tests'
+      pytest-s3-bucket: "mosaicml-internal-integration-testing"
+      pytest-wandb-entity: "mosaicml-public-integration-tests"
       pytest-wandb-project: "integration-tests-${{ github.sha }}"
     secrets:
       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
       wandb-api-key: ${{ secrets.WANDB_API_KEY }}
-      slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
       code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
       code-eval-url: ${{ secrets.CODE_EVAL_URL }}
       code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
       gcs-key: ${{ secrets.GCS_KEY }}
       gcs-secret: ${{ secrets.GCS_SECRET }}
+      azure-account-name: ${{ secrets.AZURE_ACCOUNT_NAME }}
+      azure-account-access-key: ${{ secrets.AZURE_ACCOUNT_ACCESS_KEY }}
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results
     if: github.repository_owner == 'mosaicml'
-    needs: [ daily-pytest-cpu ]
+    needs: [daily-pytest-cpu]
     with:
       download-path: artifacts
 
@@ -114,21 +105,21 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-          - name: 'gpu-3.10-1.13'
-            container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-            markers: '(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'gpu-3.10-2.0'
-            container: mosaicml/pytorch_vision:2.0.1_cu117-python3.10-ubuntu20.04
-            markers: '(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'gpu-3.10-2.1'
-            container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
-            markers: '(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
+        - name: "gpu-3.10-2.0"
+          container: mosaicml/pytorch_vision:2.0.1_cu117-python3.10-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+        - name: "gpu-3.10-2.1"
+          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+        - name: "gpu-3.10-2.2"
+          container: mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
@@ -141,4 +132,3 @@ jobs:
       python-version: 3.9
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_DAILY_API_KEY }}
-      slack-notifications-bot-token: ${{ secrets.SLACK_NOTIFICATIONS_BOT_TOKEN }}
diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml
index 8ae2705700..2b6bf4893d 100644
--- a/.github/workflows/docker-configure-build-push.yaml
+++ b/.github/workflows/docker-configure-build-push.yaml
@@ -38,58 +38,58 @@ jobs:
   configure-build-push:
     runs-on: ubuntu-latest
     steps:
-      - name: Maximize Build Space on Worker
-        uses: easimon/maximize-build-space@v4
-        with:
-          overprovision-lvm: true
-          remove-dotnet: true
-          remove-android: true
-          remove-haskell: true
+    - name: Maximize Build Space on Worker
+      uses: easimon/maximize-build-space@v4
+      with:
+        overprovision-lvm: true
+        remove-dotnet: true
+        remove-android: true
+        remove-haskell: true
 
-      - name: Checkout
-        uses: actions/checkout@v3
+    - name: Checkout
+      uses: actions/checkout@v3
 
-      - name: Setup QEMU
-        uses: docker/setup-qemu-action@v2
+    - name: Setup QEMU
+      uses: docker/setup-qemu-action@v2
 
-      - name: Setup Docker Buildx
-        uses: docker/setup-buildx-action@v2
+    - name: Setup Docker Buildx
+      uses: docker/setup-buildx-action@v2
 
-      - name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.username }}
-          password: ${{ secrets.password }}
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.username }}
+        password: ${{ secrets.password }}
 
-      - name: Calculate Docker Image Variables
-        run: |
-          set -euo pipefail
+    - name: Calculate Docker Image Variables
+      run: |
+        set -euo pipefail
 
-          ###################
-          # Calculate the tag
-          ###################
-          if [ "${{ inputs.staging }}" = "true" ]; then
-            STAGING_REPO=${{ inputs.staging-repo }}
-            IMAGE_TAG=${STAGING_REPO}:${{ inputs.image-uuid }}
-            IMAGE_CACHE="${STAGING_REPO}:${{ inputs.image-name }}-buildcache"
-          else
-            IMAGE_TAG=${{ inputs.tags }}
-            IMAGE_CACHE="${IMAGE_TAG/,*/}-buildcache"
-          fi
+        ###################
+        # Calculate the tag
+        ###################
+        if [ "${{ inputs.staging }}" = "true" ]; then
+          STAGING_REPO=${{ inputs.staging-repo }}
+          IMAGE_TAG=${STAGING_REPO}:${{ inputs.image-uuid }}
+          IMAGE_CACHE="${STAGING_REPO}:${{ inputs.image-name }}-buildcache"
+        else
+          IMAGE_TAG=${{ inputs.tags }}
+          IMAGE_CACHE="${IMAGE_TAG/,*/}-buildcache"
+        fi
 
-          echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
-          echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
+        echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
+        echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
 
-      - name: IMAGE_TAG = ${{ env.IMAGE_TAG }}
-        run: echo ${{ env.IMAGE_TAG }}
+    - name: IMAGE_TAG = ${{ env.IMAGE_TAG }}
+      run: echo ${{ env.IMAGE_TAG }}
 
-      - name: Build and Push the Docker Image
-        uses: docker/build-push-action@v3
-        with:
-          context: ${{ inputs.context }}
-          tags: ${{ env.IMAGE_TAG }}
-          target: ${{ inputs.target }}
-          push: ${{ inputs.push }}
-          cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
-          cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
-          build-args: ${{ inputs.build-args }}
+    - name: Build and Push the Docker Image
+      uses: docker/build-push-action@v3
+      with:
+        context: ${{ inputs.context }}
+        tags: ${{ env.IMAGE_TAG }}
+        target: ${{ inputs.target }}
+        push: ${{ inputs.push }}
+        cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
+        cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
+        build-args: ${{ inputs.build-args }}
diff --git a/.github/workflows/pr-code-quality.yaml b/.github/workflows/pr-code-quality.yaml
new file mode 100644
index 0000000000..26d2546e75
--- /dev/null
+++ b/.github/workflows/pr-code-quality.yaml
@@ -0,0 +1,28 @@
+name: PR Code Quality Checks
+on:
+  push:
+    branches:
+    - dev
+    - main
+    - release/**
+  pull_request:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+jobs:
+  code-quality:
+    uses: ./.github/workflows/code-quality.yaml
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
+    with:
+      python_version: ${{ matrix.python_version }}
+      pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 989b4ded43..6eee54cb0b 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -2,7 +2,8 @@ name: PR CPU tests
 on:
   pull_request:
   workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
@@ -12,31 +13,21 @@ jobs:
     strategy:
       matrix:
         include:
-          - name: 'cpu-3.10-1.13'
-            container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-3.10-2.0'
-            container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-3.10-2.1'
-            container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and not gpu and not vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-vision'
-            container: mosaicml/pytorch_vision:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and not gpu and vision and not doctest'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
-          - name: 'cpu-doctest'
-            container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and not gpu and not vision and doctest'
-            pytest_command: 'coverage run -m pytest tests/test_docs.py'
-            composer_package_name: 'mosaicml'
+        - name: cpu-3.10-2.0
+          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.10-2.1
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-doctest
+          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          markers: not daily and not remote and not gpu and doctest
+          pytest_command: coverage run -m pytest tests/test_docs.py
+          composer_package_name: mosaicml
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
@@ -49,6 +40,6 @@ jobs:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results
     if: github.repository_owner == 'mosaicml'
-    needs: [ pytest-cpu ]
+    needs: [pytest-cpu]
     with:
       download-path: artifacts
diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml
index 52e009e68f..93f0b51be1 100644
--- a/.github/workflows/pr-docker.yaml
+++ b/.github/workflows/pr-docker.yaml
@@ -2,14 +2,14 @@ name: PR Docker
 on:
   pull_request:
     branches:
-      - dev
-      - main
-      - release/**
+    - dev
+    - main
+    - release/**
     paths:
-      - .github/bin/gen_docker_matrix.py
-      - .github/workflows/docker-configure-build-push.yaml
-      - .github/workflows/pr-docker.yaml
-      - docker/**
+    - .github/bin/gen_docker_matrix.py
+    - .github/workflows/docker-configure-build-push.yaml
+    - .github/workflows/pr-docker.yaml
+    - docker/**
   workflow_dispatch:
 defaults:
   run:
@@ -22,21 +22,23 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - uses: actions/checkout@v3
-      - id: set-matrix
-        run: |
-          # Install yaml dependency
-          pip install pyyaml
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - uses: actions/checkout@v3
+    - id: set-matrix
+      run: |
+        # Install yaml dependency
+        pip install pyyaml
 
-          # Override package install command for Composer image
-          COMPOSER_INSTALL_COMMAND="mosaicml[all]@git+https://github.com/mosaicml/composer.git@${{ github.sha }}"
+        # Override package install command for Composer image
+        COMPOSER_INSTALL_COMMAND="mosaicml[all]@git+https://github.com/mosaicml/composer.git@${{ github.sha }}"
 
-          # Generate build matrix
-          BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml -b COMPOSER_INSTALL_COMMAND=$COMPOSER_INSTALL_COMMAND)
-          echo $BUILD_MATRIX >> $GITHUB_OUTPUT
+        # Generate build matrix
+        BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml -b \
+        COMPOSER_INSTALL_COMMAND=$COMPOSER_INSTALL_COMMAND)
+
+        echo $BUILD_MATRIX >> $GITHUB_OUTPUT
   stage-docker-build:
     needs: build-image-matrix
     uses: ./.github/workflows/docker-configure-build-push.yaml
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 2c818b7229..1b02fc9c51 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -2,7 +2,8 @@ name: PR GPU tests
 on:
   pull_request_target:
   workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
@@ -12,11 +13,11 @@ jobs:
     strategy:
       matrix:
         include:
-          - name: 'gpu-3.10-2.1'
-            container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
-            markers: 'not daily and not remote and gpu and (doctest or not doctest)'
-            pytest_command: 'coverage run -m pytest'
-            composer_package_name: 'mosaicml'
+        - name: gpu-3.10-2.1
+          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+          markers: not daily and not remote and gpu and (doctest or not doctest)
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
diff --git a/.github/workflows/pytest-cpu.yaml b/.github/workflows/pytest-cpu.yaml
index 152501ad64..af95f8918f 100644
--- a/.github/workflows/pytest-cpu.yaml
+++ b/.github/workflows/pytest-cpu.yaml
@@ -45,55 +45,53 @@ on:
         required: false
       gcs-secret:
         required: false
+      azure-account-name:
+        required: false
+      azure-account-access-key:
+        required: false
 jobs:
   pytest-cpu:
     timeout-minutes: 30
     runs-on: ubuntu-latest
     container: ${{ inputs.container }}
     steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v3
-      - name: Setup
-        run: |
-          set -ex
-          export PATH=/composer-python:$PATH
-          export COMPOSER_PACKAGE_NAME='${{ inputs.composer_package_name }}'
-          python -m pip install --upgrade 'pip<23' wheel
-          python -m pip install --upgrade .[all]
-      - name: Run Tests
-        id: tests
-        run: |
-          set -ex
-          export PATH=/composer-python:$PATH
-          export WANDB_API_KEY='${{ secrets.wandb-api-key }}'
-          export WANDB_ENTITY='${{ inputs.pytest-wandb-entity }}'
-          export WANDB_PROJECT='${{ inputs.pytest-wandb-project }}'
-          export AWS_ACCESS_KEY_ID='${{ secrets.aws-access-key-id }}'
-          export AWS_SECRET_ACCESS_KEY='${{ secrets.aws-secret-access-key }}'
-          export CODE_EVAL_DEVICE='${{ secrets.code-eval-device }}'
-          export CODE_EVAL_URL='${{ secrets.code-eval-url }}'
-          export CODE_EVAL_APIKEY='${{ secrets.code-eval-apikey }}'
-          export GCS_KEY='${{ secrets.gcs-key }}'
-          export GCS_SECRET='${{ secrets.gcs-secret }}'
-          export S3_BUCKET='${{ inputs.pytest-s3-bucket }}'
-          export COMMON_ARGS="-v --durations=20 -m '${{ inputs.pytest-markers }}' --s3_bucket '$S3_BUCKET' -o tmp_path_retention_policy=none"
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        export COMPOSER_PACKAGE_NAME='${{ inputs.composer_package_name }}'
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .[all]
+    - name: Run Tests
+      id: tests
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        export WANDB_API_KEY='${{ secrets.wandb-api-key }}'
+        export WANDB_ENTITY='${{ inputs.pytest-wandb-entity }}'
+        export WANDB_PROJECT='${{ inputs.pytest-wandb-project }}'
+        export AWS_ACCESS_KEY_ID='${{ secrets.aws-access-key-id }}'
+        export AWS_SECRET_ACCESS_KEY='${{ secrets.aws-secret-access-key }}'
+        export CODE_EVAL_DEVICE='${{ secrets.code-eval-device }}'
+        export CODE_EVAL_URL='${{ secrets.code-eval-url }}'
+        export CODE_EVAL_APIKEY='${{ secrets.code-eval-apikey }}'
+        export GCS_KEY='${{ secrets.gcs-key }}'
+        export GCS_SECRET='${{ secrets.gcs-secret }}'
+        export AZURE_ACCOUNT_NAME='${{ secrets.azure-account-name }}'
+        export AZURE_ACCOUNT_ACCESS_KEY='${{ secrets.azure-account-access-key }}'
+        export S3_BUCKET='${{ inputs.pytest-s3-bucket }}'
+        export COMMON_ARGS="-v --durations=20 -m '${{ inputs.pytest-markers }}' --s3_bucket '$S3_BUCKET' \
+          -o tmp_path_retention_policy=none"
 
-          # Necessary to run git diff for doctests
-          git config --global --add safe.directory /__w/composer/composer
-          make test PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
-          make test-dist PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
+        # Necessary to run git diff for doctests
+        git config --global --add safe.directory /__w/composer/composer
+        make test PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+        make test-dist PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
 
-          python -m coverage combine
-      - uses: actions/upload-artifact@v3
-        with:
-          name: coverage-${{ github.sha }}-${{ inputs.name }}
-          path: .coverage
-      - name: Notify slack fail
-        if: failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') && (github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
-        uses: voxmedia/github-action-slack-notify-build@v1
-        with:
-          channel: composer-issues
-          status: FAILED
-          color: danger
+        python -m coverage combine
+    - uses: actions/upload-artifact@v3
+      with:
+        name: coverage-${{ github.sha }}-${{ inputs.name }}
+        path: .coverage
diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
index 100de255e8..550a306746 100644
--- a/.github/workflows/pytest-gpu.yaml
+++ b/.github/workflows/pytest-gpu.yaml
@@ -38,58 +38,52 @@ on:
         required: false
 jobs:
   pytest-gpu:
-    timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
+    timeout-minutes: 60  # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
     runs-on: ubuntu-latest
     env:
       MOSAICML_API_KEY: ${{ secrets.mcloud-api-key }}
     steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v3
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ inputs.python-version }}
-      - name: Cache pip
-        uses: actions/cache@v3
-        with:
-          # This path is specific to Ubuntu
-          path: ~/.cache/pip
-          # Look to see if there is a cache hit for the corresponding requirements file
-          key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
-          restore-keys: |
-            ${{ runner.os }}-pip-
-            ${{ runner.os }}-
-      - name: Setup MCLI
-        run: |
-          set -ex
-          python -m pip install mosaicml-cli
-          mcli version
-      - name: Submit Run
-        id: tests
-        run: |
-          set -ex
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        # This path is specific to Ubuntu
+        path: ~/.cache/pip
+        # Look to see if there is a cache hit for the corresponding requirements file
+        key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+          ${{ runner.os }}-
+    - name: Setup MCLI
+      run: |
+        set -ex
+        python -m pip install mosaicml-cli
+        mcli version
+    - name: Submit Run
+      id: tests
+      run: |
+        set -ex
 
-          PR_NUMBER="$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")"
-          REF_ARGS=""
+        PR_NUMBER="$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")"
+        REF_ARGS=""
 
-          # Use the PR number if it exists, commit SHA for protected branches and the branch name otherwise
-          if [ -z "$PR_NUMBER" ] || [ "$PR_NUMBER" = "null" ]; then
-            if [[ "$GITHUB_REF" =~ "refs/heads/dev" || "$GITHUB_REF" =~ "refs/heads/main" || "$GITHUB_REF" =~ "refs/heads/release" ]]; then
-              REF_ARGS="--git_commit $GITHUB_SHA"
-            else
-              REF_ARGS="--git_branch $GITHUB_REF_NAME"
-            fi
+        # Use the PR number if it exists, commit SHA for protected branches and the branch name otherwise
+        if [ -z "$PR_NUMBER" ] || [ "$PR_NUMBER" = "null" ]; then
+          if [[ "$GITHUB_REF" =~ "refs/heads/dev" || "$GITHUB_REF" =~ "refs/heads/main" || \
+            "$GITHUB_REF" =~ "refs/heads/release" ]]; then
+            REF_ARGS="--git_commit $GITHUB_SHA"
           else
-            REF_ARGS="--pr_number $PR_NUMBER"
+            REF_ARGS="--git_branch $GITHUB_REF_NAME"
           fi
+        else
+          REF_ARGS="--pr_number $PR_NUMBER"
+        fi
 
-          python .github/mcli/mcli_pytest.py --image '${{ inputs.container }}' --pip_package_name '${{ inputs.composer_package_name }}' --pytest_markers '${{ inputs.pytest-markers }}' --pytest_command '${{ inputs.pytest-command }}' --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
-      - name: Notify slack fail
-        if: failure() && !cancelled() && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev') && (github.event_name != 'pull_request' && github.event_name != 'pull_request_target')
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.slack-notifications-bot-token }}
-        uses: voxmedia/github-action-slack-notify-build@v1
-        with:
-          channel: composer-issues
-          status: FAILED
-          color: danger
+        python .github/mcli/mcli_pytest.py --image '${{ inputs.container }}' --pip_package_name \
+          '${{ inputs.composer_package_name }}' --pytest_markers '${{ inputs.pytest-markers }}' --pytest_command \
+          '${{ inputs.pytest-command }}' --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
diff --git a/.github/workflows/release-docker.yaml b/.github/workflows/release-docker.yaml
index 17a718021e..e992663994 100644
--- a/.github/workflows/release-docker.yaml
+++ b/.github/workflows/release-docker.yaml
@@ -21,18 +21,18 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - uses: actions/checkout@v3
-      - id: set-matrix
-        run: |
-          # Install yaml dependency
-          pip install pyyaml
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - uses: actions/checkout@v3
+    - id: set-matrix
+      run: |
+        # Install yaml dependency
+        pip install pyyaml
 
-          # Generate build matrix
-          BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml)
-          echo $BUILD_MATRIX >> $GITHUB_OUTPUT
+        # Generate build matrix
+        BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml)
+        echo $BUILD_MATRIX >> $GITHUB_OUTPUT
 
   stage-docker-build:
     needs: build-image-matrix
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 05d78b4832..50032973ca 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -3,88 +3,99 @@ name: Release
 on:
   push:
     tags:
-      - "v*"
+    - "v*"
   workflow_dispatch:
 
 jobs:
   code-quality:
     uses: ./.github/workflows/code-quality.yaml
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
+    with:
+      python_version: ${{ matrix.python_version }}
+      pip_deps: ${{ matrix.pip_deps }}
 
   pypi-packaging:
     name: Build and Publish mosaicml PyPI Package
     needs:
-      - code-quality
+    - code-quality
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout source
-        uses: actions/checkout@v3
+    - name: Checkout source
+      uses: actions/checkout@v3
 
-      - name: Set up Python
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.9"
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.9"
 
-      - name: Build source and wheel distributions
-        run: |
-          if [[ "${{ github.ref }}" =~ refs\/tags\/v ]]; then
-            PYPI_PACKAGE_NAME="mosaicml"
-          else
-            PYPI_PACKAGE_NAME="mosaicml-test-$(date +%Y%m%d%H%M%S)"
-          fi
+    - name: Build source and wheel distributions
+      run: |
+        if [[ "${{ github.ref }}" =~ refs\/tags\/v ]]; then
+          PYPI_PACKAGE_NAME="mosaicml"
+        else
+          PYPI_PACKAGE_NAME="mosaicml-test-$(date +%Y%m%d%H%M%S)"
+        fi
 
-          python -m pip install --upgrade build twine
-          COMPOSER_PACKAGE_NAME=$PYPI_PACKAGE_NAME python -m build
-          twine check --strict dist/*
+        python -m pip install --upgrade build twine
+        COMPOSER_PACKAGE_NAME=$PYPI_PACKAGE_NAME python -m build
+        twine check --strict dist/*
 
-      - name: Publish 📦 to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        if: contains(github.ref, 'refs/tags/v')
-        with:
-          user: __token__
-          password: ${{ secrets.PROD_PYPI_API_TOKEN }}
+    - name: Publish 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/tags/v')
+      with:
+        user: __token__
+        password: ${{ secrets.PROD_PYPI_API_TOKEN }}
 
-      - name: Publish distribution 📦 to Test PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        if: contains(github.ref, 'refs/heads/') || contains(github.ref, 'refs/pull/')
-        with:
-          user: __token__
-          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
-          repository_url: https://test.pypi.org/legacy/
+    - name: Publish distribution 📦 to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/heads/') || contains(github.ref, 'refs/pull/')
+      with:
+        user: __token__
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/
 
   pypi-composer-packaging:
     name: Build and Publish composer PyPI Package
     needs:
-      - code-quality
+    - code-quality
     if: contains(github.ref, 'refs/tags/v')
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout source
-        uses: actions/checkout@v3
+    - name: Checkout source
+      uses: actions/checkout@v3
 
-      - name: Set up Python
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.9"
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.9"
 
-      - name: Build source and wheel distributions
-        run: |
-          PYPI_PACKAGE_NAME="composer"
+    - name: Build source and wheel distributions
+      run: |
+        PYPI_PACKAGE_NAME="composer"
 
-          python -m pip install --upgrade build twine
-          COMPOSER_PACKAGE_NAME=$PYPI_PACKAGE_NAME python -m build
-          twine check --strict dist/*
+        python -m pip install --upgrade build twine
+        COMPOSER_PACKAGE_NAME=$PYPI_PACKAGE_NAME python -m build
+        twine check --strict dist/*
 
-      - name: Publish 📦 to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          user: __token__
-          password: ${{ secrets.PROD_COMPOSER_PYPI_API_TOKEN }}
+    - name: Publish 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.PROD_COMPOSER_PYPI_API_TOKEN }}
 
   production-docker-images:
     name: Build and Push Production Docker Images
     needs:
-      - pypi-packaging
-      - pypi-composer-packaging
+    - pypi-packaging
+    - pypi-composer-packaging
     uses: ./.github/workflows/release-docker.yaml
     if: contains(github.ref, 'refs/tags/v')
     secrets:
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 00121f935b..429cc40b1d 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -2,13 +2,14 @@ name: Smoketest
 on:
   push:
     branches:
-      - dev
-      - main
-      - release/**
+    - dev
+    - main
+    - release/**
   pull_request:
   workflow_call:
   workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
@@ -22,20 +23,20 @@ jobs:
     strategy:
       matrix:
         python_version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
+        - "3.9"
+        - "3.10"
+        - "3.11"
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python_version }}
-      - name: Setup
-        run: |
-          set -ex
-          python -m pip install --upgrade 'pip<23' wheel
-          python -m pip install --upgrade .
-          python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
-      - name: Run checks
-        run: |
-          pytest tests/test_smoketest.py
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .
+        python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
+    - name: Run checks
+      run: |
+        pytest tests/test_smoketest.py
diff --git a/.gitignore b/.gitignore
index 9b66bf52db..789c75183b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,6 +136,9 @@ venv/
 # WandB
 wandb/
 
+# Neptune
+.neptune/
+
 # Spacemacs
 ._#*
 .#*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d0f8595580..4f89154571 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,125 +1,135 @@
 default_language_version:
   python: python3
 repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.0.282
-    hooks:
-      - id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.0.282
+  hooks:
+  - id: ruff
+    args: [--fix, --exit-non-zero-on-fix]
 
-  - repo: https://github.com/google/yapf
-    rev: v0.32.0
-    hooks:
-      - id: yapf
-        name: yapf
-        description: "A formatter for Python files."
-        entry: yapf
-        args: [-i, -vv, -p] #inplace
-        language: python
-        types: [python]
-        additional_dependencies:
-          - "toml"
-  - repo: https://github.com/pycqa/isort
-    hooks:
-      - id: isort
-    rev: 5.12.0
-  # -   repo: https://github.com/pycqa/pylint
-  #     hooks:
-  #         - id: pylint
-  #           entry: pylint
-  #           args: ['composer', 'examples', 'tests']
-  #           language: python
-  #           types: [python]
-  #           require_serial: true
-  #     rev: v2.12.2
-  -   repo: https://github.com/PyCQA/pydocstyle
-      hooks:
-      -   id: pydocstyle
-          name: pydocstyle
-          entry: pydocstyle
-          language: python
-          types: [python]
-          exclude: '(?:tests|.ci|composer\/algorithms|composer\/datasets|composer\/models)\/.*|composer\/trainer\/activation_checkpointing.py'
-          additional_dependencies:
-            - "toml"
-      rev: 6.1.1
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
-    hooks:
-      - id: check-added-large-files
-      - id: check-ast
-      - id: check-builtin-literals
-      - id: check-case-conflict
-      - id: check-docstring-first
-      - id: check-executables-have-shebangs
-      - id: check-json
-      - id: check-shebang-scripts-are-executable
-      - id: pretty-format-json
-        args:
-          - --autofix
-          - --no-sort-keys
-          - --indent=1
-          - --no-ensure-ascii
-      - id: check-merge-conflict
-      - id: check-symlinks
-      - id: check-toml
-      - id: check-vcs-permalinks
-      - id: check-xml
-      - id: check-yaml
-      - id: debug-statements
-      - id: destroyed-symlinks
-      - id: double-quote-string-fixer
-      - id: end-of-file-fixer
-      - id: fix-byte-order-marker
-      - id: mixed-line-ending
-      - id: trailing-whitespace
-  - repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.1.13
-    hooks:
-      - id: insert-license
-        args:
-          - --license-filepath
-          - .ci/FILE_HEADER
-          - --comment-style
-          - "#"
-        types: [python]
-        exclude: 'composer\/trainer\/activation_checkpointing.py'
+- repo: https://github.com/google/yapf
+  rev: v0.32.0
+  hooks:
+  - id: yapf
+    name: yapf
+    description: "A formatter for Python files."
+    entry: yapf
+    args: [-i, -vv, -p]  # inplace
+    language: python
+    types: [python]
+    additional_dependencies:
+    - "toml"
+- repo: https://github.com/pycqa/isort
+  hooks:
+  - id: isort
+  rev: 5.12.0
+# -   repo: https://github.com/pycqa/pylint
+#     hooks:
+#         - id: pylint
+#           entry: pylint
+#           args: ['composer', 'examples', 'tests']
+#           language: python
+#           types: [python]
+#           require_serial: true
+#     rev: v2.12.2
+- repo: https://github.com/PyCQA/pydocstyle
+  hooks:
+  - id: pydocstyle
+    name: pydocstyle
+    entry: pydocstyle
+    language: python
+    types: [python]
+    exclude: "(?:tests|.ci|composer\/algorithms|composer\/datasets|composer\/models)\/.*|composer\/trainer\/activation_checkpointing.py"
+    additional_dependencies:
+    - "toml"
+  rev: 6.1.1
+- repo: https://github.com/adrienverge/yamllint.git
+  rev: v1.28.0
+  hooks:
+  - id: yamllint
+    name: yamllint
+    description: This hook runs yamllint.
+    entry: yamllint
+    language: python
+    types: [file, yaml]
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.1.0
+  hooks:
+  - id: check-added-large-files
+  - id: check-ast
+  - id: check-builtin-literals
+  - id: check-case-conflict
+  - id: check-docstring-first
+  - id: check-executables-have-shebangs
+  - id: check-json
+  - id: check-shebang-scripts-are-executable
+  - id: pretty-format-json
+    args:
+    - --autofix
+    - --no-sort-keys
+    - --indent=1
+    - --no-ensure-ascii
+  - id: check-merge-conflict
+  - id: check-symlinks
+  - id: check-toml
+  - id: check-vcs-permalinks
+  - id: check-xml
+  - id: check-yaml
+  - id: debug-statements
+  - id: destroyed-symlinks
+  - id: double-quote-string-fixer
+  - id: end-of-file-fixer
+  - id: fix-byte-order-marker
+  - id: mixed-line-ending
+  - id: trailing-whitespace
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+  rev: v1.5.4
+  hooks:
+  - id: insert-license
+    args:
+    - --license-filepath
+    - .pre-commit/FILE_HEADER
+    - --comment-style
+    - "#"
+    - --allow-past-years
+    types: [python]
+    exclude: "composer\/trainer\/activation_checkpointing.py"
 
-  - repo: https://github.com/kynan/nbstripout
-    rev: 0.5.0
-    hooks:
-      - id: nbstripout
-        types:
-          - "jupyter"
-        args:
-          # Strip all the metadata that vscode or colab may add to a notebook
-          - --strip-empty-cells
-          - --extra-keys
-          - >
-            metadata.colab metadata.interpreter metadata.accelerator
-            metadata.kernelspec metadata.language_info.version
-            cell.metadata.heading_collapsed metadata.name metadata.nbconvert_exporter
-            metadata.version metadata.vscode
-  - repo: local
-    hooks:
-      - id: pyright
-        name: pyright
-        entry: pyright
-        language: node
-        types: [python]
-        pass_filenames: false
-        args: [--warnings]
-        additional_dependencies: ["pyright@1.1.256"]
-  - repo: https://github.com/trufflesecurity/trufflehog.git
-    rev: v3.40.0
-    hooks:
-      - id: trufflehog
-        name: secret scan
-        entry: trufflehog filesystem ./
-        args:
-          - --only-verified
-          - --fail
-          - --exclude-paths=./.github/secrets/exclude.yaml
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.5.0
+  hooks:
+  - id: nbstripout
+    types:
+    - "jupyter"
+    args:
+    # Strip all the metadata that vscode or colab may add to a notebook
+    - --strip-empty-cells
+    - --extra-keys
+    - >
+      metadata.colab metadata.interpreter metadata.accelerator
+      metadata.kernelspec metadata.language_info.version
+      cell.metadata.heading_collapsed metadata.name metadata.nbconvert_exporter
+      metadata.version metadata.vscode
+- repo: local
+  hooks:
+  - id: pyright
+    name: pyright
+    entry: pyright
+    language: node
+    types: [python]
+    pass_filenames: false
+    args: [--warnings]
+    additional_dependencies: ["pyright@1.1.310"]
+- repo: https://github.com/trufflesecurity/trufflehog.git
+  rev: v3.40.0
+  hooks:
+  - id: trufflehog
+    name: secret scan
+    entry: trufflehog filesystem ./
+    args:
+    - --only-verified
+    - --fail
+    - --exclude-paths=./.github/secrets/exclude.yaml
 
 exclude: .ci\/release_tests\/.*
diff --git a/.pre-commit/FILE_HEADER b/.pre-commit/FILE_HEADER
new file mode 100644
index 0000000000..a37a1c719f
--- /dev/null
+++ b/.pre-commit/FILE_HEADER
@@ -0,0 +1,2 @@
+Copyright 2024 MosaicML Composer authors
+SPDX-License-Identifier: Apache-2.0
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 6ebcd8535e..f8a390536d 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,17 +7,17 @@ version: 2
 
 # Specify build system and tool dependencies
 build:
-   os: "ubuntu-20.04"
-   tools:
-      python: "3.8"
+  os: "ubuntu-20.04"
+  tools:
+    python: "3.10"
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
-   builder: html
-   configuration: docs/source/conf.py
+  builder: html
+  configuration: docs/source/conf.py
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-   install:
-      - method: pip
-        path: .[all]
+  install:
+  - method: pip
+    path: .[all]
diff --git a/.yamllint.yaml b/.yamllint.yaml
index 3e760e6ae0..8d8617a1cc 100644
--- a/.yamllint.yaml
+++ b/.yamllint.yaml
@@ -1,11 +1,11 @@
 yaml-files:
-  - "*.yaml"
-  - "*.yml"
-  - .yamllint
+- "*.yaml"
+- "*.yml"
+- .yamllint
 
 ignore: |
   wandb
-  *
+  docker/build_matrix.yaml
 
 rules:
   braces:
@@ -14,8 +14,7 @@ rules:
     forbid: false
   colons: enable
   commas: enable
-  comments:
-    level: warning
+  comments: enable
   comments-indentation: enable
   document-end:
     present: false
@@ -26,12 +25,12 @@ rules:
   hyphens: enable
   indentation:
     spaces: 2
-    indent-sequences: true
+    indent-sequences: false
     check-multi-line-strings: false
   key-duplicates: enable
   key-ordering: disable
   line-length:
-    max: 200
+    max: 120
     allow-non-breakable-words: true
     allow-non-breakable-inline-mappings: true
   new-line-at-end-of-file: enable
diff --git a/README.md b/README.md
index 17a6e41cfd..9ab992be3a 100644
--- a/README.md
+++ b/README.md
@@ -105,7 +105,7 @@ Composer is built to automate away low-level pain points and headaches so you ca
 Integrate with the tools you know and love for experiment tracking and data streaming.
 
 - **Cloud integrations**: Our Checkpointing and logging features have first-class support for remote storage and loading from Cloud bucket (OCI, GCP, AWS S3).
-- **********Experiment tracking:********** Weights and Biases, MLFlow, and CometML — the choice is yours, easily log your data to your favorite platform.
+- **********Experiment tracking:********** Weights and Biases, MLFlow, CometML, and neptune.ai — the choice is yours, easily log your data to your favorite platform.
 
 # **🚀 Getting Started**
 
@@ -135,26 +135,55 @@ Here is a code snippet demonstrating our Trainer on the MNIST dataset.
 <!--pytest.mark.filterwarnings(r'ignore:Some targets have less than 1 total probability:UserWarning')-->
 <!--pytest.mark.filterwarnings('ignore:Cannot split tensor of length .* into batches of size 128.*:UserWarning')-->
 ```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torchvision import datasets, transforms
 from torch.utils.data import DataLoader
 
 from composer import Trainer
-from composer.models import mnist_model
+from composer.models import ComposerClassifier
 from composer.algorithms import LabelSmoothing, CutMix, ChannelsLast
 
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
 transform = transforms.Compose([transforms.ToTensor()])
 dataset = datasets.MNIST("data", train=True, download=True, transform=transform)
 train_dataloader = DataLoader(dataset, batch_size=128)
 
 trainer = Trainer(
-    model=mnist_model(num_classes=10),
+    model=ComposerClassifier(module=Model(), num_classes=10),
     train_dataloader=train_dataloader,
     max_duration="2ep",
     algorithms=[
         LabelSmoothing(smoothing=0.1),
         CutMix(alpha=1.0),
         ChannelsLast(),
-        ]
+    ],
 )
 trainer.fit()
 ```
diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
index 274c10ce9c..4943a9db58 100644
--- a/STYLE_GUIDE.md
+++ b/STYLE_GUIDE.md
@@ -227,22 +227,23 @@ All imports in composer should be absolute -- that is, they do not begin with a
 1.  If a dependency is not core to Composer (e.g. it is for a model, dataset, algorithm, or some callbacks):
     1.  It must be specified in a entry of the `extra_deps` dictionary of [setup.py](setup.py).
         This dictionary groups dependencies that can be conditionally installed. An entry named `foo`
-        can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[unet]'`
-        will install everything in `install_requires`, along with `monai` and `scikit-learn`.
+        can be installed with `pip install 'mosaicml[foo]'`. For example, running `pip install 'mosaicml[system_metrics_monitor]'`
+        will install everything in `install_requires`, along with `pynvml`.
     1.  It must also be specified in the `run_constrained` and the `test.requires` section.
     1.  The import must be conditionally imported in the code. For example:
 
         <!--pytest-codeblocks:importorskip(monai)-->
         <!--pytest-codeblocks:importorskip(scikit-learn)-->
         ```python
+        from composer import Callback
         from composer.utils import MissingConditionalImportError
 
-        def unet():
+        class SystemMetricsMonitor(Callback)
             try:
-                import monai
+                import pynvml
             except ImportError as e:
-                raise MissingConditionalImportError(extra_deps_group="unet",
-                                                    conda_package="monai",
+                raise MissingConditionalImportError(extra_deps_group="system_metrics_monitor",
+                                                    conda_package="pynvml",
                                                     conda_channel="conda-forge",) from e
         ```
 
diff --git a/composer/_version.py b/composer/_version.py
index a41361e246..6a46c95e08 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.17.2'
+__version__ = '0.19.1'
diff --git a/composer/algorithms/alibi/attention_surgery_functions/__init__.py b/composer/algorithms/alibi/attention_surgery_functions/__init__.py
index cb27f89f2d..207f958b58 100644
--- a/composer/algorithms/alibi/attention_surgery_functions/__init__.py
+++ b/composer/algorithms/alibi/attention_surgery_functions/__init__.py
@@ -6,7 +6,8 @@
 from composer.utils import MissingConditionalImportError
 
 try:
-    from composer.algorithms.alibi.attention_surgery_functions import _bert, _gpt2  # pyright: reportUnusedImport=none
+    from composer.algorithms.alibi.attention_surgery_functions import _bert  # pyright: ignore[reportUnusedImport]
+    from composer.algorithms.alibi.attention_surgery_functions import _gpt2  # pyright: ignore[reportUnusedImport]
     from composer.algorithms.alibi.attention_surgery_functions.utils import policy_registry
 except ImportError as e:
     raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
diff --git a/composer/algorithms/alibi/attention_surgery_functions/_bert.py b/composer/algorithms/alibi/attention_surgery_functions/_bert.py
index 915e940cad..c2a7bb3bd5 100644
--- a/composer/algorithms/alibi/attention_surgery_functions/_bert.py
+++ b/composer/algorithms/alibi/attention_surgery_functions/_bert.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
 import math
 from types import MethodType
 from typing import Optional, Tuple
@@ -20,13 +21,14 @@ def bert_embedding_converter(module: torch.nn.Module, module_index: int, max_seq
     """
     assert isinstance(module, (BertEmbeddings, RobertaEmbeddings))
     del module_index  # unused
-    zero_and_freeze_expand_position_embeddings(module,
+    new_module = copy.deepcopy(module)
+    zero_and_freeze_expand_position_embeddings(new_module,
                                                max_sequence_length,
                                                position_embedding_attribute='position_embeddings')
 
-    module_device = next(module.parameters()).device
-    module.register_buffer('position_ids', torch.arange(max_sequence_length).expand((1, -1)).to(module_device))
-    return module
+    module_device = next(new_module.parameters()).device
+    new_module.register_buffer('position_ids', torch.arange(max_sequence_length).expand((1, -1)).to(module_device))
+    return new_module
 
 
 @policy_registry.register(BertSelfAttention, RobertaSelfAttention)
diff --git a/composer/algorithms/blurpool/README.md b/composer/algorithms/blurpool/README.md
index f99e1fb275..24b25d221a 100644
--- a/composer/algorithms/blurpool/README.md
+++ b/composer/algorithms/blurpool/README.md
@@ -56,9 +56,7 @@ def training_loop(model, train_loader):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
-
-from composer.models import composer_resnet
+from tests.common import RandomImageDataset, composer_resnet
 
 model = composer_resnet('resnet50')
 
diff --git a/composer/algorithms/colout/colout.py b/composer/algorithms/colout/colout.py
index bf2e4c1337..e05717bf17 100644
--- a/composer/algorithms/colout/colout.py
+++ b/composer/algorithms/colout/colout.py
@@ -29,10 +29,12 @@
 __all__ = ['ColOut', 'ColOutTransform', 'colout_batch']
 
 
-def colout_batch(sample: Union[ImgT, Tuple[ImgT, ImgT]],
-                 p_row: float = 0.15,
-                 p_col: float = 0.15,
-                 resize_target: Union[bool, str] = 'auto') -> Union[ImgT, Tuple[ImgT, ImgT]]:
+def colout_batch(
+        sample: Union[ImgT, Tuple[ImgT, ImgT]],
+        p_row: float = 0.15,
+        p_col: float = 0.15,
+        resize_target: Union[bool,
+                             str] = 'auto') -> Union[torch.Tensor, ImgT, Tuple[Tensor, Tensor], Tuple[ImgT, ImgT]]:
     """Applies ColOut augmentation to a batch of images and (optionally) targets,
     dropping the same random rows and columns from all images and targets in a batch.
 
@@ -136,7 +138,10 @@ def __init__(self, p_row: float = 0.15, p_col: float = 0.15, resize_target: Unio
         self.p_col = p_col
         self.resize_target = resize_target
 
-    def __call__(self, sample: Union[ImgT, Tuple[ImgT, ImgT]]) -> Union[ImgT, Tuple[ImgT, ImgT]]:
+    def __call__(
+            self, sample: Union[ImgT,
+                                Tuple[ImgT,
+                                      ImgT]]) -> Union[torch.Tensor, ImgT, Tuple[Tensor, Tensor], Tuple[ImgT, ImgT]]:
         """Drops random rows and columns from up to two images.
 
         Args:
diff --git a/composer/algorithms/factorize/factorize_modules.py b/composer/algorithms/factorize/factorize_modules.py
index a650fb7fc4..5720e2f464 100644
--- a/composer/algorithms/factorize/factorize_modules.py
+++ b/composer/algorithms/factorize/factorize_modules.py
@@ -327,8 +327,8 @@ def solution_for_rank(self, input: torch.Tensor, rank: int) -> LowRankSolution:
 
     def apply_solution(self, solution: LowRankSolution):
         self.latent_size = solution.rank
-        self.module0.out_channels = solution.rank
-        self.module1.in_channels = solution.rank
+        self.module0.out_channels = solution.rank  # pyright: ignore[reportGeneralTypeIssues]
+        self.module1.in_channels = solution.rank  # pyright: ignore[reportGeneralTypeIssues]
         _apply_solution_to_module_parameters(solution, self.module0, self.module1, transpose=False)
 
     @staticmethod
@@ -452,8 +452,8 @@ def solution_for_rank(self, input: torch.Tensor, rank: int) -> LowRankSolution:
 
     def apply_solution(self, solution: LowRankSolution) -> None:
         self.latent_size = solution.rank
-        self.module0.out_features = solution.rank
-        self.module1.in_features = solution.rank
+        self.module0.out_features = solution.rank  # pyright: ignore[reportGeneralTypeIssues]
+        self.module1.in_features = solution.rank  # pyright: ignore[reportGeneralTypeIssues]
         _apply_solution_to_module_parameters(solution, self.module0, self.module1, transpose=True)
 
     @staticmethod
@@ -471,9 +471,10 @@ def max_allowed_latent_channels(in_features: int, out_features: int) -> int:
 
     @staticmethod
     def from_linear(module: torch.nn.Linear, module_ix: int = -1, **kwargs) -> FactorizedLinear:
-        ret = FactorizedLinear(in_features=module.in_features,
-                               out_features=module.out_features,
-                               bias=((module.bias is not None) and (module.bias is not False)),
-                               **kwargs)
+        ret = FactorizedLinear(
+            in_features=module.in_features,
+            out_features=module.out_features,
+            bias=(module.bias is not None and module.bias is not False),  # pyright: ignore[reportUnnecessaryComparison]
+            **kwargs)
         ret.reset_parameters()
         return ret
diff --git a/composer/algorithms/gated_linear_units/gated_linear_units.py b/composer/algorithms/gated_linear_units/gated_linear_units.py
index 8325f527d9..9788918929 100644
--- a/composer/algorithms/gated_linear_units/gated_linear_units.py
+++ b/composer/algorithms/gated_linear_units/gated_linear_units.py
@@ -36,7 +36,8 @@ def from_BertOutput(layer: torch.nn.Module,
                     non_gated_layer_bias: bool = False) -> BERTGatedFFOutput:
     """Defines a replacement policy from a :class:`transformers.models.bert.modeling_bert.BertOutput` to a :class:`composer.algorithms.gated_linear_units.gated_linear_unit_layers.BERTGatedFFOutput`"""
     assert isinstance(
-        layer, BertOutput
+        layer,
+        BertOutput  # pyright: ignore[reportUnboundVariable]
     ), 'The replacement policy requires an instance of transformers.models.bert.modeling_bert.BertOutput for the necessary fields to be defined.'
     return BERTGatedFFOutput(
         d_embed=layer.dense.out_features,  #type: ignore dense.out_features member of BertOutput
@@ -85,16 +86,20 @@ def apply_gated_linear_units(model: torch.nn.Module,
     unwrapped_model = model.model if isinstance(model, HuggingFaceModel) else model
 
     # ensure that the model is an instance of a Hugging Face BertPreTrainedModel class, since our replacement policy is only defined for BERTs
-    if not isinstance(unwrapped_model, BertPreTrainedModel):
+    if not isinstance(unwrapped_model, BertPreTrainedModel):  # pyright: ignore[reportUnboundVariable]
         raise TypeError(
             'Gated Linear Units only has a surgery policy defined for subclasses of transformers.BertPreTrainedModel')
 
     # Early exit if nothing to replace
-    if module_surgery.count_module_instances(module=model, module_class=BertIntermediate) == 0:
+    if module_surgery.count_module_instances(
+            module=model, module_class=BertIntermediate) == 0:  # pyright: ignore[reportUnboundVariable]
         return
 
     if act_fn is None:
-        intermediate_modules = {module for module in model.modules() if isinstance(module, BertIntermediate)}
+        intermediate_modules = {
+            module for module in model.modules()
+            if isinstance(module, BertIntermediate)  # pyright: ignore[reportUnboundVariable]
+        }  # pyright: ignore[reportUnboundVariable]
         if len(intermediate_modules) == 0:
             warnings.warn(
                 NoEffectWarning('No instances of BertIntermediate were found so Gated Linear Units will be skipped '
@@ -130,8 +135,8 @@ def from_bound_BertOutput(layer: torch.nn.Module, module_index: int) -> BERTGate
 
     # prepare the replacement policy and perform replacement
     policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {
-        BertIntermediate: from_BertIntermediate,
-        BertOutput: from_bound_BertOutput
+        BertIntermediate: from_BertIntermediate,  # pyright: ignore[reportUnboundVariable]
+        BertOutput: from_bound_BertOutput  # pyright: ignore[reportUnboundVariable]
     }
     replaced_instances = module_surgery.replace_module_classes(module=model, optimizers=optimizers, policies=policy)
     if len(replaced_instances) == 0:
diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
index 889cb35e55..af6f661e48 100644
--- a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
+++ b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -152,7 +152,7 @@ def __init__(self, base_batchnorm: _TORCH_BATCHNORM_BASE_CLASS, ghost_batch_size
         super().__init__()
         self.ghost_batch_size = ghost_batch_size
         self.batchnorm = base_batchnorm
-        self.batchnorm._already_ghost_batchnormed = True  # Mark to avoid rewrapping on duplicate calls
+        self.batchnorm._already_ghost_batchnormed = True  # Mark to avoid rewrapping on duplicate calls # pyright: ignore[reportGeneralTypeIssues]
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         batch_size = input.shape[0]
@@ -161,7 +161,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
             raise ValueError(f'Worker batch size {batch_size} < ghost_batch_size {self.ghost_batch_size}')
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
-        has_momentum = self.batchnorm.momentum is not None
+        has_momentum: bool = hasattr(self.batchnorm, 'momentum')
         original_momentum: float = self.batchnorm.momentum
 
         if self.training and has_momentum:
diff --git a/composer/algorithms/gradient_clipping/gradient_clipping.py b/composer/algorithms/gradient_clipping/gradient_clipping.py
index 973d12914a..04e89d7ecc 100644
--- a/composer/algorithms/gradient_clipping/gradient_clipping.py
+++ b/composer/algorithms/gradient_clipping/gradient_clipping.py
@@ -9,7 +9,7 @@
 from typing import Iterable, Optional, Union
 
 import torch
-from packaging import version
+from torch.distributed.fsdp import FullyShardedDataParallel
 
 from composer.core import Algorithm, Event, State
 from composer.loggers import Logger
@@ -39,10 +39,6 @@ def apply_gradient_clipping(model: Union[ComposerModel, torch.nn.Module], clippi
         fsdp_enabled (bool): Bool of if the model is a FSDP model or not.
     """
     if fsdp_enabled:
-        if version.parse(torch.__version__) < version.parse('1.13.0'):
-            raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-        from torch.distributed.fsdp import FullyShardedDataParallel
-
         for module in model.modules():
             if isinstance(module, FullyShardedDataParallel) and module.check_is_root():
                 if clipping_type == 'norm':
diff --git a/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py b/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py
index bc959490f1..5cdad2c6c0 100644
--- a/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py
+++ b/composer/algorithms/low_precision_groupnorm/low_precision_groupnorm.py
@@ -52,8 +52,10 @@ class LowPrecisionGroupNorm(Algorithm):
 
     def __init__(self, apply_at: Event = Event.INIT):
         self.apply_at = apply_at
-        if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
-            raise ValueError('LowPrecisionGroupNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
+        if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
+            raise ValueError(
+                'LowPrecisionGroupNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
+            )
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(apply_at={self.apply_at})'
@@ -86,8 +88,10 @@ def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, device=None
     def forward(self, x):
         module_device = x.device
         downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        downcast_weight = _cast_if_autocast_enabled(
+            self.weight) if self.weight is not None else self.weight  # pyright: ignore[reportUnnecessaryComparison]
+        downcast_bias = _cast_if_autocast_enabled(
+            self.bias) if self.bias is not None else self.bias  # pyright: ignore[reportUnnecessaryComparison]
         with torch.autocast(enabled=False, device_type=module_device.type):
             return F.group_norm(downcast_x, self.num_groups, downcast_weight, downcast_bias, self.eps)
 
@@ -111,11 +115,11 @@ def _to_LPGroupNorm(layer: torch.nn.Module, module_index: int) -> LPGroupNorm:
     lp_groupnorm = LPGroupNorm(layer.num_groups, layer.num_channels, layer.eps, layer.affine)
 
     with torch.no_grad():
-        if layer.weight is None:
+        if layer.weight is None:  # pyright: ignore[reportUnnecessaryComparison]
             lp_groupnorm.register_parameter('weight', None)
         else:
             lp_groupnorm.weight.copy_(layer.weight)  # type: ignore
-        if layer.bias is None:
+        if layer.bias is None:  # pyright: ignore[reportUnnecessaryComparison]
             lp_groupnorm.register_parameter('bias', None)
         else:
             lp_groupnorm.bias.copy_(layer.bias)  # type: ignore
diff --git a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
index 1a7e3b6b8d..64ffaebb11 100644
--- a/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
+++ b/composer/algorithms/low_precision_layernorm/low_precision_layernorm.py
@@ -6,13 +6,11 @@
 from __future__ import annotations
 
 import logging
-import textwrap
 import warnings
 from typing import Dict, Optional, Sequence, Type, Union
 
 import torch
 import torch.nn.functional as F
-from packaging import version
 from torch.optim import Optimizer
 
 from composer.algorithms.warnings import NoEffectWarning
@@ -22,12 +20,6 @@
 
 log = logging.getLogger(__name__)
 
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as APEXFusedLayerNorm
-    APEX_INSTALLED = True
-except ImportError as e:
-    APEX_INSTALLED = False
-
 
 def apply_low_precision_layernorm(model,
                                   precision: Optional[Precision] = None,
@@ -38,22 +30,6 @@ def apply_low_precision_layernorm(model,
 
     policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {torch.nn.LayerNorm: _to_LPLayerNorm}
 
-    # Prior to v1.13, torch.nn.LayerNorm is slow in bf16 precision.
-    # We use FusedLayerNorm as a fallback.
-    if version.parse(torch.__version__) < version.parse('1.13') and precision == Precision.AMP_BF16:
-        warnings.warn(
-            DeprecationWarning(
-                textwrap.dedent(
-                    'You are using Low Precision LayerNorm on PyTorch < v.1.13 with bfloat16 precision. '
-                    'In this scenario, we fall back to Fused LayerNorm. '
-                    'Fused LayerNorm has been deprecated and will be removed in Composer 0.18. '
-                    'Please upgrade your PyTorch version to >=v.1.13 to use Low Precision LayerNorm without the Fused LayerNorm fallback.'
-                )))
-        check_if_apex_installed()
-        policy: Dict[Type[torch.nn.Module], module_surgery.ReplacementFunction] = {
-            torch.nn.LayerNorm: _to_FusedLayerNorm
-        }
-
     replaced_instances = module_surgery.replace_module_classes(module=model, optimizers=optimizers, policies=policy)
     if len(replaced_instances) == 0:
         warnings.warn(NoEffectWarning('No instances of torch.nn.LayerNorm found.'))
@@ -76,8 +52,10 @@ class LowPrecisionLayerNorm(Algorithm):
 
     def __init__(self, apply_at: Event = Event.INIT):
         self.apply_at = apply_at
-        if self.apply_at not in {Event.INIT, Event.AFTER_LOAD}:
-            raise ValueError('LowPrecisionLayerNorm only supports application on Event.INIT and Event.AFTER_LOAD.')
+        if self.apply_at not in {Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD}:
+            raise ValueError(
+                'LowPrecisionLayerNorm only supports application on Event.INIT, Event.BEFORE_LOAD, and Event.AFTER_LOAD.'
+            )
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(apply_at={self.apply_at})'
@@ -109,8 +87,10 @@ def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=
     def forward(self, x):
         module_device = x.device
         downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        downcast_weight = _cast_if_autocast_enabled(
+            self.weight) if self.weight is not None else self.weight  # pyright: ignore[reportUnnecessaryComparison]
+        downcast_bias = _cast_if_autocast_enabled(
+            self.bias) if self.bias is not None else self.bias  # pyright: ignore[reportUnnecessaryComparison]
         with torch.autocast(enabled=False, device_type=module_device.type):
             return F.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 
@@ -127,13 +107,6 @@ def _cast_if_autocast_enabled(tensor):
     return tensor
 
 
-def check_if_apex_installed():
-    if not APEX_INSTALLED:
-        raise ImportError(
-            'https://github.com/NVIDIA/apex is not installed. The Low Precision LayerNorm algorithm cannot be applied on PyTorch <1.13 without Apex. The MosaicML Docker Images (https://hub.docker.com/r/mosaicml/pytorch) contain a copy of APEX for easy use.'
-        )
-
-
 def _to_LPLayerNorm(layer: torch.nn.Module, module_index: int) -> LPLayerNorm:
     """Defines a replacement policy from a `torch.nn.LayerNorm` to a `LPLayerNorm`"""
     if not isinstance(layer, torch.nn.LayerNorm):
@@ -141,32 +114,13 @@ def _to_LPLayerNorm(layer: torch.nn.Module, module_index: int) -> LPLayerNorm:
     lp_layernorm = LPLayerNorm(layer.normalized_shape, layer.eps, layer.elementwise_affine)
 
     with torch.no_grad():
-        if layer.weight is None:
+        if layer.weight is None:  # pyright: ignore[reportUnnecessaryComparison]
             lp_layernorm.register_parameter('weight', None)
         else:
             lp_layernorm.weight.copy_(layer.weight)  # type: ignore
-        if layer.bias is None:
+        if layer.bias is None:  # pyright: ignore[reportUnnecessaryComparison]
             lp_layernorm.register_parameter('bias', None)
         else:
             lp_layernorm.bias.copy_(layer.bias)  # type: ignore
 
     return lp_layernorm
-
-
-def _to_FusedLayerNorm(layer: torch.nn.Module, module_index: int) -> APEXFusedLayerNorm:
-    """Defines a replacement policy from a `torch.nn.LayerNorm` to a `apex.normalization.fused_layer_norm`"""
-    if not isinstance(layer, torch.nn.LayerNorm):
-        raise TypeError(f'Expected torch.nn.LayerNorm, got {type(layer)}')
-    fused_layernorm = APEXFusedLayerNorm(normalized_shape=layer.normalized_shape, eps=layer.eps)
-
-    with torch.no_grad():
-        if layer.weight is None:
-            fused_layernorm.weight = None
-        else:
-            fused_layernorm.weight.copy_(layer.weight)
-        if layer.bias is None:
-            fused_layernorm.bias = None
-        else:
-            fused_layernorm.bias.copy_(layer.bias)
-
-    return fused_layernorm
diff --git a/composer/algorithms/sam/sam.py b/composer/algorithms/sam/sam.py
index a4c3cf5b98..57b7804d15 100644
--- a/composer/algorithms/sam/sam.py
+++ b/composer/algorithms/sam/sam.py
@@ -50,7 +50,7 @@ def __init__(self,
         defaults = {'rho': rho, 'epsilon': epsilon, **kwargs}
         super(SAMOptimizer, self).__init__(self.base_optimizer.param_groups, defaults)
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def sub_e_w(self):
         for group in self.param_groups:
             for p in group['params']:
@@ -59,7 +59,7 @@ def sub_e_w(self):
                 e_w = self.state[p]['e_w']  # retrieve stale e(w)
                 p.sub_(e_w)  # get back to "w" from "w + e(w)"
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def first_step(self):
         grad_norm = self._grad_norm()
         for group in self.param_groups:
@@ -71,7 +71,7 @@ def first_step(self):
                 p.add_(e_w)  # climb to the local maximum "w + e(w)"
                 self.state[p]['e_w'] = e_w
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def second_step(self):
         for group in self.param_groups:
             for p in group['params']:
@@ -80,7 +80,7 @@ def second_step(self):
                 p.sub_(self.state[p]['e_w'])  # get back to "w" from "w + e(w)"
         self.base_optimizer.step()  # do the actual "sharpness-aware" update
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def step(self, closure=None):
         assert closure is not None, 'Sharpness Aware Minimization requires closure, but it was not provided'
         closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass
diff --git a/composer/algorithms/squeeze_excite/squeeze_excite.py b/composer/algorithms/squeeze_excite/squeeze_excite.py
index 1e8e905ef7..551e0be7ee 100644
--- a/composer/algorithms/squeeze_excite/squeeze_excite.py
+++ b/composer/algorithms/squeeze_excite/squeeze_excite.py
@@ -109,7 +109,7 @@ class SqueezeExciteConv2d(torch.nn.Module):
     def __init__(self, *args, latent_channels: float = 0.125, conv: Optional[torch.nn.Conv2d] = None, **kwargs):
         super().__init__()
         self.conv = torch.nn.Conv2d(*args, **kwargs) if conv is None else conv
-        self.conv._already_squeeze_excited = True  # Mark to avoid rewrapping on duplicate calls
+        self.conv._already_squeeze_excited = True  # Mark to avoid rewrapping on duplicate calls # pyright: ignore[reportGeneralTypeIssues]
         self.se = SqueezeExcite2d(num_features=self.conv.out_channels, latent_channels=latent_channels)
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
diff --git a/composer/algorithms/stochastic_depth/README.md b/composer/algorithms/stochastic_depth/README.md
index 185b3cd28c..5412134bbe 100644
--- a/composer/algorithms/stochastic_depth/README.md
+++ b/composer/algorithms/stochastic_depth/README.md
@@ -14,7 +14,7 @@
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 ```
@@ -27,7 +27,6 @@ import torch
 import torch.nn.functional as F
 
 import composer.functional as cf
-from composer.models import composer_resnet
 
 # Training
 
@@ -63,7 +62,7 @@ for epoch in range(1):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
@@ -75,7 +74,6 @@ eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 # The trainer will automatically run it at the appropriate point in the training loop
 
 from composer.algorithms import StochasticDepth
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 
 # Train model
diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
index 7f5956dfea..ae02285240 100644
--- a/composer/algorithms/swa/swa.py
+++ b/composer/algorithms/swa/swa.py
@@ -10,9 +10,10 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from torch.optim.lr_scheduler import LRScheduler
 from torch.optim.swa_utils import SWALR, AveragedModel
 
-from composer.core import Algorithm, Event, PyTorchScheduler, State, Time, TimeUnit
+from composer.core import Algorithm, Event, State, Time, TimeUnit
 from composer.loggers import Logger
 
 log = logging.getLogger(__name__)
@@ -190,7 +191,7 @@ def _get_time(self, state: State):
         else:
             raise ValueError('units must be in epoch or duration.')
 
-    def _get_last_lr(self, schedulers: List[PyTorchScheduler]):
+    def _get_last_lr(self, schedulers: List[LRScheduler]):
         """ retrieves the last lr from current schedulers. """
         if len(schedulers) == 0:
             return 1.0
diff --git a/composer/algorithms/weight_standardization/README.md b/composer/algorithms/weight_standardization/README.md
index 870eba4c03..9e18f962e6 100644
--- a/composer/algorithms/weight_standardization/README.md
+++ b/composer/algorithms/weight_standardization/README.md
@@ -18,7 +18,7 @@ Weight Standardization is a reparametrization of convolutional weights such that
 ```python
 from torchvision import models
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 my_train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 my_cnn_model = models.resnet18()
@@ -31,7 +31,6 @@ my_cnn_model = models.resnet18()
 import composer.functional as cf
 import torch
 import torch.nn.functional as F
-from composer.models import composer_resnet
 
 def training_loop(model, train_dataloader):
     opt = torch.optim.Adam(model.parameters())
@@ -58,9 +57,8 @@ training_loop(my_cnn_model, my_train_dataloader)
 <!--pytest.mark.gpu-->
 <!--
 ```python
-from composer.models import composer_resnet
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 cnn_composer_model = composer_resnet('resnet50')
 my_train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
diff --git a/composer/callbacks/__init__.py b/composer/callbacks/__init__.py
index d403ee0ee2..16a50a31a9 100644
--- a/composer/callbacks/__init__.py
+++ b/composer/callbacks/__init__.py
@@ -12,12 +12,13 @@
 from composer.callbacks.export_for_inference import ExportForInferenceCallback
 from composer.callbacks.free_outputs import FreeOutputs
 from composer.callbacks.generate import Generate
-from composer.callbacks.health_checker import HealthChecker
 from composer.callbacks.image_visualizer import ImageVisualizer
 from composer.callbacks.lr_monitor import LRMonitor
 from composer.callbacks.memory_monitor import MemoryMonitor
+from composer.callbacks.memory_snapshot import MemorySnapshot
 from composer.callbacks.mlperf import MLPerfCallback
 from composer.callbacks.nan_monitor import NaNMonitor
+from composer.callbacks.oom_observer import OOMObserver
 from composer.callbacks.optimizer_monitor import OptimizerMonitor
 from composer.callbacks.runtime_estimator import RuntimeEstimator
 from composer.callbacks.speed_monitor import SpeedMonitor
@@ -37,9 +38,10 @@
     'ExportForInferenceCallback',
     'ThresholdStopper',
     'ImageVisualizer',
-    'HealthChecker',
     'RuntimeEstimator',
     'SystemMetricsMonitor',
     'Generate',
     'FreeOutputs',
+    'MemorySnapshot',
+    'OOMObserver',
 ]
diff --git a/composer/callbacks/activation_monitor.py b/composer/callbacks/activation_monitor.py
index b30f65aea4..86e91efd63 100644
--- a/composer/callbacks/activation_monitor.py
+++ b/composer/callbacks/activation_monitor.py
@@ -146,8 +146,8 @@ def register_forward_hook(self, model: torch.nn.Module, logger: Logger, step: Op
     def _register_forward_hook(self, logger: Logger, step: Optional[int], module: torch.nn.Module):
         self.handles.append(module.register_forward_hook(partial(self.forward_hook, logger, step)))
 
-    def forward_hook(self, logger: Logger, step: Optional[int], module: torch.nn.Module, input: Sequence,
-                     output: Sequence):
+    def forward_hook(self, logger: Logger, step: Optional[int], module: torch.nn.Module, input: Optional[Sequence],
+                     output: Optional[Sequence]):
         module_name = self.module_names[module]
 
         if self.ignore_module_types is not None:
diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
index c876343f21..6fa64715c8 100644
--- a/composer/callbacks/checkpoint_saver.py
+++ b/composer/callbacks/checkpoint_saver.py
@@ -12,15 +12,15 @@
 import tempfile
 import textwrap
 from pathlib import Path
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
-from composer.core import Callback, Event, State, Time
-from composer.loggers import Logger
+from composer.core import Callback, Event, State, Time, Timestamp
+from composer.loggers import Logger, MLFlowLogger
 from composer.utils import (FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, FORMAT_NAME_WITH_DIST_TABLE, PartialFilePath,
                             checkpoint, create_interval_scheduler, create_symlink_file, dist,
                             ensure_folder_has_no_conflicting_files, format_name_with_dist,
-                            format_name_with_dist_and_time, is_model_deepspeed, reproducibility, using_torch_2)
-from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
+                            format_name_with_dist_and_time, is_model_deepspeed, partial_format)
+from composer.utils.object_store.mlflow_object_store import MLFLOW_EXPERIMENT_ID_FORMAT_KEY, MLFLOW_RUN_ID_FORMAT_KEY
 
 log = logging.getLogger(__name__)
 
@@ -199,10 +199,6 @@ class CheckpointSaver(Callback):  # noqa: D101
             progress). It should return ``True`` if a checkpoint should be saved given the current state and
             event.
 
-        weights_only (bool): If ``True``, save only the model weights instead of the entire training state.
-            This parameter must be ``False`` when using DeepSpeed. Default: ``False``.
-
-
         num_checkpoints_to_keep (int, optional): The number of checkpoints to keep locally. The oldest checkpoints
             are removed first. Set to ``-1`` to keep all checkpoints locally. Default: ``-1``.
 
@@ -214,6 +210,31 @@ class CheckpointSaver(Callback):  # noqa: D101
             This parameter only controls how many checkpoints are kept locally; checkpoints are not deleted from
             remote file systems.
 
+        weights_only (bool): If ``True``, save only the model weights instead of the entire training state.
+            This parameter must be ``False`` when using DeepSpeed. Default: ``False``.
+
+        ignore_keys (List[str] | (Dict) -> None, optional): A list of paths for the ``state_dict`` of the checkpoint,
+            which, when provided, will be ignored from the state_dict before a checkpoint is saved. Each path is a list
+            of strings specifying the keys to index into ``state_dict`` joined together with `/` as a separator (as PyTorch
+            uses `.` in parameter names). If a prefix is provided, all children are also ignored (see Example 2).
+            See :mod:`composer.core.state` for the structure of state_dict.
+
+            Example 1: ``save_ignore_keys = ["state/model/layer1.weights", "state/model/layer1.bias"]`` would ignore
+            layer 1 weights and bias.
+
+            Example 2: ``save_ignore_keys = ["state/model/*"]`` would ignore the entire model, which would have the same
+            effect as the previous example if there was only 1 layer.
+
+            Example 3: ``save_ignore_keys = ["state/model/layer*.weights"]`` would ignore all weights in the model.
+
+            Example 4: ``save_ignore_keys = ["state/rank_zero_seed", "rng"]`` would reset all randomness when
+            saving the checkpoint.
+
+            If a callable, it should take one argument which is the state_dict. The callable is free to arbitrarily modify
+            the state_dict before it is loaded.
+
+            (default: ``None``)
+
     Attributes:
         saved_checkpoints (List[Tuple[Timestamp, List[pathlib.Path]]]): The checkpoint timestamps and filepaths.
 
@@ -243,6 +264,7 @@ def __init__(
         overwrite: bool = False,
         num_checkpoints_to_keep: int = -1,
         weights_only: bool = False,
+        ignore_keys: Optional[Union[List[str], Callable[[Dict], None]]] = None,
     ):
         folder = str(folder)
         filename = str(filename)
@@ -264,12 +286,38 @@ def __init__(
 
         self.overwrite = overwrite
         self.saved_checkpoints: List[str] = []
+        self.all_saved_checkpoints_to_timestamp: Dict[str, Timestamp] = {}
         self.num_checkpoints_to_keep = num_checkpoints_to_keep
         self.weights_only = weights_only
+        self.ignore_keys = ignore_keys
 
         self.start_batch = None
 
     def init(self, state: State, logger: Logger) -> None:
+        # If MLFlowLogger is being used, format MLFlow-specific placeholders in the save folder and paths.
+        # Assumes that MLFlowLogger comes before CheckpointSaver in the list of loggers.
+        for destination in logger.destinations:
+            if isinstance(destination, MLFlowLogger):
+                mlflow_format_kwargs = {
+                    MLFLOW_EXPERIMENT_ID_FORMAT_KEY: destination._experiment_id,
+                    MLFLOW_RUN_ID_FORMAT_KEY: destination._run_id
+                }
+                self.folder = partial_format(self.folder, **mlflow_format_kwargs)
+
+                self.filename.folder = self.folder
+                if self.latest_filename is not None:
+                    self.latest_filename.folder = self.folder
+
+                # The remote paths have the placeholders in their filename rather than folder
+                if self.remote_file_name is not None:
+                    self.remote_file_name.filename = partial_format(self.remote_file_name.filename,
+                                                                    **mlflow_format_kwargs)
+                if self.latest_remote_file_name is not None:
+                    self.latest_remote_file_name.filename = partial_format(self.latest_remote_file_name.filename,
+                                                                           **mlflow_format_kwargs)
+
+                break
+
         folder = format_name_with_dist(self.folder, state.run_name)
         os.makedirs(folder, exist_ok=True)
 
@@ -303,11 +351,22 @@ def epoch_checkpoint(self, state: State, logger: Logger):
                 logger,
             )
 
-    def get_state_dict(self, state):
-        return {
-            'state': state.state_dict(),
-            'rng': reproducibility.get_rng_state(),
-        }
+    def state_dict(self) -> Dict[str, Any]:
+        state_dict = {}
+
+        all_checkpoints = []
+        for save_filename, timestamp in self.all_saved_checkpoints_to_timestamp.items():
+            all_checkpoints.append((save_filename, timestamp.state_dict()))
+
+        state_dict['all_saved_checkpoints_to_timestamp'] = all_checkpoints
+        return state_dict
+
+    def load_state_dict(self, state: Dict[str, Any]):
+        if 'all_saved_checkpoints_to_timestamp' in state:
+            for (save_filename, timestamp_state) in state['all_saved_checkpoints_to_timestamp']:
+                load_timestamp = Timestamp()
+                load_timestamp.load_state_dict(timestamp_state)
+                self.all_saved_checkpoints_to_timestamp[save_filename] = load_timestamp
 
     def _save_checkpoint(self, state: State, logger: Logger):
         self.last_checkpoint_batch = state.timestamp.batch
@@ -319,16 +378,21 @@ def _save_checkpoint(self, state: State, logger: Logger):
 
         # save the checkpoint to the filename
         filename_with_placeholders = self.filename.format(state, is_deepspeed, keep_placeholders=True)
+        save_filename = checkpoint.get_save_filename(state, filename_with_placeholders)
+        # Store before saving so state_dict in checkpoint has reference to latest checkpoint (itself)
+        self.all_saved_checkpoints_to_timestamp[save_filename] = state.timestamp
 
         saved_path = checkpoint.save_checkpoint(
             state=state,
             filename=filename_with_placeholders,
             weights_only=self.weights_only,
+            ignore_keys=self.ignore_keys,
         )
         log.debug(f'Checkpoint locally saved to {saved_path}')
 
         if not saved_path:  # not all ranks save
             return
+
         metadata_local_file_path = None
         if dist.get_global_rank() == 0 and state.fsdp_elastic_sharded_enabled:
             metadata_local_file_path = format_name_with_dist_and_time(
@@ -361,8 +425,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 ).lstrip('/')
                 assert state.sharded_ckpt_prefix_dir is not None
                 remote_prefix = state.sharded_ckpt_prefix_dir
-                ckpt_filename = _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME if using_torch_2() else pathlib.Path(
-                    remote_file_name).name
+                ckpt_filename = checkpoint._TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
                 remote_file_name = os.path.join(pathlib.Path(remote_file_name).parent, remote_prefix, ckpt_filename)
                 remote_file_name = format_name_with_dist_and_time(remote_file_name, state.run_name, state.timestamp)
                 # Upload metadata file.
@@ -423,10 +486,10 @@ def _rotate_checkpoints(self, sharding_enabled: bool = False):
 
         while len(self.saved_checkpoints) > self.num_checkpoints_to_keep:
             prefix_dir = None
-            checkpoint = self.saved_checkpoints.pop(0)
-            prefix_dir = str(Path(checkpoint).parent)
+            checkpoint_to_delete = self.saved_checkpoints.pop(0)
+            prefix_dir = str(Path(checkpoint_to_delete).parent)
             if not sharding_enabled:
-                os.remove(checkpoint)
+                os.remove(checkpoint_to_delete)
             else:
                 if dist.get_global_rank() == 0:
                     shutil.rmtree(prefix_dir)
diff --git a/composer/callbacks/early_stopper.py b/composer/callbacks/early_stopper.py
index 6771ef3b83..16f8f1af4b 100644
--- a/composer/callbacks/early_stopper.py
+++ b/composer/callbacks/early_stopper.py
@@ -107,7 +107,7 @@ def __init__(
                 raise ValueError('If `patience` is an instance of Time, it must have units of EPOCH or BATCH.')
 
     def _get_monitored_metric(self, state: State):
-        if self.dataloader_label == 'train':
+        if self.dataloader_label == 'train' and state.train_metrics is not None:
             if self.monitor in state.train_metrics:
                 return state.train_metrics[self.monitor].compute()
         else:
diff --git a/composer/callbacks/generate.py b/composer/callbacks/generate.py
index 83c3eea899..53eaa35202 100644
--- a/composer/callbacks/generate.py
+++ b/composer/callbacks/generate.py
@@ -92,8 +92,8 @@ def generate(self, state: State, logger: Logger):
         n_prompts = len(self.prompts)
         for start in range(0, n_prompts, self.batch_size):
             end = min(start + self.batch_size, n_prompts)
-            input_ids = all_input_ids[start:end]
-            attn_mask = all_attn_masks[start:end]
+            input_ids = all_input_ids[start:end]  # pyright: ignore[reportGeneralTypeIssues]
+            attn_mask = all_attn_masks[start:end]  # pyright: ignore[reportGeneralTypeIssues]
 
             # Move batch to device.
             input_ids = device.tensor_to_device(input_ids)
@@ -110,7 +110,7 @@ def generate(self, state: State, logger: Logger):
         if dist.get_global_rank() == 0:
             # Process prompts and outputs into a table.
             rows = []
-            input_tokens_len = all_input_ids.shape[1]
+            input_tokens_len = all_input_ids.shape[1]  # pyright: ignore[reportGeneralTypeIssues]
             for i, prompt in enumerate(self.prompts):
                 output_tokens = output_token_ids[i][input_tokens_len:]
                 output_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py
deleted file mode 100644
index e407dc9500..0000000000
--- a/composer/callbacks/health_checker.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Check GPU Health during training."""
-import logging
-import os
-import warnings
-from collections import deque
-from datetime import datetime
-from typing import List, Optional, Tuple
-
-import numpy as np
-import torch
-
-from composer.core import Callback, State
-from composer.core.time import Timestamp
-from composer.loggers import Logger
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-__all__ = ['HealthChecker']
-
-
-class HealthChecker(Callback):
-    """Checks for GPU health.
-
-    This callback checks for GPU health by tracking and alerting for abnormal
-    GPU utilizations.
-
-    For example, if the average utilization during the observation window is,
-    [30, 30, 45], then the range (45-30=15) would exceed a threshold of 10%.
-
-    Args:
-        threshold (float, optional): Threshold of GPU utilization range to
-            trigger an alert. Defaults to 10.
-        sample_freq (int, optional): Sample frequency in seconds. Default: 5.
-        window_size (int, optional): Window size in seconds. HealthChecker will
-            check for abnormalities at this frequency. Default: 120.
-        wait (int, optional): Seconds to wait for starting to sample. Default: 120.
-        slack_webhook_url (str, optional): Slack URL to send alerts. Can also
-            be set with the SLACK_WEBHOOK_URL environment variable. Default: None
-        test_mode (bool, optional): If True, will send a test alert at the first check.
-            Default: False
-    """
-
-    def __init__(
-        self,
-        threshold: float = 10,
-        sample_freq: int = 5,
-        window_size: int = 120,
-        wait: int = 120,
-        slack_webhook_url: Optional[str] = None,
-        test_mode: bool = False,
-    ) -> None:
-        warnings.warn(f'HealthChecker is deprecated and will be removed in v0.16.')
-        self.sample_freq = sample_freq
-        self.window_size = window_size
-        self.wait = wait
-        self.slack_webhook_url = slack_webhook_url
-        self.test_mode = test_mode
-
-        if not self.slack_webhook_url:
-            self.slack_webhook_url = os.environ.get('SLACK_WEBHOOK_URL', None)
-
-        if self.slack_webhook_url:
-            # fail fast if missing import
-            try:
-                import slack_sdk
-                del slack_sdk
-            except ImportError as e:
-                raise MissingConditionalImportError('health_checker', 'slack_sdk', None) from e
-
-        self.last_sample = 0
-        self.last_check = 0
-
-        self.metrics = []
-        if self._is_available():
-            self.metrics.append(GPUUtilization(threshold))
-
-    def init(self, state: State, logger: Logger) -> None:
-        pass
-
-    def after_train_batch(self, state: State, logger: Logger):
-        if not self.metrics:
-            return
-
-        if self._sample(state.timestamp):
-            for metric in self.metrics:
-                metric.sample()
-
-        if self._check(state.timestamp):
-            for metric in self.metrics:
-                message, alert = metric.check()
-                if self.test_mode and message and dist.get_global_rank() == 0:
-                    alert = True
-                    message = '[**THIS IS A TEST**]' + message
-                if alert and not metric.alerted:
-                    self._alert(message, state)
-                    metric.alerted = True
-                metric.clear()
-
-    def _sample(self, timestamp: Timestamp) -> bool:
-        now = timestamp.total_wct.seconds
-
-        if now < self.wait:
-            return False
-
-        if now - self.last_sample >= self.sample_freq:
-            self.last_sample = now
-            return True
-
-        return False
-
-    def _check(self, timestamp: Timestamp) -> bool:
-        now = timestamp.total_wct.seconds
-
-        if now - self.last_check >= self.window_size:
-            self.last_check = now
-            return True
-        return False
-
-    def _alert(self, message: str, state: State) -> None:
-        prefix = '[{now}][{run_name}][node_rank={node_rank}]'.format(
-            now=datetime.now(),
-            run_name=state.run_name,
-            node_rank=dist.get_node_rank(),
-        )
-
-        node_name = os.environ.get('NODENAME', None)
-        if node_name is not None:
-            prefix += f'[node={node_name}]'
-
-        message = prefix + ' : ' + message
-
-        logging.warning(message)
-        if self.slack_webhook_url:
-            from slack_sdk.webhook import WebhookClient
-            client = WebhookClient(url=self.slack_webhook_url)
-            client.send(text=message)
-
-    @staticmethod
-    def _is_available() -> bool:
-        if not torch.cuda.is_available():
-            return False
-        try:
-            import pynvml
-            pynvml.nvmlInit()  # type: ignore
-            return True
-        except ImportError:
-            raise MissingConditionalImportError('health_checker', 'pynvml', None)
-        except pynvml.NVMLError_LibraryNotFound:  # type: ignore
-            logging.warning('NVML not found, disabling GPU health checking')
-        except Exception as e:
-            logging.warning(f'Error initializing NVML: {e}')
-
-        return False
-
-
-class GPUUtilization:
-    """GPU Utilization Metric."""
-
-    def __init__(self, threshold=10) -> None:
-        self.samples = deque()
-        self.threshold = threshold
-        self.alerted = False
-
-    def sample(self) -> None:
-        if dist.get_local_rank() == 0:
-            sample = self._sample()
-            if sample is not None:
-                self.samples.append(sample)
-
-    def _sample(self) -> Optional[List]:
-        try:
-            import pynvml
-        except ImportError:
-            raise MissingConditionalImportError('health_checker', 'pynvml', None)
-
-        try:
-            samples = []
-            device_count = pynvml.nvmlDeviceGetCount()
-            for i in range(device_count):
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
-        except pynvml.NVMLError:
-            return None
-        return samples
-
-    def check(self) -> Tuple[Optional[str], bool]:
-        if dist.get_local_rank() == 0:
-            average_sample = np.nanmean(list(self.samples), axis=0)
-            if np.nanmax(average_sample) - np.nanmin(average_sample) > self.threshold:
-                message = f'Abnormal GPU utilizations: {average_sample}'
-                return message, True
-            else:
-                message = f':+1: Normal GPU utilizations: {average_sample}'
-                return message, False
-        return None, False
-
-    def clear(self) -> None:
-        self.samples.clear()
diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
index c1a9379665..ccc080058c 100644
--- a/composer/callbacks/image_visualizer.py
+++ b/composer/callbacks/image_visualizer.py
@@ -46,7 +46,7 @@ class ImageVisualizer(Callback):
     +---------------------------------------------+---------------------------------------+
 
         .. note::
-            This callback only works with wandb logging for now.
+            This callback only works with wandb and Neptune logging for now.
 
     Args:
         interval (int | str | Time, optional): Time string specifying how often to log train images. For example, ``interval='1ep'``
diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
new file mode 100644
index 0000000000..5fc724eaee
--- /dev/null
+++ b/composer/callbacks/memory_snapshot.py
@@ -0,0 +1,189 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Log memory snapshot during training."""
+import logging
+import os
+import pickle
+import warnings
+from typing import Optional, Union
+
+import torch.cuda
+from packaging import version
+
+from composer import State
+from composer.core import Callback, State, Time, TimeUnit
+from composer.loggers import Logger
+from composer.utils import ensure_folder_is_empty, format_name_with_dist, format_name_with_dist_and_time, parse_uri
+
+log = logging.getLogger(__name__)
+
+__all__ = ['MemorySnapshot']
+
+
+class MemorySnapshot(Callback):
+    """Logs the memory snapshot of the model.
+
+    This callback calls the torch memory snapshot API (see :func:`torch.cuda.memory._snapshot`) to record a model's tensor memory allocation over a user defined interval (only once through time [skip_batches, skip_batches + interval]). This provides a fine-grained GPU memory visualization for debugging GPU OOMs. Captured memory snapshots will show memory events including allocations, frees and OOMs, along with their stack traces over one interval.
+
+    Example:
+        .. doctest::
+
+            >>> from composer import Trainer
+            >>> from composer.callbacks import MemorySnapshot
+            >>> # constructing trainer object with this callback
+            >>> trainer = Trainer(
+            ...     model=model,
+            ...     train_dataloader=train_dataloader,
+            ...     eval_dataloader=eval_dataloader,
+            ...     optimizers=optimizer,
+            ...     max_duration="1ep",
+            ...     callbacks=[MemorySnapshot()],
+            ... )
+
+    .. note::
+        Memory snapshot is only supported for GPU devices.
+
+    Args:
+        skip_batches (int, optional): Number of batches to skip before starting recording memory snapshot. Defaults to 1.
+        interval (Union[int, str, Time], optional): Time string specifying how long to record the tensor allocation.
+            For example, ``interval='3ba'`` means 3 batches are recorded. Default: '3ba'.
+        max_entries (int, optional): Maximum number of memory alloc/free events to record. Defaults to 100000.
+        folder (str, optional): A format string describing the folder containing the memory snapshot files.
+            Defaults to ``'{{run_name}}/torch_traces'``.
+        filename (str, optional): A format string describing the prefix used to name the memory snapshot files.
+            Defaults to ``'rank{{rank}}.{{batch}}.memory_snapshot'``.
+        remote_file_name (str, optional): A format string describing the prefix for the memory snapshot remote file name.
+            Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.memory_snapshot'``.
+
+            Whenever a trace file is saved, it is also uploaded as a file according to this format string.
+            The same format variables as for ``filename`` are available.
+
+            .. seealso:: :doc:`Uploading Files</trainer/file_uploading>` for notes for file uploading.
+
+            Leading slashes (``'/'``) will be stripped.
+
+            To disable uploading trace files, set this parameter to ``None``.
+        overwrite (bool, optional): Whether to override existing memory snapshots. Defaults to False.
+
+            If False, then the trace folder as determined by ``folder`` must be empty.
+    """
+
+    def __init__(
+        self,
+        skip_batches: int = 1,
+        interval: Union[int, str, Time] = '3ba',
+        max_entries: int = 100000,
+        folder: str = '{run_name}/torch_traces',
+        filename: str = 'rank{rank}.{batch}.memory_snapshot',
+        remote_file_name: Optional[str] = '{run_name}/torch_memory_traces/rank{rank}.{batch}.memory_snapshot',
+        overwrite: bool = False,
+    ) -> None:
+        self.batches_left_to_skip = skip_batches
+        # Check that the interval timestring is parsable and convert into time object
+        self.interval = Time.from_input(interval, TimeUnit.BATCH)
+        self.max_entries = max_entries
+        self.folder = folder
+        self.folder_name = None
+        self.filename = filename
+        self.remote_file_name = remote_file_name
+        self.overwrite = overwrite
+        self._start_time = None
+        if remote_file_name:
+            self.remote_file_name = remote_file_name
+            _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
+        else:
+            self.remote_path_in_bucket = None
+
+        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
+            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
+            self._enabled = True
+        else:
+            self._enabled = False
+            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+
+    def init(self, state: State, logger: Logger) -> None:
+        if not self._enabled:
+            return
+        # Not relying on `torch.cuda.is_available()` since the model could be on CPU.
+        model_device = next(state.model.parameters()).device
+
+        if model_device.type not in ('cuda', 'meta'):
+            warnings.warn(f'The memory snapshot only works on CUDA devices, but the model is on {model_device.type}.')
+            self._enabled = False
+        else:
+            self.folder_name = format_name_with_dist(self.folder, state.run_name)
+            os.makedirs(self.folder_name, exist_ok=True)
+            if not self.overwrite:
+                ensure_folder_is_empty(self.folder_name)
+
+    def batch_start(self, state: State, logger: Logger) -> None:
+        if self._enabled and self._start_time is None and self.batches_left_to_skip == 0:
+            self.start_record_memory_history()
+            self._start_time = state.timestamp.get(self.interval.unit).value
+
+    def batch_end(self, state: State, logger: Logger) -> None:
+        if not self._enabled:
+            return
+
+        if self.batches_left_to_skip > 0:
+            self.batches_left_to_skip -= 1
+            return
+        assert self._start_time is not None
+
+        if state.timestamp.get(self.interval.unit).value == (self._start_time + self.interval.value):
+            self.export_memory_snapshot(state, logger)
+            self.stop_record_memory_history()
+            self._start_time = None
+            self._enabled = False
+
+    def start_record_memory_history(self) -> None:
+
+        log.info('Starting snapshot record_memory_history')
+        torch.cuda.memory._record_memory_history(
+            True,  # type: ignore
+            trace_alloc_max_entries=self.max_entries,
+            trace_alloc_record_context=True)
+
+    def stop_record_memory_history(self) -> None:
+
+        log.info('Stopping snapshot record_memory_history')
+        torch.cuda.memory._record_memory_history(False)  # type: ignore
+
+    def export_memory_snapshot(self, state: State, logger: Logger) -> None:
+        assert self.filename
+        assert self.folder_name, 'folder_name must be set in init'
+        filename = os.path.join(
+            self.folder_name,
+            format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp))
+        try:
+            snapshot_file = filename + '.pickle'
+            trace_plot_file = filename + '.html'
+            log.info(f'Saving memory snapshot files')
+
+            snapshot = torch.cuda.memory._snapshot()
+            # No data was recorded - avoids a `ValueError` in `trace_plot`
+            if all(len(t) == 0 for t in snapshot['device_traces']):
+                log.info(f'No allocation is recorded in memory snapshot)')
+                return
+
+            with open(snapshot_file, 'wb') as fd:
+                pickle.dump(snapshot, fd)
+
+            with open(trace_plot_file, 'w+') as fd:
+                fd.write(torch.cuda._memory_viz.trace_plot(snapshot))  # type: ignore
+
+            log.info(f'Saved memory snapshot to local files with prefix = {filename}')
+
+            if self.remote_path_in_bucket is not None:
+                for f in [snapshot_file, trace_plot_file]:
+                    remote_file_name = (self.remote_path_in_bucket + os.path.basename(f)).lstrip('/')
+                    log.info(f'Uploading memory snapshot to remote: {remote_file_name} from {f}')
+                    try:
+                        logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite)
+                    except FileExistsError as e:
+                        raise FileExistsError(
+                            f'Uploading memory snapshot failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory snapshot with Trainer, set `overwrite` to True.'
+                        ) from e
+        except Exception as e:
+            log.error(f'Failed to capture memory snapshot {e}')
diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
new file mode 100644
index 0000000000..87d818df1a
--- /dev/null
+++ b/composer/callbacks/oom_observer.py
@@ -0,0 +1,178 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Generate a memory snapshot during an OutOfMemory exception."""
+
+import logging
+import os
+import pickle
+import warnings
+from typing import Optional
+
+import torch.cuda
+from packaging import version
+
+from composer import State
+from composer.core import Callback, State
+from composer.loggers import Logger
+from composer.utils import ensure_folder_is_empty, format_name_with_dist, format_name_with_dist_and_time, parse_uri
+
+log = logging.getLogger(__name__)
+
+__all__ = ['OOMObserver']
+
+
+class OOMObserver(Callback):
+    """Generate visualizations of the state of allocated memory during an OutOfMemory exception.
+
+    This callback registers an observer with the allocator that will be called everytime it is about to raise an OutOfMemoryError before any memory has been release while unwinding the exception. OOMObserver is attached to the Trainer at init stage. The visualizations include a snapshot of the memory state, a trace plot, a segment plot, a segment flamegraph, and a memory flamegraph.
+
+    Example:
+        .. doctest::
+
+            >>> from composer import Trainer
+            >>> from composer.callbacks import OOMObserver
+            >>> # constructing trainer object with this callback
+            >>> trainer = Trainer(
+            ...     model=model,
+            ...     train_dataloader=train_dataloader,
+            ...     eval_dataloader=eval_dataloader,
+            ...     optimizers=optimizer,
+            ...     max_duration="1ep",
+            ...     callbacks=[OOMObserver()],
+            ... )
+
+    .. note::
+        OOMObserver is only supported for GPU devices.
+
+    Args:
+        max_entries (int, optional): Maximum number of memory alloc/free events to record. Defaults to 100000.
+        folder (str, optional): A format string describing the folder containing the memory visualization files.
+            Defaults to ``'{{run_name}}/torch_traces'``.
+        filename (str, optional): A format string describing the prefix used to name the memory visualization files.
+            Defaults to ``'rank{{rank}}_oom'``.
+        remote_file_name (str, optional): A format string describing the prefix for the memory visualization remote file name.
+            Defaults to ``'{{run_name}}/oom_traces/rank{{rank}}_oom'``.
+
+            Whenever a trace file is saved, it is also uploaded as a file according to this format string.
+            The same format variables as for ``filename`` are available.
+
+            .. seealso:: :doc:`Uploading Files</trainer/file_uploading>` for notes for file uploading.
+
+            Leading slashes (``'/'``) will be stripped.
+
+            To disable uploading trace files, set this parameter to ``None``.
+        overwrite (bool, optional): Whether to override existing memory snapshots. Defaults to False.
+
+            If False, then the trace folder as determined by ``folder`` must be empty.
+    """
+
+    def __init__(
+        self,
+        max_entries: int = 100000,
+        folder: str = '{run_name}/torch_traces',
+        filename: str = 'rank{rank}_oom',
+        remote_file_name: Optional[str] = '{run_name}/oom_traces/rank{rank}_oom',
+        overwrite: bool = False,
+    ) -> None:
+        self.max_entries = max_entries
+        self.folder = folder
+        self.folder_name = None
+        self.filename = filename
+        self.remote_file_name = remote_file_name
+        self.overwrite = overwrite
+        if remote_file_name:
+            self.remote_file_name = remote_file_name
+            _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
+        else:
+            self.remote_path_in_bucket = None
+
+        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
+            # OOMObserver is only supported in torch v2.1.0 or higher
+            self._enabled = True
+        else:
+            self._enabled = False
+            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
+
+    def init(self, state: State, logger: Logger) -> None:
+        if not self._enabled:
+            return
+        # Not relying on `torch.cuda.is_available()` since the model could be on CPU.
+        model_device = next(state.model.parameters()).device
+
+        if model_device.type not in ('cuda', 'meta'):
+            warnings.warn(
+                f'OOMObserver only works on CUDA devices, but the model is on {model_device.type}. Disabling OOMObserver.'
+            )
+            self._enabled = False
+        else:
+            self.folder_name = format_name_with_dist(self.folder, state.run_name)
+            os.makedirs(self.folder_name, exist_ok=True)
+            if not self.overwrite:
+                ensure_folder_is_empty(self.folder_name)
+
+        def oom_observer(device: int, alloc: int, device_alloc: int, device_free: int):
+            # Snapshot right after an OOM happened
+            log.warning('Out Of Memory (OOM) observed')
+
+            assert self.filename
+            assert self.folder_name, 'folder_name must be set in init'
+            filename = os.path.join(
+                self.folder_name,
+                format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp))
+
+            try:
+                snapshot_file = filename + '_snapshot.pickle'
+                trace_plot_file = filename + '_trace_plot.html'
+                segment_plot_file = filename + '_segment_plot.html'
+                segment_flamegraph_file = filename + '_segment_flamegraph.svg'
+                memory_flamegraph_file = filename + '_memory_flamegraph.svg'
+                log.info(f'Dumping OOMObserver visualizations')
+
+                snapshot = torch.cuda.memory._snapshot()
+                # No data was recorded - avoids a `ValueError` in `trace_plot`
+                if all(len(t) == 0 for t in snapshot['device_traces']):
+                    log.info(f'No allocation is recorded in memory snapshot)')
+                    return
+
+                with open(snapshot_file, 'wb') as fd:
+                    pickle.dump(snapshot, fd)
+
+                with open(trace_plot_file, 'w+') as fd:
+                    fd.write(torch.cuda._memory_viz.trace_plot(snapshot))  # type: ignore
+
+                with open(segment_plot_file, 'w+') as fd:
+                    fd.write(torch.cuda._memory_viz.segment_plot(snapshot))  # type: ignore
+
+                with open(segment_flamegraph_file, 'w+') as fd:
+                    fd.write(torch.cuda._memory_viz.segments(snapshot))  # type: ignore
+
+                with open(memory_flamegraph_file, 'w+') as fd:
+                    fd.write(torch.cuda._memory_viz.memory(snapshot))  # type: ignore
+
+                log.info(f'Saved memory visualizations to local files with prefix = {filename} during OOM')
+
+                if self.remote_path_in_bucket is not None:
+                    for f in [
+                            snapshot_file, trace_plot_file, segment_plot_file, segment_flamegraph_file,
+                            memory_flamegraph_file
+                    ]:
+                        remote_file_name = (self.remote_path_in_bucket + os.path.basename(f)).lstrip('/')
+                        log.info(f'Uploading memory visualization to remote: {remote_file_name} from {f}')
+                        try:
+                            logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite)
+                        except FileExistsError as e:
+                            raise FileExistsError(
+                                f'Uploading memory visualizations failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory visualizations with Trainer, set save_overwrite to True.'
+                            ) from e
+
+            except Exception as e:
+                log.error(f'Failed to capture memory snapshot {e}')
+
+        if self._enabled:
+            torch.cuda.memory._record_memory_history(
+                True,  # type: ignore
+                trace_alloc_max_entries=self.max_entries,
+                trace_alloc_record_context=True)
+            torch._C._cuda_attach_out_of_memory_observer(oom_observer)  # type: ignore
+            log.info('OOMObserver is enabled and registered')
diff --git a/composer/callbacks/threshold_stopper.py b/composer/callbacks/threshold_stopper.py
index 10c38f9fd5..9cfc541cc7 100644
--- a/composer/callbacks/threshold_stopper.py
+++ b/composer/callbacks/threshold_stopper.py
@@ -85,7 +85,7 @@ def __init__(self,
                 self.comp_func = torch.greater
 
     def _get_monitored_metric(self, state: State):
-        if self.dataloader_label == 'train':
+        if self.dataloader_label == 'train' and state.train_metrics is not None:
             if self.monitor in state.train_metrics:
                 return state.train_metrics[self.monitor].compute()
         else:
diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index f4f7c436a2..c72e1b01db 100755
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -14,13 +14,15 @@
 import tempfile
 import time
 import traceback
+import warnings
 from argparse import ArgumentParser
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import psutil
 import torch
 
 import composer
+from composer.loggers.mosaicml_logger import MOSAICML_LOG_DIR_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR
 from composer.utils import get_free_tcp_port
 
 CLEANUP_TIMEOUT = datetime.timedelta(seconds=30)
@@ -260,7 +262,7 @@ def _launch_processes(
     command_mode: bool,
     training_script: str,
     stdout_file_format: str,
-    stderr_file_format: str,
+    stderr_file_format: Union[str, None],
     training_script_args: List[Any],
     processes: Dict[int, subprocess.Popen],
 ):
@@ -315,17 +317,18 @@ def _get_file(format: str):
                     )
                     return open(filename, 'x+')
 
-                stderr_file = _get_file(stderr_file_format)
                 stdout_file = _get_file(stdout_file_format)
+                stderr_file = _get_file(stderr_file_format) if stderr_file_format is not None else None
 
                 process = subprocess.Popen(
                     cmd,
                     stdout=stdout_file,
-                    stderr=stderr_file,
+                    stderr=stderr_file if stderr_file is not None else subprocess.STDOUT,
                     text=True,
                 )
-                process.stderr = stderr_file
                 process.stdout = stdout_file
+                if stderr_file is not None:
+                    process.stderr = stderr_file
             processes[global_rank] = process
 
 
@@ -357,6 +360,7 @@ def _monitor_processes(processes: Dict[int, subprocess.Popen]):
 
 
 def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
+    stdOutLabel = 'STDOUT'
     if process.stdout is None:
         output = None
     else:
@@ -365,6 +369,7 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
 
     if process.stderr is None:
         stderr = None
+        stdOutLabel = 'logs'
     else:
         process.stderr.seek(0)
         stderr = process.stderr.read()
@@ -374,13 +379,15 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
         output=output,
         stderr=stderr,
     )
+
     error_msg = [f'Global rank {global_rank} (PID {process.pid}) exited with code {process.returncode}']
     if output is not None:
         error_msg.extend([
-            f'----------Begin global rank {global_rank} STDOUT----------',
+            f'----------Begin global rank {global_rank} {stdOutLabel}----------',
             output,
-            f'----------End global rank {global_rank} STDOUT----------',
+            f'----------End global rank {global_rank} {stdOutLabel}----------',
         ])
+
     if stderr is not None:
         error_msg.extend([
             f'----------Begin global rank {global_rank} STDERR----------',
@@ -473,6 +480,19 @@ def main():
     if args.stderr is None:
         args.stderr = f'{log_tmpdir.name}/rank{{rank}}.stderr.txt'
 
+    # If running on the Mosaic platform, log all gpu ranks' stderr and stdout to Mosaic platform
+    if os.environ.get(
+            MOSAICML_PLATFORM_ENV_VAR,
+            'false').lower() == 'true' and str(os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false':
+        log.info('Logging all GPU ranks to Mosaic Platform.')
+        log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/gpu_{{rank}}.txt'
+        if args.stderr is not None or args.stdout is not None:
+            warnings.warn(
+                'Logging to Mosaic Platform. Ignoring provided stdout and stderr args. To use provided stdout and stderr, set MOSAICML_LOG_DIR=false.'
+            )
+        args.stdout = log_file_format
+        args.stderr = None
+
     try:
         _launch_processes(nproc=args.nproc,
                           world_size=args.world_size,
diff --git a/composer/core/__init__.py b/composer/core/__init__.py
index 748778b460..34ed9c2c2b 100644
--- a/composer/core/__init__.py
+++ b/composer/core/__init__.py
@@ -19,7 +19,7 @@
 from composer.core.serializable import Serializable
 from composer.core.state import State
 from composer.core.time import Time, Timestamp, TimeUnit, ensure_time
-from composer.core.types import JSON, Batch, Dataset, MemoryFormat, PyTorchScheduler, TrainerMode
+from composer.core.types import JSON, Batch, Dataset, MemoryFormat, TrainerMode
 
 __all__ = [
     'Algorithm',
@@ -42,7 +42,6 @@
     'ensure_time',
     'ensure_evaluator',
     'Batch',
-    'PyTorchScheduler',
     'JSON',
     'MemoryFormat',
     'TrainerMode',
diff --git a/composer/core/callback.py b/composer/core/callback.py
index c6132ee9d4..68c170bcab 100644
--- a/composer/core/callback.py
+++ b/composer/core/callback.py
@@ -105,6 +105,16 @@ def init(self, state: State, logger: Logger) -> None:
         del state, logger  # unused
         pass
 
+    def before_load(self, state: State, logger: Logger) -> None:
+        """Called on the :attr:`.Event.BEFORE_LOAD` event.
+
+        Args:
+            state (State): The training state.
+            logger (Logger): The logger.
+        """
+        del state, logger  # unused
+        pass
+
     def after_load(self, state: State, logger: Logger) -> None:
         """Called on the :attr:`.Event.AFTER_LOAD` event.
 
diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index feb38a4687..71e3b0a63a 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -111,16 +111,12 @@ class DataSpec:
     """Specifications for operating and training on data.
 
     An example of constructing a :class:`DataSpec` object with a ``device_transforms``
-    callable (:class:`.NormalizationFn`) and then using it with :class:`~.Trainer`:
+    callable and then using it with :class:`~.Trainer`:
 
     .. doctest::
 
-       >>> # In this case, we apply NormalizationFn
-       >>> # Construct DataSpec as shown below to apply this transformation
-       >>> from composer.datasets.utils import NormalizationFn
-       >>> CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-       >>> CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-       >>> device_transform_fn = NormalizationFn(mean=CHANNEL_MEAN, std=CHANNEL_STD)
+       >>> # Construct DataSpec and subtract mean from the batch
+       >>> device_transform_fn = lambda xs, ys: (xs.sub_(xs.mean()), ys)
        >>> train_dspec = DataSpec(train_dataloader, device_transforms=device_transform_fn)
        >>> # The same function can be used for eval dataloader as well
        >>> eval_dspec = DataSpec(eval_dataloader, device_transforms=device_transform_fn)
@@ -209,8 +205,7 @@ def __init__(
             world_size = dist.get_world_size()
             # Check for Distributed Sampler if not using IterableDataset on more than 1 GPU
             if world_size > 1 and not isinstance(dataloader.dataset, torch.utils.data.IterableDataset):
-                is_sampler_distributed = dataloader.sampler is not None and isinstance(
-                    dataloader.sampler, DistributedSampler)
+                is_sampler_distributed = dataloader.sampler and isinstance(dataloader.sampler, DistributedSampler)
                 is_batch_sampler_distributed = dataloader.batch_sampler is not None and isinstance(
                     dataloader.batch_sampler, DistributedSampler)
                 if not is_sampler_distributed and not is_batch_sampler_distributed:
diff --git a/composer/core/engine.py b/composer/core/engine.py
index 7992a02498..75d89d0f9a 100644
--- a/composer/core/engine.py
+++ b/composer/core/engine.py
@@ -351,11 +351,13 @@ def register_pass(self, algorithm_pass: passes.AlgorithmPass, index: int = -1):
     def _assert_dataloader_and_duration_set(state: State, event: Event):
         # correctness checks that dataloader and max duration need to be set for certain events
 
-        # dataloader should be set on all events expect INIT/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
-        if event not in {Event.INIT, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END}:
+        # dataloader should be set on all events except INIT/BEFORE_LOAD/AFTER_LOAD/EVAL_STANDALONE_START/EVAL_STANDALONE_END
+        if event not in {
+                Event.INIT, Event.BEFORE_LOAD, Event.AFTER_LOAD, Event.EVAL_STANDALONE_START, Event.EVAL_STANDALONE_END
+        }:
             assert state.dataloader is not None, f'The trainer should have set state.dataloader for event {event}.'
 
-        if event != Event.INIT and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
+        if event != Event.INIT and event != Event.BEFORE_LOAD and event != Event.AFTER_LOAD and not event.is_predict and not event.is_eval:
             assert state.max_duration is not None, f'The trainer should have set state.max_duration for event {event}.'
 
     def _run_algorithms(
@@ -388,10 +390,9 @@ def _run_algorithms(
                                      order=order,
                                      run=True)
 
-        if self.logger is not None:
-            if len(trace) > 0:
-                self.logger.log_traces(
-                    {f'algorithm_traces/{tr.name}/{tr.event}': 1 if tr.run else 0 for _, tr in trace.items()})
+        if len(trace) > 0:
+            self.logger.log_traces(
+                {f'algorithm_traces/{tr.name}/{tr.event}': 1 if tr.run else 0 for _, tr in trace.items()})
 
         return trace
 
diff --git a/composer/core/event.py b/composer/core/event.py
index 4cda7fc9a8..cb05d393ff 100644
--- a/composer/core/event.py
+++ b/composer/core/event.py
@@ -18,6 +18,7 @@ class Event(StringEnum):
     .. code-block:: python
 
         # <INIT>
+        # <BEFORE_LOAD>
         # <AFTER_LOAD>
         # <FIT_START>
         for epoch in range(NUM_EPOCHS):
@@ -93,6 +94,7 @@ class Event(StringEnum):
     Attributes:
         INIT: Invoked in the constructor of :class:`~.trainer.Trainer`. Model surgery (see
             :mod:`~composer.utils.module_surgery`) typically occurs here.
+        BEFORE_LOAD: Immediately before the checkpoint is loaded in :class:`~.trainer.Trainer`.
         AFTER_LOAD: Immediately after checkpoint is loaded in constructor of :class:`~.trainer.Trainer`.
         FIT_START: Invoked at the beginning of each call to :meth:`.Trainer.fit`. Dataset transformations typically
             occur here.
@@ -142,6 +144,7 @@ class Event(StringEnum):
     """
 
     INIT = 'init'
+    BEFORE_LOAD = 'before_load'
     AFTER_LOAD = 'after_load'
     FIT_START = 'fit_start'
 
@@ -243,12 +246,12 @@ def is_eval(self) -> bool:
         return self.value.startswith('eval')
 
 
-_BEFORE_EVENTS = (Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
+_BEFORE_EVENTS = (Event.BEFORE_LOAD, Event.FIT_START, Event.EPOCH_START, Event.BEFORE_DATALOADER, Event.BATCH_START,
                   Event.BEFORE_TRAIN_BATCH, Event.BEFORE_FORWARD, Event.BEFORE_LOSS, Event.BEFORE_BACKWARD,
                   Event.EVAL_BEFORE_ALL, Event.EVAL_START, Event.EVAL_BATCH_START, Event.EVAL_BEFORE_FORWARD,
                   Event.PREDICT_START, Event.PREDICT_BATCH_START, Event.PREDICT_BEFORE_FORWARD,
                   Event.EVAL_STANDALONE_START)
-_AFTER_EVENTS = (Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH, Event.AFTER_FORWARD,
-                 Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END, Event.EVAL_BATCH_END,
-                 Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END, Event.PREDICT_BATCH_END,
-                 Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
+_AFTER_EVENTS = (Event.AFTER_LOAD, Event.EPOCH_END, Event.BATCH_END, Event.AFTER_DATALOADER, Event.AFTER_TRAIN_BATCH,
+                 Event.AFTER_FORWARD, Event.AFTER_LOSS, Event.AFTER_BACKWARD, Event.EVAL_AFTER_ALL, Event.EVAL_END,
+                 Event.EVAL_BATCH_END, Event.EVAL_AFTER_FORWARD, Event.FIT_END, Event.PREDICT_END,
+                 Event.PREDICT_BATCH_END, Event.PREDICT_AFTER_FORWARD, Event.EVAL_STANDALONE_END)
diff --git a/composer/core/precision.py b/composer/core/precision.py
index a35e9e5bb2..4e641a07e9 100644
--- a/composer/core/precision.py
+++ b/composer/core/precision.py
@@ -78,7 +78,11 @@ def get_precision_context(precision: Union[str, Precision],
                 }
             fp8_recipe = DelayedScaling(**precision_config)
             with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
-                yield
+                # The te.onnx_export flag ensures that we save all fp8 buffers
+                # as tensors instead of bytes. This is necessary for proper
+                # saving and resumption of checkpoints.
+                with te.onnx_export(enabled=True):
+                    yield
         else:
             if te_installed:
                 raise RuntimeError('AMP_FP8 precision is used but current device does not support it.')
diff --git a/composer/core/state.py b/composer/core/state.py
index dc43898e73..fb415f050c 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -16,8 +16,12 @@
 import torch
 import torch.nn.modules.utils
 from packaging import version
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import (FullOptimStateDictConfig, FullStateDictConfig,
+                                                                ShardedOptimStateDictConfig, StateDictType)
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import DataLoader, Dataset
 from torchmetrics import Metric
 
@@ -27,13 +31,12 @@
 from composer.core.serializable import Serializable
 from composer.core.time import Time, Timestamp, TimeUnit
 from composer.devices import Device
-from composer.utils import batch_get, batch_set, dist, ensure_tuple, get_composer_env_dict, is_model_deepspeed
-from composer.utils.misc import using_torch_2
+from composer.utils import (batch_get, batch_set, dist, ensure_tuple, get_composer_env_dict, is_model_deepspeed,
+                            reproducibility)
 
 if TYPE_CHECKING:
     import deepspeed
 
-    import composer.core.types as types
     from composer.core.algorithm import Algorithm
     from composer.core.callback import Callback
     from composer.core.evaluator import Evaluator
@@ -54,22 +57,15 @@ def fsdp_state_dict_type_context(module: torch.nn.Module, state_dict_type: str =
         module (torch.nn.Module): The torch module that you want to call `state_dict()`
             or `load_state_dict()` on.
         state_dict_type (str, optional): which of the three state dict types you want to use.
-            choices are ['full', 'sharded', 'local']. Defaults to 'full'.
+            choices are ['full', 'sharded']. Defaults to 'full'.
             * 'full': the full, unsharded state dict materialized only on rank 0 with cpu_offload if necessary
-            * 'local': the sharded, flattened state_dict, where each rank only gets a single shard.
             * 'sharded': the sharded, unflattened state_dict, where each rank only gets a single shard.
             See torch.distributed.fsdp.StateDictType for more info.
 
     Raises:
-        RuntimeError: if your torch version is earlier than 1.13.0 because FSDP is not available for those versions.
-        NotImplementedError: if you specify a state_dict_type not in ['full', 'sharded', 'local'].
+        NotImplementedError: if you specify a state_dict_type not in ['full', 'sharded'].
     """
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    from torch.distributed.fsdp import FullStateDictConfig
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    from torch.distributed.fsdp import LocalStateDictConfig, StateDictType
-    # torch forgot to put ShardedStateDictConfig in torch/distributed/fsdp/__init__.py, so we
+    # Torch forgot to put ShardedStateDictConfig in torch/distributed/fsdp/__init__.py, so we
     # have to import it this way.
     from torch.distributed.fsdp.fully_sharded_data_parallel import ShardedStateDictConfig
 
@@ -81,40 +77,24 @@ def fsdp_state_dict_type_context(module: torch.nn.Module, state_dict_type: str =
     if state_dict_type == 'full':
         fsdp_state_dict_type = StateDictType.FULL_STATE_DICT
         state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-        if using_torch_2():
-            from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig
-            optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=True)
 
     # Sharded is sharded state dict, but unflattened parameters (not useful for FSDP, but
     # useful if you plan to use the state dict outside of FSDP).
     elif state_dict_type == 'sharded':
         fsdp_state_dict_type = StateDictType.SHARDED_STATE_DICT
         state_dict_config = ShardedStateDictConfig()
-        if using_torch_2():
-            state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
-            from torch.distributed.fsdp.fully_sharded_data_parallel import ShardedOptimStateDictConfig
-            optim_state_dict_config = ShardedOptimStateDictConfig()
-
-    # Local is the FSDP standard sharded, flattened parameters. This is what the parameters
-    # are formatted to for a single rank's FSDP module.
-    elif state_dict_type == 'local':
-        fsdp_state_dict_type = StateDictType.LOCAL_STATE_DICT
-        state_dict_config = LocalStateDictConfig()
-        if using_torch_2():
-            from torch.distributed.fsdp.fully_sharded_data_parallel import LocalOptimStateDictConfig
-            optim_state_dict_config = LocalOptimStateDictConfig()
+        state_dict_config = ShardedStateDictConfig(offload_to_cpu=True)
+        optim_state_dict_config = ShardedOptimStateDictConfig()
+
     else:
         raise NotImplementedError(f'No valid FSDP state_dict_type for {state_dict_type}')
 
-    if using_torch_2():
-        with FSDP.state_dict_type(module,
-                                  state_dict_type=fsdp_state_dict_type,
-                                  state_dict_config=state_dict_config,
-                                  optim_state_dict_config=optim_state_dict_config):
-            yield
-    else:
-        with FSDP.state_dict_type(module, state_dict_type=fsdp_state_dict_type, state_dict_config=state_dict_config):
-            yield
+    with FSDP.state_dict_type(module,
+                              state_dict_type=fsdp_state_dict_type,
+                              state_dict_config=state_dict_config,
+                              optim_state_dict_config=optim_state_dict_config):
+        yield
 
 
 def fsdp_get_optim_state_dict(model: torch.nn.Module,
@@ -126,46 +106,18 @@ def fsdp_get_optim_state_dict(model: torch.nn.Module,
         model (torch.nn.Module): The model that the optimizer corresponds to.
         optim (torch.optim.Optimizer): The optimizer that you want a state dict for.
         state_dict_type (str, optional): which of the three state dict types you want to use.
-            choices are ['full', 'sharded', 'local']. Defaults to 'full'.
+            choices are ['full', 'sharded']. Defaults to 'full'.
             * 'full': the full, unsharded state dict materialized only on rank 0
-            * 'local': the sharded, flattened state_dict, where each rank only gets a single shard.
             * 'sharded': the sharded, unflattened state_dict, where each rank only gets a single shard.
 
     Raises:
-        RuntimeError: if your torch version is earlier than 1.13.0 because FSDP is not available for those versions.
-        NotImplementedError: if you specify a state_dict_type not in ['full', 'sharded', 'local'].
+        NotImplementedError: if you specify a state_dict_type not in ['full', 'sharded'].
 
     Returns:
         Dict[str, Any]: The state_dict for the given optimizer.
     """
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    if not using_torch_2():
-        optim_state_dict = _legacy_fsdp_get_optim_state_dict(model, optim, state_dict_type)
-    else:
-        with fsdp_state_dict_type_context(module=model, state_dict_type=state_dict_type):
-            optim_state_dict = FSDP.optim_state_dict(model, optim)  # type: ignore
-    return optim_state_dict
-
-
-def _legacy_fsdp_get_optim_state_dict(model: torch.nn.Module,
-                                      optim: torch.optim.Optimizer,
-                                      state_dict_type: str = 'full') -> Dict[str, Any]:
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    if state_dict_type == 'full':
-        # Converts local state dict to full.
-        return FSDP.full_optim_state_dict(model=model, optim=optim)
-    elif state_dict_type == 'sharded':
-        # Converts local state dict to sharded.
-        return FSDP.sharded_optim_state_dict(model=model, optim=optim)
-    elif state_dict_type == 'local':
-        # State dict is already local, so just return state dict.
-        return optim.state_dict()
-    else:
-        raise NotImplementedError(f'No valid FSDP state_dict_type for {state_dict_type}')
+    with fsdp_state_dict_type_context(module=model, state_dict_type=state_dict_type):
+        return FSDP.optim_state_dict(model, optim)  # type: ignore
 
 
 def _legacy_optim_state_dict_to_load(
@@ -174,9 +126,6 @@ def _legacy_optim_state_dict_to_load(
     optim: torch.optim.Optimizer,
     state_dict_type: str = 'full',
 ):
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     if state_dict_type == 'sharded':
         # Optimizer and optimizer state dict are already sharded, but not
         # flattened, so we flatten the state dict then load it.
@@ -185,10 +134,6 @@ def _legacy_optim_state_dict_to_load(
                                                                            model=model,
                                                                            optim=optim)
         return flattened_optim_state_dict
-    elif state_dict_type == 'local':
-        # Optimizer and optimizer state dict are already sharded and flattened,
-        # so just load the state_dict.
-        return optim_state_dict
     else:  # fsdp_state_dict_type == 'full'
         # FSDP enabled, but fsdp_state_dict is set to 'full', so the state dict
         # is a full state dict and we must shard and flatten it first before loading it.
@@ -198,8 +143,6 @@ def _legacy_optim_state_dict_to_load(
 
 
 def get_fsdp_sharded_optim_state_dict(full_optim_state_dict: Dict[str, Any], model: torch.nn.Module):
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     log.debug(
         f'Scattering optimizer state dict with keys {full_optim_state_dict.keys()} and model of type {type(model)}')
@@ -207,9 +150,6 @@ def get_fsdp_sharded_optim_state_dict(full_optim_state_dict: Dict[str, Any], mod
 
 
 def get_fsdp_full_optim_state_dict(model: torch.nn.Module, optim: torch.optim.Optimizer, rank0_only: bool = True):
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     return FSDP.full_optim_state_dict(model=model, optim=optim, rank0_only=rank0_only)
 
 
@@ -280,7 +220,7 @@ class State(Serializable):
             `DelayedScaling <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html?highlight=delayedscaling#transformer_engine.common.recipe.DelayedScaling>`_.
         optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer], optional): The optimizer being used to
             train the model. Multiple optimizers are not currently supported.
-        schedulers (types.PyTorchScheduler | Sequence[types.PyTorchScheduler], optional):
+        schedulers (LRScheduler | Sequence[LRScheduler], optional):
             The learning rate scheduler (can also be a list or tuple of schedulers).
         scaler (torch.cuda.amp.GradScaler, optional): The gradient scaler in use for mixed precision training.
         save_metrics (bool, optional): Whether to save metrics in state_dict.
@@ -531,11 +471,13 @@ def __init__(
         if self.fsdp_config is not None:
             self.sharded_ckpt_prefix_dir = self.fsdp_config['sharded_ckpt_prefix_dir']
 
-        if using_torch_2() and self.fsdp_state_dict_type == 'local':
-            raise DeprecationWarning(
-                textwrap.dedent(
-                    "FSDP state_dict_type='local' is deprecated in torch>=2.0.0. "
-                    "Please set fsdp_config['state_dict_type']='sharded' instead and will be removed in v0.17"))
+        if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
+            if self.fsdp_state_dict_type == 'local':
+                raise ValueError('Composer and PyTorch no longer support saving or loading local state dicts. '
+                                 'To upgrade an older checkpoint, use Composer version 0.18.1 and export as '
+                                 'a monolithic checkpoint using a callback.')
+            raise ValueError(f'fsdp_state_dict_type must be one of [None, "full", "sharded"], but got '
+                             f'{self.fsdp_state_dict_type}')
         if self.fsdp_sharded_state_dict_enabled and self.save_metrics:
             # Sharded state dict breaks in many different ways with torchmetrics, due to both sharding
             # metric tensors and only sometimes flattening path names in state dict and _computed, so
@@ -570,7 +512,7 @@ def __init__(
             'dataset_state',
         ]
 
-        self.train_metrics: Dict[str, Metric] = {}
+        self.train_metrics: Optional[Dict[str, Metric]] = {}
         self.eval_metrics: Dict[str, Dict[str, Metric]] = {}
         self.train_metric_values: Dict[str, float] = {}
         self.eval_metric_values: Dict[str, float] = {}
@@ -687,7 +629,7 @@ def schedulers(self):
         return self._schedulers
 
     @schedulers.setter
-    def schedulers(self, schedulers: Union[types.PyTorchScheduler, Sequence[types.PyTorchScheduler]]):
+    def schedulers(self, schedulers: Union[LRScheduler, Sequence[LRScheduler]]):
         self._schedulers[:] = ensure_tuple(schedulers)
 
     def batch_get_item(self, key: Union[str, int, Callable, Any]) -> Any:
@@ -772,11 +714,8 @@ def deepspeed_enabled(self):
     @property
     def fsdp_enabled(self):
         """Indicates if FSDP is enabled."""
-        if version.parse(torch.__version__) < version.parse('1.13.0'):
-            return False
-        from torch.distributed.fsdp import FullyShardedDataParallel
         for module in self.model.modules():
-            if isinstance(module, FullyShardedDataParallel):
+            if isinstance(module, FSDP):
                 return True
         return False
 
@@ -790,7 +729,16 @@ def fsdp_state_dict_type(self):
 
     @property
     def fsdp_sharded_state_dict_enabled(self):
-        return self.fsdp_config is not None and self.fsdp_enabled and self.fsdp_state_dict_type in ['sharded', 'local']
+        return self.fsdp_config is not None and self.fsdp_enabled and self.fsdp_state_dict_type == 'sharded'
+
+    @property
+    def fsdp_device_mesh(self):
+        if self.fsdp_enabled:
+            if not hasattr(self.model, 'model'):
+                return None
+            return self.model.model._device_mesh
+        else:
+            return None
 
     @property
     def load_fsdp_monolith_rank0_only(self):
@@ -799,7 +747,7 @@ def load_fsdp_monolith_rank0_only(self):
 
     @property
     def fsdp_elastic_sharded_enabled(self):
-        return (self.fsdp_sharded_state_dict_enabled and using_torch_2())
+        return self.fsdp_sharded_state_dict_enabled
 
     def _get_integrations_state_dict(self) -> Dict[str, Any]:
         """Gets a dictionary of information about integrations to store in the state dict.
@@ -864,18 +812,71 @@ def get_model_state_dict(self) -> Dict[str, Any]:
         Returns:
             Dict[str, Any]: The state dict for the model.
         """
-        if self.fsdp_enabled and self.fsdp_state_dict_type is not None:
-            with fsdp_state_dict_type_context(self.model, state_dict_type=self.fsdp_state_dict_type):
-                model_state_dict = self.model.state_dict()
+        if version.parse(torch.__version__) > version.parse('2.2.9'):
+            from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
+            if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
+                raise NotImplementedError(
+                    textwrap.dedent(f'fsdp_state_dict_type={self.fsdp_state_dict_type} is not supported for '
+                                    f'torch version {{version.parse(torch.__version__)}} > 2.1.3. Please set '
+                                    'fsdp_state_dict_type to None, "full", or "sharded".'))
+
+            model_state_dict = get_model_state_dict(
+                model=self.model,
+                submodules=None,
+                options=StateDictOptions(
+                    full_state_dict=self.fsdp_state_dict_type != 'sharded',
+                    cpu_offload=True,
+                ),
+            )
         else:
-            model_state_dict = self.model.state_dict()
+            if self.fsdp_enabled and self.fsdp_state_dict_type is not None:
+                with fsdp_state_dict_type_context(self.model, state_dict_type=self.fsdp_state_dict_type):
+                    model_state_dict = self.model.state_dict()
+            else:
+                model_state_dict = self.model.state_dict()
 
-        # Save model directly instead of by class name, since model may be wrapped by DistributedDataParallel
-        # If it is DDP wrapped, do not save the `module.` prefix, as that is an implementation detail
+        # If model is DDP wrapped, do not save the `module.` prefix, as that is an implementation detail
         if self.is_model_ddp:
             torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')
+
         return model_state_dict
 
+    def get_optim_state_dict(self) -> Dict[str, Any]:
+        """Collect the state dict for the optimizer.
+
+        Returns:
+            Dict[str, Any]: The state dict for the optimizer.
+        """
+        if version.parse(torch.__version__) > version.parse('2.2.9'):
+            from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
+            if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
+                raise NotImplementedError(
+                    textwrap.dedent(f'fsdp_state_dict_type={self.fsdp_state_dict_type} is not supported for '
+                                    f'torch version {{version.parse(torch.__version__)}} > 2.1.3. Please set '
+                                    'fsdp_state_dict_type to None, "full", or "sharded".'))
+
+            optimizer = ensure_tuple(self.optimizers)[0]
+            optim_state_dict = get_optimizer_state_dict(
+                model=self.model,
+                optimizers=optimizer,
+                submodules=None,
+                options=StateDictOptions(
+                    full_state_dict=self.fsdp_state_dict_type != 'sharded',
+                    cpu_offload=True,
+                ),
+            )
+            return {type(optimizer).__qualname__: optim_state_dict}
+        else:
+            optimizer = ensure_tuple(self.optimizers)[0]
+            if self.fsdp_enabled and self.fsdp_state_dict_type is not None:
+                optim_state_dict = {
+                    type(optimizer).__qualname__:
+                        fsdp_get_optim_state_dict(self.model, optimizer, state_dict_type=self.fsdp_state_dict_type)
+                }
+            else:
+                optim_state_dict = {type(optimizer).__qualname__: optimizer.state_dict()}
+            return optim_state_dict
+
     def state_dict(self) -> Dict[str, Any]:
         """Collect the state dicts of our serializable attributes.
 
@@ -883,7 +884,6 @@ def state_dict(self) -> Dict[str, Any]:
             Dict[str, Any]: The state dict.
         """
         state_dict = {}
-
         for attribute_name in self.serialized_attributes:
             attribute_value = getattr(self, attribute_name)
             if attribute_name == 'dataset_state':
@@ -891,16 +891,7 @@ def state_dict(self) -> Dict[str, Any]:
             elif attribute_name == 'model':
                 serialized_value = self.get_model_state_dict()
             elif attribute_name == 'optimizers':
-                optimizer = ensure_tuple(attribute_value)[
-                    0]  # Let's stop pretending. We don't support more than one optimizer.
-                if self.fsdp_enabled and self.fsdp_state_dict_type is not None:
-                    optim_state_dict = {
-                        type(optimizer).__qualname__:
-                            fsdp_get_optim_state_dict(self.model, optimizer, state_dict_type=self.fsdp_state_dict_type)
-                    }
-                else:
-                    optim_state_dict = {type(optimizer).__qualname__: optimizer.state_dict()}
-                serialized_value = optim_state_dict
+                serialized_value = self.get_optim_state_dict()
             elif attribute_name == 'algorithms':
                 # Store as list to preserve order in which algorithms were applied
                 serialized_value = [(type(obj).__qualname__, obj.state_dict()) for obj in ensure_tuple(attribute_value)]
@@ -1058,80 +1049,7 @@ def _apply_required_algorithms(
                     'have undergone surgery, the following algorithms may be excluded using '
                     f'`load_exclude_algorithms`, e.g. `load_exclude_algorithms=[{missing_algo_names}]`.')) from e
 
-    def load_model_state(
-        self,
-        state_dict: Dict[str, Any],
-        logger: Logger,
-        strict: bool,
-        exclude_algorithms: Optional[List[str]] = None,
-        algorithm_passes: Optional[List[AlgorithmPass]] = None,
-    ):
-        """Loads the model's state from a ``state_dict``.
-
-        Args:
-            state_dict (Dict[str, Any]): The state dict, generated from a previous call to :meth:`state_dict`.
-            logger (Logger): The logger.
-            strict (bool): Whether the keys (i.e., model parameter names) in the model state dict should
-                perfectly match the keys in the model instance.
-            exclude_algorithms (List[str], optional): List of algorithm names to exclude from autoloading. (default: ``None``)
-            algorithm_passes (List[AlgorithmPass], optional): A list of algorithm passes to apply to autoloaded algorithms
-                to sort them into the correct order. (default: ``None``)
-        """
-        if 'algorithms' in state_dict:
-            self._apply_required_algorithms(state_dict, logger, exclude_algorithms, algorithm_passes)
-
-        if state_dict.get('is_model_ddp', False) and not self.is_model_ddp:
-            # This check is for backwards compatibility, as pre-v0.6.0 checkpoints serialized the state
-            # with the `module.` prefix
-            torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict['model'], 'module.')
-
-        # For FSDP monolith checkpoints, the model does not exist on ranks > 0
-        model_on_rank = state_dict['model'] is not None
-
-        missing_keys, unexpected_keys = [], []
-        try:
-            # Load model if it exists. For FSDP monolith checkpoints, the model does not exist on ranks > 0
-            if model_on_rank:
-                if self.fsdp_enabled and self.fsdp_state_dict_type is not None and not self.load_fsdp_monolith_rank0_only:
-                    log.debug(
-                        f'Loading model state dict with strict={strict} and FSDP state_dict_type={self.fsdp_state_dict_type}'
-                    )
-                    with fsdp_state_dict_type_context(self.model, state_dict_type=self.fsdp_state_dict_type):
-                        missing_keys, unexpected_keys = self.model.load_state_dict(state_dict['model'], strict=strict)
-                else:
-                    log.debug(f'Loading model state dict with strict={strict}')
-                    missing_keys, unexpected_keys = self.model.load_state_dict(state_dict['model'], strict=strict)
-        except RuntimeError as e:
-            if 'Missing key(s) in state_dict' in str(e) or 'Unexpected key(s) in state_dict' in str(e):
-                raise RuntimeError(
-                    textwrap.dedent('Failed to load checkpoint due to missing or unexpected keys in state_dict. '
-                                    'This is likely due to a change in the model architecture. If this is intentional, '
-                                    'you can set load_strict_model_weights=False in the Trainer.')) from e
-            else:
-                raise e
-
-        if model_on_rank and len(missing_keys) > 0:
-            log.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
-        if model_on_rank and len(unexpected_keys) > 0:
-            if self.fsdp_config is not None and self.fsdp_config[
-                    'use_orig_params'] and self.fsdp_state_dict_type == 'local':
-                log.warning(
-                    'You are using use_orig_params=True and fsdp_state_dict_type=local. '
-                    'This results in both the original parameters and the flat parameters being '
-                    'in the state dict. If you see a warning with unexpected keys ending in ._flat_param, the model'
-                    'was still loaded correctly.')
-            log.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
-
-        # If loading FSDP monolith checkpoint on rank 0 only, the model must be wrapped after loading
-        if self.load_fsdp_monolith_rank0_only:
-            assert self.fsdp_config is not None
-            log.info('Wrapping model with FSDP after loading model_state.')
-            from composer.trainer.dist_strategy import prepare_fsdp_module
-            prepare_fsdp_module(self.model, self.optimizers, self.fsdp_config, self.precision, self.device,
-                                self.auto_microbatching)
-            log.debug('Finished wrapping model with FSDP.')
-
-    def load_optim_state(self, state_dict: Dict[str, Any]):
+    def _legacy_load_optim_state(self, state_dict: Dict[str, Any]):
         """Load the optimizer state.
 
         Args:
@@ -1154,13 +1072,10 @@ def load_optim_state(self, state_dict: Dict[str, Any]):
             optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
             if self.fsdp_enabled:
                 assert self.fsdp_state_dict_type is not None  # pyright
-                if version.parse(torch.__version__) < version.parse('1.13.0'):
-                    raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-                from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
                 log.debug(f'Loading FSDP optimizer with fsdp_state_dict_type={self.fsdp_state_dict_type}')
                 # Loading FSDP monolith on rank 0 only requires FSDP.scatter_full_optim_state_dict
                 # as the context manager does not seem to pass rank0_only=True for the optimizer config
-                if not using_torch_2() or self.load_fsdp_monolith_rank0_only:
+                if self.load_fsdp_monolith_rank0_only:
                     optim_state_dict = _legacy_optim_state_dict_to_load(
                         optim_state_dict=optim_state_dict,
                         model=self.model,
@@ -1205,6 +1120,141 @@ def _load_dataset_state(self, obj: Dict[str, Any]) -> None:
                 # starts. This avoids "CUDA error: initialization error" -- its not clear why.
                 # self.dataset_resumption['eval'][evaluator.label] = True
 
+    def load_model_state(
+        self,
+        state_dict: Dict[str, Any],
+        logger: Logger,
+        strict: bool,
+        exclude_algorithms: Optional[List[str]] = None,
+        algorithm_passes: Optional[List[AlgorithmPass]] = None,
+    ):
+        """Loads the model's state from a ``state_dict``.
+
+        Args:
+            state_dict (Dict[str, Any]): The state dict, generated from a previous call to :meth:`state_dict`.
+            logger (Logger): The logger.
+            strict (bool): Whether the keys (i.e., model parameter names) in the model state dict should
+                perfectly match the keys in the model instance.
+            exclude_algorithms (List[str], optional): List of algorithm names to exclude from autoloading. (default: ``None``)
+            algorithm_passes (List[AlgorithmPass], optional): A list of algorithm passes to apply to autoloaded algorithms
+                to sort them into the correct order. (default: ``None``)
+        """
+        if 'algorithms' in state_dict:
+            self._apply_required_algorithms(state_dict, logger, exclude_algorithms, algorithm_passes)
+
+        if state_dict.get('is_model_ddp', False) and not self.is_model_ddp:
+            # This check is for backwards compatibility, as pre-v0.6.0 checkpoints serialized the state
+            # with the `module.` prefix
+            torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict['model'], 'module.')
+
+        # For FSDP monolith checkpoints, the model does not exist on ranks > 0
+        model_on_rank = state_dict['model'] is not None
+
+        if model_on_rank:
+            if version.parse(torch.__version__) > version.parse('2.2.9'):
+                from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
+                set_model_state_dict(
+                    model=self.model,
+                    model_state_dict=state_dict['model'],
+                    options=StateDictOptions(strict=strict, cpu_offload=True),
+                )
+            else:
+                missing_keys, unexpected_keys = [], []
+                try:
+                    # Load model if it exists
+                    if self.fsdp_enabled and self.fsdp_state_dict_type is not None and not self.load_fsdp_monolith_rank0_only:
+                        log.debug(
+                            f'Loading model state dict with strict={strict} and FSDP state_dict_type={self.fsdp_state_dict_type}'
+                        )
+                        with fsdp_state_dict_type_context(self.model, state_dict_type=self.fsdp_state_dict_type):
+                            missing_keys, unexpected_keys = self.model.load_state_dict(state_dict['model'],
+                                                                                       strict=strict)
+                    else:
+                        log.debug(f'Loading model state dict with strict={strict}')
+                        missing_keys, unexpected_keys = self.model.load_state_dict(state_dict['model'], strict=strict)
+                except RuntimeError as e:
+                    if 'Missing key(s) in state_dict' in str(e) or 'Unexpected key(s) in state_dict' in str(e):
+                        raise RuntimeError(
+                            textwrap.dedent(
+                                'Failed to load checkpoint due to missing or unexpected keys in state_dict. '
+                                'This is likely due to a change in the model architecture. If this is intentional, '
+                                'you can set load_strict_model_weights=False in the Trainer.')) from e
+                    else:
+                        raise e
+
+                if len(missing_keys) > 0:
+                    log.warning(f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}")
+                if len(unexpected_keys) > 0:
+                    log.warning(f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}")
+
+        # If loading FSDP monolith checkpoint on rank 0 only, the model must be wrapped after loading
+        if self.load_fsdp_monolith_rank0_only:
+            assert self.fsdp_config is not None
+            log.info('Wrapping model with FSDP after loading model_state.')
+            from composer.trainer.dist_strategy import prepare_fsdp_module
+            with reproducibility.seed_context(self.rank_zero_seed):
+                prepare_fsdp_module(self.model, self.optimizers, self.fsdp_config, self.precision, self.device,
+                                    self.auto_microbatching)
+            log.debug('Finished wrapping model with FSDP.')
+
+    def load_optim_state(self, state_dict: Dict[str, Any], strict: bool = True):
+        """Load the optimizer state.
+
+        Args:
+            state_dict (Dict[str, Any]): The state to load.
+            strict (bool): Whether the keys (i.e., optimizer parameter names) in the optimizer
+                state dict should perfectly match the keys in the optimizer instance.
+        """
+        if version.parse(torch.__version__) > version.parse('2.2.9'):
+            from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict
+            optimizer = self.optimizers[0]
+            set_optimizer_state_dict(
+                model=self.model,
+                optimizers=optimizer,
+                optim_state_dict=state_dict['optimizers'].get(type(optimizer).__qualname__, {}),
+                options=StateDictOptions(strict=strict, cpu_offload=True),
+            )
+        else:
+            serialized_value = state_dict['optimizers']
+            for optimizer in ensure_tuple(self.optimizers):
+                # Broadcast compatibility check as monolith rank 0 only loads won't have optimizer on all ranks
+                skip_optimizer_load = 1 if serialized_value is not None and type(
+                    optimizer).__qualname__ not in serialized_value else 0
+                skip_optimizer_load_tensor = self.device.tensor_to_device(
+                    torch.tensor([skip_optimizer_load], dtype=torch.uint8))
+                dist.all_reduce(skip_optimizer_load_tensor, reduce_operation='MAX')
+                if skip_optimizer_load_tensor.item() == 1:
+                    warnings.warn(
+                        f'{type(optimizer).__qualname__} is not in the state_dict. Its state will not be restored.',
+                        category=UserWarning)
+                    continue
+
+                optim_state_dict = serialized_value[type(
+                    optimizer).__qualname__] if serialized_value is not None else None
+                if self.fsdp_enabled:
+                    assert self.fsdp_state_dict_type is not None  # pyright
+                    log.debug(f'Loading FSDP optimizer with fsdp_state_dict_type={self.fsdp_state_dict_type}')
+                    # Loading FSDP monolith on rank 0 only requires FSDP.scatter_full_optim_state_dict
+                    # as the context manager does not seem to pass rank0_only=True for the optimizer config
+                    if self.load_fsdp_monolith_rank0_only:
+                        optim_state_dict = _legacy_optim_state_dict_to_load(
+                            optim_state_dict=optim_state_dict,
+                            model=self.model,
+                            optim=optimizer,
+                            state_dict_type=self.fsdp_state_dict_type,
+                        )
+                    else:
+                        assert optim_state_dict is not None
+                        with fsdp_state_dict_type_context(module=self.model, state_dict_type=self.fsdp_state_dict_type):
+                            optim_state_dict = FSDP.optim_state_dict_to_load(  #  type: ignore
+                                optim_state_dict=optim_state_dict, model=self.model, optim=optimizer)
+                    assert optim_state_dict is not None
+                    optimizer.load_state_dict(optim_state_dict)
+                else:
+                    assert optim_state_dict is not None
+                    log.debug(f'Loading optimizer state dict')
+                    optimizer.load_state_dict(optim_state_dict)
+
     def load_state_dict(
         self,
         state: Dict[str, Any],
@@ -1241,15 +1291,12 @@ def load_state_dict(
             # Skip removed attributes as well as algorithms and model, which was already loaded
             if attribute_name not in self.serialized_attributes or attribute_name == 'model':
                 continue
-
             # Integrations are extra information about other libraries (e.g. huggingface) and not attributes to be loaded here
             if attribute_name == 'integrations':
                 continue
-
             # Skip metadata, which is not an attribute on State
             if attribute_name == 'metadata':
                 continue
-
             log.debug(f'Loading {attribute_name} into state.')
 
             # Restructure algorithms serialized_value from list to dict
diff --git a/composer/core/types.py b/composer/core/types.py
index 2ffa343d4d..2925a8df63 100644
--- a/composer/core/types.py
+++ b/composer/core/types.py
@@ -6,8 +6,6 @@
 Attributes:
     Batch (Any): Alias to type Any.
         A batch of data can be represented in several formats, depending on the application.
-    PyTorchScheduler (torch.optim.lr_scheduler._LRScheduler): Alias for base class of learning rate schedulers such
-        as :class:`torch.optim.lr_scheduler.ConstantLR`.
     JSON (str | float | int | None | List['JSON'] | Dict[str, 'JSON']): JSON Data.
     Dataset (torch.utils.data.Dataset[Batch]): Alias for :class:`torch.utils.data.Dataset`.
 """
@@ -21,19 +19,12 @@
 
 from composer.utils import StringEnum
 
-__all__ = ['Batch', 'PyTorchScheduler', 'JSON', 'MemoryFormat', 'TrainerMode']
+__all__ = ['Batch', 'JSON', 'MemoryFormat', 'TrainerMode']
 
 Batch = Any
 
 Dataset = torch.utils.data.Dataset[Batch]
 
-try:
-    # This is correct for PyTorch >= 2.0
-    PyTorchScheduler = torch.optim.lr_scheduler.LRScheduler  # type: ignore
-except:
-    # This is correct for PyTorch < 2.0
-    PyTorchScheduler = torch.optim.lr_scheduler._LRScheduler
-
 JSON = Union[str, float, int, None, List['JSON'], Dict[str, 'JSON']]
 
 
diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
index 56672e27f3..6496c2b499 100644
--- a/composer/datasets/__init__.py
+++ b/composer/datasets/__init__.py
@@ -3,40 +3,17 @@
 
 """Natively supported datasets."""
 
-from composer.datasets.ade20k import (ADE20k, build_ade20k_dataloader, build_streaming_ade20k_dataloader,
-                                      build_synthetic_ade20k_dataloader)
-from composer.datasets.brats import PytTrain, PytVal
-from composer.datasets.c4 import build_streaming_c4_dataloader
-from composer.datasets.cifar import (build_cifar10_dataloader, build_ffcv_cifar10_dataloader,
-                                     build_streaming_cifar10_dataloader, build_synthetic_cifar10_dataloader)
-from composer.datasets.imagenet import (build_ffcv_imagenet_dataloader, build_imagenet_dataloader,
-                                        build_streaming_imagenet1k_dataloader, build_synthetic_imagenet_dataloader)
-from composer.datasets.lm_dataset import build_lm_dataloader
-from composer.datasets.mnist import build_mnist_dataloader, build_synthetic_mnist_dataloader
-from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
-                                         SyntheticPILDataset)
+from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
+                                                              InContextLearningMultipleChoiceTaskDataset,
+                                                              InContextLearningQATaskDataset,
+                                                              InContextLearningSchemaTaskDataset)
 
 __all__ = [
-    'ADE20k',
-    'PytTrain',
-    'PytVal',
-    'SyntheticBatchPairDataset',
-    'SyntheticDataLabelType',
-    'SyntheticDataType',
-    'SyntheticPILDataset',
-    'build_ade20k_dataloader',
-    'build_streaming_ade20k_dataloader',
-    'build_streaming_c4_dataloader',
-    'build_cifar10_dataloader',
-    'build_streaming_cifar10_dataloader',
-    'build_ffcv_cifar10_dataloader',
-    'build_synthetic_ade20k_dataloader',
-    'build_synthetic_cifar10_dataloader',
-    'build_ffcv_imagenet_dataloader',
-    'build_imagenet_dataloader',
-    'build_streaming_imagenet1k_dataloader',
-    'build_synthetic_imagenet_dataloader',
-    'build_mnist_dataloader',
-    'build_synthetic_mnist_dataloader',
-    'build_lm_dataloader',
+    'InContextLearningDataset',
+    'InContextLearningQATaskDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
 ]
diff --git a/composer/datasets/ade20k.py b/composer/datasets/ade20k.py
deleted file mode 100644
index 23f801c485..0000000000
--- a/composer/datasets/ade20k.py
+++ /dev/null
@@ -1,568 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ADE20K Semantic segmentation and scene parsing dataset.
-
-Please refer to the `ADE20K dataset <https://groups.csail.mit.edu/vision/datasets/ADE20K/>`_ for more details about this
-dataset.
-"""
-
-import os
-import warnings
-from math import ceil
-from typing import Any, Dict, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torchvision.transforms.functional as TF
-from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from torchvision import transforms
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'ADE20k', 'build_ade20k_dataloader', 'build_streaming_ade20k_dataloader', 'build_synthetic_ade20k_dataloader'
-]
-
-IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-
-def build_ade20k_transformations(split,
-                                 base_size: int = 512,
-                                 min_resize_scale: float = 0.5,
-                                 max_resize_scale: float = 2.0,
-                                 final_size: int = 512):
-    """Builds the transformations for the ADE20k dataset.
-
-       Args:
-           base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-           min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-           max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-           final_size (int): The final size of the image and target. Default: ``512``.
-
-       Returns:
-           both_transforms (torch.nn.Module): Transformations to apply to a 2-tuple containing the input image and the
-               target semantic segmentation mask.
-           image_transforms (torch.nn.Module): Transformations to apply to the input image only.
-           target_transforms (torch.nn.Module): Transformations to apply to the target semantic segmentation mask only.
-    """
-    warnings.warn(DeprecationWarning('build_ade20k_transformations is deprecated and will be removed in v0.18'))
-
-    if split == 'train':
-        both_transforms = torch.nn.Sequential(
-            RandomResizePair(
-                min_scale=min_resize_scale,
-                max_scale=max_resize_scale,
-                base_size=(base_size, base_size),
-            ),
-            RandomCropPair(
-                crop_size=(final_size, final_size),
-                class_max_percent=0.75,
-                num_retry=10,
-            ),
-            RandomHFlipPair(),
-        )
-
-        # Photometric distoration values come from mmsegmentation:
-        # https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L861
-        r_mean, g_mean, b_mean = IMAGENET_CHANNEL_MEAN
-        image_transforms = torch.nn.Sequential(
-            PhotometricDistoration(brightness=32. / 255, contrast=0.5, saturation=0.5, hue=18. / 255),
-            PadToSize(size=(final_size, final_size), fill=(int(r_mean), int(g_mean), int(b_mean))))
-
-        target_transforms = PadToSize(size=(final_size, final_size), fill=0)
-    else:
-        both_transforms = None
-        image_transforms = transforms.Resize(size=(final_size, final_size), interpolation=TF.InterpolationMode.BILINEAR)
-        target_transforms = transforms.Resize(size=(final_size, final_size), interpolation=TF.InterpolationMode.NEAREST)
-    return both_transforms, image_transforms, target_transforms
-
-
-def build_ade20k_dataloader(
-    global_batch_size: int,
-    datadir: str,
-    *,
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    base_size: int = 512,
-    min_resize_scale: float = 0.5,
-    max_resize_scale: float = 2.0,
-    final_size: int = 512,
-    ignore_background: bool = True,
-    **dataloader_kwargs,
-):
-    """Builds an ADE20k dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        datadir (str): Path to location of dataset.
-        split (str): The dataset split to use either 'train', 'val', or 'test'. Default: ``'train```.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        shuffle (bool): Whether to shuffle the dataset. Default: ``True``.
-        base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-        min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-        max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        ignore_background (bool): If true, ignore the background class when calculating the training loss.
-            Default: ``true``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    both_transforms, image_transforms, target_transforms = build_ade20k_transformations(
-        split=split,
-        base_size=base_size,
-        min_resize_scale=min_resize_scale,
-        max_resize_scale=max_resize_scale,
-        final_size=final_size)
-
-    dataset = ADE20k(datadir=datadir,
-                     split=split,
-                     both_transforms=both_transforms,
-                     image_transforms=image_transforms,
-                     target_transforms=target_transforms)
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                          std=IMAGENET_CHANNEL_STD,
-                                          ignore_background=ignore_background)
-
-    return DataSpec(
-        dataloader=DataLoader(dataset=dataset,
-                              batch_size=batch_size,
-                              sampler=sampler,
-                              drop_last=drop_last,
-                              collate_fn=pil_image_collate,
-                              **dataloader_kwargs),
-        device_transforms=device_transform_fn,
-    )
-
-
-def build_streaming_ade20k_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-ade20k/',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    base_size: int = 512,
-    min_resize_scale: float = 0.5,
-    max_resize_scale: float = 2.0,
-    final_size: int = 512,
-    ignore_background: bool = True,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Build an ADE20k streaming dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str): Local filesystem directory where dataset is cached during operation.
-            Default: ``'/tmp/mds-cache/mds-ade20k/```.
-        split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
-        base_size (int): Initial size of the image and target before other augmentations. Default: ``512``.
-        min_resize_scale (float): The minimum value the samples can be rescaled. Default: ``0.5``.
-        max_resize_scale (float): The maximum value the samples can be rescaled. Default: ``2.0``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        ignore_background (bool): If true, ignore the background class when calculating the training loss.
-            Default: ``true``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_streaming_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingADE20K
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    # Build the sets of transformations for ADE20k
-    joint_transform, image_transform, target_transform = build_ade20k_transformations(
-        split=split,
-        base_size=base_size,
-        min_resize_scale=min_resize_scale,
-        max_resize_scale=max_resize_scale,
-        final_size=final_size,
-    )
-
-    dataset = StreamingADE20K(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        joint_transform=joint_transform,
-        transform=image_transform,
-        target_transform=target_transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    device_transform_fn = NormalizationFn(
-        mean=IMAGENET_CHANNEL_MEAN,
-        std=IMAGENET_CHANNEL_STD,
-        ignore_background=ignore_background,
-    )
-
-    return DataSpec(dataloader=dataloader, device_transforms=device_transform_fn)
-
-
-def build_synthetic_ade20k_dataloader(
-    global_batch_size: int,
-    *,
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    final_size: int = 512,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Builds a synthetic ADE20k dataloader.
-
-    Args:
-        batch_size (int): Global batch size.
-        split (str): The dataset split to use either 'train', 'val', or 'test'. Default: ``'train```.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        shuffle (bool): Whether to shuffle the dataset. Default: ``True``.
-        final_size (int): The final size of the image and target. Default: ``512``.
-        num_unique_samples (int): Number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): Device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): Memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_synthetic_ade20k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if split == 'train':
-        total_dataset_size = 20_206
-    elif split == 'val':
-        total_dataset_size = 2_000
-    else:
-        total_dataset_size = 3_352
-
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=total_dataset_size,
-        data_shape=[3, final_size, final_size],
-        label_shape=[final_size, final_size],
-        num_classes=150,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ))
-
-
-class RandomResizePair(torch.nn.Module):
-    """Resize the image and target to ``base_size`` scaled by a randomly sampled value.
-
-    Args:
-        min_scale (float): the minimum value the samples can be rescaled.
-        max_scale (float): the maximum value the samples can be rescaled.
-        base_size (Tuple[int, int]): a specified base size (height x width) to scale to get the resized dimensions.
-            When this is None, use the input image size. Default: ``None``.
-    """
-
-    def __init__(self, min_scale: float, max_scale: float, base_size: Optional[Tuple[int, int]] = None):
-        super().__init__()
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.base_size = base_size
-        warnings.warn(DeprecationWarning('RandomResizePair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-        resize_scale = np.random.random_sample() * (self.max_scale - self.min_scale) + self.min_scale
-        base_height, base_width = self.base_size if self.base_size else (image.height, image.width)
-        resized_dims = (int(base_height * resize_scale), int(base_width * resize_scale))
-        resized_image = TF.resize(image, resized_dims, interpolation=TF.InterpolationMode.BILINEAR)  # type: ignore
-        resized_target = TF.resize(target, resized_dims, interpolation=TF.InterpolationMode.NEAREST)  # type: ignore
-        return resized_image, resized_target
-
-
-# Based on: https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L584
-class RandomCropPair(torch.nn.Module):
-    """Crop the image and target at a randomly sampled position.
-
-    Args:
-        crop_size (Tuple[int, int]): the size (height x width) of the crop.
-        class_max_percent (float): the maximum percent of the image area a single class should occupy. Default is 1.0.
-        num_retry (int): the number of times to resample the crop if ``class_max_percent`` threshold is not reached.
-            Default is 1.
-    """
-
-    def __init__(self, crop_size: Tuple[int, int], class_max_percent: float = 1.0, num_retry: int = 1):
-        super().__init__()
-        self.crop_size = crop_size
-        self.class_max_percent = class_max_percent
-        self.num_retry = num_retry
-        warnings.warn(DeprecationWarning('RandomCropPair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-
-        # if image size is smaller than crop size, no cropping necessary
-        if image.height <= self.crop_size[0] and image.width <= self.crop_size[1]:
-            return image, target
-
-        # generate crop
-        crop = transforms.RandomCrop.get_params(
-            image, output_size=self.crop_size)  # type: ignore - transform typing excludes PIL.Image
-
-        if self.class_max_percent < 1.0:
-            for _ in range(self.num_retry):
-                # Crop target
-                target_crop = TF.crop(target, *crop)  # type: ignore - transform typing excludes PIL.Image
-
-                # count the number of each class represented in cropped target
-                labels, counts = np.unique(np.array(target_crop), return_counts=True)
-                counts = counts[labels != 0]
-
-                # if the class with the most area is within the class_max_percent threshold, stop retrying
-                if len(counts) > 1 and (np.max(counts) / np.sum(counts)) < self.class_max_percent:
-                    break
-
-                crop = transforms.RandomCrop.get_params(
-                    image, output_size=self.crop_size)  # type: ignore - transform typing excludes PIL.Image
-
-        image = TF.crop(image, *crop)  # type: ignore - transform typing excludes PIL.Image
-        target = TF.crop(target, *crop)  # type: ignore - transform typing excludes PIL.Image
-
-        return image, target
-
-
-class RandomHFlipPair(torch.nn.Module):
-    """Flip the image and target horizontally with a specified probability.
-
-    Args:
-        probability (float): the probability of flipping the image and target. Default: ``0.5``.
-    """
-
-    def __init__(self, probability: float = 0.5):
-        super().__init__()
-        self.probability = probability
-        warnings.warn(DeprecationWarning('RandomHFlipPair is deprecated and will be removed in v0.18'))
-
-    def forward(self, sample: Tuple[Image.Image, Image.Image]):
-        image, target = sample
-        if np.random.random_sample() > self.probability:
-            image = TF.hflip(image)  # type: ignore - transform typing does not include PIL.Image
-            target = TF.hflip(target)  # type: ignore - transform typing does not include PIL.Image
-        return image, target
-
-
-class PadToSize(torch.nn.Module):
-    """Pad an image to a specified size.
-
-    Args:
-        size (Tuple[int, int]): the size (height x width) of the image after padding.
-        fill (Union[int, Tuple[int, int, int]]): the value to use for the padded pixels. Default: ``0``.
-    """
-
-    def __init__(self, size: Tuple[int, int], fill: Union[int, Tuple[int, int, int]] = 0):
-        super().__init__()
-        self.size = size
-        self.fill = fill
-        warnings.warn(DeprecationWarning('PadToSize is deprecated and will be removed in v0.18'))
-
-    def forward(self, image: Image.Image):
-        padding = max(self.size[0] - image.height, 0), max(self.size[1] - image.width, 0)
-        padding = (padding[1] // 2, padding[0] // 2, ceil(padding[1] / 2), ceil(padding[0] / 2))
-        image = TF.pad(image, padding, fill=self.fill)  # type: ignore - transform typing does not include PIL.Image
-        return image
-
-
-class PhotometricDistoration(torch.nn.Module):
-    """Applies a combination of brightness, contrast, saturation, and hue jitters with random intensity.
-
-    This is a less severe form of PyTorch's ColorJitter used by the mmsegmentation library here:
-    https://github.com/open-mmlab/mmsegmentation/blob/aa50358c71fe9c4cccdd2abe42433bdf702e757b/mmseg/datasets/pipelines/transforms.py#L861
-
-    Args:
-        brightness (float): max and min to jitter brightness.
-        contrast (float): max and min to jitter contrast.
-        saturation (float): max and min to jitter saturation.
-        hue (float): max and min to jitter hue.
-    """
-
-    def __init__(self, brightness: float, contrast: float, saturation: float, hue: float):
-        super().__init__()
-        self.brightness = brightness
-        self.contrast = contrast
-        self.saturation = saturation
-        self.hue = hue
-        warnings.warn(DeprecationWarning('PhotometricDistoration is deprecated and will be removed in v0.18'))
-
-    def forward(self, image: Image.Image):
-        if np.random.randint(2):
-            brightness_factor = np.random.uniform(1 - self.brightness, 1 + self.brightness)
-            image = TF.adjust_brightness(
-                image, brightness_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        contrast_mode = np.random.randint(2)
-        if contrast_mode == 1 and np.random.randint(2):
-            contrast_factor = np.random.uniform(1 - self.contrast, 1 + self.contrast)
-            image = TF.adjust_contrast(
-                image,  # type: ignore - transform typing does not include PIL.Image
-                contrast_factor)
-
-        if np.random.randint(2):
-            saturation_factor = np.random.uniform(1 - self.saturation, 1 + self.saturation)
-            image = TF.adjust_saturation(
-                image, saturation_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        if np.random.randint(2):
-            hue_factor = np.random.uniform(-self.hue, self.hue)
-            image = TF.adjust_hue(image, hue_factor)  # type: ignore - transform typing does not include PIL.Image
-
-        if contrast_mode == 0 and np.random.randint(2):
-            contrast_factor = np.random.uniform(1 - self.contrast, 1 + self.contrast)
-            image = TF.adjust_contrast(
-                image,  # type: ignore - transform typing does not include PIL.Image
-                contrast_factor)
-
-        return image
-
-
-class ADE20k(Dataset):
-    """PyTorch Dataset for ADE20k.
-
-    Args:
-        datadir (str): the path to the ADE20k folder.
-        split (str): the dataset split to use, either 'training', 'validation', or 'test'. Default: ``'training'``.
-        both_transforms (torch.nn.Module): transformations to apply to the image and target simultaneously.
-            Default: ``None``.
-        image_transforms (torch.nn.Module): transformations to apply to the image only. Default: ``None``.
-        target_transforms (torch.nn.Module): transformations to apply to the target only. Default ``None``.
-    """
-
-    def __init__(self,
-                 datadir: str,
-                 split: str = 'training',
-                 both_transforms: Optional[torch.nn.Module] = None,
-                 image_transforms: Optional[torch.nn.Module] = None,
-                 target_transforms: Optional[torch.nn.Module] = None):
-        warnings.warn(DeprecationWarning('ADE20k is deprecated and will be removed in v0.18'))
-        super().__init__()
-        self.datadir = datadir
-        self.split = split
-        self.both_transforms = both_transforms
-        self.image_transforms = image_transforms
-        self.target_transforms = target_transforms
-
-        # Check datadir value
-        if self.datadir is None:
-            raise ValueError('datadir must be specified')
-        elif not os.path.exists(self.datadir):
-            raise FileNotFoundError(f'datadir path does not exist: {self.datadir}')
-
-        # Check split value
-        if self.split not in ['training', 'validation', 'test']:
-            raise ValueError(f'split must be one of [`training`, `validation`, `test`] but is: {self.split}')
-
-        self.image_dir = os.path.join(self.datadir, 'images', self.split)
-        if not os.path.exists(self.image_dir):
-            raise FileNotFoundError(f'ADE20k directory structure is not as expected: {self.image_dir} does not exist')
-
-        self.image_files = os.listdir(self.image_dir)
-
-        # Filter for ADE files
-        self.image_files = [f for f in self.image_files if f[:3] == 'ADE']
-
-        # Remove grayscale samples
-        if self.split == 'training':
-            corrupted_samples = ['00003020', '00001701', '00013508', '00008455']
-            for sample in corrupted_samples:
-                sample_file = f'ADE_train_{sample}.jpg'
-                if sample_file in self.image_files:
-                    self.image_files.remove(sample_file)
-
-    def __getitem__(self, index):
-        # Load image
-        image_file = self.image_files[index]
-        image_path = os.path.join(self.image_dir, image_file)
-        image = Image.open(image_path)
-
-        # Load annotation target if using either train or val splits
-        if self.split in ['training', 'validation']:
-            target_path = os.path.join(self.datadir, 'annotations', self.split, image_file.split('.')[0] + '.png')
-            target = Image.open(target_path)
-
-            if self.both_transforms:
-                image, target = self.both_transforms((image, target))
-
-            if self.target_transforms:
-                target = self.target_transforms(target)
-
-        if self.image_transforms:
-            image = self.image_transforms(image)
-
-        if self.split in ['training', 'validation']:
-            return image, target  # type: ignore
-        else:
-            return image
-
-    def __len__(self):
-        return len(self.image_files)
diff --git a/composer/datasets/brats.py b/composer/datasets/brats.py
deleted file mode 100644
index 9356bdfb66..0000000000
--- a/composer/datasets/brats.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""BraTS (Brain Tumor Segmentation) dataset.
-
-Please refer to the `Brain Tumor Segmentation (BraTS) challenge <https://www.med.upenn.edu/cbica/brats2021/>`_ for more
-details about this dataset.
-"""
-
-import glob
-import os
-import random
-import warnings
-
-import numpy as np
-import torch
-import torch.utils.data
-import torchvision
-
-from composer.utils import MissingConditionalImportError, dist
-
-PATCH_SIZE = [1, 192, 160]
-
-__all__ = ['PytTrain', 'PytVal']
-
-
-def build_brats_dataloader(datadir: str,
-                           global_batch_size: int,
-                           oversampling: float = 0.33,
-                           is_train: bool = True,
-                           drop_last: bool = True,
-                           shuffle: bool = True,
-                           **dataloader_kwargs):
-    """Builds a BRaTS dataloader
-
-    Args:
-        global_batch_size (int): Global batch size.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_brats_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    x_train, y_train, x_val, y_val = get_data_split(datadir)
-    dataset = PytTrain(x_train, y_train, oversampling) if is_train else PytVal(x_val, y_val)
-    collate_fn = None if is_train else _my_collate
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return torch.utils.data.DataLoader(dataset=dataset,
-                                       batch_size=batch_size,
-                                       sampler=sampler,
-                                       drop_last=drop_last,
-                                       collate_fn=collate_fn,
-                                       **dataloader_kwargs)
-
-
-def _my_collate(batch):
-    """Custom collate function to handle images with different depths."""
-    data = [item[0] for item in batch]
-    target = [item[1] for item in batch]
-
-    return [torch.Tensor(data), torch.Tensor(target)]
-
-
-def _coin_flip(prob):
-    return random.random() < prob
-
-
-def _random_augmentation(probability, augmented, original):
-    condition = _coin_flip(probability)
-    neg_condition = condition ^ True
-    return condition * augmented + neg_condition * original
-
-
-class Crop(object):
-
-    def __call__(self, data, oversampling):
-        img, lbl = data['image'], data['label']
-
-        def randrange(max_range):
-            return 0 if max_range == 0 else random.randrange(max_range)
-
-        def get_cords(cord, idx):
-            return cord[idx], cord[idx] + PATCH_SIZE[idx]
-
-        def _rand_crop(image, label):
-            ranges = [s - p for s, p in zip(image.shape[1:], PATCH_SIZE)]
-
-            cord = [randrange(x) for x in ranges]
-            low_x, high_x = get_cords(cord, 0)
-            low_y, high_y = get_cords(cord, 1)
-            image = image[:, low_x:high_x, low_y:high_y]
-            label = label[:, low_x:high_x, low_y:high_y]
-            return image, label, [low_x, high_x, low_y, high_y]
-
-        def rand_foreg_cropd(image, label):
-
-            import scipy.ndimage
-            cl = np.random.choice(np.unique(label[label > 0]))
-            foreg_slices = scipy.ndimage.find_objects(scipy.ndimage.measurements.label(label == cl)[0])
-            foreg_slices = [x for x in foreg_slices if x is not None]
-            slice_volumes = [np.prod([s.stop - s.start for s in sl]) for sl in foreg_slices]
-            slice_idx = np.argsort(slice_volumes)[-2:]
-            foreg_slices = [foreg_slices[i] for i in slice_idx]
-            if not foreg_slices:
-                return _rand_crop(image, label)
-            foreg_slice = foreg_slices[random.randrange(len(foreg_slices))]
-            low_x, high_x = adjust(foreg_slice, PATCH_SIZE, label, 1)
-            low_y, high_y = adjust(foreg_slice, PATCH_SIZE, label, 2)
-            image = image[:, low_x:high_x, low_y:high_y]
-            label = label[:, low_x:high_x, low_y:high_y]
-            return image, label, [low_x, high_x, low_y, high_y]
-
-        def adjust(foreg_slice, patch_size, label, idx):
-
-            diff = patch_size[idx - 1] - (foreg_slice[idx].stop - foreg_slice[idx].start)
-            sign = -1 if diff < 0 else 1
-            diff = abs(diff)
-            ladj = randrange(diff)
-            hadj = diff - ladj
-            low = max(0, foreg_slice[idx].start - sign * ladj)
-            high = min(label.shape[idx], foreg_slice[idx].stop + sign * hadj)
-            diff = patch_size[idx - 1] - (high - low)
-            if diff > 0 and low == 0:
-                high += diff
-            elif diff > 0:
-                low -= diff
-            return low, high
-
-        if random.random() < oversampling:
-            img, lbl, _ = rand_foreg_cropd(img, lbl)
-        else:
-            img, lbl, _ = _rand_crop(img, lbl)
-
-        return {'image': img, 'label': lbl}
-
-
-class Noise(object):
-
-    def __call__(self, data, oversampling):
-        img, lbl = data['image'], data['label']
-        std = np.random.uniform(0.0, oversampling)
-        noise = np.random.normal(0, scale=std, size=img.shape)
-        img_noised = img + noise
-        img = _random_augmentation(0.15, img_noised, img)
-
-        return {'image': img, 'label': lbl}
-
-
-class Blur(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-
-        transf = torchvision.transforms.GaussianBlur(kernel_size=3, sigma=(0.5, 1.5))
-        img_blured = transf(torch.Tensor(img)).numpy()
-        img = _random_augmentation(0.15, img_blured, img)
-
-        return {'image': img, 'label': lbl}
-
-
-class Brightness(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        brightness_scale = _random_augmentation(0.15, np.random.uniform(0.7, 1.3), 1.0)
-        img = img * brightness_scale
-
-        return {'image': img, 'label': lbl}
-
-
-class Contrast(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        min_, max_ = np.min(img), np.max(img)
-        scale = _random_augmentation(0.15, np.random.uniform(0.65, 1.5), 1.0)
-
-        img = torch.clamp(torch.Tensor(img * scale), min_, max_).numpy()
-        return {'image': img, 'label': lbl}
-
-
-class Flips(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        axes = [1, 2]
-        prob = 1 / len(axes)
-
-        for axis in axes:
-            if random.random() < prob:
-                img = np.flip(img, axis=axis).copy()
-                lbl = np.flip(lbl, axis=axis).copy()
-
-        return {'image': img, 'label': lbl}
-
-
-class Transpose(object):
-
-    def __call__(self, data):
-        img, lbl = data['image'], data['label']
-        img, lbl = img.transpose((1, 0, 2, 3)), lbl.transpose((1, 0, 2, 3))
-
-        return {'image': img, 'label': lbl}
-
-
-class PytTrain(torch.utils.data.Dataset):
-
-    def __init__(self, images, labels, oversampling, transform=None):
-        self.images, self.labels = images, labels
-        self.oversampling = oversampling
-        self.transform = transform
-        self.rand_crop = Crop()
-        self.transpose = Transpose()
-        self.contrast = Contrast()
-        self.noise = Noise()
-        self.blur = Blur()
-        self.flips = Flips()
-        self.bright = Brightness()
-
-    def __len__(self):
-        return len(self.images)
-
-    def __getitem__(self, idx):
-        data = {'image': np.load(self.images[idx]), 'label': np.load(self.labels[idx])}
-        data = self.rand_crop(data, self.oversampling)
-        data = self.flips(data)
-        data = self.noise(data, self.oversampling)
-        data = self.blur(data)
-        data = self.bright(data)
-        data = self.contrast(data)
-        data = self.transpose(data)
-
-        return data['image'], data['label']
-
-
-class PytVal(torch.utils.data.Dataset):
-
-    def __init__(self, images, labels):
-        self.images, self.labels = images, labels
-
-    def __len__(self):
-        return len(self.images)
-
-    def __getitem__(self, idx):
-        data = {'image': np.load(self.images[idx]), 'label': np.load(self.labels[idx])}
-        return data['image'], data['label']
-
-
-def load_data(path, files_pattern):
-    data = sorted(glob.glob(os.path.join(path, files_pattern)))
-    assert len(data) > 0, f'Found no data at {path}'
-    return data
-
-
-def get_split(data, idx):
-    return list(np.array(data)[idx])
-
-
-def get_data_split(path: str):
-    try:
-        from sklearn.model_selection import KFold
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='unet',
-                                            conda_channel='conda-forge',
-                                            conda_package='scikit-learn') from e
-    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
-    imgs = load_data(path, '*_x.npy')
-    lbls = load_data(path, '*_y.npy')
-    assert len(imgs) == len(lbls), f'Found {len(imgs)} volumes but {len(lbls)} corresponding masks'
-    train_imgs, train_lbls, val_imgs, val_lbls = [], [], [], []
-
-    train_idx, val_idx = list(kfold.split(imgs))[0]
-    train_imgs = get_split(imgs, train_idx)
-    train_lbls = get_split(lbls, train_idx)
-    val_imgs = get_split(imgs, val_idx)
-    val_lbls = get_split(lbls, val_idx)
-
-    return train_imgs, train_lbls, val_imgs, val_lbls
diff --git a/composer/datasets/c4.py b/composer/datasets/c4.py
deleted file mode 100644
index 4402134a98..0000000000
--- a/composer/datasets/c4.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""C4 (Colossal Cleaned Common Crawl) dataset.
-
-This dataset is a colossal, cleaned version of Common Crawl's web crawl corpus and it is based on the `Common Crawl
-<https://commoncrawl.org>`_ dataset.
-"""
-import logging
-import warnings
-from typing import Any, Dict, Optional
-
-from torch.utils.data import DataLoader
-
-from composer.core import DataSpec
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-__all__ = ['build_streaming_c4_dataloader']
-
-
-def build_streaming_c4_dataloader(
-    global_batch_size: int,
-    remote: str = 's3://mosaicml-internal-dataset-c4/mds/2/',
-    local: str = '/tmp/mds-cache/mds-c4/',
-    split: str = 'train',
-    shuffle: bool = True,
-    drop_last: bool = True,
-    tokenizer_name: str = 'bert-base-uncased',
-    max_seq_len: int = 512,
-    group_method: str = 'truncate',
-    mlm: bool = False,
-    mlm_probability: float = 0.15,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-):
-    """Builds a :class:`.DataSpec` for the StreamingC4 (Colossal Cleaned Common Crawl) dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-            Default: ``'s3://mosaicml-internal-dataset-c4/mds/2/'``
-        local (str): Local filesystem directory where dataset is cached during operation.
-            Default: ``'/tmp/mds-cache/mds-c4/'``
-        split (str): What split of the dataset to use. Either ``'train'`` or ``'val'``.
-            Default: ``'train'``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        tokenizer_name (str): The name of the HuggingFace tokenizer to preprocess text with. Default:
-            ``'bert-base-uncased'``.
-        max_seq_len (int): The max sequence length of each token sample. Default: ``512``.
-        group_method (str): How to group text samples into token samples. Currently only `truncate` is supported.
-        mlm (bool): Whether or not to use masked language modeling. Default: ``False``.
-        mlm_probability (float): If ``mlm==True``, the probability that tokens are masked. Default: ``0.15``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_streaming_c4_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.text import StreamingC4
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    dataset = StreamingC4(
-        tokenizer_name=tokenizer_name,
-        max_seq_len=max_seq_len,
-        group_method=group_method,
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    collate_fn = transformers.DataCollatorForLanguageModeling(
-        tokenizer=dataset.tokenizer,
-        mlm=mlm,
-        mlm_probability=mlm_probability,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        drop_last=drop_last,
-        collate_fn=collate_fn,
-        **dataloader_kwargs,
-    )
-
-    return DataSpec(dataloader=dataloader)
diff --git a/composer/datasets/cifar.py b/composer/datasets/cifar.py
deleted file mode 100644
index f866c5be1b..0000000000
--- a/composer/datasets/cifar.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""CIFAR image classification dataset.
-
-The CIFAR datasets are a collection of labeled 32x32 colour images. Please refer to the `CIFAR dataset
-<https://www.cs.toronto.edu/~kriz/cifar.html>`_ for more details.
-"""
-
-import os
-import textwrap
-import warnings
-from typing import Any, Dict, List, Optional, Union
-
-import torch
-from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.ffcv_utils import write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'build_cifar10_dataloader', 'build_ffcv_cifar10_dataloader', 'build_streaming_cifar10_dataloader',
-    'build_synthetic_cifar10_dataloader'
-]
-
-CIFAR10_CHANNEL_MEAN = 0.4914, 0.4822, 0.4465
-CIFAR10_CHANNEL_STD = 0.247, 0.243, 0.261
-
-
-def build_cifar10_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Any,
-) -> DataSpec:
-    """Builds a CIFAR-10 dataloader with default transforms.
-
-    Args:
-        datadir (str): Path to the data directory
-        global_batch_size (int): Global batch size
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_cifar10_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if is_train:
-        transform = transforms.Compose([
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-    else:
-        transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-
-    with dist.run_local_rank_zero_first():
-        dataset = datasets.CIFAR10(
-            datadir,
-            train=is_train,
-            download=dist.get_local_rank() == 0 and download,
-            transform=transform,
-        )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ),)
-
-
-def build_ffcv_cifar10_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    prefetch_factor: int = 2,
-    num_workers: int = 8,
-    ffcv_dir: str = '/tmp',
-    ffcv_dest: str = 'cifar_train.ffcv',
-    ffcv_write_dataset: Union[str, bool] = False,
-    datadir: Union[str, None] = None,
-) -> DataSpec:
-    """Builds an FFCV CIFAR10 dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Whether to drop last samples. Default: ``True``.
-        prefetch_factor (int): Number of batches to prefect. Default: ``2``.
-        ffcv_dir (str, optional): A directory containing train/val <file>.ffcv files. If
-            these files don't exist and ``ffcv_write_dataset`` is ``True``, train/val
-            <file>.ffcv files will be created in this dir. Default: ``"/tmp"``.
-        ffcv_dest (str, optional): <file>.ffcv file that has dataset samples. Default: ``"cifar_train.ffcv"``.
-        ffcv_write_dataset (str | bool, optional): Whether to create dataset in FFCV format (<file>.ffcv) if it doesn't exist. Default:
-        ``False``.
-        datadir (str | None, optional): Path to the non-FFCV data directory.
-    """
-    warnings.warn(DeprecationWarning('build_ffcv_cifar10_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import ffcv
-        from ffcv.fields.decoders import IntDecoder, SimpleRGBImageDecoder
-        from ffcv.pipeline.operation import Operation
-    except ImportError:
-        raise ImportError(
-            textwrap.dedent("""\
-            Composer was installed without ffcv support.
-            To use ffcv with Composer, please install ffcv in your environment."""))
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset_filepath = os.path.join(ffcv_dir, ffcv_dest)
-    # always create if ffcv_write_dataset is true
-    if ffcv_write_dataset:
-        if dist.get_local_rank() == 0:
-            if datadir is None:
-                raise ValueError('datadir is required if use_synthetic is False and ffcv_write_dataset is True.')
-            ds = datasets.CIFAR10(
-                datadir,
-                train=is_train,
-                download=download,
-            )
-
-            write_ffcv_dataset(dataset=ds, write_path=dataset_filepath)
-
-        # Wait for the local rank 0 to be done creating the dataset in ffcv format.
-        dist.barrier()
-
-    if not os.path.exists(dataset_filepath):
-        raise ValueError(
-            f'Dataset file containing samples not found at {dataset_filepath}. Use ffcv_dir flag to point to a dir containing {dataset_filepath}.'
-        )
-
-    # Please note that this mean/std is different from the mean/std used for regular PyTorch dataloader as
-    # ToTensor does the normalization for PyTorch dataloaders.
-    cifar10_mean_ffcv = [125.307, 122.961, 113.8575]
-    cifar10_std_ffcv = [51.5865, 50.847, 51.255]
-    label_pipeline: List[Operation] = [IntDecoder(), ffcv.transforms.ToTensor(), ffcv.transforms.Squeeze()]
-    image_pipeline: List[Operation] = [SimpleRGBImageDecoder()]
-
-    if is_train:
-        image_pipeline.extend([
-            ffcv.transforms.RandomHorizontalFlip(),
-            ffcv.transforms.RandomTranslate(padding=2, fill=tuple(map(int, cifar10_mean_ffcv))),
-            ffcv.transforms.Cutout(4, tuple(map(int, cifar10_mean_ffcv))),
-        ])
-    # Common transforms for train and test
-    image_pipeline.extend([
-        ffcv.transforms.ToTensor(),
-        ffcv.transforms.ToTorchImage(channels_last=False, convert_back_int16=False),
-        ffcv.transforms.Convert(torch.float32),
-        transforms.Normalize(cifar10_mean_ffcv, cifar10_std_ffcv),
-    ])
-
-    ordering = ffcv.loader.OrderOption.RANDOM if is_train else ffcv.loader.OrderOption.SEQUENTIAL
-
-    return DataSpec(
-        ffcv.Loader(
-            dataset_filepath,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            order=ordering,
-            distributed=False,
-            pipelines={
-                'image': image_pipeline,
-                'label': label_pipeline,
-            },
-            batches_ahead=prefetch_factor,
-            drop_last=drop_last,
-        ),)
-
-
-def build_synthetic_cifar10_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Any,
-) -> DataSpec:
-    """Builds a synthetic CIFAR-10 dataset for debugging or profiling.
-
-    Args:
-        global_batch_size (int): Global batch size
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=50_000 if is_train else 10_000,
-        data_shape=[3, 32, 32],
-        num_classes=10,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            **dataloader_kwargs,
-        ),)
-
-
-def build_streaming_cifar10_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-cifar10',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds a streaming CIFAR10 dataset
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str, optional): Local filesystem directory where dataset is cached during operation.
-            Defaults to ``'/tmp/mds-cache/mds-imagenet1k/```.
-        split (str): Which split of the dataset to use. Either ['train', 'val']. Default:
-            ``'train```.
-        drop_last (bool, optional): whether to drop last samples. Default: ``True``.
-        shuffle (bool, optional): whether to shuffle dataset. Defaults to ``True``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingCIFAR10
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    if split == 'train':
-        transform = transforms.Compose([
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-    else:
-        transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize(CIFAR10_CHANNEL_MEAN, CIFAR10_CHANNEL_STD),
-        ])
-
-    dataset = StreamingCIFAR10(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        transform=transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        sampler=None,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    return DataSpec(dataloader=dataloader)
diff --git a/composer/datasets/ffcv_utils.py b/composer/datasets/ffcv_utils.py
deleted file mode 100644
index cdf7616d47..0000000000
--- a/composer/datasets/ffcv_utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import warnings
-from typing import Optional
-
-import numpy as np
-
-from composer.core import Dataset
-from composer.utils import MissingConditionalImportError
-
-try:
-    import ffcv
-    ffcv_installed = True
-except ImportError:
-    ffcv_installed = False
-
-log = logging.getLogger(__name__)
-
-__all__ = ['write_ffcv_dataset', 'ffcv_monkey_patches']
-
-
-def _require_ffcv():
-    if not ffcv_installed:
-        raise MissingConditionalImportError(extra_deps_group='ffcv', conda_package='ffcv')
-
-
-def ffcv_monkey_patches():
-    warnings.warn(DeprecationWarning('ffcv_monkey_patches is deprecated and will be removed in v0.18'))
-    _require_ffcv()
-
-    # ffcv's __len__ function is expensive as it always calls self.next_traversal_order which does shuffling.
-    # Composer calls len(dataloader) function in training loop for every batch and thus len function causes 2x slowdown.
-    # ffcv's __len__ is fixed in 1.0.0 branch but for another reason (https://github.com/libffcv/ffcv/issues/163).
-    def new_len(self):
-        if not hasattr(self, 'init_traversal_order'):
-            self.init_traversal_order = self.next_traversal_order()
-        if self.drop_last:
-            return len(self.init_traversal_order) // self.batch_size
-        else:
-            return int(np.ceil(len(self.init_traversal_order) / self.batch_size))
-
-    ffcv.loader.loader.Loader.__len__ = new_len
-
-
-def write_ffcv_dataset(dataset: Optional[Dataset] = None,
-                       write_path: str = '/tmp/dataset.ffcv',
-                       max_resolution: Optional[int] = None,
-                       num_workers: int = 16,
-                       write_mode: str = 'raw',
-                       compress_probability: float = 0.50,
-                       jpeg_quality: float = 90,
-                       chunk_size: int = 100):
-    """Converts PyTorch compatible ``dataset`` into FFCV format at filepath ``write_path``.
-
-    Args:
-        dataset (Iterable[Sample]): A PyTorch dataset. Default: ``None``.
-        write_path (str): Write results to this file. Default: ``"/tmp/dataset.ffcv"``.
-        max_resolution (int): Limit resolution if provided. Default: ``None``.
-        num_workers (int): Numbers of workers to use. Default: ``16``.
-        write_mode (str): Write mode for the dataset. Default: ``'raw'``.
-        compress_probability (float): Probability with which image is JPEG-compressed. Default: ``0.5``.
-        jpeg_quality (float): Quality to use for jpeg compression. Default: ``90``.
-        chunk_size (int): Size of chunks processed by each worker during conversion. Default: ``100``.
-    """
-    warnings.warn(DeprecationWarning('write_ffcv_dataset is deprecated and will be removed in v0.18'))
-
-    _require_ffcv()
-    if dataset is None:
-        raise ValueError('dataset should not be None.')
-
-    log.info(f'Writing dataset in FFCV <file>.ffcv format to {write_path}.')
-    writer = ffcv.writer.DatasetWriter(write_path, {
-        'image':
-            ffcv.fields.RGBImageField(write_mode=write_mode,
-                                      max_resolution=max_resolution,
-                                      compress_probability=compress_probability,
-                                      jpeg_quality=jpeg_quality),
-        'label':
-            ffcv.fields.IntField()
-    },
-                                       num_workers=num_workers)
-    writer.from_indexed_dataset(dataset, chunksize=chunk_size)
diff --git a/composer/datasets/imagenet.py b/composer/datasets/imagenet.py
deleted file mode 100644
index 80d0f8c05d..0000000000
--- a/composer/datasets/imagenet.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ImageNet classification streaming dataset.
-
-The most widely used dataset for Image Classification algorithms. Please refer to the `ImageNet 2012 Classification
-Dataset <http://image-net.org/>`_ for more details.
-"""
-
-import os
-import warnings
-from typing import Any, Dict, List, Optional
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from torchvision import transforms
-from torchvision.datasets import ImageFolder
-
-from composer.core import DataSpec, MemoryFormat
-from composer.datasets.ffcv_utils import ffcv_monkey_patches, write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.utils import MissingConditionalImportError, dist
-
-__all__ = [
-    'build_imagenet_dataloader',
-    'build_streaming_imagenet1k_dataloader',
-    'build_synthetic_imagenet_dataloader',
-    'write_ffcv_imagenet',
-    'build_ffcv_imagenet_dataloader',
-]
-
-IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-
-def build_imagenet_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds an ImageNet dataloader.
-
-    Args:
-        datadir (str): path to location of dataset.
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    if is_train:
-        # include fixed-size resize before RandomResizedCrop in training only
-        # if requested (by specifying a size > 0)
-        train_transforms: List[torch.nn.Module] = []
-
-        if resize_size > 0:
-            train_transforms.append(transforms.Resize(resize_size))
-
-        train_transforms += [
-            transforms.RandomResizedCrop(crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-            transforms.RandomHorizontalFlip()
-        ]
-        transformation = transforms.Compose(train_transforms)
-        split = 'train'
-    else:
-        val_transforms: List[torch.nn.Module] = []
-        if resize_size > 0:
-            val_transforms.append(transforms.Resize(resize_size))
-        val_transforms.append(transforms.CenterCrop(crop_size))
-        transformation = transforms.Compose(val_transforms)
-        split = 'val'
-
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)
-
-    dataset = ImageFolder(os.path.join(datadir, split), transformation)
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            collate_fn=pil_image_collate,
-            **dataloader_kwargs,
-        ),
-        device_transforms=device_transform_fn,
-    )
-
-
-def build_synthetic_imagenet_dataloader(
-    global_batch_size: int,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    is_train: bool = True,
-    crop_size: int = 224,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds a synthetic ImageNet dataloader.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        crop size (int): The crop size to use. Default: ``224``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_synthetic_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    total_dataset_size = 1_281_167 if is_train else 50_000
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=total_dataset_size,
-        data_shape=[3, crop_size, crop_size],
-        num_classes=1000,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataSpec(
-        DataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            drop_last=drop_last,
-            collate_fn=pil_image_collate,
-            **dataloader_kwargs,
-        ),)
-
-
-def write_ffcv_imagenet(
-    datadir: str,
-    savedir: str = '/tmp/imagenet_train.ffcv',
-    split: str = 'train',
-    num_workers: int = 8,
-):
-    """Converts an ImageNet dataset to FFCV format.
-
-        datadir (str): Path of ImageNet dataset, in ImageFolder format.
-        savedir (str): Path to save the FFCV dataset. Default: ``/tmp/imagenet_train.ffcv``.
-        split (str): 'train' or 'val'. Default: ``train``.
-        num_workers (int): Number of workers to use for conversion. Default: ``8``.
-    """
-    warnings.warn(DeprecationWarning('write_ffcv_imagenet is deprecated and will be removed in v0.18'))
-
-    if dist.get_local_rank() == 0:
-        ds = ImageFolder(os.path.join(datadir, split))
-        write_ffcv_dataset(dataset=ds,
-                           write_path=savedir,
-                           max_resolution=500,
-                           num_workers=num_workers,
-                           compress_probability=0.50,
-                           jpeg_quality=90)
-
-    # wait for rank 0 to finish conversion
-    dist.barrier()
-
-
-def build_ffcv_imagenet_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    cpu_only: bool = False,
-    drop_last: bool = True,
-    prefetch_factor: int = 2,
-    num_workers: int = 8,
-):
-    """Builds an FFCV ImageNet dataloader.
-
-    Args:
-        datadir (str): path to location of dataset.
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        cpu_only (int): Only perform transforms on 'cpu'. Default: ``False``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        prefetch_factor (int): Number of batches to prefect. Default: ``2``.
-        num_workers (int): Number of workers. Default: ``8``.
-    """
-    warnings.warn(DeprecationWarning('build_ffcv_imagenet_dataloader is deprecated and will be removed in v0.18'))
-
-    try:
-        import ffcv
-        from ffcv.fields.decoders import CenterCropRGBImageDecoder, IntDecoder, RandomResizedCropRGBImageDecoder
-        from ffcv.pipeline.operation import Operation
-    except ImportError:
-        raise ImportError('Composer was installed without ffcv support.'
-                          'To use ffcv with Composer, please install ffcv.')
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    device = torch.device(f'cuda:{dist.get_local_rank()}')
-    label_pipeline: List[Operation] = [
-        IntDecoder(),
-        ffcv.transforms.ToTensor(),
-        ffcv.transforms.Squeeze(),
-        ffcv.transforms.ToDevice(device, non_blocking=True)
-    ]
-    image_pipeline: List[Operation] = []
-    if is_train:
-        image_pipeline.extend(
-            [RandomResizedCropRGBImageDecoder((crop_size, crop_size)),
-             ffcv.transforms.RandomHorizontalFlip()])
-        dtype = np.float16
-    else:
-        ratio = crop_size / resize_size if resize_size > 0 else 1.0
-        image_pipeline.extend([CenterCropRGBImageDecoder((crop_size, crop_size), ratio=ratio)])
-        dtype = np.float32
-
-    # Common transforms for train and test
-    if cpu_only:
-        image_pipeline.extend([
-            ffcv.transforms.NormalizeImage(np.array(IMAGENET_CHANNEL_MEAN), np.array(IMAGENET_CHANNEL_STD), dtype),
-            ffcv.transforms.ToTensor(),
-            ffcv.transforms.ToTorchImage(),
-        ])
-    else:
-        image_pipeline.extend([
-            ffcv.transforms.ToTensor(),
-            ffcv.transforms.ToDevice(device, non_blocking=True),
-            ffcv.transforms.ToTorchImage(),
-            ffcv.transforms.NormalizeImage(np.array(IMAGENET_CHANNEL_MEAN), np.array(IMAGENET_CHANNEL_STD), dtype),
-        ])
-
-    is_distributed = dist.get_world_size() > 1
-
-    ffcv_monkey_patches()
-    ordering = ffcv.loader.OrderOption.RANDOM if is_train else ffcv.loader.OrderOption.SEQUENTIAL
-
-    return ffcv.Loader(
-        datadir,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        order=ordering,
-        distributed=is_distributed,
-        pipelines={
-            'image': image_pipeline,
-            'label': label_pipeline
-        },
-        batches_ahead=prefetch_factor,
-        drop_last=drop_last,
-    )
-
-
-def build_streaming_imagenet1k_dataloader(
-    global_batch_size: int,
-    remote: str,
-    *,
-    local: str = '/tmp/mds-cache/mds-imagenet1k',
-    split: str = 'train',
-    drop_last: bool = True,
-    shuffle: bool = True,
-    resize_size: int = -1,
-    crop_size: int = 224,
-    predownload: Optional[int] = 100_000,
-    keep_zip: Optional[bool] = None,
-    download_retry: int = 2,
-    download_timeout: float = 60,
-    validate_hash: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    num_canonical_nodes: Optional[int] = None,
-    **dataloader_kwargs: Dict[str, Any],
-) -> DataSpec:
-    """Builds an imagenet1k streaming dataset
-
-    Args:
-        global_batch_size (int): Global batch size.
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str, optional): Local filesystem directory where dataset is cached during operation.
-            Defaults to ``'/tmp/mds-cache/mds-imagenet1k/```.
-        split (str): Which split of the dataset to use. Either ['train', 'val']. Default:
-            ``'train```.
-        drop_last (bool, optional): whether to drop last samples. Default: ``True``.
-        shuffle (bool, optional): whether to shuffle dataset. Defaults to ``True``.
-        resize_size (int, optional): The resize size to use. Use ``-1`` to not resize. Default: ``-1``.
-        crop size (int): The crop size to use. Default: ``224``.
-        predownload (int, optional): Target number of samples ahead to download the shards of while
-            iterating. Defaults to ``100_000``.
-        keep_zip (bool, optional): Whether to keep or delete the compressed file when
-            decompressing downloaded shards. If set to None, keep iff remote is local. Defaults to
-            ``None``.
-        download_retry (int): Number of download re-attempts before giving up. Defaults to ``2``.
-        download_timeout (float): Number of seconds to wait for a shard to download before raising
-            an exception. Defaults to ``60``.
-        validate_hash (str, optional): Optional hash or checksum algorithm to use to validate
-            shards. Defaults to ``None``.
-        shuffle_seed (int, optional): Seed for shuffling, or ``None`` for random seed. Defaults to
-            ``None``.
-        num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption.
-            Defaults to ``None``, which is interpreted as the number of nodes of the initial run.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(
-        DeprecationWarning('build_streaming_imagenet1k_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    try:
-        from streaming.vision import StreamingImageNet
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='streaming', conda_package='mosaicml-streaming') from e
-
-    transform = []
-    if split == 'train':
-        # include fixed-size resize before RandomResizedCrop in training only
-        # if requested (by specifying a size > 0)
-        if resize_size > 0:
-            transform.append(transforms.Resize(resize_size))
-        # always include RandomResizedCrop and RandomHorizontalFlip
-        transform += [
-            transforms.RandomResizedCrop(crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-            transforms.RandomHorizontalFlip()
-        ]
-    else:
-        if resize_size > 0:
-            transform.append(transforms.Resize(resize_size))
-        transform.append(transforms.CenterCrop(crop_size))
-    transform.append(lambda image: image.convert('RGB'))
-    transform = transforms.Compose(transform)
-
-    dataset = StreamingImageNet(
-        local=local,
-        remote=remote,
-        split=split,
-        shuffle=shuffle,
-        transform=transform,
-        predownload=predownload,
-        keep_zip=keep_zip if keep_zip is not None else False,
-        download_retry=download_retry,
-        download_timeout=download_timeout,
-        validate_hash=validate_hash,
-        shuffle_seed=shuffle_seed if shuffle_seed is not None else 9176,
-        num_canonical_nodes=num_canonical_nodes,
-        batch_size=batch_size,
-    )
-
-    dataloader = DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=pil_image_collate,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-    device_transform_fn = NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD)
-    return DataSpec(dataloader=dataloader, device_transforms=device_transform_fn)
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 8ad18f9b1e..2fc75cf899 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -4,22 +4,23 @@
 
 from __future__ import annotations
 
+import copy
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 
 import torch
-import transformers
 from torch.utils.data import DataLoader, Dataset
-from tqdm import tqdm
 
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
+from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
 
 if TYPE_CHECKING:
     import transformers
+    from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -33,35 +34,106 @@
 ]
 
 
-def strip_data(samples):
-    return [{k: v.strip() if isinstance(v, str) else v for k, v in entry.items()} for entry in samples]
+def strip_data(example: Dict) -> Dict:
+    """
+    Remove white space from the begging and end of string values in a dictionary
+
+    Args:
+        example: Dictionary to be stripped
+
+    Returns:
+        dict: The same dictionary with .strip() applied to any value in the dict that is a string
+    """
+    return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
+
+
+def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
+    """
+    Test for whether a prefix space is needed before the continuation.
+    Sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should.
+
+    Args:
+        tokenizer: Tokenizer to test
+
+    Returns:
+        bool: Whether or not the tokenizer needs a prefix space
+    """
+    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
+    assert isinstance(test_tokens, list)
+    return len(test_tokens) == 1
 
 
-def _tokenizer_needs_prefix_space(tokenizer) -> bool:
-    # Test for whether a prefix space is needed before the continuation.
-    # sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should
-    return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
+def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    """
+    Trims a list of tokens down to `max_seq_len` if the length of the list plus the continuation
+    is more than `max_seq_len`. It will always trim tokens from the left, i.e. tokens at the beginning
+    of the context will be removed.
 
+    Args:
+        context_enc (list): List of tokens in the context
+        continuation_enc (lsit): List of tokens in the continuation
+        max_seq_len (int): Maximum length the model can ingest
 
-def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, padding_side='right'):
+    Returns:
+        list: The encoded context trimmed from the left
+    """
     if len(continuation_enc) + len(context_enc) > max_seq_len:
-        # clip from the end
         context_max_subseq_len = max_seq_len - len(continuation_enc)
 
         if context_max_subseq_len < 0:
-            raise Exception(f'Dataset included continuation longer than the max seq len')
             # can't support continuations which are longer than the max seq len
+            raise Exception(f'Dataset included continuation longer than the max seq len')
 
+        # clip from the end
         context_enc = context_enc[-(context_max_subseq_len):]
+    return context_enc
+
+
+def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
+    """
+    Gets the list of indices of the continuation tokens for language modeling or generation tasks.
+
+    Args:
+        context_enc (list): List of context tokens
+        continuation_enc (list): List of continuation tokens
+
+    Returns:
+        torch.tensor: A tensor containing indices corresponding to continuation tokens
+    """
+    return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+
+
+def _make_padded_input(context_enc: List,
+                       continuation_enc: List,
+                       max_seq_len: int,
+                       pad_tok_id: int,
+                       padding_side: str = 'right') -> torch.Tensor:
+    """
+    Takes an encoded context and continuation and clips the beginning of the context if they're too long.
+    Adds the padding token to the specified side.
+
+    Args:
+        context_enc (List): The encoded input to the model
+        continuation_enc (List): The encoded desired output for the example
+        max_seq_list (int): Maximum length sequences can be
+        pad_tok_id (int): The token id we pad with
+        padding_side (str): Which side to pad the context on. Can be 'right' or 'left
+
+    Returns:
+        input (torch.tensor): The padded and encoded context
+        continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
+    """
 
-    # continuation span is the _inclusive_ range of indices corresponding to the continuation
-    continuation_span = torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
     inp = torch.tensor(
         (context_enc + continuation_enc),
         dtype=torch.long,
     )
     (inp_len,) = inp.shape
 
+    # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
+    # token and cause errors
+    if not isinstance(pad_tok_id, int):
+        raise ValueError(f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead')
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -82,1167 +154,1331 @@ def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, p
     else:
         raise ValueError(f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'")
 
-    return inp, continuation_span
+    return inp
+
+
+def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, Any]:
+    """
+    HF Datasets converts tensors into lists when we store them, and we don't want to use `type='torch'`
+    because some content in the dataset, like generation args or single ints, should not be converted.
+
+    Here, we convert those lists of tokens back into tensors in order to feed them into the model.
+
+    Args:
+        batch (dict): A dictionary of batched inputs
+        tokenize_labels (bool): Whether or not the labels are tokenized (and need to be stacked)
+
+    Returns:
+        dict: The batch with torch tensors in the corresponding keys instead of lists of lists
+    """
+    batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
+    if tokenize_labels:
+        batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
+        batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
+    return batch
+
 
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> Set[int]:
+    """
+    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
+    then we will have fewer than num_fewshot examples in context.
+    Args:
+        dataset_size (int): Length of the dataset
+        num_fewshot (int): Number of examples to prepend
+        example_idx (int): Current example's index (excluded from fewshot choices)
+        rng (random.Random): RNG for repeatable sample selection
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random):
-    # samples without replacement. if num_fewshot exceeds the number of unique samples,
-    # then we will have fewer than num_fewshot examples in context
+    Returns:
+        list: Indices of the examples chosen for fewshot selection
+    """
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
 
-    if sample_idx in fewshot_idxs:
-        fewshot_idxs.remove(sample_idx)
+    if example_idx in fewshot_idxs:
+        fewshot_idxs.remove(example_idx)
         if len(fewshot_idxs) >= dataset_size - 1:
             return fewshot_idxs
 
         replacement_sample = rng.choice(range(0, dataset_size))
-        while replacement_sample in fewshot_idxs or replacement_sample == sample_idx:
+        while replacement_sample in fewshot_idxs or replacement_sample == example_idx:
             replacement_sample = rng.choice(range(0, dataset_size))
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
 
 
-class InContextLearningQATaskDataset(Dataset):
-    """A dataset that construct batches for in-context learning question answering evaluation
-
-    The input format is expected to be a jsonl file with the following fields:
-    - context: the question
-    - answer: the preferred answer to the question
-    - aliases: a list of aliases for the answer
+class InContextLearningDataset(Dataset):
+    """
+    A base dataset that constructs batches for in-context learning task evaluations.
+    The dataset format is expected to be a local jsonl file, a cloud link to a jsonl file, or a Hugging Face dataset link.
+    'context' refers to the input a model will recieve before generating an output. For example, the question in question answering tasks,
+    the preceding text in a language modeling task, or the document and question regarding the document in a document understanding task.
+    'example' refers to a loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
+    'answer' refers to the desired output of the model.
+
+    When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
+
+    - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
+    - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset.read_dataset())
+
+    Additionally, base_batch and batch_mapping must be defined.
+
+    - base_batch (Dict): The base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+      and empty lists for values that will need to be accumulated from each example.
+      NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self.update_generation_kwargs()
+      after setting self.base_batch.
+    - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
+      collate_fn will use this mapping to create batches from self.dataset.
 
     Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
-            "answer", and "aliases". See tests/datasets/local_data/triviaqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The maximum sequence length supported by the model
-        pad_tok_id (int): The special token reserved for padding batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
-        destination_path (str): Temporary path to store downloaded datasets
-        question_prelimiter (str): String to put before each question (e.g. 'Q: ')
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
-    """
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependent fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        fewshot_random_seed (int): Random seed to use for fewshot sampling.
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\\nA: ').
+        destination_path (str): Temporary path to store downloaded datasets.
+        prelimiter (str): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
+        context_key (str): The key in the loaded dataset that contains the context.
+        answer_key (str): The key in the loaded dataset that contains the answer.
+        strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
+            so unless whitespace should be preserved (for example in code), this should be set to True.
+        padding_side (str): Side of the content and answer on which to apply padding. Can be either 'right' or 'left'.
+        padding_size (int): The final size of the tensor after padding. Defaults to max_sequence_length.
+        base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
+        base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
+        generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
 
-    def _read_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        result = []
-        for example in dataset:
-            result.append({
-                'context': example['context'],
-                'answer': example['answer'],
-                'aliases': set([example['answer']] + example.get('aliases', [])),
-                'chain_of_thought': example.get('chain_of_thought', '')
-            })
-        return result
+    """
 
     def __init__(
         self,
         dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        tokenizer: transformers.PreTrainedTokenizerBase,
         max_seq_len: int,
         pad_tok_id: int,
         num_fewshot: int,
+        fewshot_random_seed: int,
         prompt_string: str,
         example_delimiter: str,
         continuation_delimiter: str,
         destination_path: str,
-        question_prelimiter: str,
-        fewshot_random_seed: int,
-        cot_delimiter: str = '',
+        prelimiter: str = '',
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        padding_side: str = 'right',
+        tokenize_labels: bool = True,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+        tensor_keys: Optional[List] = None,
+        padding_size: Optional[int] = None,
+        base_batch: Optional[Dict] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
     ):
-        if tokenizer.eos_token_id is None:
-            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
         try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
+            import datasets
+            del datasets
         except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = self._read_dataset(dataset)
-        self.samples = strip_data(self.samples)
+            raise MissingConditionalImportError(
+                extra_deps_group='nlp',
+                conda_package='datasets',
+                conda_channel='conda-forge',
+            ) from e
+
         self.tokenizer = tokenizer
+        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
-        self.padding_side = 'left'
-        self.max_answer_length = 0
+        self.num_fewshot = num_fewshot
+        self.padding_side = padding_side
+        self.padding_size = padding_size if padding_size else self.max_seq_len
+        self.prelimiter = prelimiter
+        self.example_delimiter = example_delimiter
+        self.continuation_delimiter = continuation_delimiter
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenize_labels = tokenize_labels
+        self.batch_mapping = batch_mapping or {}
+        self.base_batch = base_batch or {}
+        if generation_kwargs:
+            self.update_generation_kwargs(generation_kwargs)
+
+        self.static_keys = static_keys
+        self.list_keys = list_keys
+        self.tensor_keys = tensor_keys
+
+        hf_loading_vars = hf_loading_vars or {}
+        self.dataset: HFDataset = self.read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        self.strip_data = strip_dataset
+        if self.strip_data:
+            self.dataset = self.dataset.map(strip_data)
+
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter,
-                                                   continuation_delimiter, question_prelimiter, fewshot_rng,
-                                                   cot_delimiter)
+        self.dataset: HFDataset = self.dataset.map(
+            self._prep_example,
+            with_indices=True,
+            fn_kwargs={
+                'num_fewshot': num_fewshot,
+                'prompt_string': prompt_string,
+                'fewshot_rng': fewshot_rng,
+            },
+        )
+
+    def __getitem__(self, index: int) -> Dict:
+        return self.dataset[index]
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def get_num_samples_in_batch(self, batch: Dict) -> int:
+        return batch['input_ids'].shape[0]
+
+    def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
+        """
+        Updates self.base_batch with the passed in generation_kwargs.
+        This must be run after self.base_batch is set (for example, if self.base_batch is set after __init__() is run,
+        likely because base_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
+
+        Args:
+            dict: Keyword arguments that be written into base_batch['generation_kwargs']
+        """
+        if generation_kwargs:
+            if 'generation_kwargs' not in self.base_batch:
+                self.base_batch['generation_kwargs'] = {}
+            self.base_batch['generation_kwargs'].update(generation_kwargs)
+
+    def read_dataset(self,
+                     dataset_uri: str,
+                     destination_path: str,
+                     hf_loading_vars: Optional[Dict[str, Any]] = None,
+                     hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
+        """
+        Reads a dataset and handles parsing it from HuggingFace.
+
+        Args:
+            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
+                Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            destination_path (str): A local path where the data will be stored
+            hf_loading_vars (Dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
+            hf_parsing_map (Dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
+
+        Returns:
+            dataset: A loaded HF dataset
+        """
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        if 'hf://' in dataset_uri:
+            dataset_uri = dataset_uri.replace('hf://', '')
+            if hf_loading_vars is None:
+                hf_loading_vars = {}
+            dataset = load_dataset(dataset_uri, **hf_loading_vars)
+            if hf_parsing_map:
+                dataset_parsing_func = lambda example: {
+                    k: ' '.join([str(example[col]) for col in v])
+                    for k, v in hf_parsing_map.items()  # pyright: ignore[reportOptionalMemberAccess]
+                }
+                assert isinstance(dataset, HFDataset)
+                dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
+        else:
+            with dist.local_rank_zero_download_and_wait(destination_path):
+                if dist.get_local_rank() == 0:
+                    get_file(dataset_uri, destination_path, overwrite=True)
+            dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+        assert isinstance(dataset, HFDataset)
+        return dataset
+
+    def _generate_few_shot_prompt(
+        self,
+        num_fewshot: int,
+        example_idx: int,
+        preamble: str,
+        fewshot_rng: random.Random,
+    ) -> str:
+        """
+        Formats the fewshot prompt for test example `example_idx`.
 
-    def _format_prompt_and_fewshot(self, num_fewshot: int, prompt_string: str, example_delimiter: str,
-                                   continuation_delimiter: str, question_prelimiter: str, cot_delimiter: str,
-                                   fewshot_rng: random.Random, sample_idx: int) -> str:
-        """Formats the prompt fewshot examples for test sample `sample_idx`.
+        Randomly selects `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
+        contextes with answers appended.
 
-        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
-        them each as follows `{example_delimiter}{question_prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
+        Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
 
-        `chain_of_thought` will default to empty if not present in the dataset but `context` and `answer` must be present.
+        Args:
+            num_fewshot (int): Number of examples to prepend
+            example_idx (int): Current example idx
+            preamble (str): Text to occur at the beginning of the task. Generally instructions or a prompt.
+            fewshot_rng (random.Random): Seeded sampler to chose samples with
 
-        Returns the formatted prompt_string + concatenated list of formatted few shot examples.
+        Returns:
+            str: The original preamble with num_fewshot examples appended
         """
-        prompt_and_fewshot = prompt_string
+        few_shot_text = preamble
 
         if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
+            fewshot_idxs = _get_fewshot_sample_idxs(
+                len(self.dataset),
+                num_fewshot,
+                example_idx,
+                fewshot_rng,
+            )
             for fewshot_idx in fewshot_idxs:
-                context = self.samples[fewshot_idx]['context']
-                chain_of_thought = self.samples[fewshot_idx].get('chain_of_thought', '')
-                answer = self.samples[fewshot_idx]['answer']
-
-                if len(chain_of_thought) == 0:
-                    cot_delimiter = ''
-                context = f'{question_prelimiter}{context}'
-                if len(prompt_and_fewshot) > 0:
-                    context = f'{example_delimiter}{context}'
-                prompt_and_fewshot += f'{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
-
-        return prompt_and_fewshot
-
-    def _prep_examples(self,
-                       num_fewshot: int,
-                       prompt_string: str,
-                       example_delimiter: str,
-                       continuation_delimiter: str,
-                       question_prelimiter: str,
-                       fewshot_rng: random.Random,
-                       cot_delimiter: str = '') -> List[Dict[str, Any]]:
-        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
+                ctxt = self.construct_context(
+                    self.dataset[fewshot_idx],
+                    few_shot_text,
+                    add_answer=True,
+                )
+                few_shot_text += ctxt
+
+        return few_shot_text
+
+    def construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
+        """
+        Takes an example and constructs a context, i.e. the input the model reads for this example.
+        Optionally adds the correct answer (for fewshot examples) and handles example delimiters
+
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, used as a check for prepending self.example_delimiter
+            add_answer (bool): Bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
+
+        Returns:
+            str: The constructed context. The default output context is
+                 formatted as follows: f'{self.prelimiter}{example[self.context_key]}{self.continuation_delimiter}'
+        """
+        ctxt = example[self.context_key]
+        ctxt = f'{self.prelimiter}{ctxt}'
+        if len(preceding_text) > 0:
+            ctxt = f'{self.example_delimiter}{ctxt}'
+        ctxt = f'{ctxt}{self.continuation_delimiter}'
+        if add_answer:
+            ctxt = f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}'
+        return ctxt
+
+    def get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
+        """
+        Returns the answer from the example.
+
+        Args:
+            example (Dict): The example from which to retrieve the answer
+
+        Returns:
+            str: The answer in the example
+        """
+        cont = example[self.answer_key]
+        if self.prefix_space and not cont.startswith(' ') and not in_context:
+            cont = f' {cont}'
+        return cont
+
+    def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
+        """
+        If the input_ids is empty then input_ids will be a 0-length List
+        unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer).
+        If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
+        as the specific eval question's prompt will follow the input_ids.
+
+        Args:
+            input_ids (List): The tokenized input
+
+        Returns:
+            input_ids: The tokenized input conditionally edited
+        """
+        if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
+                input_ids[-1] == self.tokenizer.eos_token_id):
+            input_ids = input_ids[:-1]
+        return input_ids
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Runs text through the tokenizer and handle special cases.
+
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctxt (str): The specific example's derrived context
+            example (Dict): The example as a dictionary. Used for additional processing in inherited classes.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(preamble, list)
+        assert isinstance(tokenized_context, list)
+
+        tokenized_context = preamble + tokenized_context
+
+        if self.tokenize_labels:
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(self.get_answer_from_example(example),
+                                              add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = tokenized_answer
+            tokenized_example['continuation_indices'] = continuation_indices
+        else:
+            assert isinstance(tokenized_context, list)
+            trimmed_context = _trim_context(
+                tokenized_context,
+                [],
+                self.padding_size,
+            )
+            assert isinstance(trimmed_context, list)
+            padded_context = _make_padded_input(trimmed_context, [], self.padding_size, self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = self.get_answer_from_example(example)
+
+        return tokenized_example
+
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> Dict[str, Any]:
+        """
+        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
         example context/continuation pairs which precede the test context/continuation pair.
 
         Args:
+            example (Dict): A Dictionary from the hf dataset
+            example_idx (int): The index of example
             num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
             prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            continuation_delimiter (str): The delimiter used to separate each context from its continuation
-            question_prelimiter (str): The text to prepend to each question
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
-            cot_delimiter (str): The delimiter used to separate the chain-of-thought (if present) from the final model response.
 
+        Returns:
+            Dict: Contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        return tokenized_example
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        The function that the dataloader uses to accumulate data into batches.
+
+        Args:
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
+            Dict: Dictionary for a single batch
         """
-        max_answer_length = 0
-        has_cot = False
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            prompt_and_fewshot = self._format_prompt_and_fewshot(num_fewshot, prompt_string, example_delimiter,
-                                                                 continuation_delimiter, question_prelimiter,
-                                                                 cot_delimiter, fewshot_rng, sample_idx)
-
-            ctxt = self.samples[sample_idx]['context']
-            ctxt = f'{question_prelimiter}{ctxt}'
-            if len(prompt_and_fewshot) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-            ctxt = f'{ctxt}{continuation_delimiter_stripped}'
-
-            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            encoded_example['preamble'] = self.tokenizer(prompt_and_fewshot)
-            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['aliases'] = list(self.samples[sample_idx]['aliases'])
-            encoded_example['cot_delimiter'] = cot_delimiter
-            examples.append(encoded_example)
-            for answer in self.samples[sample_idx]['aliases']:
-                response = f"{self.samples[sample_idx]['chain_of_thought']}{cot_delimiter}{answer}"
-                max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
-
-            if len(self.samples[sample_idx]['chain_of_thought']) > 0:
-                has_cot = True
-
-        self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
-        return examples
-
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
-    def collate_fn(self, data):
-        inputs, answers = [], []
-        cot_delimiter = ''
-
-        for sample in data:
-            preamble, context, aliases = (sample['preamble'], sample['context'], sample['aliases'])
-            context_enc = preamble['input_ids'] + context['input_ids']
-            inp, _ = _make_padded_input(context_enc, [],
-                                        self.max_seq_len - self.max_answer_length,
-                                        self.pad_tok_id,
-                                        padding_side=self.padding_side)
-
-            inputs.append(inp)
-            answers.append(aliases)
-
-            # We will search for the answer within the portion of the model response
-            # beginning with `cot_delimiter`
-            cot_delimiter = sample['cot_delimiter']
-
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'mode': 'generate',
-            'labels': answers,
-            'cot_delimiter': cot_delimiter,
-            'generation_length': self.max_answer_length,
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id
-            }
-        }
+        batch = copy.deepcopy(self.base_batch)
+        for data_pair in data:
+            for batch_key, data_key in self.batch_mapping.items():
+                batch[batch_key].append(data_pair[data_key])
+            if 'continuation_indices' in data_pair:
+                batch['continuation_indices'].append(data_pair['continuation_indices'])
 
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        return batch['input_ids'].shape[0]
+    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
+        """
+        Handling for certain specialty columns that must be split into batches in different formats.
 
-    def split_batch(self, batch: Any, microbatch_size: int):
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
+
+        Returns:
+            List: List of chunked batches
+        """
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
-        no_split = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
-        normal_split = ['input_ids', 'attention_mask']
-        list_split = ['labels']
         chunked = {}
         for k, v in batch.items():
-            if k in no_split:
+            if k in self.static_keys:
                 # Defer broadcasting until we know num_chunks
                 pass
-            elif k in list_split:
+            elif k in self.list_keys:
                 chunked[k] = _split_list(v, microbatch_size)
-            elif k in normal_split:
+            elif k in self.tensor_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected key {k}')
+                raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
+            if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
+        batched_list = [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+        return batched_list
 
-class InContextLearningLMTaskDataset(Dataset):
-    """A dataset that construct batches for in-context learning language modeling evaluation
 
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
-            and "continuation". See tests/datasets/local_data/lambada_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
+class InContextLearningQATaskDataset(InContextLearningDataset):
     """
+    A dataset that constructs batches for in-context learning question answering evaluation.
+    QA tasks evaluate a model's ability to answer questions using a consistent format.
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(lambda examples: {
-                'continuation': examples['continuation'],
-                'context': examples['context'],
-            }))
-        self.samples = strip_data(self.samples)
-
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
+    The input format is expected to be a jsonl file with the following fields:
+    - context: The question
+    - answer: The preferred answer to the question
+    - aliases: A list of aliases for the answer
 
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+    See InContextLearningDataset for more details.
 
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
+    Additional Args:
+        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
+    """
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
+    def __init__(self,
+                 cot_delimiter: str = '',
+                 early_stopping_criteria: Optional[List[str]] = None,
+                 do_normalization: bool = True,
+                 *args,
+                 **kwargs):
+        if kwargs['tokenizer'].eos_token_id is None:
+            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
+        self.cot_delimiter = cot_delimiter
+        self.has_cot = False
+        self.max_answer_length = 0
+        static_keys = [
+            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+        ]
+        tensor_keys = ['input_ids', 'attention_mask']
+        list_keys = ['labels']
+        super().__init__(padding_side='left',
+                         tokenize_labels=False,
+                         static_keys=static_keys,
+                         list_keys=list_keys,
+                         tensor_keys=tensor_keys,
+                         *args,
+                         **kwargs)
+        # NOTE: set these after init call because they take class vars
+        self.early_stopping_criteria = early_stopping_criteria
+        self.base_batch = {
+            'input_ids': [],
+            'mode': 'generate',
+            'labels': [],
+            'cot_delimiter': self.cot_delimiter,
+            'generation_length': self.max_answer_length,
+            'stopping_criteria': early_stopping_criteria,
+            'do_normalization': do_normalization,
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                'use_cache': True,
+                'eos_token_id': self.tokenizer.eos_token_id,
+            }
+        }
+        self.batch_mapping = {
+            'input_ids': self.context_key,
+            'labels': 'aliases',
+        }
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
-        Each task consists of a context and a continuation as well as an optional prompt and optional list of
-        example context/continuation pairs which precede the test context/continuation pair.
+    def read_dataset(
+        self,
+        dataset_uri: str,
+        destination_path: str,
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+    ) -> 'HFDataset':
+        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        self.has_cot = 'chain_of_thought' in dataset.features
+        dataset = dataset.map(
+            lambda examples: {
+                'context': examples['context'],
+                'answer': examples['answer'],
+                'aliases': set([examples['answer']] + examples.get('aliases', [])),
+                'chain_of_thought': examples.get('chain_of_thought', ''),
+            })
+        self.max_answer_length = self._get_max_answer_length(dataset)
+        # NOTE: This is the only time we use the class variable padding_size.
+        self.padding_size = self.max_seq_len - self.max_answer_length
+        return dataset
 
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+        """
+        Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            continuation_delimiter (str): The delimiter used to separate each context from its continuation
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
+            example (Dict): The example from which to retrieve the answer
 
         Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
+            str: The answer in from the example with chain of thought and delimiter if needed
         """
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            preamble = prompt_string
-
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    ctxt, cont = self.samples[fewshot_idx]['context'], self.samples[fewshot_idx]['continuation']
-                    if len(preamble) > 0:
-                        ctxt = f'{example_delimiter}{ctxt}'
-                    preamble += f'{ctxt}{continuation_delimiter}{cont}'
-
-            ctxt, cont = self.samples[sample_idx]['context'], self.samples[sample_idx]['continuation']
-            if len(preamble) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
+        if self.has_cot:
+            return f'{example["chain_of_thought"]}{self.cot_delimiter}{example[self.answer_key]}'
+        else:
+            return example[self.answer_key]
 
-            if self.prefix_space and not cont.startswith(' '):
-                cont = f' {cont}'
-            ctxt += continuation_delimiter_stripped
-
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
-
-            examples.append(encoded_example)
-
-        return examples
-
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
-    def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        for data_pair in data:
-            preamble, context, continuation = (data_pair['preamble'], data_pair['context'], data_pair['continuation'])
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Run text through the tokenizer and handle special cases.
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
-            context_enc = preamble['input_ids'] + context['input_ids']
-            continuation_enc = continuation['input_ids']
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['aliases'] = list(example.get('aliases', []))
+        return tokenized_example
 
-            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                        self.pad_tok_id)
+    def _get_max_answer_length(self, dataset) -> int:
+        f"""
+        Loops over the dataset and finds the longest answer length.
 
-            inputs.append(inp)
-            continuation_indices.append(continuation_span)
+        Returns:
+            int: The maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
+        """
+        max_answer_length = 0
+        for example in dataset:
+            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
+            for answer in all_answers:
+                if self.has_cot:
+                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
+                else:
+                    response = answer
+                tokenized_repsonse = self.tokenizer(response)['input_ids']
+                assert isinstance(tokenized_repsonse, list)
+                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
+        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
+        return max_answer_length
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        batch = super().collate_fn(data)
+        batch_size = batch['input_ids'].shape[0]
+        stopping_criteria = None
+        if self.early_stopping_criteria:
+            if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
+                raise MissingConditionalImportError(extra_deps_group='nlp',
+                                                    conda_package='transformers',
+                                                    conda_channel='conda-forge')
+            stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
+        batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
+        return batch
 
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-        }
 
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
+class InContextLearningLMTaskDataset(InContextLearningDataset):
+    """
+    A dataset that constructs batches for in-context learning language modeling evaluation.
+    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        return batch['input_ids'].shape[0]
+    The input format is expected to be a jsonl file with the following fields:
+    - context: Preceding text
+    - continuation: The expected continuation
 
+    See InContextLearningDataset for more details.
+    """
 
-class InContextLearningMultipleChoiceTaskDataset(Dataset):
-    """A dataset that construct batches for in-context learning multiple choice evaluation
+    def __init__(self, *args, **kwargs):
+        super().__init__(answer_key='continuation',
+                         static_keys=['mode'],
+                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
+                         base_batch={
+                             'input_ids': [],
+                             'continuation_indices': [],
+                             'mode': 'icl_task',
+                             'labels': []
+                         },
+                         batch_mapping={
+                             'input_ids': 'context',
+                             'labels': 'context'
+                         },
+                         padding_side='right',
+                         *args,
+                         **kwargs)
+
+
+class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
+    """
+    A dataset that construct batches for in-context learning multiple choice evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
     inputs per question can stored in the same batch.
 
-    Each batch then consists of batch_size // N distinct questions and has the following the structure
-
-    'input_ids': Input tensor batch x seqlen x # tokens
-    'continuation_indices': List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    'mode': Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    'labels': Identical to the input, used by the model to calculate loss/metrics
-    'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    'choice_groupings': Indicates which indices of the batch correspond to which questions
-
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
-            "choices", and "gold" index. See tests/datasets/local_data/piqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
+    The default input format is a jsonl file with the following fields:
+    - query: The preceding text, question, or document relevant to the choices
+    - gold: Index of the correct choice under 'choices'
+    - choices: A list of strings, each being one of the potential choices
+
+    Each batch then consists of ``|batch_size // N|`` distinct questions and has the following the structure.
+    - input_ids: Input tensor ``|batch x seqlen x # tokens|``
+    - continuation_indices: List of ``|batch|`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length ``|batch_size // N|`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
+
+    Additional Args:
+        choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(lambda examples: {
-                'query': examples['query'],
-                'choices': examples['choices'],
-                'gold': examples['gold']
-            }))
-        self.samples = strip_data(self.samples)
-
-        self.num_choices = len(self.samples[0]['choices'])
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
-
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
-
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
-
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of multiple choice questions into tokenized format with prompt and few shot examples.
-
-        Each question consists of a query and set of answer choices, only one of which is correct. At inference time
-        we construct individual inference examples consisting of the query + a single choice, as well as an optional (prompt) and optional list
-        of example query + correct answers, which precede the test query + choice.
+    def __init__(self,
+                 choices_key: str = 'choices',
+                 static_keys: Optional[List] = None,
+                 list_of_tensors_keys: Optional[List] = None,
+                 list_of_tuples_keys: Optional[List] = None,
+                 list_of_primitives: Optional[List] = None,
+                 *args,
+                 **kwargs):
+        self.choices_key = choices_key
+        base_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
+        context_key = kwargs.pop('context_key', 'query')
+        static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
+        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
+        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
+        self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
+        self.list_of_primitives = list_of_primitives or ['gold_indices']
+        super().__init__(context_key=context_key,
+                         base_batch=base_batch,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         padding_side='right',
+                         *args,
+                         **kwargs)
+        self.num_choices = len(self.dataset[0][self.choices_key])
+        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
+        self.batch_map_per_example = {'gold_indices': 'gold'}
+
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+        """
+        Returns the correct answer from the example's choices.
+        Args:
+            example (Dict): The example from which to retrieve the answer
 
-        For multiple choice, this method provides information relaying which of the answer choices is the correct one. This
-        information is used for computing accuracy metrics.
+        Returns:
+            str: The full string of the correct answer based on the 'gold' key
+        """
+        choices = example[self.choices_key]
+        gold_idx = example['gold']
+        return choices[gold_idx]
 
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Runs text through the tokenizer and handle special cases.
         Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each example query/answer pair
-            continuation_delimiter (str): The delimiter used to separate each query from its answer
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
         Returns:
-            dict: Contains the query, the list of encoded potential answer choices, the preamble (prompt + fewshot examples), and
-                the index of the correct answer choice.
+            Dict: Dictionary with the tokenized data
         """
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-
-            preamble = prompt_string
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    query, choices, gold_idx = self.samples[fewshot_idx]['query'], self.samples[fewshot_idx][
-                        'choices'], self.samples[fewshot_idx]['gold']
-                    if len(preamble) > 0:
-                        query = f'{example_delimiter}{query}'
-                    assert isinstance(gold_idx, int)
-                    preamble += f'{query}{continuation_delimiter}{choices[gold_idx]}'
-            encoded_example = {}
-            query, choices, gold_idx = self.samples[sample_idx]['query'], self.samples[sample_idx][
-                'choices'], self.samples[sample_idx]['gold'],
-            if len(preamble) > 0:
-                query = f'{example_delimiter}{query}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-
+        # NOTE: some of this is repeated from super class but for loop makes things considerably different
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(tokenized_context, list)
+        tokenized_context = preamble + tokenized_context
+
+        tokenized_example[self.context_key] = []
+        tokenized_example[self.answer_key] = []
+        tokenized_example['continuation_indices'] = []
+        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our MC accuracy metric)
+        for choice in example[self.choices_key]:
             if self.prefix_space:
-                choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
-            query += continuation_delimiter_stripped
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['gold_idx'] = gold_idx
-
-            encoded_example['query'] = self.tokenizer(query, add_special_tokens=False)
-            encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
-
-            examples.append(encoded_example)
+                choice = f' {choice}' if not choice.startswith(' ') else choice
+
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_context, list)
+            assert isinstance(tokenized_answer, list)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                tokenized_answer,
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
 
-        return examples
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example[self.answer_key].append(tokenized_answer)
+            tokenized_example['continuation_indices'].append(continuation_indices)
 
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
+        tokenized_example['gold'] = example['gold']
+        return tokenized_example
 
-    def __len__(self):
-        return len(self.encoded_dataset)
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        The function that the dataloader uses to accumulate data into batches.
+        We run each distinct query + answer choice through the model separately and determine which
+        answer has the lowest per-token-perplexity.
+
+        If each question has N possible choices, all N must be grouped together as distinct elements of the batch
+        since the batch may consist of multiple questions, the choice_groupings indicates
+        which contiguous sequences of elements in the batch correspond to which question
+        gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
+        Args:
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
-    def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        gold_idxs = []
-        choice_groupings = []
+        Returns:
+            Dict: Dictionary for a single batch
+        """
+        batch = copy.deepcopy(self.base_batch)
         for data_pair in data:
-
-            choice_start_idx = len(continuation_indices)
-            preamble, context, choices, gold_idx = (data_pair['preamble'], data_pair['query'], data_pair['choices'],
-                                                    data_pair['gold_idx'])
-
-            for choice in choices:
-                context_enc = preamble['input_ids'] + context['input_ids']
-                continuation_enc = choice['input_ids']
-                inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                            self.pad_tok_id)
-
-                inputs.append(inp)
-                continuation_indices.append(continuation_span)
-
-            gold_idxs.append(gold_idx)
-            choice_end_idx = len(continuation_indices)
-            choice_groupings.append((choice_start_idx, choice_end_idx))
-
-        # We run each distinct query + answer choice through the model separately and determine which
-        # answer has the lowest per-token-perplexity.
-        #
-        # If each question has N possible choices, all N must be grouped together as distinct elements of the batch
-        # since the batch may consist of multiple questions, the choice_groupings indicates
-        # which contiguous sequences of elements in the batch correspond to which question
-        # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-            'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings
-        }
+            choice_start_idx = len(batch['continuation_indices'])
+            # NOTE: not using batch_mapping
+            for i, context_enc in enumerate(data_pair[self.context_key]):
+                batch['input_ids'].append(context_enc)
+                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
+                batch['labels'].append(context_enc)
+
+            batch['gold_indices'].append(data_pair['gold'])
+            choice_end_idx = len(batch['continuation_indices'])
+            batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
+
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int):
-        """Split batch while ensuring all continuations are in the same microbatch.
+    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
+        """
+        Split batch while ensuring all continuations are in the same microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
-        When splitting a batch, we have logical samples, which refer to one possible question,
-        and real samples, which refers to one possible continuation. As sample count and
-        microbatch_size are tracked in logical samples, we split logical attributes by
+        When splitting a batch, we have logical example, which refer to one possible question,
+        and real example, which refers to one possible continuation. As example count and
+        microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
+
+        Returns:
+            list: List of chunked batches
         """
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        no_split = ['mode']
-        # Real
-        real = ['input_ids', 'labels', 'attention_mask']
-        logical = ['gold_indices']
         chunked = {}
         for k, v in batch.items():
-            if k in no_split:
+            if k in self.static_keys:
                 # Defer broadcasting primitives until we know num_chunks
                 pass
-            elif k == 'continuation_indices':
-                # List of list, so we have to directly call _split_list
-                chunked[k] = _split_list(v, microbatch_size * self.num_choices)
-            elif k == 'choice_groupings':
-                # List of list, so we have to directly call _split_list
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in real:
+            elif type(v) == list:
+                # list of tensors - 'continuation_indices'
+                if k in self.list_of_tensors_keys:
+                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
+                # list of tuples - 'choice_groupings'
+                elif k in self.list_of_tuples_keys:
+                    chunked[k] = _split_list(v, microbatch_size)
+                # list - 'gold_indices'
+                elif k in self.list_of_primitives:
+                    chunked[k] = _default_split_batch(v, microbatch_size)
+                else:
+                    raise ValueError(f'Unexpected key {k} in list splitting')
+            elif k in self.tensor_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
-            elif k in logical:
-                chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected key {k}')
+                raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
         # Broadcast primitives to all chunks
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool)):
+            if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
+
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation
+    """A dataset that constructs batches for in-context learning schema evaluation.
     A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
     to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
     to determine the model's choice of fill-in word.
-    Each batch then consists of batch_size // N distinct tasks and has the following the structure
-    'input_ids': Input tensor batch x seqlen x # tokens
-    'continuation_indices': List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    'mode': Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    'labels': Identical to the input, used by the model to calculate loss/metrics
-    'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    'choice_groupings': Indicates which indices of the batch correspond to which questions
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
-            "choices", and "gold" index. See tests/datasets/local_data/piqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
+
+    The default input format is a jsonl file with the following fields:
+    - context_options: List of strings corresponding to possible preceding context options for the continuation
+    - gold: Index of the correct context from 'context_options'
+    - continuation: The finishing continuation
+
+    Each batch then consists of ``batch_size // N`` distinct tasks and has the following the structure
+    - input_ids: Input tensor ``batch x seqlen x # of tokens``
+    - continuation_indices: List of ``batch`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
+
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
+    def __init__(self, choices_key='context_options', *args, **kwargs):
+        static_keys = ['mode']
+        tensor_keys = ['input_ids', 'labels', 'attention_mask']
+        list_of_tensors_keys = ['continuation_indices']
+        super().__init__(choices_key=choices_key,
+                         context_key=choices_key,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         list_of_tensors_keys=list_of_tensors_keys,
+                         *args,
+                         **kwargs)
+        self.base_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
 
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(
-                lambda examples: {
-                    'context_options': examples['context_options'],
-                    'continuation': examples['continuation'],
-                    'gold': examples['gold']
-                }))
-        self.samples = strip_data(self.samples)
-
-        self.num_choices = len(self.samples[0]['context_options'])
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
+    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
+        """
+        Takes a example and constructs a context with the correct context for the example's continuation.
 
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): This will always be true when calling this function for SchemaTaskDataset
+
+        Returns:
+            str: The single correct context for a given continuation
+        """
+        context_options = example[self.choices_key]
+        gold_idx = example['gold']
+        continuation = example['continuation']
+        context = context_options[gold_idx]
+        if len(preceding_text) > 0:
+            context = f'{self.example_delimiter}{context}'
+        context = f'{context}{self.continuation_delimiter}{continuation}'
+        return context
+
+    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
+        """
+        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples).
 
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
-
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of schema questions into tokenized format with prompt and few shot examples.
-        Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation.
-        At inference time we construct individual inference examples consisting of a single context option + the continuation,
-        as well as an optional (prompt) and optional list of example correct context option + continuations, which precede the test context option + continuation.
-        For schema, this method provides information relaying which of the answer choices is the correct one. This
-        information is used for computing accuracy metrics.
         Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each example query/answer pair
-            continuation_delimiter (str): The delimiter used to separate each query from its answer
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
+
         Returns:
-            dict: Contains the query, the list of encoded potential answer choices, the preamble (prompt + fewshot examples), and
-                the index of the correct answer choice.
+            list: All context options for the selected example with formatting
         """
+        context_options = example[self.choices_key]
+        if len(preceding_text) > 0:
+            if self.strip_data:
+                cont_del = self.continuation_delimiter.rstrip()
+            else:
+                cont_del = self.continuation_delimiter
+            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+        return context_options
 
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-
-            preamble = prompt_string
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    context_options, continuation, gold_idx = self.samples[fewshot_idx][
-                        'context_options'], self.samples[fewshot_idx]['continuation'], self.samples[fewshot_idx]['gold']
-                    assert isinstance(gold_idx, int)
-                    context = context_options[gold_idx]
-                    if len(preamble) > 0:
-                        context = f'{example_delimiter}{context}'
-                    preamble += f'{context}{continuation_delimiter}{continuation}'
-
-            encoded_example = {}
-            context_options, continuation, gold_idx = self.samples[sample_idx]['context_options'], self.samples[
-                sample_idx]['continuation'], self.samples[sample_idx]['gold'],
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-
-            if len(preamble) > 0:
-                context_options = [f'{example_delimiter}{c}{continuation_delimiter_stripped}' for c in context_options]
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['gold_idx'] = gold_idx
-            encoded_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> Dict[str, Any]:
+        """
+        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
-            if self.prefix_space:
-                continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
-            encoded_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
-            examples.append(encoded_example)
+        Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
+        prompt if present.
 
-        return examples
+        Args:
+            example (Dict): A dictionary from the hf dataset
+            example_idx (int): The index of example
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
 
-    def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        gold_idxs = []
-        choice_groupings = []
-        for data_pair in data:
+        Returns:
+            Dict: Contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        return tokenized_example
 
-            continuation_start_idx = len(continuation_indices)
-            preamble, context_options, continuation, gold_idx = (data_pair['preamble'], data_pair['context_options'],
-                                                                 data_pair['continuation'], data_pair['gold_idx'])
-
-            for ctxt in context_options:
-                context_enc = preamble['input_ids'] + ctxt['input_ids']
-                continuation_enc = continuation['input_ids']
-                inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                            self.pad_tok_id)
-
-                inputs.append(inp)
-                continuation_indices.append(continuation_span)
-
-            gold_idxs.append(gold_idx)
-            continuation_end_idx = len(continuation_indices)
-            choice_groupings.append((continuation_start_idx, continuation_end_idx))
-
-        # We run each distinct query + answer choice through the model separately and determine which
-        # answer has the lowest per-token-perplexity.
-        #
-        # If each question has N possible choices, all N must be grouped together as distinct elements of the batch
-        # since the batch may consist of multiple questions, the choice_groupings indicates
-        # which contiguous sequences of elements in the batch correspond to which question
-        # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-            'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings
-        }
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
+        """
+        Runs text through the tokenizer and handle special cases.
 
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
-class InContextLearningCodeEvalDataset(Dataset):
-    """ A dataset that constructs batches for in-context learning code evaluation
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = {}
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        encoded_contexts = [
+            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
+            for c in context_options
+        ]
+        continuation = example['continuation']
+        if self.prefix_space:
+            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
+        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+
+        tokenized_example[self.context_key] = []
+        tokenized_example['continuation_indices'] = []
+        tokenized_example[self.answer_key] = []
+        for context in encoded_contexts:
+            assert isinstance(context, list)
+            assert isinstance(tokenized_continuation, list)
+            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
+            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
+                                                self.pad_tok_id, self.padding_side)
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example[self.answer_key].append(tokenized_continuation)
+
+        tokenized_example['gold'] = example['gold']
+        return tokenized_example
+
+
+class InContextLearningCodeEvalDataset(InContextLearningDataset):
+    """
+    A dataset that constructs batches for in-context learning code evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
-    - task_id: label of given task
-    - prompt: the code snippet that must be completed
-    - entry_point: the entry to the function/code snippet to generate
-    - canonical_solution: working solution
-    - test: the checker code that will run to completion if the code generation is valid and otherwise throw assertion
-    - test_inputs: list of test inputs
-    - test_outputs: list of test outputs
-    - language: the language of the code snippet
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-        supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "task_id",
-        "prompt", "entry_point", "canonical_solution", "test", "test_inputs", and "test_outputs". See tests/datasets/local_data/human_eval_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The maximum sequence length supported by the model
-        pad_tok_id (int): The special token reserved for padding batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
-        destination_path (str): Temporary path to store downloaded datasets
-        code_prelimiter (str): String to put before each code prompt (e.g. 'Q: ')
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
-        generations_per_sample: how many outputs to generate per prompt
-        top_p: top_p sampling parameter for nucleus sampling
-        top_k: top_k sampling parameter for number of samples to consider
+
+    - task_id: Label of given task
+    - prompt: The code snippet that must be completed
+    - entry_point: The entry to the function/code snippet to generate
+    - canonical_solution: Working solution
+    - test: The checker code that will run to completion if the code generation is valid and otherwise throw assertion
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - language: The language of the code snippet
+
+    Each batch then consists of the following the structure
+
+    - input_ids: Input tensor batch x seqlen x num tokens
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - mode: Always set to 'generate'
+    - labels: Exact solution for the coding problem
+    - prompts: Prompt for the task
+    - entry_points: List of entry points
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - languages:  List of languages
+    - pass_at_k: Passed value for pass_at_k
+    - generation_length: Derrived maximum generation length
+    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
+      by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+      for more details):
+
+        - pad_token_id: ID for padding token, derived automatically
+        - num_beams: How many beams to search for generations, set to 1
+        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
+        - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
+        - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
+
+    Additional Args:
+        generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
+        pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
     """
 
     def __init__(
         self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        destination_path: str,
-        code_prelimiter: str,
-        fewshot_random_seed: int,
         generations_per_sample: int,
         pass_at_k: int = 1,
-        top_p: Optional[float] = 0.95,
-        top_k: Optional[int] = 40,
+        *args,
+        **kwargs,
     ):
-        if tokenizer.eos_token_id is None:
-            raise ValueError('`InContextLearningCodeEvalDataset` tokenizer must have non-null `eos_token_id`')
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(
-                lambda examples: {
-                    'task_id': examples['task_id'],
-                    'prompt': examples['prompt'],
-                    'canonical_solution': examples['canonical_solution'],
-                    'test': examples['test'],
-                    'entry_point': examples['entry_point'],
-                    'test_inputs': examples['test_inputs'],
-                    'test_outputs': examples['test_outputs'],
-                    'language': examples['language'],
-                }))
-
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
-
-        self.pass_at_k = pass_at_k
-        self.generations_per_sample = generations_per_sample
-
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        self.padding_side = 'left'
+        batch_mapping = {
+            'input_ids': 'prompt',
+            'prompts': 'prompt_text',
+            'tests': 'test',
+            'labels': 'canonical_solution',
+            'entry_points': 'entry_point',
+            'test_inputs': 'test_inputs',
+            'test_outputs': 'test_outputs',
+            'languages': 'language'
+        }
+        # Linting complains if these are not set in init
         self.max_prompt_length = 0
-        self.top_p = top_p
-        self.top_k = top_k
         self.max_answer_length = 0
-        fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, code_prelimiter,
-                                                  fewshot_rng)
-
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, code_prelimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of code evaluation tasks into tokenized format with prompt and fewshot examples.
-
-        Each task consists of a context as well as an optional prompt and optional list of
-        example context/continuation pairs which precede the test context/continuation pair.
-
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            code_prelimiter (str): The text to prepend to each code prompt
-            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
-
-        Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
-        """
-        max_prompt_length = 0
-        examples = []
-        max_answer_length = 0
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            preamble = prompt_string
-
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    ctxt, cont = self.samples[fewshot_idx]['prompt'], self.samples[fewshot_idx]['canonical_solution']
-                    ctxt = f'{code_prelimiter}{ctxt}'
-                    if len(preamble) > 0:
-                        ctxt = f'{example_delimiter}{ctxt}'
-                    preamble += f'{ctxt}{cont}'
-
-            ctxt = self.samples[sample_idx]['prompt']
-            ctxt = f'{code_prelimiter}{ctxt}'
-            if len(preamble) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            encoded_example['preamble'] = self.tokenizer(preamble)
-            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['prompt'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['prompt_text'] = self.samples[sample_idx]['prompt']
-            encoded_example['task_id'] = self.samples[sample_idx]['task_id']
-            encoded_example['canonical_solution'] = self.samples[sample_idx]['canonical_solution']
-            encoded_example['test'] = self.samples[sample_idx]['test']
-            encoded_example['entry_point'] = self.samples[sample_idx]['entry_point']
-            encoded_example['test_inputs'] = self.samples[sample_idx]['test_inputs']
-            encoded_example['test_outputs'] = self.samples[sample_idx]['test_outputs']
-            encoded_example['language'] = self.samples[sample_idx]['language']
-
-            examples.append(encoded_example)
-            max_prompt_length = max(
-                max_prompt_length,
-                len(encoded_example['preamble']['input_ids'] + encoded_example['prompt']['input_ids']))
-            max_answer_length = max(
-                max_answer_length,
-                len(self.tokenizer(encoded_example['canonical_solution'], add_special_tokens=False)['input_ids']))
-
-        self.max_prompt_length = max_prompt_length
-        self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
-        return examples
-
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
-    def collate_fn(self, data):
-        inputs, prompts, tests, canonical_solutions, entry_points, test_inputs, test_outputs, languages = [], [], [], [], [], [], [], []
-        for sample in data:
-            preamble, prompt, text_prompt, canonical_solution, test, entry_point, test_input, test_output, language = (
-                sample['preamble'],
-                sample['prompt'],
-                sample['prompt_text'],
-                sample['canonical_solution'],
-                sample['test'],
-                sample['entry_point'],
-                sample['test_inputs'],
-                sample['test_outputs'],
-                sample['language'],
-            )
-            context_enc = preamble['input_ids'] + prompt['input_ids']
-            inp, _ = _make_padded_input(context_enc, [],
-                                        self.max_prompt_length,
-                                        self.pad_tok_id,
-                                        padding_side=self.padding_side)
-
-            inputs.append(inp)
-            tests.append(test)
-            prompts.append(text_prompt)
-            canonical_solutions.append(canonical_solution)
-            entry_points.append(entry_point)
-            test_inputs.append(test_input)
-            test_outputs.append(test_output)
-            languages.append(language)
-
-        batch = {
-            'input_ids': torch.stack(inputs),
+        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
+        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        tensor_keys = ['input_ids', 'attention_mask']
+        super().__init__(
+            context_key='prompt',
+            answer_key='canonical_solution',
+            strip_dataset=False,
+            static_keys=static_keys,
+            list_keys=list_keys,
+            tensor_keys=tensor_keys,
+            tokenize_labels=False,
+            padding_side='left',
+            batch_mapping=batch_mapping,
+            *args,
+            **kwargs,
+        )
+        self._set_max_prompt_and_answer_lengths()
+        self.dataset = self.dataset.map(self._trim_padding)
+        self.base_batch = {
+            'input_ids': [],
             'mode': 'generate',
-            'labels': canonical_solutions,
-            'prompts': prompts,  # list of prompts
-            'tests': tests,  # list of tests
-            'canonical_solutions': canonical_solutions,  # list of solutions
-            'entry_points': entry_points,  # list of entry points
-            'test_inputs': test_inputs,  # list of test inputs
-            'test_outputs': test_outputs,  # list of test outputs
-            'languages': languages,  # list of languages
-            'pass_at_k': self.pass_at_k,
+            'labels': [],
+            'prompts': [],
+            'tests': [],
+            'entry_points': [],
+            'test_inputs': [],
+            'test_outputs': [],
+            'languages': [],
+            'pass_at_k': pass_at_k,
             'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,  # how many gens per prompt
+                'num_return_sequences': generations_per_sample,
                 'do_sample': True,
-                'top_p': self.top_p,
-                'top_k': self.top_k,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        # Count number of inputs in the batch
-        return batch['input_ids'].shape[0]
+    def _set_max_prompt_and_answer_lengths(self):
+        """
+        Iterates through the dataset and finds the maximum prompt length and sequence lengths
 
-    def split_batch(self, batch: Any, microbatch_size: int):
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        no_split = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
-        normal_split = ['input_ids', 'attention_mask']
-        list_split = [
-            'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
-            'languages'
-        ]
-        chunked = {}
-        for k, v in batch.items():
-            if k in no_split:
-                # Defer broadcasting until we know num_chunks
-                pass
-            elif k in list_split:
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in normal_split:
-                chunked[k] = _default_split_batch(v, microbatch_size)
-            else:
-                raise ValueError(f'Unexpected key {k}')
-        num_chunks = len(chunked['input_ids'])
-        for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
-                chunked[k] = [v] * num_chunks
+        Returns:
+            None
+        """
+        max_prompt_length = 0
+        max_answer_length = 0
+        for example in self.dataset:
+            assert isinstance(example, Dict)
+            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            len_tokenized_answer = len(tokenized_answer)
+            max_answer_length = max(max_answer_length, len_tokenized_answer)
+
+        self.max_prompt_length = max_prompt_length
+        self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
+
+    def _trim_padding(self, example: Dict):
+        """
+        Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't know the maximum
+        prompt length until after we've tokenized it.
+
+        Returns:
+            dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
+        """
+        # Remove padding tokens applied during tokenization
+        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        # Reapply padding only to max_prompt_length
+        full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
+        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
+
+        example[self.context_key] = padded_context
+        return example
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Adds extra code task details to the example dictionary.
+        See InContextLearningDataset for more details
+        """
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['prompt_text'] = example['prompt']
+        tokenized_example['task_id'] = example['task_id']
+        tokenized_example['canonical_solution'] = example['canonical_solution']
+        tokenized_example['test'] = example['test']
+        tokenized_example['entry_point'] = example['entry_point']
+        tokenized_example['test_inputs'] = example['test_inputs']
+        tokenized_example['test_outputs'] = example['test_outputs']
+        tokenized_example['language'] = example['language']
+        return tokenized_example
 
 
 def build_icl_dataloader(
-    icl_task_type: str,
-    dataset_uri: str,
-    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-    batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str,  # e.g. ''
-    destination_path: str,
-    question_prelimiter: str = '',  # e.g. 'Question: '
-    cot_delimiter: str = '',
-    fewshot_random_seed: int = 1234,
-    pass_at_k: int = 1,
-    generations_per_sample: int = 1,
-) -> DataSpec:
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str,  # e.g. ''
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+        destination_path: str,
+        prelimiter: str,  # e.g. 'Question: '
+        cot_delimiter: str,  # e.g. ' ### '
+        fewshot_random_seed: int,
+        pass_at_k: int,
+        generations_per_sample: int,
+        generation_kwargs: Dict,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> DataSpec:
+    """
+    Factory method that builds the specific dataset for the specified icl_task_type.
+    See documentation for `get_icl_task_dataloader` for arugment documentation.
+
+    When writing a dataset for a new task, here you will need to:
+        1. add the dataset to the factory and choose an appropriate string
+        2. set the batch size for that task (see InContextLearningMultipleChoiceTaskDataset for why
+            this might be different)
+        3. set the `split_batch` funciton if necessary
+    """
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
-                                                             tokenizer,
-                                                             max_seq_len,
-                                                             pad_tok_id,
-                                                             num_fewshot,
-                                                             prompt_string,
-                                                             example_delimiter,
-                                                             continuation_delimiter,
-                                                             destination_path=destination_path,
-                                                             fewshot_random_seed=fewshot_random_seed)
+        dataset = InContextLearningMultipleChoiceTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(dataset_uri,
-                                                     tokenizer,
-                                                     max_seq_len,
-                                                     pad_tok_id,
-                                                     num_fewshot,
-                                                     prompt_string,
-                                                     example_delimiter,
-                                                     continuation_delimiter,
-                                                     destination_path=destination_path,
-                                                     fewshot_random_seed=fewshot_random_seed)
+        dataset = InContextLearningSchemaTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(dataset_uri,
-                                                 tokenizer,
-                                                 max_seq_len,
-                                                 pad_tok_id,
-                                                 num_fewshot,
-                                                 prompt_string,
-                                                 example_delimiter,
-                                                 continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 fewshot_random_seed=fewshot_random_seed)
+        dataset = InContextLearningLMTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(dataset_uri,
-                                                 tokenizer,
-                                                 max_seq_len,
-                                                 pad_tok_id,
-                                                 num_fewshot,
-                                                 prompt_string,
-                                                 example_delimiter,
-                                                 continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 question_prelimiter=question_prelimiter,
-                                                 fewshot_random_seed=fewshot_random_seed,
-                                                 cot_delimiter=cot_delimiter)
+        dataset = InContextLearningQATaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            cot_delimiter=cot_delimiter,
+            early_stopping_criteria=early_stopping_criteria,
+            do_normalization=do_normalization,
+            generation_kwargs=generation_kwargs,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(dataset_uri,
-                                                   tokenizer,
-                                                   max_seq_len,
-                                                   pad_tok_id,
-                                                   num_fewshot,
-                                                   prompt_string,
-                                                   example_delimiter,
-                                                   destination_path=destination_path,
-                                                   code_prelimiter=question_prelimiter,
-                                                   fewshot_random_seed=fewshot_random_seed,
-                                                   pass_at_k=pass_at_k,
-                                                   generations_per_sample=generations_per_sample)
+        dataset = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            generation_kwargs=generation_kwargs,
+        )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1252,7 +1488,12 @@ def build_icl_dataloader(
     split_batch = None
     if isinstance(
             dataset,
-        (InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset, InContextLearningCodeEvalDataset)):
+        (
+            InContextLearningMultipleChoiceTaskDataset,
+            InContextLearningQATaskDataset,
+            InContextLearningCodeEvalDataset,
+        ),
+    ):
         split_batch = dataset.split_batch
 
     return DataSpec(
@@ -1268,8 +1509,10 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Dict[str, str]:
-    """If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
+def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
+                                  hf_parsing_map: Dict) -> Dict[str, str]:
+    """
+    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
 
     Args:
         dataset_uri (str): Location of dataset.
@@ -1282,20 +1525,37 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
     try:
-        from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
     except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp',
-                                            conda_package='datasets',
-                                            conda_channel='conda-forge') from e
-    with dist.local_rank_zero_download_and_wait(destination_path):
-        if dist.get_local_rank() == 0:
-            get_file(dataset_uri, destination_path, overwrite=True)
-    dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+        raise MissingConditionalImportError(
+            extra_deps_group='nlp',
+            conda_package='datasets',
+            conda_channel='conda-forge',
+        ) from e
+    if dataset_uri.startswith('hf://'):
+        dataset_uri = dataset_uri.replace('hf://', '')
+        dataset = load_dataset(dataset_uri, **hf_loading_vars)
+        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        if hf_parsing_map:
+            dataset_parsing_func = lambda example: {
+                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+            }
+            assert hasattr(dataset, 'column_names')
+            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
+    else:
+        with dist.local_rank_zero_download_and_wait(destination_path):
+            if dist.get_local_rank() == 0:
+                get_file(dataset_uri, destination_path, overwrite=True)
+        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+    assert hasattr(dataset, 'features')
+    assert dataset.features is not None
     if 'category' not in dataset.features.keys():
-        raise Exception(
-            f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"
-        )
-    categories = sorted(set(dataset['category']))
+        raise Exception(f"""Attempted to partition dataset by `category` \
+            but it doesn't have a `category` key. \
+            Got keys: {str(list(dataset.features.keys()))}""")
+    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
     output_files = {}
     for cat in categories:
         path = destination_path.split('/')
@@ -1303,7 +1563,9 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
         tmp_path_to_broadcast = str(os.path.abspath(cat_dest))
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         if dist.get_local_rank() == 0:
-            subset = [l for l in dataset if l['category'] == cat]
+            subset = [
+                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
+            ]  # pyright: ignore[reportArgumentType, reportCallIssue]
             with open(gathered_paths[0], 'w', encoding='utf8') as f:
                 for l in subset:
                     f.write(json.dumps(l, ensure_ascii=False) + '\n')
@@ -1328,94 +1590,150 @@ def get_icl_task_dataloader(
         pass_at_k: int = 1,
         generations_per_sample: int = 1,
         cot_delimiter: str = '',
-        has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]:
+        has_categories: bool = False,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
     """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
-    >>> dl = get_icl_task_dataloader(
-       ... 'language_modeling',
-       ... dataset_uri,
-       ... tokenizer,
-       ... batch_size=2,
-       ... max_seq_len=2048,
-       ... pad_tok_id=tokenizer.pad_token_id,
-       ... num_fewshot=10,
-       ... prompt_string='translate english to french',
-       ... example_delimiter='\n',
-       ... continuation_delimiter=''
-       )
-    >>> eval_evaluator = Evaluator(
-       ...     label="lambada",
-       ...     dataloader=dl,
-       ...     metric_names=['InContextLearningLMAccuracy']
-       ... )
-    >>> trainer = Trainer(
-       ...     model=model,
-       ...     train_dataloader=train_dataloader,
-       ...     eval_dataloader=eval_evaluator,
-       ...     optimizers=optimizer,
-       ...     max_duration="1ep",
-       ... )
+        .. testsetup::
+
+            import transformers
+            from composer.models import HuggingFaceModel
+            from composer.trainer import Trainer
+            dataset_uri = "/tmp/dataset_uri.jsonl"
+            dataset = RandomTextClassificationDataset(size=16, use_keys=True)
+            train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+            hf_model, tokenizer = HuggingFaceModel.hf_from_composer_checkpoint('composer-hf-checkpoint.pt')
+            # At this point, hf_model is randomly initialized
+            composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
+
+        Example:
+
+        .. testcode::
+
+
+            dl = get_icl_task_dataloader(
+                'language_modeling',
+                dataset_uri,
+                tokenizer,
+                batch_size=2,
+                max_seq_len=2048,
+                pad_tok_id=tokenizer.pad_token_id,
+                num_fewshot=10,
+                prompt_string='translate english to french',
+                example_delimiter='\\n',
+                continuation_delimiter=''
+                )
+            eval_evaluator = Evaluator(
+                    label="lambada",
+                    dataloader=dl,
+                    metric_names=['InContextLearningLMAccuracy']
+                )
+            trainer = Trainer(
+                    model=model,
+                    train_dataloader=train_dataloader,
+                    eval_dataloader=eval_evaluator,
+                    optimizers=optimizer,
+                    max_duration="1ep",
+                )
 
     Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
+        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependant fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
         batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to pad each test example with
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path: (str): This is the local file where remote datasets will be saved.
-        question_prelimiter: (str): For QA tasks, this will be prepended to each question.
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: ').
+        destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
+        question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
+        fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
+        pass_at_k (int): k for how many chances the model gets to write passing code.
+        generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
+        cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
+        hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
+                                                  keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+                                                  for more details)
+        early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
+            Used in QA tasks with CoT
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningQAAccuracy. Only used in QA tasks.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """
+    if hf_loading_vars is None:
+        hf_loading_vars = {}
+    if hf_parsing_map is None:
+        hf_parsing_map = {}
+    if generation_kwargs is None:
+        generation_kwargs = {}
+    if early_stopping_criteria is None:
+        early_stopping_criteria = []
 
     if has_categories:
         result_dls = {}
-        output_files = partition_dataset_by_category(dataset_uri, destination_path)
+        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
             result_dls[category] = build_icl_dataloader(
-                icl_task_type,
-                partition_uri,
-                tokenizer,
-                batch_size,
-                max_seq_len,
-                pad_tok_id,
-                num_fewshot,
-                prompt_string,
-                example_delimiter,
-                continuation_delimiter,
-                partition_uri + '_tmp',
-                question_prelimiter,
-                cot_delimiter,
-                fewshot_random_seed,
-                pass_at_k,
-                generations_per_sample,
+                icl_task_type=icl_task_type,
+                dataset_uri=partition_uri,
+                tokenizer=tokenizer,
+                batch_size=batch_size,
+                max_seq_len=max_seq_len,
+                pad_tok_id=pad_tok_id,
+                num_fewshot=num_fewshot,
+                prompt_string=prompt_string,
+                example_delimiter=example_delimiter,
+                continuation_delimiter=continuation_delimiter,
+                destination_path=partition_uri + '_tmp',
+                prelimiter=question_prelimiter,
+                cot_delimiter=cot_delimiter,
+                fewshot_random_seed=fewshot_random_seed,
+                pass_at_k=pass_at_k,
+                generations_per_sample=generations_per_sample,
+                hf_loading_vars=hf_loading_vars,
+                hf_parsing_map=hf_parsing_map,
+                generation_kwargs=generation_kwargs,
+                early_stopping_criteria=early_stopping_criteria,
+                do_normalization=do_normalization,
             )
         return result_dls
     else:
         return build_icl_dataloader(
-            icl_task_type,
-            dataset_uri,
-            tokenizer,
-            batch_size,
-            max_seq_len,
-            pad_tok_id,
-            num_fewshot,
-            prompt_string,
-            example_delimiter,
-            continuation_delimiter,
-            destination_path,
-            question_prelimiter,
-            cot_delimiter,
-            fewshot_random_seed,
-            pass_at_k,
-            generations_per_sample,
+            icl_task_type=icl_task_type,
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=question_prelimiter,
+            cot_delimiter=cot_delimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            generation_kwargs=generation_kwargs,
+            early_stopping_criteria=early_stopping_criteria,
+            do_normalization=do_normalization,
         )
diff --git a/composer/datasets/lm_dataset.py b/composer/datasets/lm_dataset.py
deleted file mode 100644
index aa752d0b2a..0000000000
--- a/composer/datasets/lm_dataset.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import warnings
-from typing import List, cast
-
-from torch.utils.data import DataLoader, Dataset
-
-from composer.utils import MissingConditionalImportError, dist
-
-log = logging.getLogger(__name__)
-
-
-def build_lm_dataloader(
-    datadir: List[str],
-    tokenizer_name: str,
-    global_batch_size: int,
-    *,
-    split: str = 'train',
-    shuffle: bool = True,
-    drop_last: bool = True,
-    use_masked_lm: bool = False,
-    num_tokens: int = 0,
-    mlm_probability: float = 0.15,
-    subsample_ratio: float = 1.0,
-    **dataloader_kwargs,
-):
-    """Builds a dataloader for a generic language modeling dataset.
-
-    Args:
-        datadir (list): List containing the string of the path to the HuggingFace
-            Datasets directory.
-        dataloader_hparams (DataLoaderHparams): DataLoaderHparams object.
-        tokenizer_name (str): The name of the HuggingFace tokenizer to
-            preprocess text with. See `HuggingFace documentation
-            <https://huggingface.co/models>`_.
-        global_batch_size (int): Global batch size.
-        split (str): the dataset split to use either 'train', 'val', or 'test'. Default: ``'train```. Default: ``'train'``.
-        shuffle (bool): whether to shuffle the dataset. Default: ``True``.
-        drop_last (bool): whether to drop last samples. Default: ``True``.
-        use_masked_lm (bool): Whether the dataset should be encoded with masked
-            language modeling or not.
-        num_tokens (int, optional): Number of tokens to train on. ``0``
-            will train on all tokens in the dataset. Default: ``0``.
-        mlm_probability (float, optional): If using masked language modeling, the
-            probability with which tokens will be masked. Default: ``0.15``.
-        subsample_ratio (float, optional): Proportion of the dataset to use. Default:
-            ``1.0``.
-        **dataloader_kwargs (Dict[str, Any]): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_lm_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    try:
-        import datasets
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='datasets') from e
-
-    assert tokenizer_name is not None
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    # loads a dataset that is assumed to be pre-tokenized
-    lm_datasets = [datasets.load_from_disk(i) for i in datadir]  #type: ignore (thirdparty)
-
-    # merge the dataset to re-sample from
-    if split is None:
-        raise ValueError('A dataset split is required')
-    merged_dataset = [[d[split]] for d in lm_datasets]
-    # flatten merged_dataset
-    merged_dataset = [item for sublist in merged_dataset for item in sublist]
-    lm_datasets = datasets.concatenate_datasets(merged_dataset)  #type: ignore (thirdparty)
-
-    total_num_samples = len(lm_datasets)  # type: ignore
-    tokens_per_sample = len(lm_datasets[0]['input_ids'])  #type: ignore (thirdparty)
-    total_num_tokens = total_num_samples * tokens_per_sample
-
-    # truncate the dataset to a specified size
-    num_samples = total_num_samples
-    if num_tokens > 0:
-        assert num_tokens <= total_num_tokens, f'Requested {num_tokens} tokens must be <= total_num_tokens={total_num_tokens}'
-        assert num_tokens % tokens_per_sample == 0, f'Requested {num_tokens} tokens is not divisible by tokens_per_sample={tokens_per_sample}'
-        num_samples = num_tokens // tokens_per_sample
-        subsample_ratio = num_samples / total_num_samples
-    elif subsample_ratio < 1.0:
-        num_samples = round(total_num_samples * subsample_ratio)
-        num_tokens = num_samples * tokens_per_sample
-    elif subsample_ratio == 1.0 and num_tokens == 0:
-        num_tokens = total_num_tokens
-    else:
-        log.warning('No subsampling going on!')
-
-    lm_datasets = lm_datasets.select(range(num_samples))  # type: ignore (thirdparty)
-    log.info(f'LM datasets: {lm_datasets}')
-    log.info(f'Subsample ratio: {subsample_ratio}')
-    log.info(f'Total number of samples: {num_samples:e}')
-    log.info(f'Total number of tokens: {num_tokens:e}')
-    dataset = lm_datasets
-
-    # for some tokenizers, e.g. GPT-2, they don't have padding tokens. Hence, we cannot use the LM collator.
-    if tokenizer.pad_token_id is None:
-        data_collator = transformers.default_data_collator
-    else:
-        data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer,
-                                                                     mlm=use_masked_lm,
-                                                                     mlm_probability=mlm_probability)
-
-    sampler = dist.get_sampler(
-        cast(Dataset, dataset),  # HF datasets do not subclass torch datasets, so this cast is needed
-        drop_last=drop_last,
-        shuffle=shuffle)
-
-    return DataLoader(
-        dataset=dataset,  # type: ignore
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        collate_fn=data_collator,
-        **dataloader_kwargs)
diff --git a/composer/datasets/mnist.py b/composer/datasets/mnist.py
deleted file mode 100644
index 4ec9601da4..0000000000
--- a/composer/datasets/mnist.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import warnings
-from typing import Any
-
-from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
-
-from composer.core import MemoryFormat
-from composer.datasets.synthetic import SyntheticBatchPairDataset
-from composer.utils import dist
-
-
-def build_mnist_dataloader(
-    datadir: str,
-    global_batch_size: int,
-    is_train: bool = True,
-    download: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    **dataloader_kwargs: Any,
-) -> DataLoader:
-    """Builds an MNIST dataloader.
-
-    Args:
-        datadir (str): Path to the data directory
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        download (bool, optional): Whether to download the dataset, if needed. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    warnings.warn(DeprecationWarning('build_mnist_dataloader is deprecated and will be removed in v0.18'))
-
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-
-    transform = transforms.Compose([transforms.ToTensor()])
-
-    with dist.run_local_rank_zero_first():
-        dataset = datasets.MNIST(
-            datadir,
-            train=is_train,
-            download=dist.get_local_rank() == 0 and download,
-            transform=transform,
-        )
-
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
-
-
-def build_synthetic_mnist_dataloader(
-    global_batch_size: int,
-    is_train: bool = True,
-    drop_last: bool = True,
-    shuffle: bool = True,
-    num_unique_samples: int = 100,
-    device: str = 'cpu',
-    memory_format: MemoryFormat = MemoryFormat.CONTIGUOUS_FORMAT,
-    **dataloader_kwargs: Any,
-) -> DataLoader:
-    """Builds a synthetic MNIST dataset.
-
-    Args:
-        global_batch_size (int): Global batch size.
-        is_train (bool): Whether to load the training data or validation data. Default:
-            ``True``.
-        drop_last (bool): Drop remainder samples. Default: ``True``.
-        shuffle (bool): Shuffle the dataset. Default: ``True``.
-        num_unique_samples (int): number of unique samples in synthetic dataset. Default: ``100``.
-        device (str): device with which to load the dataset. Default: ``cpu``.
-        memory_format (:class:`composer.core.MemoryFormat`): memory format of the tensors. Default: ``CONTIGUOUS_FORMAT``.
-        **dataloader_kwargs (Any): Additional settings for the dataloader (e.g. num_workers, etc.)
-    """
-    if global_batch_size % dist.get_world_size() != 0:
-        raise ValueError(
-            f'global_batch_size ({global_batch_size}) must be divisible by world_size ({dist.get_world_size()}).')
-    batch_size = global_batch_size // dist.get_world_size()
-    dataset = SyntheticBatchPairDataset(
-        total_dataset_size=60_000 if is_train else 10_000,
-        data_shape=[1, 28, 28],
-        num_classes=10,
-        num_unique_samples_to_create=num_unique_samples,
-        device=device,
-        memory_format=memory_format,
-    )
-    sampler = dist.get_sampler(dataset, drop_last=drop_last, shuffle=shuffle)
-
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=drop_last,
-        **dataloader_kwargs,
-    )
diff --git a/composer/datasets/synthetic.py b/composer/datasets/synthetic.py
deleted file mode 100644
index 37d2e0f52c..0000000000
--- a/composer/datasets/synthetic.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Synthetic datasets used for testing, profiling, and debugging."""
-
-from __future__ import annotations
-
-import warnings
-from typing import Callable, Optional, Sequence, Union
-
-import torch
-import torch.utils.data
-from PIL import Image
-from torchvision.datasets import VisionDataset
-
-from composer.core import MemoryFormat
-from composer.utils import StringEnum
-
-__all__ = ['SyntheticDataType', 'SyntheticDataLabelType', 'SyntheticBatchPairDataset', 'SyntheticPILDataset']
-
-
-class SyntheticDataType(StringEnum):
-    """Defines the distribution of the synthetic data.
-
-    Attributes:
-        GAUSSIAN: Standard Gaussian distribution.
-        SEPARABLE: Gaussian distributed, but classes will be mean-shifted for
-            separability.
-    """
-
-    GAUSSIAN = 'gaussian'
-    SEPARABLE = 'separable'
-
-
-class SyntheticDataLabelType(StringEnum):
-    """Defines the class label type of the synthetic data.
-
-    Attributes:
-        CLASSIFICATION_INT: Class labels are ints.
-        CLASSIFICATION_ONE_HOT: Class labels are one-hot vectors.
-    """
-    CLASSIFICATION_INT = 'classification_int'
-    CLASSIFICATION_ONE_HOT = 'classification_one_hot'
-
-
-class SyntheticBatchPairDataset(torch.utils.data.Dataset):
-    """Emulates a dataset of provided size and shape.
-
-    Args:
-        total_dataset_size (int): The total size of the dataset to emulate.
-        data_shape (List[int]): Shape of the tensor for input samples.
-        num_unique_samples_to_create (int): The number of unique samples to allocate memory for.
-        data_type (str or SyntheticDataType, optional), Type of synthetic data to create.
-            Default: ``SyntheticDataType.GAUSSIAN``.
-        label_type (str or SyntheticDataLabelType, optional), Type of synthetic data to
-            create. Default: ``SyntheticDataLabelType.CLASSIFICATION_INT``.
-        num_classes (int, optional): Number of classes to use. Required if
-            ``SyntheticDataLabelType`` is ``CLASSIFICATION_INT``
-            or``CLASSIFICATION_ONE_HOT``. Default: ``None``.
-        label_shape (List[int], optional): Shape of the tensor for each sample label.
-            Default: ``None``.
-        device (str): Device to store the sample pool. Set to ``'cuda'`` to store samples
-            on the GPU and eliminate PCI-e bandwidth with the dataloader. Set to ``'cpu'``
-            to move data between host memory and the gpu on every batch. Default:
-            ``'cpu'``.
-        memory_format (:class:`composer.core.MemoryFormat`, optional): Memory format for the sample pool.
-            Default: `MemoryFormat.CONTIGUOUS_FORMAT`.
-        transform (Callable, optional): Transform(s) to apply to data. Default: ``None``.
-    """
-
-    def __init__(self,
-                 *,
-                 total_dataset_size: int,
-                 data_shape: Sequence[int],
-                 num_unique_samples_to_create: int = 100,
-                 data_type: Union[str, SyntheticDataType] = SyntheticDataType.GAUSSIAN,
-                 label_type: Union[str, SyntheticDataLabelType] = SyntheticDataLabelType.CLASSIFICATION_INT,
-                 num_classes: Optional[int] = None,
-                 label_shape: Optional[Sequence[int]] = None,
-                 device: str = 'cpu',
-                 memory_format: Union[str, MemoryFormat] = MemoryFormat.CONTIGUOUS_FORMAT,
-                 transform: Optional[Callable] = None):
-        warnings.warn(DeprecationWarning('SyntheticBatchPairDataset is deprecated and will be removed in v0.18'))
-
-        self.total_dataset_size = total_dataset_size
-        self.data_shape = data_shape
-        self.num_unique_samples_to_create = num_unique_samples_to_create
-        self.data_type = SyntheticDataType(data_type)
-        self.label_type = SyntheticDataLabelType(label_type)
-        self.num_classes = num_classes
-        self.label_shape = label_shape
-        self.device = device
-        self.memory_format = MemoryFormat(memory_format)
-        self.transform = transform
-
-        self._validate_label_inputs(label_type=self.label_type,
-                                    num_classes=self.num_classes,
-                                    label_shape=self.label_shape)
-
-        # The synthetic data
-        self.input_data = None
-        self.input_target = None
-
-    def _validate_label_inputs(self, label_type: SyntheticDataLabelType, num_classes: Optional[int],
-                               label_shape: Optional[Sequence[int]]):
-        if label_type == SyntheticDataLabelType.CLASSIFICATION_INT or label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-            if num_classes is None or num_classes <= 0:
-                raise ValueError('classification label_types require num_classes > 0')
-
-    def __len__(self) -> int:
-        return self.total_dataset_size
-
-    def __getitem__(self, idx: int):
-        idx = idx % self.num_unique_samples_to_create
-        if self.input_data is None:
-            # Generating data on the first call to __getitem__ so that data is stored on the correct gpu,
-            # after DeviceSingleGPU calls torch.cuda.set_device
-            # This does mean that the first batch will be slower
-            # generating samples so all values for the sample are the sample index
-            # e.g. all(input_data[1] == 1). Helps with debugging.
-            assert self.input_target is None
-            input_data = torch.randn(self.num_unique_samples_to_create, *self.data_shape, device=self.device)
-
-            input_data = torch.clone(input_data)  # allocate actual memory
-            input_data = input_data.contiguous(memory_format=getattr(torch, self.memory_format.value))
-
-            if self.label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-                assert self.num_classes is not None
-                input_target = torch.zeros((self.num_unique_samples_to_create, self.num_classes), device=self.device)
-                input_target[:, 0] = 1.0
-            elif self.label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-                assert self.num_classes is not None
-                if self.label_shape:
-                    label_batch_shape = (self.num_unique_samples_to_create, *self.label_shape)
-                else:
-                    label_batch_shape = (self.num_unique_samples_to_create,)
-                input_target = torch.randint(0, self.num_classes, label_batch_shape, device=self.device)
-            else:
-                raise ValueError(f'Unsupported label type {self.data_type}')
-
-            # If separable, force the positive examples to have a higher mean than the negative examples
-            if self.data_type == SyntheticDataType.SEPARABLE:
-                assert self.label_type == SyntheticDataLabelType.CLASSIFICATION_INT, \
-                    'SyntheticDataType.SEPARABLE requires integer classes.'
-                assert torch.max(input_target) == 1 and torch.min(input_target) == 0, \
-                    'SyntheticDataType.SEPARABLE only supports binary labels'
-                # Make positive examples have mean = 3 and negative examples have mean = -3
-                # so they are easier to separate with a classifier
-                input_data[input_target == 0] -= 3
-                input_data[input_target == 1] += 3
-
-            self.input_data = input_data
-            self.input_target = input_target
-
-        assert self.input_target is not None
-
-        if self.transform is not None:
-            return self.transform(self.input_data[idx]), self.input_target[idx]
-        else:
-            return self.input_data[idx], self.input_target[idx]
-
-
-class SyntheticPILDataset(VisionDataset):
-    """Similar to :class:`SyntheticBatchPairDataset`, but yields samples of type :class:`~PIL.Image.Image` and supports
-    dataset transformations.
-
-    Args:
-        total_dataset_size (int): The total size of the dataset to emulate.
-        data_shape (List[int]): Shape of the tensor for input samples.
-        num_unique_samples_to_create (int): The number of unique samples to allocate memory for.
-        data_type (str or SyntheticDataType, optional), Type of synthetic data to create.
-            Default: ``SyntheticDataType.GAUSSIAN``.
-        label_type (str or SyntheticDataLabelType, optional), Type of synthetic data to
-            create. Default: ``SyntheticDataLabelType.CLASSIFICATION_INT``.
-        num_classes (int, optional): Number of classes to use. Required if
-            ``SyntheticDataLabelType`` is ``CLASSIFICATION_INT``
-            or ``CLASSIFICATION_ONE_HOT``. Default: ``None``.
-        label_shape (List[int], optional): Shape of the tensor for each sample label.
-            Default: ``None``.
-        transform (Callable, optional): Transform(s) to apply to data. Default: ``None``.
-    """
-
-    def __init__(self,
-                 *,
-                 total_dataset_size: int,
-                 data_shape: Sequence[int] = (64, 64, 3),
-                 num_unique_samples_to_create: int = 100,
-                 data_type: Union[str, SyntheticDataType] = SyntheticDataType.GAUSSIAN,
-                 label_type: Union[str, SyntheticDataLabelType] = SyntheticDataLabelType.CLASSIFICATION_INT,
-                 num_classes: Optional[int] = None,
-                 label_shape: Optional[Sequence[int]] = None,
-                 transform: Optional[Callable] = None):
-        warnings.warn(DeprecationWarning('SyntheticPILDataset is deprecated and will be removed in v0.18'))
-        super().__init__(root='', transform=transform)
-        self._dataset = SyntheticBatchPairDataset(
-            total_dataset_size=total_dataset_size,
-            data_shape=data_shape,
-            data_type=data_type,
-            num_unique_samples_to_create=num_unique_samples_to_create,
-            label_type=label_type,
-            num_classes=num_classes,
-            label_shape=label_shape,
-        )
-
-    def __len__(self) -> int:
-        return len(self._dataset)
-
-    def __getitem__(self, idx: int):
-        input_data, target = self._dataset[idx]
-
-        input_data = input_data.numpy()
-
-        # Shift and scale to be [0, 255]
-        input_data = (input_data - input_data.min())
-        input_data = (input_data * (255 / input_data.max())).astype('uint8')
-
-        sample = Image.fromarray(input_data)
-        if self.transform is not None:
-            return self.transform(sample), target
-        else:
-            return sample, target
diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py
index 9f6b2aac4e..c8ff48cfbf 100644
--- a/composer/datasets/utils.py
+++ b/composer/datasets/utils.py
@@ -5,130 +5,28 @@
 
 import logging
 import textwrap
-import warnings
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional
 
-import numpy as np
 import torch
-from PIL import Image
 from torchvision import transforms
 from torchvision.datasets import VisionDataset
 
-from composer.core import Batch
-
 __all__ = [
     'add_vision_dataset_transform',
-    'NormalizationFn',
-    'pil_image_collate',
+    'MultiTokenEOSCriteria',
 ]
 
 log = logging.getLogger(__name__)
 
 
-class NormalizationFn:
-    """Normalizes input data and removes the background class from target data if desired.
-
-    An instance of this class can be used as the ``device_transforms`` argument
-    when constructing a :class:`~composer.core.data_spec.DataSpec`. When used here,
-    the data will normalized after it has been loaded onto the device (i.e., GPU).
-
-    Args:
-        mean (Tuple[float, float, float]): The mean pixel value for each channel (RGB) for
-            the dataset.
-        std (Tuple[float, float, float]): The standard deviation pixel value for each
-            channel (RGB) for the dataset.
-        ignore_background (bool): If ``True``, ignore the background class in the training
-            loss. Only used in semantic segmentation. Default: ``False``.
-    """
-
-    def __init__(self,
-                 mean: Tuple[float, float, float],
-                 std: Tuple[float, float, float],
-                 ignore_background: bool = False):
-        warnings.warn(DeprecationWarning('NormalizationFn is deprecated and will be removed in v0.18'))
-        self.mean = mean
-        self.std = std
-        self.ignore_background = ignore_background
-
-    def __call__(self, batch: Batch):
-        xs, ys = batch
-        assert isinstance(xs, torch.Tensor)
-        assert isinstance(ys, torch.Tensor)
-        device = xs.device
-
-        if not isinstance(self.mean, torch.Tensor):
-            self.mean = torch.tensor(self.mean, device=device)
-            self.mean = self.mean.view(1, 3, 1, 1)
-        if not isinstance(self.std, torch.Tensor):
-            self.std = torch.tensor(self.std, device=device)
-            self.std = self.std.view(1, 3, 1, 1)
-
-        xs = xs.float()
-        xs = xs.sub_(self.mean).div_(self.std)
-        if self.ignore_background:
-            ys = ys.sub_(1)
-        return xs, ys
-
-
-def pil_image_collate(
-        batch: List[Tuple[Image.Image, Union[Image.Image, np.ndarray]]],
-        memory_format: torch.memory_format = torch.contiguous_format) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Constructs a length 2 tuple of torch.Tensors from datasets that yield samples of type
-    :class:`PIL.Image.Image`.
-
-    This function can be used as the ``collate_fn`` argument of a :class:`torch.utils.data.DataLoader`.
-
-    Args:
-        batch (List[Tuple[Image.Image, Union[Image.Image, np.ndarray]]]): List of (image, target) tuples
-            that will be aggregated and converted into a single (:class:`~torch.Tensor`, :class:`~torch.Tensor`)
-            tuple.
-
-        memory_format (torch.memory_format): The memory format for the input and target tensors.
-
-    Returns:
-        (torch.Tensor, torch.Tensor): Tuple of (image tensor, target tensor)
-            The image tensor will be four-dimensional (NCHW or NHWC, depending on the ``memory_format``).
-    """
-    warnings.warn(DeprecationWarning('pil_image_collate is deprecated and will be removed in v0.18'))
-    imgs = [sample[0] for sample in batch]
-    w, h = imgs[0].size
-    image_tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=memory_format)
-
-    # Convert targets to torch tensor
-    targets = [sample[1] for sample in batch]
-    if isinstance(targets[0], Image.Image):
-        target_dims = (len(targets), targets[0].size[1], targets[0].size[0])
-    else:
-        target_dims = (len(targets),)
-    target_tensor = torch.zeros(target_dims, dtype=torch.int64).contiguous(memory_format=memory_format)
-
-    for i, img in enumerate(imgs):
-        nump_array = np.asarray(img, dtype=np.uint8)
-        if nump_array.ndim < 3:
-            nump_array = np.expand_dims(nump_array, axis=-1)
-
-        nump_array = np.rollaxis(nump_array, 2).copy()
-        if nump_array.shape[0] != 3:
-            assert nump_array.shape[0] == 1, 'unexpected shape'
-            nump_array = np.resize(nump_array, (3, h, w))
-        assert image_tensor.shape[1:] == nump_array.shape, 'shape mismatch'
-
-        image_tensor[i] += torch.from_numpy(nump_array)
-        target_tensor[i] += torch.from_numpy(np.array(targets[i], dtype=np.int64))
-
-    return image_tensor, target_tensor
-
-
 def add_vision_dataset_transform(dataset: VisionDataset, transform: Callable, is_tensor_transform: bool = False):
     """Add a transform to a dataset's collection of transforms.
-
     Args:
         dataset (VisionDataset): A torchvision dataset.
         transform (Callable): Function to be added to the dataset's collection of
             transforms.
         is_tensor_transform (bool): Whether ``transform`` acts on data of the type
             :class:`~torch.Tensor`. default: ``False``.
-
             * If ``True``, and :class:`~torchvision.transforms.ToTensor` is present in the transforms of the
               ``dataset``, then ``transform`` will be inserted after the
               :class:`~torchvision.transforms.ToTensor` transform.
@@ -136,12 +34,9 @@ def add_vision_dataset_transform(dataset: VisionDataset, transform: Callable, is
               inserted before :class:`~torchvision.transforms.ToTensor`.
             * If :class:`~torchvision.transforms.ToTensor` is not present, the transform will be appended to
               the end of collection of transforms.
-
     Returns:
         None: The ``dataset`` is modified in-place.
     """
-    warnings.warn(DeprecationWarning('add_vision_dataset_transform is deprecated and will be removed in v0.18'))
-
     transform_added_logstring = textwrap.dedent(f"""\
         Transform {transform} added to dataset.
         Dataset now has the following transforms: {dataset.transform}""")
@@ -166,3 +61,63 @@ def add_vision_dataset_transform(dataset: VisionDataset, transform: Callable, is
         else:
             dataset.transform = transforms.Compose([dataset.transform, transform])
             log.warning(transform_added_logstring)
+
+
+try:
+    import transformers
+
+    class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+        """Criteria to stop on the specified multi-token sequence.
+        Slightly modified from: https://github.com/EleutherAI/lm-evaluation-harness/blob/78545d42f2ca95c6fe0ed220d456eeb94f4485e9/lm_eval/utils.py#L614-L649
+        """
+
+        def __init__(
+            self,
+            stop_sequence: str,
+            tokenizer: transformers.PreTrainedTokenizerBase,
+            batch_size: int,
+        ) -> None:
+            self.done_tracker = [False] * batch_size
+            self.stop_sequence = stop_sequence
+            self.stop_sequence_ids = tokenizer.encode(stop_sequence, add_special_tokens=False)
+
+            # sentence piece tokenizers add a superflous underline token before string-initial \n
+            # that throws off our calculation of the stop sequence length
+            # so we remove any token ids that produce empty strings
+            self.stop_sequence_ids = [id for id in self.stop_sequence_ids if tokenizer.decode(id) != '']
+
+            # we look back for 1 more token than it takes to encode our stop sequence
+            # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+            # and we don't want to mistakenly not stop a generation because our
+            # (string) stop sequence was output in a different tokenization
+
+            self.stop_sequence_id_len = len(self.stop_sequence_ids) + 1
+            self.tokenizer = tokenizer
+
+        def __call__(self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
+            # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+            lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
+            lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+            for i, done in enumerate(self.done_tracker):
+                if i >= len(lookback_tokens_batch):
+                    # The last batch of a dataset may be smaller than `batch_size`
+                    # Automatically set those indices in the done_tracker to True
+                    # since those indices don't show up in the current batch
+                    self.done_tracker[i] = True
+                    break
+                elif not done:
+                    self.done_tracker[i] = self.stop_sequence in lookback_tokens_batch[i]
+            return False not in self.done_tracker
+
+    def stop_sequences_criteria(
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        stop_sequences: List[str],
+        batch_size: int,
+    ) -> transformers.StoppingCriteriaList:
+        return transformers.StoppingCriteriaList([
+            *[MultiTokenEOSCriteria(sequence, tokenizer, batch_size) for sequence in stop_sequences],
+        ])
+
+except ImportError as e:
+    stop_sequences_criteria = None  # pyright: ignore [reportGeneralTypeIssues]
+    MultiTokenEOSCriteria = None  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/composer/loggers/__init__.py b/composer/loggers/__init__.py
index b46039fbe3..d95ca05c36 100644
--- a/composer/loggers/__init__.py
+++ b/composer/loggers/__init__.py
@@ -20,6 +20,7 @@
 from composer.loggers.logger_destination import LoggerDestination
 from composer.loggers.mlflow_logger import MLFlowLogger
 from composer.loggers.mosaicml_logger import MosaicMLLogger
+from composer.loggers.neptune_logger import NeptuneLogger
 from composer.loggers.progress_bar_logger import ProgressBarLogger
 from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
 from composer.loggers.slack_logger import SlackLogger
@@ -32,6 +33,7 @@
     'LoggerDestination',
     'FileLogger',
     'InMemoryLogger',
+    'NeptuneLogger',
     'ProgressBarLogger',
     'WandBLogger',
     'RemoteUploaderDownloader',
diff --git a/composer/loggers/cometml_logger.py b/composer/loggers/cometml_logger.py
index a6b362f8cb..01b814e6b7 100644
--- a/composer/loggers/cometml_logger.py
+++ b/composer/loggers/cometml_logger.py
@@ -98,7 +98,12 @@ def init(self, state: State, logger: Logger) -> None:
             assert self.experiment is not None
             self.experiment.set_name(self.name)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         if self._enabled:
             assert self.experiment is not None
             try:
@@ -109,8 +114,13 @@ def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Tabl
                                                     conda_channel='conda-forge') from e
 
             table = pd.DataFrame.from_records(data=rows, columns=columns)
-            self.experiment.log_table(filename=f'{name}.json', tabular_data=table, orient='split',
-                                      index=False)  # formatting to be consistent with mlflow and wandb json formats
+            # Formatting to be consistent with mlflow and wandb json formats
+            self.experiment.log_table(
+                filename=f'{name}.json',
+                tabular_data=table,
+                orient='split',  # pyright: ignore[reportGeneralTypeIssues] cometml has incorrect type hints for kwargs
+                index=False,  # pyright: ignore[reportGeneralTypeIssues] cometml has incorrect type hints for kwargs
+            )
 
     def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
         if self._enabled:
@@ -162,6 +172,7 @@ def log_images(self,
                     for mask_name, mask in zip(mask_names, mask_set):
                         if channels_last:
                             # permute to channels_first to be compatible with draw_segmentation_masks.
+                            assert isinstance(image, torch.Tensor)
                             comet_image = image.permute(2, 0, 1)
                         # Log input image with mask superimposed.
                         im_with_mask_overlay = draw_segmentation_masks(comet_image.to(torch.uint8), mask, alpha=0.6)
diff --git a/composer/loggers/console_logger.py b/composer/loggers/console_logger.py
index df97cdff04..4e6fc8a3d8 100644
--- a/composer/loggers/console_logger.py
+++ b/composer/loggers/console_logger.py
@@ -77,7 +77,12 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         # Lazy logging of hyperparameters.
         self.hparams.update(hyperparameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e:
diff --git a/composer/loggers/file_logger.py b/composer/loggers/file_logger.py
index 8d88a0deae..c6d77d102c 100644
--- a/composer/loggers/file_logger.py
+++ b/composer/loggers/file_logger.py
@@ -185,7 +185,12 @@ def log_traces(self, traces: Dict[str, Any]):
                     trace_str + '\n',
                 )
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e:
diff --git a/composer/loggers/in_memory_logger.py b/composer/loggers/in_memory_logger.py
index 30820d1e3b..8f5a2c0ea3 100644
--- a/composer/loggers/in_memory_logger.py
+++ b/composer/loggers/in_memory_logger.py
@@ -72,7 +72,12 @@ def __init__(self) -> None:
     def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         self.hyperparameters.update(hyperparameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         try:
             import pandas as pd
         except ImportError as e:
@@ -80,6 +85,7 @@ def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Tabl
                                                 conda_package='pandas',
                                                 conda_channel='conda-forge') from e
         table = pd.DataFrame.from_records(data=rows, columns=columns).to_json(orient='split', index=False)
+        assert isinstance(table, str)
         self.tables[name] = table
 
     def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
diff --git a/composer/loggers/logger.py b/composer/loggers/logger.py
index e1567dfbb3..f341ab375b 100644
--- a/composer/loggers/logger.py
+++ b/composer/loggers/logger.py
@@ -60,9 +60,15 @@ def log_hyperparameters(self, parameters: Dict[str, Any]):
         for destination in self.destinations:
             destination.log_hyperparameters(parameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        if step is None:
+            step = self._state.timestamp.batch.value
         for destination in self.destinations:
-            destination.log_table(columns, rows, name)
+            destination.log_table(columns, rows, name, step)
 
     def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
         if step is None:
@@ -102,7 +108,15 @@ def log_images(
         if step is None:
             step = self._state.timestamp.batch.value
         for destination in self.destinations:
-            destination.log_images(images, name, channels_last, step, masks, mask_class_labels, use_table)
+            destination.log_images(
+                images,
+                name,
+                channels_last,
+                step,
+                masks,
+                mask_class_labels,
+                use_table,
+            )
 
     def upload_file(
         self,
diff --git a/composer/loggers/logger_destination.py b/composer/loggers/logger_destination.py
index 0ce862cea6..df41157dc6 100644
--- a/composer/loggers/logger_destination.py
+++ b/composer/loggers/logger_destination.py
@@ -58,15 +58,23 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         del hyperparameters  # unused
         pass
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
         """Log a table.
 
         Args:
             columns (List[str]): Names of the columns in the table.
             rows (List[List[Any]]): 2D row-oriented array of values.
             name (str): Name of table. (Default: ``'Table'``)
+            step (Optional[int], optional): The current step or batch of training at the
+                time of logging. Defaults to None. If not specified the specific
+                LoggerDestination implementation will choose a step (usually a running
+                counter).
         """
-        del columns, rows, name
+        del columns, rows, name, step
         pass
 
     def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 430d99b5ca..0e13791d64 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -5,12 +5,14 @@
 
 from __future__ import annotations
 
+import fnmatch
+import logging
 import os
 import pathlib
 import textwrap
 import time
 import warnings
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Sequence, Union
 
 import numpy as np
 import torch
@@ -21,7 +23,9 @@
 from composer.utils import MissingConditionalImportError, dist
 
 if TYPE_CHECKING:
-    from mlflow import ModelVersion
+    from mlflow import ModelVersion  # pyright: ignore[reportGeneralTypeIssues]
+
+log = logging.getLogger(__name__)
 
 __all__ = ['MLFlowLogger']
 
@@ -54,6 +58,8 @@ class MLFlowLogger(LoggerDestination):
             synchronously to the MLflow backend. If ``False``, Mlflow will log asynchronously. (default: ``False``)
         log_system_metrics (bool, optional): Whether to log system metrics. If ``True``, Mlflow will
             log system metrics (CPU/GPU/memory/network usage) during training. (default: ``True``)
+        ignore_metrics (List[str], optional): A list of glob patterns for metrics to ignore when logging. (default: ``None``)
+        ignore_hyperparameters (List[str], optional): A list of glob patterns for hyperparameters to ignore when logging. (default: ``None``)
     """
 
     def __init__(
@@ -68,6 +74,8 @@ def __init__(
         model_registry_uri: Optional[str] = None,
         synchronous: bool = False,
         log_system_metrics: bool = True,
+        ignore_metrics: Optional[List[str]] = None,
+        ignore_hyperparameters: Optional[List[str]] = None,
     ) -> None:
         try:
             import mlflow
@@ -85,6 +93,8 @@ def __init__(
         self.model_registry_uri = model_registry_uri
         self.synchronous = synchronous
         self.log_system_metrics = log_system_metrics
+        self.ignore_metrics = [] if ignore_metrics is None else ignore_metrics
+        self.ignore_hyperparameters = [] if ignore_hyperparameters is None else ignore_hyperparameters
         if self.model_registry_uri == 'databricks-uc':
             if len(self.model_registry_prefix.split('.')) != 2:
                 raise ValueError(f'When registering to Unity Catalog, model_registry_prefix must be in the format ' +
@@ -93,6 +103,10 @@ def __init__(
         self._rank_zero_only = rank_zero_only
         self._last_flush_time = time.time()
         self._flush_interval = flush_interval
+
+        self._experiment_id: Optional[str] = None
+        self._run_id = None
+
         if self._enabled:
             self.tracking_uri = str(tracking_uri or mlflow.get_tracking_uri())
             mlflow.set_tracking_uri(self.tracking_uri)
@@ -102,11 +116,16 @@ def __init__(
             # Set up MLflow state
             self._run_id = None
             if self.experiment_name is None:
-                self.experiment_name = os.getenv(mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.name,
-                                                 DEFAULT_MLFLOW_EXPERIMENT_NAME)
+                self.experiment_name = os.getenv(
+                    mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.name,  # type: ignore
+                    DEFAULT_MLFLOW_EXPERIMENT_NAME,
+                )
             self._mlflow_client = MlflowClient(self.tracking_uri)
             # Set experiment.
-            env_exp_id = os.getenv(mlflow.environment_variables.MLFLOW_EXPERIMENT_ID.name, None)
+            env_exp_id = os.getenv(
+                mlflow.environment_variables.MLFLOW_EXPERIMENT_ID.name,  # pyright: ignore[reportGeneralTypeIssues]
+                None,
+            )
             if env_exp_id is not None:
                 self._experiment_id = env_exp_id
             else:
@@ -123,28 +142,58 @@ def init(self, state: State, logger: Logger) -> None:
         if self.run_name is None:
             self.run_name = state.run_name
 
+        # Store the Composer run name in the MLFlow run tags so it can be retrieved for autoresume.
+        self.tags = self.tags or {}
+        self.tags['composer_run_name'] = state.run_name
+
         # Adjust name and group based on `rank_zero_only`.
         if not self._rank_zero_only:
             self.run_name += f'-rank{dist.get_global_rank()}'
 
         # Start run
         if self._enabled:
-            env_run_id = os.getenv(mlflow.environment_variables.MLFLOW_RUN_ID.name, None)
+            env_run_id = os.getenv(
+                mlflow.environment_variables.MLFLOW_RUN_ID.name,  # pyright: ignore[reportGeneralTypeIssues]
+                None,
+            )
             if env_run_id is not None:
                 self._run_id = env_run_id
             else:
-                new_run = self._mlflow_client.create_run(
-                    experiment_id=self._experiment_id,
-                    run_name=self.run_name,
-                )
-                self._run_id = new_run.info.run_id
+                # Search for an existing run tagged with this Composer run.
+                assert self._experiment_id is not None
+                existing_runs = mlflow.search_runs(experiment_ids=[self._experiment_id],
+                                                   filter_string=f'tags.composer_run_name = "{state.run_name}"',
+                                                   output_format='list')
+                if len(existing_runs) > 0:
+                    self._run_id = existing_runs[0].info.run_id
+                else:
+                    new_run = self._mlflow_client.create_run(
+                        experiment_id=self._experiment_id,
+                        run_name=self.run_name,
+                    )
+                    self._run_id = new_run.info.run_id
             mlflow.start_run(
                 run_id=self._run_id,
                 tags=self.tags,
                 log_system_metrics=self.log_system_metrics,
             )
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+        # If rank zero only, broadcast the MLFlow experiment and run IDs to other ranks, so the MLFlow run info is
+        # available to other ranks during runtime.
+        if self._rank_zero_only:
+            mlflow_ids_list = [self._experiment_id, self._run_id]
+            dist.broadcast_object_list(mlflow_ids_list, src=0)
+            self._experiment_id, self._run_id = mlflow_ids_list
+
+    def after_load(self, state: State, logger: Logger) -> None:
+        logger.log_hyperparameters({'mlflow_experiment_id': self._experiment_id, 'mlflow_run_id': self._run_id})
+
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
+        del step
         if self._enabled:
             try:
                 import pandas as pd
@@ -153,6 +202,7 @@ def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Tabl
                                                     conda_package='pandas',
                                                     conda_channel='conda-forge') from e
             table = pd.DataFrame.from_records(data=rows, columns=columns)
+            assert isinstance(self._run_id, str)
             self._mlflow_client.log_table(
                 run_id=self._run_id,
                 data=table,
@@ -163,7 +213,11 @@ def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> No
         from mlflow import log_metrics
         if self._enabled:
             # Convert all metrics to floats to placate mlflow.
-            metrics = {k: float(v) for k, v in metrics.items()}
+            metrics = {
+                k: float(v)
+                for k, v in metrics.items()
+                if not any(fnmatch.fnmatch(k, pattern) for pattern in self.ignore_metrics)
+            }
             log_metrics(
                 metrics=metrics,
                 step=step,
@@ -174,6 +228,11 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
         from mlflow import log_params
 
         if self._enabled:
+            hyperparameters = {
+                k: v
+                for k, v in hyperparameters.items()
+                if not any(fnmatch.fnmatch(k, pattern) for pattern in self.ignore_hyperparameters)
+            }
             log_params(
                 params=hyperparameters,
                 synchronous=self.synchronous,
@@ -183,7 +242,7 @@ def register_model(
         self,
         model_uri: str,
         name: str,
-        await_registration_for: Optional[int] = 300,
+        await_registration_for: int = 300,
         tags: Optional[Dict[str, Any]] = None,
     ) -> 'ModelVersion':
         """Register a model to model registry.
@@ -191,10 +250,10 @@ def register_model(
         Args:
             model_uri (str): The URI of the model to register.
             name (str): The name of the model to register. Will be appended to ``model_registry_prefix``.
-            await_registration_for (Optional[int], optional): The number of seconds to wait for the model to be registered.
+            await_registration_for (int, optional): The number of seconds to wait for the model to be registered.
                 Defaults to 300.
-            tags (Dict[str, Any], optional): A dictionary of tags to add to the model. Defaults to None.
-            registry_uri (str, optional): The URI of the model registry. Defaults to 'databricks-uc' which will register to
+            tags (Optional[Dict[str, Any]], optional): A dictionary of tags to add to the model. Defaults to None.
+            registry_uri (str, optional): The URI of the model registry. Defaults to `None` which will register to
                 the Databricks Unity Catalog.
 
         Returns:
@@ -211,28 +270,75 @@ def register_model(
                 tags=tags,
             )
 
-    def save_model(self, flavor: str, **kwargs):
+    def save_model(self, flavor: Literal['transformers', 'peft'], **kwargs):
         """Save a model to MLflow.
 
+        Note: The ``'peft'`` flavor is experimental and the API is subject to change without warning.
+
         Args:
-            flavor (str): The MLflow model flavor to use. Currently only ``'transformers'`` is supported.
+            flavor (Literal['transformers', 'peft']): The MLflow model flavor to use. Currently only ``'transformers'`` and ``'peft'`` are supported.
             **kwargs: Keyword arguments to pass to the MLflow model saving function.
 
         Raises:
-            NotImplementedError: If ``flavor`` is not ``'transformers'``.
+            NotImplementedError: If ``flavor`` is not ``'transformers'`` or ``'peft'``.
         """
         if self._enabled:
             import mlflow
             if flavor == 'transformers':
                 mlflow.transformers.save_model(**kwargs,)
+            elif flavor == 'peft':
+                import transformers
+
+                # TODO: Remove after mlflow fixes the bug that makes this necessary
+                mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: ''  # type: ignore
+
+                # This is a temporary workaround until MLflow adds full support for saving PEFT models.
+                # https://github.com/mlflow/mlflow/issues/9256
+                log.warning(
+                    'Saving PEFT models using MLflow is experimental and the API is subject to change without warning.')
+                expected_keys = {'path', 'save_pretrained_dir'}
+                if not expected_keys.issubset(kwargs.keys()):
+                    raise ValueError(f'Expected keys {expected_keys} but got {kwargs.keys()}')
+
+                # This does not implement predict for now, as we will wait for the full MLflow support
+                # for PEFT models.
+                class PeftModel(mlflow.pyfunc.PythonModel):
+
+                    def load_context(self, context):
+                        self.model = transformers.AutoModelForCausalLM.from_pretrained(
+                            context.artifacts['lora_checkpoint'])
+                        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                            context.artifacts['lora_checkpoint'])
+
+                from mlflow.models.signature import ModelSignature
+                from mlflow.types import ColSpec, DataType, Schema
+
+                # This is faked for now, until MLflow adds full support for saving PEFT models.
+                input_schema = Schema([
+                    ColSpec(DataType.string, 'fake_input'),
+                ])
+                output_schema = Schema([ColSpec(DataType.string)])
+                signature = ModelSignature(inputs=input_schema, outputs=output_schema)
+
+                # Symlink the directory so that we control the path that MLflow saves the model under
+                os.symlink(kwargs['save_pretrained_dir'], 'lora_checkpoint')
+
+                mlflow.pyfunc.save_model(
+                    path=kwargs['path'],
+                    artifacts={'lora_checkpoint': 'lora_checkpoint'},
+                    python_model=PeftModel(),
+                    signature=signature,
+                )
+
+                os.unlink('lora_checkpoint')
             else:
                 raise NotImplementedError(f'flavor {flavor} not supported.')
 
-    def log_model(self, flavor: str, **kwargs):
+    def log_model(self, flavor: Literal['transformers'], **kwargs):
         """Log a model to MLflow.
 
         Args:
-            flavor (str): The MLflow model flavor to use. Currently only ``'transformers'`` is supported.
+            flavor (Literal['transformers']): The MLflow model flavor to use. Currently only ``'transformers'`` is supported.
             **kwargs: Keyword arguments to pass to the MLflow model logging function.
 
         Raises:
@@ -245,6 +351,53 @@ def log_model(self, flavor: str, **kwargs):
             else:
                 raise NotImplementedError(f'flavor {flavor} not supported.')
 
+    def register_model_with_run_id(
+        self,
+        model_uri: str,
+        name: str,
+        await_creation_for: int = 300,
+        tags: Optional[Dict[str, Any]] = None,
+    ):
+        """Similar to ``register_model``, but uses a different MLflow API to allow passing in the run id.
+
+        Args:
+            model_uri (str): The URI of the model to register.
+            name (str): The name of the model to register. Will be appended to ``model_registry_prefix``.
+            await_creation_for (int, optional): The number of seconds to wait for the model to be registered. Defaults to 300.
+            tags (Optional[Dict[str, Any]], optional): A dictionary of tags to add to the model. Defaults to None.
+        """
+        if self._enabled:
+            from mlflow.exceptions import MlflowException
+            from mlflow.protos.databricks_pb2 import ALREADY_EXISTS, RESOURCE_ALREADY_EXISTS, ErrorCode
+
+            full_name = f'{self.model_registry_prefix}.{name}' if len(self.model_registry_prefix) > 0 else name
+
+            # This try/catch code is copied from
+            # https://github.com/mlflow/mlflow/blob/3ba1e50e90a38be19920cb9118593a43d7cfa90e/mlflow/tracking/_model_registry/fluent.py#L90-L103
+            try:
+                create_model_response = self._mlflow_client.create_registered_model(full_name)
+                log.info(f'Successfully registered model {name} with {create_model_response.name}')
+            except MlflowException as e:
+                if e.error_code in (
+                        ErrorCode.Name(RESOURCE_ALREADY_EXISTS),
+                        ErrorCode.Name(ALREADY_EXISTS),
+                ):
+                    log.info(f'Registered model {name} already exists. Creating a new version of this model...')
+                else:
+                    raise e
+
+            create_version_response = self._mlflow_client.create_model_version(
+                name=full_name,
+                source=model_uri,
+                run_id=self._run_id,
+                await_creation_for=await_creation_for,
+                tags=tags,
+            )
+
+            log.info(
+                f'Successfully created model version {create_version_response.version} for model {create_version_response.name}'
+            )
+
     def log_images(
         self,
         images: Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]],
@@ -265,6 +418,7 @@ def log_images(
                 images = [images]
             for im_ind, image in enumerate(images):
                 image = _convert_to_mlflow_image(image, channels_last)
+                assert isinstance(self._run_id, str)
                 self._mlflow_client.log_image(image=image,
                                               artifact_file=f'{name}_{step}_{im_ind}.png',
                                               run_id=self._run_id)
@@ -273,6 +427,7 @@ def post_close(self):
         if self._enabled:
             import mlflow
 
+            assert isinstance(self._run_id, str)
             self._mlflow_client.set_terminated(self._run_id)
             mlflow.end_run()
 
diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py
index 390f26e436..cab710a4c3 100644
--- a/composer/loggers/mosaicml_logger.py
+++ b/composer/loggers/mosaicml_logger.py
@@ -36,6 +36,7 @@
 RUN_NAME_ENV_VAR = 'RUN_NAME'
 MOSAICML_PLATFORM_ENV_VAR = 'MOSAICML_PLATFORM'
 MOSAICML_ACCESS_TOKEN_ENV_VAR = 'MOSAICML_ACCESS_TOKEN_FILE'
+MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR'
 
 
 class MosaicMLLogger(LoggerDestination):
@@ -113,7 +114,6 @@ def batch_start(self, state: State, logger: Logger) -> None:
 
     def batch_end(self, state: State, logger: Logger) -> None:
         training_progress_data = self._get_training_progress_metrics(state)
-        log.debug(f'\nLogging training progress data to metadata:\n{dict_to_str(training_progress_data)}')
         self._log_metadata(training_progress_data)
         self._flush_metadata()
 
@@ -153,6 +153,7 @@ def _flush_metadata(self, force_flush: bool = False, future: bool = True) -> Non
         if self._enabled and len(self.buffered_metadata) > 0 and (
                 time.time() - self.time_last_logged > self.log_interval or force_flush):
             try:
+                assert self.run_name is not None
                 if future:
                     f = mcli.update_run_metadata(self.run_name, self.buffered_metadata, future=True, protect=True)
                     self._futures.append(f)
@@ -161,7 +162,6 @@ def _flush_metadata(self, force_flush: bool = False, future: bool = True) -> Non
                 self.buffered_metadata = {}
                 self.time_last_logged = time.time()
                 done, incomplete = wait(self._futures, timeout=0.01)
-                log.info(f'Logged {len(done)} metadata to MosaicML, waiting on {len(incomplete)}')
                 # Raise any exceptions
                 for f in done:
                     if f.exception() is not None:
@@ -238,7 +238,8 @@ def format_data_to_json_serializable(data: Any):
         elif isinstance(data, torch.Tensor):
             if data.shape == () or reduce(operator.mul, data.shape, 1) == 1:
                 ret = format_data_to_json_serializable(data.cpu().item())
-            ret = 'Tensor of shape ' + str(data.shape)
+            else:
+                ret = 'Tensor of shape ' + str(data.shape)
         elif isinstance(data, collections.abc.Mapping):
             ret = {format_data_to_json_serializable(k): format_data_to_json_serializable(v) for k, v in data.items()}
         elif isinstance(data, collections.abc.Iterable):
diff --git a/composer/loggers/neptune_logger.py b/composer/loggers/neptune_logger.py
new file mode 100644
index 0000000000..c1718a5c09
--- /dev/null
+++ b/composer/loggers/neptune_logger.py
@@ -0,0 +1,307 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Log training metadata to [neptune.ai](https://neptune.ai/)."""
+
+__all__ = ['NeptuneLogger']
+
+import os
+import pathlib
+import warnings
+from functools import partial
+from typing import TYPE_CHECKING, Any, Dict, Optional, Sequence, Set, Union
+
+import numpy as np
+import torch
+
+from composer._version import __version__
+from composer.loggers import LoggerDestination
+from composer.utils import MissingConditionalImportError, dist
+
+if TYPE_CHECKING:
+    from composer import Logger
+    from composer.core import State
+
+
+class NeptuneLogger(LoggerDestination):
+    """Log to `neptune.ai <https://neptune.ai/>`_.
+
+    For more, see the [Neptune-Composer integration guide](https://docs.neptune.ai/integrations/composer/).
+
+    Args:
+        project (str, optional): The name of your Neptune project,
+            in the form "workspace-name/project-name". If you leave it empty, the
+            ``NEPTUNE_PROJECT`` environment variable will be used.
+        api_token (str, optional): Your Neptune API token.
+            You can leave out this argument if you save your token to the
+            ``NEPTUNE_API_TOKEN`` environment variable (recommended).
+            You can find your API token in the user menu of the Neptune web app.
+        rank_zero_only (bool, optional): Whether to log only on the rank-zero process.
+            (default: ``True``).
+        upload_artifacts (bool, optional): Whether the logger should upload artifacts to Neptune.
+            (default: ``False``).
+        base_namespace (str, optional): The name of the base namespace to log the metadata to.
+            (default: "training").
+        neptune_kwargs (Dict[str, Any], optional): Any additional keyword arguments to the
+            ``neptune.init_run()`` function. For options, see the
+            `Run API reference <https://docs.neptune.ai/api/neptune/#init_run>`_ in the
+            Neptune docs.
+    """
+    metric_namespace = 'metrics'
+    hyperparam_namespace = 'hyperparameters'
+    trace_namespace = 'traces'
+    integration_version_key = 'source_code/integrations/neptune-MosaicML'
+
+    def __init__(
+        self,
+        *,
+        project: Optional[str] = None,
+        api_token: Optional[str] = None,
+        rank_zero_only: bool = True,
+        upload_artifacts: bool = False,
+        base_namespace: str = 'training',
+        **neptune_kwargs,
+    ) -> None:
+        try:
+            from neptune.internal.utils import verify_type
+        except ImportError as e:
+            raise MissingConditionalImportError(extra_deps_group='neptune',
+                                                conda_package='neptune',
+                                                conda_channel='conda-forge') from e
+
+        verify_type('project', project, (str, type(None)))
+        verify_type('api_token', api_token, (str, type(None)))
+        verify_type('rank_zero_only', rank_zero_only, bool)
+        verify_type('upload_artifacts', upload_artifacts, bool)
+        verify_type('base_namespace', base_namespace, str)
+
+        if not base_namespace:
+            raise ValueError("Argument 'base_namespace' cannot be an empty string.")
+
+        self._project = project
+        self._api_token = api_token
+        self._rank_zero_only = rank_zero_only
+        self._upload_artifacts = upload_artifacts
+        self._base_namespace = base_namespace
+        self._neptune_kwargs = neptune_kwargs
+
+        mode = self._neptune_kwargs.pop('mode', 'async')
+
+        self._enabled = (not rank_zero_only) or dist.get_global_rank() == 0
+
+        self._mode = mode if self._enabled else 'debug'
+
+        self._neptune_run = None
+        self._base_handler = None
+
+        self._metrics_dict: Dict[str, int] = {}  # used to prevent duplicate step logging
+
+        super().__init__()
+
+    @property
+    def neptune_run(self):
+        """Gets the Neptune run object from a NeptuneLogger instance.
+
+        You can log additional metadata to the run by accessing a path inside the run and assigning metadata to it
+        with "=" or [Neptune logging methods](https://docs.neptune.ai/logging/methods/).
+
+        Example:
+                from composer import Trainer
+                from composer.loggers import NeptuneLogger
+                neptune_logger = NeptuneLogger()
+                trainer = Trainer(loggers=neptune_logger, ...)
+                trainer.fit()
+                neptune_logger.neptune_run["some_metric"] = 1
+                trainer.close()
+        """
+        from neptune import Run
+
+        if not self._neptune_run:
+            self._neptune_run = Run(
+                project=self._project,
+                api_token=self._api_token,
+                mode=self._mode,
+                **self._neptune_kwargs,
+            )
+        return self._neptune_run
+
+    @property
+    def base_handler(self):
+        """Gets a handler for the base logging namespace.
+
+        Use the handler to log extra metadata to the run and organize it under the base namespace (default: "training").
+        You can operate on it like a run object: Access a path inside the handler and assign metadata to it with "=" or
+        other [Neptune logging methods](https://docs.neptune.ai/logging/methods/).
+
+        Example:
+                from composer import Trainer
+                from composer.loggers import NeptuneLogger
+                neptune_logger = NeptuneLogger()
+                trainer = Trainer(loggers=neptune_logger, ...)
+                trainer.fit()
+                neptune_logger.base_handler["some_metric"] = 1
+                trainer.close()
+            Result: The value `1` is organized under "training/some_metric" inside the run.
+        """
+        return self.neptune_run[self._base_namespace]
+
+    def init(self, state: 'State', logger: 'Logger') -> None:
+        del logger  # unused
+
+        self.base_handler['rank'] = dist.get_global_rank()
+
+        if self._enabled:
+            self.neptune_run['sys/name'] = state.run_name
+            self.neptune_run[self.integration_version_key] = __version__
+
+    def _sanitize_metrics(self, metrics: Dict[str, float], step: Optional[int]) -> Dict[str, float]:
+        """Sanitize metrics to prevent duplicate step logging.
+
+        Args:
+            metrics (Dict[str, float]): Metrics to log.
+            step (Optional[int]): Step to log metrics at.
+
+        Returns:
+            Dict[str, float]: Sanitized metrics.
+        """
+        keys_to_delete: Set[str] = set()
+
+        for k in metrics:
+            self._process_single_metric(k, step, keys_to_delete)
+
+        return dict(filter(lambda x: x[0] not in keys_to_delete, metrics.items()))
+
+    def _process_single_metric(self, metric_key: str, step: Optional[int], keys_to_delete: Set[str]) -> None:
+        if metric_key not in self._metrics_dict:
+            self._metrics_dict[metric_key] = step if step is not None else 0
+        else:
+            if step is not None:
+                if step <= self._metrics_dict[metric_key]:
+                    # we cannot insert metrics earlier than or in place of an existing metric point
+                    keys_to_delete.add(metric_key)
+                else:
+                    self._metrics_dict[metric_key] = step
+            else:
+                self._metrics_dict[metric_key] += 1
+
+    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+        if not self._enabled:
+            return
+
+        from neptune.utils import stringify_unsupported
+
+        if metrics_to_log := self._sanitize_metrics(metrics, step):
+            self.base_handler[NeptuneLogger.metric_namespace].append(stringify_unsupported(metrics_to_log), step=step)
+
+    def log_hyperparameters(self, hyperparameters: Dict[str, Any]) -> None:
+        if not self._enabled:
+            return
+
+        from neptune.utils import stringify_unsupported
+
+        self.base_handler[NeptuneLogger.hyperparam_namespace] = stringify_unsupported(hyperparameters)
+
+    def log_traces(self, traces: Dict[str, Any]):
+        if not self._enabled:
+            return
+
+        from neptune.utils import stringify_unsupported
+
+        self.base_handler[NeptuneLogger.trace_namespace] = stringify_unsupported(traces)
+
+    def can_upload_files(self) -> bool:
+        """Whether the logger supports uploading files."""
+        return self._enabled and self._upload_artifacts
+
+    def upload_file(
+        self,
+        state: 'State',
+        remote_file_name: str,
+        file_path: pathlib.Path,
+        *,
+        overwrite: bool = False,
+    ):
+        if not self.can_upload_files():
+            return
+
+        neptune_path = f'{self._base_namespace}/{remote_file_name}'
+        if self.neptune_run.exists(neptune_path) and not overwrite:
+
+            warnings.warn(f"The file '{neptune_path}' already exists and overwrite is set to False."
+                          'No action will be taken.')
+            return
+
+        del state  # unused
+        self.base_handler[remote_file_name].upload(str(file_path))
+
+    def download_file(
+        self,
+        remote_file_name: str,
+        destination: str,
+        overwrite: bool = False,
+        progress_bar: bool = True,
+    ):
+        del progress_bar  # not supported
+
+        if not self._enabled:
+            return
+
+        if os.path.exists(os.path.join(
+                destination,
+                remote_file_name,
+        )) and not overwrite:
+            warnings.warn(f"Destination '{destination}' already exists and overwrite is set to False."
+                          'No action will be taken.')
+            return
+
+        file_path = f'{self._base_namespace}/{remote_file_name}'
+        if not self.neptune_run.exists(file_path):
+            raise FileNotFoundError(f'File {file_path} not found')
+
+        self.base_handler[remote_file_name].download(destination=destination)
+
+    def log_images(
+        self,
+        images: Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]],
+        name: str = 'Images',
+        channels_last: bool = False,
+        step: Optional[int] = None,
+        masks: Optional[Dict[str, Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]]]] = None,
+        mask_class_labels: Optional[Dict[int, str]] = None,
+        use_table: bool = True,
+    ):
+        if not self._enabled:
+            return
+
+        from neptune.types import File
+
+        if not isinstance(images, Sequence) and images.ndim <= 3:
+            images = _validate_image(images, channels_last=channels_last)
+            self.base_handler[name].append(File.as_image(images), step=step)
+
+        else:
+            images = list(map(partial(_validate_image, channels_last=channels_last), images))
+            self.base_handler[name].extend([File.as_image(img) for img in images])
+
+    def post_close(self) -> None:
+        if not self._enabled:
+            return
+
+        if self._neptune_run:
+            self._neptune_run.stop()
+            self._neptune_run = None
+
+
+def _validate_image(img: Union[np.ndarray, torch.Tensor], channels_last: bool) -> np.ndarray:
+    img_numpy = img.data.cpu().numpy() if isinstance(img, torch.Tensor) else img
+
+    assert isinstance(img_numpy, np.ndarray)
+
+    # Error out for empty arrays or weird arrays of dimension 0.
+    if np.any(np.equal(img_numpy.shape, 0)):
+        raise ValueError(f'Got an image (shape {img_numpy.shape}) with at least one dimension being 0! ')
+
+    if not channels_last:
+        img_numpy = np.moveaxis(img_numpy, 0, -1)
+
+    return img_numpy
diff --git a/composer/loggers/remote_uploader_downloader.py b/composer/loggers/remote_uploader_downloader.py
index a3f9698483..0ee65c832b 100644
--- a/composer/loggers/remote_uploader_downloader.py
+++ b/composer/loggers/remote_uploader_downloader.py
@@ -24,8 +24,10 @@
 
 from composer.loggers.logger import Logger
 from composer.loggers.logger_destination import LoggerDestination
-from composer.utils import (GCSObjectStore, LibcloudObjectStore, ObjectStore, ObjectStoreTransientError, OCIObjectStore,
-                            S3ObjectStore, SFTPObjectStore, UCObjectStore, dist, format_name_with_dist, get_file, retry)
+from composer.utils import (GCSObjectStore, LibcloudObjectStore, MLFlowObjectStore, ObjectStore,
+                            ObjectStoreTransientError, OCIObjectStore, S3ObjectStore, SFTPObjectStore, UCObjectStore,
+                            dist, format_name_with_dist, get_file, retry)
+from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX
 
 if TYPE_CHECKING:
     from composer.core import State
@@ -37,19 +39,32 @@
 
 
 def _build_remote_backend(remote_backend_name: str, backend_kwargs: Dict[str, Any]):
+    remote_backend_cls = None
     remote_backend_name_to_cls = {
         's3': S3ObjectStore,
         'oci': OCIObjectStore,
         'sftp': SFTPObjectStore,
         'libcloud': LibcloudObjectStore,
         'gs': GCSObjectStore,
-        'dbfs': UCObjectStore,
     }
-    remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None)
-    if remote_backend_cls is None:
-        raise ValueError(
-            f'The remote backend {remote_backend_name} is not supported. Please use one of ({list(remote_backend_name_to_cls.keys())})'
-        )
+
+    # Handle `dbfs` backend as a special case, since it can map to either :class:`.UCObjectStore`
+    # or :class:`.MLFlowObjectStore`.
+    if remote_backend_name == 'dbfs':
+        path = backend_kwargs['path']
+        if path.startswith(MLFLOW_DBFS_PATH_PREFIX):
+            remote_backend_cls = MLFlowObjectStore
+        else:
+            # Validate if the path conforms to the requirements for UC volume paths
+            UCObjectStore.validate_path(path)
+            remote_backend_cls = UCObjectStore
+    else:
+        remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None)
+        if remote_backend_cls is None:
+            supported_remote_backends = list(remote_backend_name_to_cls.keys()) + ['dbfs']
+            raise ValueError(
+                f'The remote backend {remote_backend_name} is not supported. Please use one of ({supported_remote_backends})'
+            )
 
     return remote_backend_cls(**backend_kwargs)
 
@@ -88,7 +103,7 @@ class RemoteUploaderDownloader(LoggerDestination):
             backend_kwargs={
                 'provider': 's3',
                 'container': 'my-bucket',
-                'provider_kwargs=': {
+                'provider_kwargs': {
                     'key': 'AKIA...',
                     'secret': '*********',
                     'region': 'ap-northeast-1',
@@ -283,7 +298,9 @@ def __init__(self,
             self._completed_queue: Union[queue.Queue[str], multiprocessing.JoinableQueue[str],] = mp_ctx.JoinableQueue()
             self._exception_queue: Union[queue.Queue[Exception],
                                          multiprocessing.JoinableQueue[Exception],] = mp_ctx.JoinableQueue()
-            self._finished_cls: Union[Callable[[], multiprocessing._EventType], Type[threading.Event]] = mp_ctx.Event
+            self._finished_cls: Union[Callable[[],
+                                               multiprocessing._EventType],  # pyright: ignore[reportGeneralTypeIssues]
+                                      Type[threading.Event]] = mp_ctx.Event
             self._proc_class = mp_ctx.Process
         else:
             self._file_upload_queue = queue.Queue()
@@ -291,7 +308,8 @@ def __init__(self,
             self._exception_queue = queue.Queue()
             self._finished_cls = threading.Event
             self._proc_class = threading.Thread
-        self._worker_flag: Optional[Union[multiprocessing._EventType, threading.Event]] = None
+        self._worker_flag: Optional[Union[multiprocessing._EventType,  # pyright: ignore[reportGeneralTypeIssues]
+                                          threading.Event]] = None
         self._workers: List[Union[SpawnProcess, threading.Thread]] = []
         # the object store instance for the main thread. Deferring the construction of the object_store to first use.
         self._remote_backend = None
@@ -319,6 +337,20 @@ def init(self, state: State, logger: Logger) -> None:
         if dist.get_global_rank() == 0:
             retry(ObjectStoreTransientError,
                   self.num_attempts)(lambda: _validate_credentials(self.remote_backend, file_name_to_test))()
+
+        # If the remote backend is an `MLFlowObjectStore`, the original path kwarg may have placeholders that can be
+        # updated with information generated at runtime, i.e., the MLFlow experiment and run IDs. This information
+        # must be propagated across all ranks before the workers are started so that all workers use the same
+        # MLFlow run.
+        if self.backend_kwargs.get('path', '').startswith(MLFLOW_DBFS_PATH_PREFIX):
+            if dist.get_global_rank() == 0:
+                assert isinstance(self.remote_backend, MLFlowObjectStore)
+                self.backend_kwargs['path'] = self.remote_backend.get_dbfs_path(self.backend_kwargs['path'])
+
+            path_list = [self.backend_kwargs['path']]
+            dist.broadcast_object_list(path_list, src=0)
+            self.backend_kwargs['path'] = path_list[0]
+
         assert len(self._workers) == 0, 'workers should be empty if self._worker_flag was None'
         for _ in range(self._num_concurrent_uploads):
             worker = self._proc_class(
@@ -583,7 +615,7 @@ def _upload_worker(
     file_queue: Union[queue.Queue[Tuple[str, str, bool]], multiprocessing.JoinableQueue[Tuple[str, str, bool]]],
     completed_queue: Union[queue.Queue[str], multiprocessing.JoinableQueue[str]],
     exception_queue: Union[queue.Queue[Exception], multiprocessing.JoinableQueue[Exception]],
-    is_finished: Union[multiprocessing._EventType, threading.Event],
+    is_finished: Union[multiprocessing._EventType, threading.Event],  # pyright: ignore[reportGeneralTypeIssues]
     remote_backend_name: str,
     backend_kwargs: Dict[str, Any],
     num_attempts: int,
diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py
index d75f9d9016..7fc5fdcada 100644
--- a/composer/loggers/wandb_logger.py
+++ b/composer/loggers/wandb_logger.py
@@ -117,11 +117,15 @@ def log_hyperparameters(self, hyperparameters: Dict[str, Any]):
             import wandb
             wandb.config.update(hyperparameters)
 
-    def log_table(self, columns: List[str], rows: List[List[Any]], name: str = 'Table') -> None:
+    def log_table(self,
+                  columns: List[str],
+                  rows: List[List[Any]],
+                  name: str = 'Table',
+                  step: Optional[int] = None) -> None:
         if self._enabled:
             import wandb
             table = wandb.Table(columns=columns, rows=rows)
-            wandb.log({name: table})
+            wandb.log({name: table}, step)
 
     def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
         if self._enabled:
diff --git a/composer/metrics/map.py b/composer/metrics/map.py
index 51f52a5559..25eec0d355 100644
--- a/composer/metrics/map.py
+++ b/composer/metrics/map.py
@@ -163,8 +163,8 @@ def __init__(
             dist_sync_fn=dist_sync_fn,
         )
         try:
-            from pycocotools.coco import COCO
-            from pycocotools.cocoeval import COCOeval
+            from pycocotools.coco import COCO  # pyright: ignore[reportMissingModuleSource]
+            from pycocotools.cocoeval import COCOeval  # pyright: ignore[reportMissingModuleSource]
         except ImportError as e:
             raise MissingConditionalImportError(extra_deps_group='coco', conda_package='pycocotools') from e
 
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index a3111db0aa..b4815ea702 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -8,7 +8,7 @@
 import re
 import string
 import warnings
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -196,9 +196,20 @@ def compute(self) -> Tensor:
 
 class InContextLearningMetric(Metric):
 
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.needs_batch = True
+
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
         """Abstract interface for computing an in-context learning metrics.
 
+        The `output_logits` argument is deprecated and will be removed in v0.21 while it's functionality will
+        be moved to `outputs`.
+
         Args:
             batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
                 to compute the metric.
@@ -210,6 +221,27 @@ def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor)
         """
         raise NotImplementedError
 
+    @staticmethod
+    def rename_args(batch: dict,
+                    output_logits: Optional[torch.Tensor] = None,
+                    labels: Optional[torch.Tensor] = None,
+                    outputs: Optional[torch.Tensor] = None) -> Tuple[dict, torch.Tensor, torch.Tensor]:
+        if outputs is not None and output_logits is not None:
+            raise ValueError('Cannot use both `outputs` and `output_logits`')
+        if output_logits is not None:
+            warnings.warn(
+                ('`output_logits` has been renamed to `outputs` and will be removed in v0.21'),
+                DeprecationWarning,
+            )
+            outputs = output_logits
+
+        if labels is None:
+            raise ValueError('`labels` cannot be None')
+        if outputs is None:
+            raise ValueError('`outputs` cannot be None')
+
+        return batch, outputs, labels
+
 
 class InContextLearningQAAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) question answering (QA) tasks.
@@ -266,17 +298,25 @@ def replace_underscore(text: str) -> str:
 
         return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(answer))))).strip()
 
-    def update(self, outputs: List[str], labels: List[List[str]], batch: Optional[Dict[str, Any]] = None):
-        if batch is None:
-            batch = {}
+    def update(self, outputs: List[str], labels: List[List[str]], batch: Dict[str, Any]):
         cot_delimiter = batch.get('cot_delimiter', '')
+        do_normalization = batch.get('do_normalization', True)
+        stopping_criteria = batch.get('stopping_criteria', None)
         for sample_output, sample_labels in zip(outputs, labels):
             final_answer = sample_output
+
+            if stopping_criteria is not None and len(stopping_criteria) > 0:
+                final_answer = re.split('|'.join(stopping_criteria), final_answer)[0]
+
             if cot_delimiter is not None and len(cot_delimiter) > 0:
                 final_answer = final_answer.split(cot_delimiter)[-1]
 
-            cleaned_final_answer = self.normalize_answer(final_answer)
-            cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
+            if do_normalization:
+                cleaned_final_answer = self.normalize_answer(final_answer)
+                cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
+            else:
+                cleaned_final_answer = final_answer
+                cleaned_sample_labels = set(sample_labels)
 
             if any(cleaned_final_answer.startswith(label) for label in cleaned_sample_labels):
                 self.correct += torch.tensor(1.0)
@@ -318,9 +358,18 @@ def __init__(self, dist_sync_on_step: bool = False):
         self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch,
+                                                                     output_logits=output_logits,
+                                                                     labels=labels,
+                                                                     outputs=outputs)
+
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_pred = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
             cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
 
             self.correct += (cont_tok_pred == cont_tok_targ).all().int()
@@ -360,11 +409,20 @@ def __init__(self, dist_sync_on_step: bool = False):
         self.add_state('correct', default=torch.tensor(0.0), dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch,
+                                                                     output_logits=output_logits,
+                                                                     labels=labels,
+                                                                     outputs=outputs)
+
         perplexities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             # continuation indices refer to indices in the original input's token space
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1)
             # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
             cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
             cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
@@ -445,11 +503,20 @@ class InContextLearningMCExpectedCalibrationError(InContextLearningExpectedCalib
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch,
+                                                                     output_logits=output_logits,
+                                                                     labels=labels,
+                                                                     outputs=outputs)
+
+        outputs = torch.softmax(outputs, dim=2)
         probabilites = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1)
             cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
             probability = cont_tok_logits.index_select(dim=1, index=cont_tok_targ).diagonal().mean()
             probabilites.append(probability)
@@ -481,10 +548,19 @@ class InContextLearningLMExpectedCalibrationError(InContextLearningExpectedCalib
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch,
+                                                                     output_logits=output_logits,
+                                                                     labels=labels,
+                                                                     outputs=outputs)
+
+        outputs = torch.softmax(outputs, dim=2)
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1)
             cont_tok_pred = cont_tok_logits.argmax(dim=-1)
             confidence = cont_tok_logits.max(dim=-1).values.min()
             cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
diff --git a/composer/models/__init__.py b/composer/models/__init__.py
index 095f79d59a..50f714e0fd 100644
--- a/composer/models/__init__.py
+++ b/composer/models/__init__.py
@@ -10,37 +10,14 @@
 """
 
 from composer.models.base import ComposerModel
-from composer.models.bert import create_bert_classification, create_bert_mlm
-from composer.models.classify_mnist import mnist_model
-from composer.models.deeplabv3 import composer_deeplabv3
-from composer.models.efficientnetb0 import composer_efficientnetb0
-from composer.models.gpt2 import create_gpt2
 from composer.models.huggingface import HuggingFaceModel, write_huggingface_pretrained_from_composer_checkpoint
 from composer.models.initializers import Initializer
-from composer.models.mmdetection import MMDetModel
-from composer.models.resnet import composer_resnet
-from composer.models.resnet_cifar import composer_resnet_cifar
 from composer.models.tasks import ComposerClassifier
-from composer.models.timm import composer_timm
-from composer.models.unet import UNet
-from composer.models.vit_small_patch16 import vit_small_patch16
 
 __all__ = [
     'ComposerModel',
-    'create_bert_classification',
-    'create_bert_mlm',
-    'mnist_model',
-    'composer_deeplabv3',
-    'composer_efficientnetb0',
-    'create_gpt2',
     'HuggingFaceModel',
     'write_huggingface_pretrained_from_composer_checkpoint',
     'Initializer',
-    'MMDetModel',
-    'composer_resnet',
-    'composer_resnet_cifar',
     'ComposerClassifier',
-    'composer_timm',
-    'UNet',
-    'vit_small_patch16',
 ]
diff --git a/composer/models/bert/__init__.py b/composer/models/bert/__init__.py
deleted file mode 100644
index 69903a0115..0000000000
--- a/composer/models/bert/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The `BERT <https://huggingface.co/docs/transformers/master/en/model_doc/bert>`_ model family using `Hugging Face
-Transformers <https://huggingface.co/transformers/>`_."""
-
-from composer.models.bert.model import create_bert_classification as create_bert_classification
-from composer.models.bert.model import create_bert_mlm as create_bert_mlm
-
-__all__ = ['create_bert_classification', 'create_bert_mlm']
diff --git a/composer/models/bert/model.py b/composer/models/bert/model.py
deleted file mode 100644
index 7c79ef109b..0000000000
--- a/composer/models/bert/model.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Implements a BERT wrapper around a :class:`.ComposerTransformer`."""
-
-from __future__ import annotations
-
-import warnings
-from typing import Optional
-
-from torchmetrics import MeanSquaredError
-from torchmetrics.classification import MatthewsCorrCoef, MulticlassAccuracy
-from torchmetrics.regression import SpearmanCorrCoef
-
-from composer.metrics.nlp import BinaryF1Score, LanguageCrossEntropy, MaskedAccuracy
-from composer.models.huggingface import HuggingFaceModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['create_bert_mlm', 'create_bert_classification']
-
-
-def create_bert_mlm(use_pretrained: Optional[bool] = False,
-                    pretrained_model_name: Optional[str] = None,
-                    model_config: Optional[dict] = None,
-                    tokenizer_name: Optional[str] = None,
-                    gradient_checkpointing: Optional[bool] = False):
-    """BERT model based on |:hugging_face:| Transformers.
-
-    For more information, see `Transformers <https://huggingface.co/transformers/>`_.
-
-    Args:
-
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): The settings used to create a Hugging Face BertConfig. BertConfig is used to specify the
-        architecture of a Hugging Face model.
-        tokenizer_name (transformers.BertTokenizer, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "bert-base-uncased",
-              "architectures": ["BertForMaskedLM"],
-              "attention_probs_dropout_prob": 0.1,
-              "classifier_dropout": null,
-              "gradient_checkpointing": false,
-              "hidden_act": "gelu",
-              "hidden_dropout_prob": 0.1,
-              "hidden_size": 768,
-              "initializer_range": 0.02,
-              "intermediate_size": 3072,
-              "layer_norm_eps": 1e-12,
-              "max_position_embeddings": 512,
-              "model_type": "bert",
-              "num_attention_heads": 12,
-              "num_hidden_layers": 12,
-              "pad_token_id": 0,
-              "position_embedding_type": "absolute",
-              "transformers_version": "4.16.0",
-              "type_vocab_size": 2,
-              "use_cache": true,
-              "vocab_size": 30522
-            }
-
-   To create a BERT model for Masked Language Model pretraining:
-
-    .. testcode::
-
-        from composer.models import create_bert_mlm
-        model = create_bert_mlm()
-
-    """
-    warnings.warn(DeprecationWarning('create_bert_mlm is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'bert-base-uncased'
-
-    if use_pretrained:
-        assert transformers.AutoModelForMaskedLM.from_pretrained is not None, 'AutoModelForMaskedLM has from_pretrained method'
-        model = transformers.AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
-                                                                  **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForMaskedLM.from_config is not None, 'AutoModelForMaskedLM has from_config method'
-        model = transformers.AutoModelForMaskedLM.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()  # type: ignore
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)]
-    return HuggingFaceModel(model=model, tokenizer=tokenizer, use_logits=True, metrics=metrics)
-
-
-def create_bert_classification(num_labels: int = 2,
-                               use_pretrained: bool = False,
-                               pretrained_model_name: Optional[str] = None,
-                               model_config: Optional[dict] = None,
-                               tokenizer_name: Optional[str] = None,
-                               gradient_checkpointing: bool = False):
-    """BERT classification model based on |:hugging_face:| Transformers.
-
-    For more information, see `Transformers <https://huggingface.co/transformers/>`_.
-
-    Args:
-        num_labels (int, optional): The number of classes in the classification task. Default: ``2``.
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): The settings used to create a Hugging Face BertConfig. BertConfig is used to specify the
-        architecture of a Hugging Face model.
-        tokenizer_name (str, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "bert-base-uncased",
-              "architectures": [
-                "BertForSequenceClassification
-              ],
-              "attention_probs_dropout_prob": 0.1,
-              "classifier_dropout": null,
-              "gradient_checkpointing": false,
-              "hidden_act": "gelu",
-              "hidden_dropout_prob": 0.1,
-              "hidden_size": 768,
-              "id2label": {
-                "0": "LABEL_0",
-                "1": "LABEL_1",
-                "2": "LABEL_2"
-              },
-              "initializer_range": 0.02,
-              "intermediate_size": 3072,
-              "label2id": {
-                "LABEL_0": 0,
-                "LABEL_1": 1,
-                "LABEL_2": 2
-              },
-              "layer_norm_eps": 1e-12,
-              "max_position_embeddings": 512,
-              "model_type": "bert",
-              "num_attention_heads": 12,
-              "num_hidden_layers": 12,
-              "pad_token_id": 0,
-              "position_embedding_type": "absolute",
-              "transformers_version": "4.16.0",
-              "type_vocab_size": 2,
-              "use_cache": true,
-              "vocab_size": 30522
-            }
-
-   To create a BERT model for classification:
-
-    .. testcode::
-
-        from composer.models import create_bert_classification
-        model = create_bert_classification(num_labels=3) # if the task has three classes.
-
-    Note:
-        This function can be used to construct a BERT model for regression by setting ``num_labels == 1``.
-        This will have two noteworthy effects. First, it will switch the training loss to :class:`~torch.nn.MSELoss`.
-        Second, the returned :class:`.ComposerModel`'s train/validation metrics will be :class:`~torchmetrics.MeanSquaredError` and :class:`~torchmetrics.SpearmanCorrCoef`.
-
-        For the classification case (when ``num_labels > 1``), the training loss is :class:`~torch.nn.CrossEntropyLoss`, and the train/validation
-        metrics are :class:`~torchmetrics.MulticlassAccuracy` and :class:`~torchmetrics.MatthewsCorrCoef`, as well as :class:`.BinaryF1Score` if ``num_labels == 2``.
-    """
-    warnings.warn(DeprecationWarning('create_bert_classification is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    model_config['num_labels'] = num_labels
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'bert-base-uncased'
-
-    if use_pretrained:
-        assert transformers.AutoModelForSequenceClassification.from_pretrained is not None, 'AutoModelForSequenceClassification has from_pretrained method'
-        model = transformers.AutoModelForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name, **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForSequenceClassification.from_config is not None, 'AutoModelForSequenceClassification has from_config method'
-        model = transformers.AutoModelForSequenceClassification.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    if num_labels == 1:
-        # Metrics for a regression model
-        metrics = [MeanSquaredError(), SpearmanCorrCoef()]
-    else:
-        # Metrics for a classification model
-        metrics = [
-            MulticlassAccuracy(num_classes=num_labels, average='micro'),
-            MatthewsCorrCoef(task='multiclass', num_classes=num_labels)
-        ]
-        if num_labels == 2:
-            metrics.append(BinaryF1Score())
-
-    return HuggingFaceModel(model=model, tokenizer=tokenizer, use_logits=True, metrics=metrics)
diff --git a/composer/models/classify_mnist/__init__.py b/composer/models/classify_mnist/__init__.py
deleted file mode 100644
index 146326e893..0000000000
--- a/composer/models/classify_mnist/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A simple example convolutional neural network which can be used to classify MNIST data."""
-from composer.models.classify_mnist.model import mnist_model as mnist_model
-
-__all__ = ['mnist_model']
-
-_task = 'Image Classification'
-_dataset = 'MNIST'
-_name = 'SimpleConvNet'
-_quality = ''
-_metric = 'MulticlassAccuracy'
-_ttt = '?'
-_hparams = 'classify_mnist_cpu.yaml'
diff --git a/composer/models/classify_mnist/model.py b/composer/models/classify_mnist/model.py
deleted file mode 100644
index 708196af5b..0000000000
--- a/composer/models/classify_mnist/model.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A simple convolutional neural network extending :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import List, Optional, Sequence, Union
-
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['Model', 'mnist_model']
-
-
-class Model(nn.Module):
-    """Toy convolutional neural network architecture in pytorch for MNIST."""
-
-    def __init__(self, initializers: Sequence[Union[str, Initializer]], num_classes: int = 10):
-        warnings.warn(DeprecationWarning('Model is deprecated and will be removed in v0.18'))
-        super().__init__()
-
-        self.num_classes = num_classes
-
-        for initializer in initializers:
-            initializer = Initializer(initializer)
-            self.apply(initializer.get_initializer())
-
-        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
-        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
-        self.bn = nn.BatchNorm2d(32)
-        self.fc1 = nn.Linear(32 * 16, 32)
-        self.fc2 = nn.Linear(32, num_classes)
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = F.relu(out)
-        out = self.conv2(out)
-        out = self.bn(out)
-        out = F.relu(out)
-        out = F.adaptive_avg_pool2d(out, (4, 4))
-        out = torch.flatten(out, 1, -1)
-        out = self.fc1(out)
-        out = F.relu(out)
-        return self.fc2(out)
-
-
-def mnist_model(num_classes: int = 10, initializers: Optional[List[Initializer]] = None):
-    """Helper function to create a :class:`.ComposerClassifier` with a simple convolutional neural network.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``
-        initializers (List[Initializer], optional): list of Initializers
-            for the model. ``None`` for no initialization. Default: ``None``
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a simple MNIST model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import mnist_model
-
-        model = mnist_model()
-    """
-    warnings.warn(DeprecationWarning('mnist_model is deprecated and will be removed in v0.18'))
-
-    if initializers is None:
-        initializers = []
-
-    model = Model(initializers, num_classes)
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/deeplabv3/README.md b/composer/models/deeplabv3/README.md
deleted file mode 100644
index 2d207b4a26..0000000000
--- a/composer/models/deeplabv3/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# 🤿 DeepLabv3+
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Training Hyperparameters\]](#training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-[DeepLabv3+](https://arxiv.org/abs/1802.02611) is an architecture designed for semantic segmenation i.e. per-pixel classification. DeepLabv3+ takes in a feature map from a backbone architecture (e.g. ResNet-101), then outputs classifications for each pixel in the input image. Our implementation is a simple wrapper around [torchvision’s ResNet](https://pytorch.org/vision/stable/models.html#id10) for the backbone and [mmsegmentation’s DeepLabv3+](https://github.com/open-mmlab/mmsegmentation/tree/master/configs/deeplabv3plus) for the head.
-
-## Example
-
-<!--pytest.mark.skip-->
-```python
-from composer.models import composer_deeplabv3
-
-model = composer_deeplabv3(num_classes=150,
-                           backbone_arch="resnet101",
-                           backbone_weights="IMAGENET1K_V2",
-                           sync_bn=False
-)
-```
-
-## Architecture
-
-Based on [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
-
-<div align=center>
-<img src="https://storage.googleapis.com/docs.mosaicml.com/images/models/deeplabv3_v2.png" alt="deeplabv3plus" width="650">
-</div>
-
-
-- **Backbone network**: converts the input image into a feature map.
-    * Usually ResNet-101 with the strided convolutions converted to dilations convolutions in stage 3 and 4.
-    * The 3x3 convolutions in stage 3 and 4 have dilation sizes of 2 and 4, respectively, to compensate for the decreased receptive field.
-    * The average pooling and classification layer are ignored.
-- **Spatial Pyramid Pooling**: extracts multi-resolution features from the stage 4 backbone feature map.
-    * The backbone feature map is processed with four parallel convolution layers with dilations {1, 12, 24, 36} and kernel sizes {1x1, 3x3, 3x3, 3x3}.
-    * In parallel to the convolutions, global average pool the backbone feature map, then bilinearly upsample to be the same spatial dimension as the feature map.
-    * Concatenate the outputs from the convolutions and global average pool, then process with a 1x1 convolution.
-    * The 3x3 convolutions are implemented as depth-wise convolutions to reduce memory and computation cost.
-- **Decoder**: converts the output of spatial pyramid pooling (SPP) to class predictions of the same spatial dimension as the input image.
-    * SPP output is bilinearly upsampled to be the same spatial dimension as the output from the first stage in the backbone network.
-    * A 1x1 convolution is applied to the first stage activations, then this is concatenated with the upsampled SPP output.
-    * The concatenation is processed by a 3x3 convolution with dropout followed by a classification layer.
-    * The predictions are bilinearly upsampled to be the same resolution as the input image.
-
-## Training Hyperparameters
-
-We tested two sets of hyperparameters for DeepLabv3+ trained on the ADE20k dataset.
-
-### Typical ADE20k Model Hyperparameters
-
-- Model: deeplabv3:
-  - Initializers: kaiming_normal, bn_ones
-  - Number of classes: 150
-  - Backbone weights: IMAGENET1K_V1
-  - Sync BatchNorm
-- Optimizer: SGD
-  - Learning rate: 0.01
-  - Momentum: 0.9
-  - Weight decay: 5.0e-4
-  - Dampening: 0
-  - Nsterov: false
-- LR schedulers:
-  - Polynomial:
-    - Alpha_f: 0.01
-    - Power: 0.9
-- Number of epochs: 127
-- Batch size: 16
-- Precision: amp
-
-| Model | mIoU | Time-to-Train on 8xA100 |
-| --- | --- | --- |
-| ResNet101-DeepLabv3+ | 44.17 +/- 0.17 | 6.385 hr |
-
-### Composer ADE20k Model Hyperparameters
-
-- Model: deeplabv3:
-  - Initializers: kaiming_normal, bn_ones
-  - Number of classes: 150
-  - Backbone Architecture: resnet101
-  - Sync BatchNorm
-  - Backbone weights: IMAGENET1K_V2
-- Optimizer: Decoupled SGDW
-  - Learning rate: 0.01
-  - Momentum: 0.9
-  - Weight decay: 2.0e-5
-  - Dampening: 0
-  - Nesterov: false
-- LR schedulers:
-  - Cosine decay, t_max: 1dur
-- Number of epochs: 128
-- Batch size: 32
-- Precision: amp
-
-| Model | mIoU | Time-to-Train on 8xA100 |
-| --- | --- | --- |
-| ResNet101-DeepLabv3+ | 45.764 +/- 0.29 | 4.67 hr |
-
-Improvements:
-
-- New PyTorch pretrained weights
-- Cosine decay
-- Decoupled Weight Decay
-- Increase batch size to 32
-- Decrease weight decay to 2e-5
-
-## Attribution
-
-[Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611) by Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam
-
-[OpenMMLab Semantic Segmentation Toolbox and Benchmark](https://github.com/open-mmlab/mmsegmentation)
-
-[How to Train State-Of-The-Art Models Using TorchVision’s Latest Primitives](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/) by Vasilis Vryniotis
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.deeplabv3.composer_deeplabv3
-    :noindex:
-```
diff --git a/composer/models/deeplabv3/__init__.py b/composer/models/deeplabv3/__init__.py
deleted file mode 100644
index e3473a3015..0000000000
--- a/composer/models/deeplabv3/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""DeepLabV3 for image segmentation."""
-from composer.models.deeplabv3.model import composer_deeplabv3 as composer_deeplabv3
-
-__all__ = ['composer_deeplabv3']
diff --git a/composer/models/deeplabv3/model.py b/composer/models/deeplabv3/model.py
deleted file mode 100644
index 7e58847708..0000000000
--- a/composer/models/deeplabv3/model.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""DeepLabV3 model extending :class:`.ComposerClassifier`."""
-
-import functools
-import textwrap
-import warnings
-from typing import Dict, Optional, Sequence
-
-import torch
-import torch.distributed as torch_dist
-import torch.nn.functional as F
-import torchvision
-from packaging import version
-from torchmetrics import MetricCollection
-from torchvision.models import _utils, resnet
-
-from composer.loss import DiceLoss, soft_cross_entropy
-from composer.metrics import CrossEntropy, MIoU
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-from composer.utils import dist
-
-__all__ = ['deeplabv3', 'composer_deeplabv3']
-
-
-class SimpleSegmentationModel(torch.nn.Module):
-
-    def __init__(self, backbone, classifier):
-        warnings.warn(DeprecationWarning('SimpleSegmentationModel is deprecated and will be removed in v0.18'))
-
-        super().__init__()
-        self.backbone = backbone
-        self.classifier = classifier
-
-    def forward(self, x):
-        input_shape = x.shape[-2:]
-        features = self.backbone(x)
-        logits = self.classifier(tuple(features.values()))
-        logits = F.interpolate(logits,
-                               size=input_shape,
-                               mode='bilinear',
-                               align_corners=False,
-                               recompute_scale_factor=False)
-        return logits
-
-
-def deeplabv3(num_classes: int,
-              backbone_arch: str = 'resnet101',
-              backbone_weights: Optional[str] = None,
-              sync_bn: bool = True,
-              use_plus: bool = True,
-              initializers: Sequence[Initializer] = ()):
-    """Helper function to build a mmsegmentation DeepLabV3 model.
-
-    Args:
-        num_classes (int): Number of classes in the segmentation task.
-        backbone_arch (str, optional): The architecture to use for the backbone. Must be either
-            [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``.
-        backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone.
-            Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``.
-        sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers.
-            Default: ``True``.
-        use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``.
-        initializers (Sequence[Initializer], optional): Initializers for the model. ``()`` for no initialization.
-            Default: ``()``.
-
-    Returns:
-        deeplabv3: A DeepLabV3 :class:`torch.nn.Module`.
-
-    Example:
-
-    .. code-block:: python
-
-        from composer.models.deeplabv3.deeplabv3 import deeplabv3
-
-        pytorch_model = deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None)
-    """
-    warnings.warn(DeprecationWarning('deeplabv3 is deprecated and will be removed in v0.18'))
-
-    # check that the specified architecture is in the resnet module
-    if not hasattr(resnet, backbone_arch):
-        raise ValueError(f'backbone_arch must be part of the torchvision resnet module, got value: {backbone_arch}')
-
-    # change the model weight url if specified
-    if version.parse(torchvision.__version__) < version.parse('0.13.0'):
-        pretrained = False
-        if backbone_weights:
-            pretrained = True
-            if backbone_weights == 'IMAGENET1K_V1':
-                resnet.model_urls[backbone_arch] = 'https://download.pytorch.org/models/resnet101-63fe2227.pth'
-            elif backbone_weights == 'IMAGENET1K_V2':
-                resnet.model_urls[backbone_arch] = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth'
-            else:
-                ValueError(
-                    textwrap.dedent(f"""\
-                        `backbone_weights` must be either "IMAGENET1K_V1" or "IMAGENET1K_V2"
-                        if torchvision.__version__ < 0.13.0. `backbone_weights` was {backbone_weights}."""))
-        backbone = getattr(resnet, backbone_arch)(pretrained=pretrained,
-                                                  replace_stride_with_dilation=[False, True, True])
-    else:
-        backbone = getattr(resnet, backbone_arch)(weights=backbone_weights,
-                                                  replace_stride_with_dilation=[False, True, True])
-
-    # specify which layers to extract activations from
-    return_layers = {'layer1': 'layer1', 'layer4': 'layer4'} if use_plus else {'layer4': 'layer4'}
-    backbone = _utils.IntermediateLayerGetter(backbone, return_layers=return_layers)
-
-    try:
-        from mmseg.models import ASPPHead, DepthwiseSeparableASPPHead
-    except ImportError as e:
-        raise ImportError(
-            textwrap.dedent("""\
-            Either mmcv or mmsegmentation is not installed. To install mmcv, please run pip install mmcv-full==1.4.4 -f
-             https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html where {cu_version} and
-             {torch_version} refer to your CUDA and PyTorch versions, respectively. To install mmsegmentation, please
-             run pip install mmsegmentation==0.22.0 on command-line.""")) from e
-
-    world_size = dist.get_world_size()
-    if sync_bn and world_size == 1:
-        warnings.warn('sync_bn was true, but only one process is present for training. sync_bn will be ignored.')
-
-    norm_type = 'SyncBN' if sync_bn and world_size > 1 else 'BN'
-    norm_cfg = {'type': norm_type, 'requires_grad': True}
-    if use_plus:
-        # mmseg config:
-        # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3plus_r50-d8.py
-        head = DepthwiseSeparableASPPHead(in_channels=2048,
-                                          in_index=-1,
-                                          channels=512,
-                                          dilations=(1, 12, 24, 36),
-                                          c1_in_channels=256,
-                                          c1_channels=48,
-                                          dropout_ratio=0.1,
-                                          num_classes=num_classes,
-                                          norm_cfg=norm_cfg,
-                                          align_corners=False)
-    else:
-        # mmseg config:
-        # https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/deeplabv3_r50-d8.py
-        head = ASPPHead(in_channels=2048,
-                        in_index=-1,
-                        channels=512,
-                        dilations=(1, 12, 24, 36),
-                        dropout_ratio=0.1,
-                        num_classes=num_classes,
-                        norm_cfg=norm_cfg,
-                        align_corners=False)
-
-    model = SimpleSegmentationModel(backbone, head)
-
-    if initializers:
-        for initializer in initializers:
-            initializer_fn = Initializer(initializer).get_initializer()
-
-            # Only apply initialization to classifier head if pre-trained weights are used
-            if backbone_weights is None:
-                model.apply(initializer_fn)
-            else:
-                model.classifier.apply(initializer_fn)
-
-    if sync_bn and world_size > 1:
-        local_world_size = dist.get_local_world_size()
-
-        # List of ranks for each node, assumes that each node has the same number of ranks
-        num_nodes = world_size // local_world_size
-        process_group = None
-        if num_nodes > 1:
-            ranks_per_node = [
-                list(range(node * local_world_size, (node + 1) * local_world_size)) for node in range(num_nodes)
-            ]
-            process_groups = [torch_dist.new_group(ranks) for ranks in ranks_per_node]
-            process_group = process_groups[dist.get_node_rank()]
-
-        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=process_group)
-
-    return model
-
-
-def composer_deeplabv3(num_classes: int,
-                       backbone_arch: str = 'resnet101',
-                       backbone_weights: Optional[str] = None,
-                       sync_bn: bool = True,
-                       use_plus: bool = True,
-                       ignore_index: int = -1,
-                       cross_entropy_weight: float = 1.0,
-                       dice_weight: float = 0.0,
-                       initializers: Sequence[Initializer] = ()):
-    """Helper function to create a :class:`.ComposerClassifier` with a DeepLabv3(+) model. Logs
-        Mean Intersection over Union (MIoU) and Cross Entropy during training and validation.
-
-    From `Rethinking Atrous Convolution for Semantic Image Segmentation <https://arxiv.org/abs/1706.05587>`_
-        (Chen et al, 2017).
-
-    Args:
-        num_classes (int): Number of classes in the segmentation task.
-        backbone_arch (str, optional): The architecture to use for the backbone. Must be either
-            [``'resnet50'``, ``'resnet101'``]. Default: ``'resnet101'``.
-        backbone_weights (str, optional): If specified, the PyTorch pre-trained weights to load for the backbone.
-            Currently, only ['IMAGENET1K_V1', 'IMAGENET1K_V2'] are supported. Default: ``None``.
-        sync_bn (bool, optional): If ``True``, replace all BatchNorm layers with SyncBatchNorm layers.
-            Default: ``True``.
-        use_plus (bool, optional): If ``True``, use DeepLabv3+ head instead of DeepLabv3. Default: ``True``.
-        ignore_index (int): Class label to ignore when calculating the loss and other metrics. Default: ``-1``.
-        cross_entropy_weight (float): Weight to scale the cross entropy loss. Default: ``1.0``.
-        dice_weight (float): Weight to scale the dice loss. Default: ``0.0``.
-        initializers (List[Initializer], optional): Initializers for the model. ``[]`` for no initialization.
-            Default: ``[]``.
-
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a DeepLabv3(+) model.
-
-    Example:
-
-    .. code-block:: python
-
-        from composer.models import composer_deeplabv3
-
-        model = composer_deeplabv3(num_classes=150, backbone_arch='resnet101', backbone_weights=None)
-    """
-    warnings.warn(DeprecationWarning('composer_deeplabv3 is deprecated and will be removed in v0.18'))
-
-    model = deeplabv3(backbone_arch=backbone_arch,
-                      backbone_weights=backbone_weights,
-                      use_plus=use_plus,
-                      num_classes=num_classes,
-                      sync_bn=sync_bn,
-                      initializers=initializers)
-
-    train_metrics = MetricCollection(
-        [CrossEntropy(ignore_index=ignore_index),
-         MIoU(num_classes, ignore_index=ignore_index)])
-    val_metrics = MetricCollection(
-        [CrossEntropy(ignore_index=ignore_index),
-         MIoU(num_classes, ignore_index=ignore_index)])
-
-    ce_loss_fn = functools.partial(soft_cross_entropy, ignore_index=ignore_index)
-    dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True)
-
-    def _combo_loss(output, target) -> Dict[str, torch.Tensor]:
-        loss = {'total': torch.zeros(1, device=output.device, dtype=output.dtype)}
-        if cross_entropy_weight:
-            loss['cross_entropy'] = ce_loss_fn(output, target)
-            loss['total'] += loss['cross_entropy'] * cross_entropy_weight
-        if dice_weight:
-            loss['dice'] = dice_loss_fn(output, target)
-            loss['total'] += loss['dice'] * dice_weight
-        return loss
-
-    composer_model = ComposerClassifier(module=model,
-                                        train_metrics=train_metrics,
-                                        val_metrics=val_metrics,
-                                        loss_fn=_combo_loss)
-    return composer_model
diff --git a/composer/models/efficientnetb0/README.md b/composer/models/efficientnetb0/README.md
deleted file mode 100644
index 9cb1096bc6..0000000000
--- a/composer/models/efficientnetb0/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# EfficientNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` /`Image Classification`
-
-The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks, but were initially designed for image classification. The model family was designed to reach the highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and image resolution according to an empirically determined scaling law.
-
-## Example
-
-```python
-from composer.models import composer_efficientnetb0
-
-model = composer_efficientnetb0(num_classes=1000, drop_connect_rate=0.2)
-```
-
-## Architecture
-
-The table below from Tan and Le specifies the EfficientNet baseline architecture broken up into separate stages. MBConv indicates a mobile inverted bottleneck with a specific expansion size and kernel size. Resolution is the expected input resolution of the current stage. Number of channels is the number of output channels of the current stage. Number of layers indicates the number of repeated blocks in each stage. Subsequent EfficientNet family members scale the resolution, number of channels, and number of layers according to the resolution, width, and depth scaling parameters defined by Tan and Le.
-
-![efficientnet_arch.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/efficientnet_arch.png)
-
-## Family members
-
-Tan and Le included 8 members in their model family. The goal was for each family member to have approximately double the FLOPs of the previous family member. Currently, we only support EfficientNet-B0.
-
-| Model Family Member | Parameter Count | TPU Repo Accuracy* | Our Accuracy** | Training Time on 8x3080 |
-|---------------------|-----------------|--------------------|----------------|-------------------------|
-| EfficientNet-B0     | 5.3M            | 77.1%              | 77.22%         | 23.3 hr                 |
-| EfficientNet-B1     | 7.8M            | 79.1%              | TBA            | TBA                     |
-| EfficientNet-B2     | 9.2M            | 80.1%              | TBA            | TBA                     |
-| EfficientNet-B3     | 12M             | 81.6%              | TBA            | TBA                     |
-| EfficientNet-B4     | 19M             | 82.9%              | TBA            | TBA                     |
-| EfficientNet-B5     | 30M             | 83.6%              | TBA            | TBA                     |
-| EfficientNet-B6     | 43M             | 84.0%              | TBA            | TBA                     |
-| EfficientNet-B7     | 66M             | 84.3%              | TBA            | TBA                     |
-
-*Includes label smoothing, sample-wise stochastic depth, and AutoAugment
-
-**Includes label smoothing and sample-wise stochastic depth
-
-## Default Training Hyperparameters
-
-We use the following default hyperparameters from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet):
-
-```yaml
-optimizer:
-  rmsprop:
-    lr: 0.08
-    momentum: 0.9
-    alpha: 0.9
-    eps: 0.01
-    weight_decay: 1.0e-5
-schedulers:
-  - cosine_decay_with_warmup:
-      t_warmup: "16ep"
-train_batch_size: 4096
-max_duration: 400ep
-```
-
-Our implementation differs from the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) in that we:
-
-- Apply weight decay to batch normalization trainable parameters
-- Use `momentum = 0.1` and `eps = 1e-5` as batch normalization parameters
-
-## Attribution
-
-Paper: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le
-
-Code: [gen-efficientnet-pytorch Github repository](https://github.com/rwightman/gen-efficientnet-pytorch) by Ross Wightman
-
-Hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) by Nvidia
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.efficientnetb0.composer_efficientnetb0
-    :noindex:
-```
diff --git a/composer/models/efficientnetb0/__init__.py b/composer/models/efficientnetb0/__init__.py
deleted file mode 100644
index d1101f595c..0000000000
--- a/composer/models/efficientnetb0/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety
-of vision tasks, but were initially designed for image classification. The model family was designed to reach the
-highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and
-image resolution according to an empirically determined scaling law.
-
-See the :doc:`Model Card </model_cards/efficientnet>` for more details.
-"""
-from composer.models.efficientnetb0.model import composer_efficientnetb0 as composer_efficientnetb0
-
-__all__ = ['composer_efficientnetb0']
-
-_task = 'Image Classification'
-_dataset = 'ImageNet'
-_name = 'EfficientNet-B0'
-_quality = '76.63'
-_metric = 'Top-1 Accuracy'
-_ttt = '21h 48m'
-_hparams = 'efficientnetb0.yaml'
diff --git a/composer/models/efficientnetb0/_layers.py b/composer/models/efficientnetb0/_layers.py
deleted file mode 100644
index ab12aec9c3..0000000000
--- a/composer/models/efficientnetb0/_layers.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Callable, Optional
-
-import torch
-from torch import nn as nn
-
-
-def round_channels(
-    channels: float,
-    width_multiplier: float,
-    divisor: int = 8,
-    min_value: Optional[int] = None,
-) -> int:
-    """Round number of channels after scaling with width multiplier.
-
-    This function ensures that channel integers halfway in-between divisors is rounded up.
-
-    Args:
-        channels (float): Number to round.
-        width_multiplier (float): Amount to scale `channels`.
-        divisor (int): Number to make the output divisible by.
-        min_value (int, optional): Minimum value the output can be. If not specified, defaults
-            to the ``divisor``.
-    """
-    if not width_multiplier:
-        return int(channels)
-    channels *= width_multiplier
-
-    min_value = min_value or divisor
-    new_channels = max(min_value, int(channels + divisor / 2) // divisor * divisor)
-    if new_channels < 0.9 * channels:  # increase channels if rounding decreases by >10%
-        new_channels += divisor
-    return new_channels
-
-
-def calculate_same_padding(kernel_size, dilation, stride):
-    """Calculates the amount of padding to use to get the "SAME" functionality in Tensorflow."""
-    return ((stride - 1) + dilation * (kernel_size - 1)) // 2
-
-
-def drop_connect(inputs: torch.Tensor, drop_connect_rate: float, training: bool):
-    """Randomly mask a set of samples. Provides similar regularization as stochastic depth.
-
-    Args:
-        input (torch.Tensor): Input tensor to mask.
-        drop_connect_rate (float): Probability of droppping each sample.
-        training (bool): Whether or not the model is training
-    """
-    if not training:
-        return inputs
-
-    keep_prob = 1 - drop_connect_rate
-    rand_tensor = keep_prob + torch.rand(
-        [inputs.size()[0], 1, 1, 1],
-        dtype=inputs.dtype,
-        device=inputs.device,
-    )
-    rand_tensor.floor_()  # binarize
-    output = inputs.div(keep_prob) * rand_tensor
-    return output
-
-
-class SqueezeExcite(nn.Module):
-    """Squeeze Excite Layer.
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        latent_channels (int): Number of hidden channels.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        act_layer: Callable[..., nn.Module] = nn.ReLU,
-    ):
-        super().__init__()
-
-        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.conv_reduce = nn.Conv2d(in_channels, latent_channels, kernel_size=1, bias=True)
-        self.act1 = act_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(latent_channels, in_channels, kernel_size=1, bias=True)
-        self.gate_fn = torch.nn.Sigmoid()
-
-    def forward(self, x: torch.Tensor):
-        out = self.global_avg_pool(x)
-        out = self.conv_reduce(out)
-        out = self.act1(out)
-        out = self.conv_expand(out)
-        out = x * self.gate_fn(out)
-        return out
-
-
-class DepthwiseSeparableConv(nn.Module):
-    """Depthwise Separable Convolution layer.
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        out_channels (int): Number of channels in the output tensor.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution.
-        se_ratio (float): How much to scale `in_channels` for the hidden layer
-            dimensionality of the squeeze-excite module.
-        drop_connect_rate (float): Probability of dropping a sample before the
-            identity connection, provides regularization similar to stochastic
-            depth.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-        norm_kwargs (dict): Normalization layer's keyword arguments.
-        norm_layer (torch.nn.Module): Normalization layer to use in block.
-    """
-
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int,
-                 stride: int,
-                 se_ratio: float,
-                 drop_connect_rate: float,
-                 act_layer: Callable[..., nn.Module],
-                 norm_kwargs: dict,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        super().__init__()
-        self.drop_connect_rate = drop_connect_rate
-        self.has_residual = (in_channels == out_channels and stride == 1)
-        self.has_se = se_ratio is not None and se_ratio > 0.0
-
-        padding = calculate_same_padding(kernel_size, dilation=1, stride=stride)
-        self.conv_depthwise = nn.Conv2d(in_channels=in_channels,
-                                        out_channels=in_channels,
-                                        groups=in_channels,
-                                        kernel_size=kernel_size,
-                                        stride=stride,
-                                        padding=padding,
-                                        bias=False)
-        self.bn1 = norm_layer(in_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        if self.has_se:
-            latent_channels = max(1, int(in_channels * se_ratio))
-            self.se = SqueezeExcite(in_channels, latent_channels, act_layer)
-
-        self.conv_pointwise = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            bias=False,
-        )
-        self.bn2 = norm_layer(out_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-    def forward(self, input: torch.Tensor):
-        residual = input
-
-        out = self.conv_depthwise(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-
-        if self.has_se:
-            out = self.se(out)
-
-        out = self.conv_pointwise(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-
-        if self.has_residual:
-            if self.drop_connect_rate > 0.0:
-                out = drop_connect(out, self.drop_connect_rate, self.training)
-            out += residual
-        return out
-
-
-class MBConvBlock(nn.Module):
-    """Mobile Inverted Residual Bottleneck Block.
-
-    This block is implemented as as defined in
-    `MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/abs/1801.04381>`_ (Sandler et al, 2018).
-
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-        out_channels (int): Number of channels in the output tensor.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution.
-        expand_ratio (int): How much to expand the input channels for the
-            depthwise convolution.
-        se_ratio (float): How much to scale `in_channels` for the hidden layer
-            dimensionality of the squeeze-excite module.
-        drop_connect_rate (float): Probability of dropping a sample before the
-            identity connection, provides regularization similar to stochastic
-            depth.
-        act_layer (torch.nn.Module): Activation layer to use in block.
-        norm_kwargs (dict): Normalization layer's keyword arguments.
-        norm_layer (torch.nn.Module): Normalization layer to use in block.
-    """
-
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int,
-                 stride: int,
-                 expand_ratio: int,
-                 se_ratio: float,
-                 drop_connect_rate: float,
-                 act_layer: Callable[..., nn.Module],
-                 norm_kwargs: dict,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        super().__init__()
-        self.drop_connect_rate = drop_connect_rate
-        self.has_residual = (in_channels == out_channels and stride == 1)
-        self.has_se = se_ratio is not None and se_ratio > 0.0
-
-        mid_channels = round_channels(in_channels, expand_ratio)
-
-        # Point-wise convolution expansion
-        self.conv1x1_expand = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(mid_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Depth-wise Convolution
-        padding = calculate_same_padding(kernel_size, dilation=1, stride=stride)
-        self.conv_depthwise = nn.Conv2d(in_channels=mid_channels,
-                                        out_channels=mid_channels,
-                                        groups=mid_channels,
-                                        kernel_size=kernel_size,
-                                        stride=stride,
-                                        padding=padding,
-                                        bias=False)
-        self.bn2 = norm_layer(mid_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        # Squeeze and Excitation layer, if specified
-        if self.has_se:
-            latent_channels = max(1, int(in_channels * se_ratio))
-            self.se = SqueezeExcite(mid_channels, latent_channels, act_layer)
-
-        # Point-wise convolution contraction
-        self.conv1x1_contract = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(out_channels, **norm_kwargs)
-
-    def forward(self, input: torch.Tensor):
-        residual = input
-
-        out = self.conv1x1_expand(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-
-        out = self.conv_depthwise(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-
-        if self.has_se:
-            out = self.se(out)
-
-        out = self.conv1x1_contract(out)
-        out = self.bn3(out)
-
-        if self.has_residual:
-            if self.drop_connect_rate:
-                out = drop_connect(out, self.drop_connect_rate, self.training)
-            out += residual
-        return out
diff --git a/composer/models/efficientnetb0/efficientnets.py b/composer/models/efficientnetb0/efficientnets.py
deleted file mode 100644
index 7c544a5143..0000000000
--- a/composer/models/efficientnetb0/efficientnets.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""EfficientNet model.
-
-Adapted from `(Generic) EfficientNets for PyTorch. <https://github.com/rwightman/gen-efficientnet-pytorch>`_.
-"""
-
-import math
-import re
-import warnings
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-
-from composer.models.efficientnetb0._layers import (DepthwiseSeparableConv, MBConvBlock, calculate_same_padding,
-                                                    round_channels)
-
-__all__ = ['EfficientNet']
-
-
-class EfficientNet(nn.Module):
-    """EfficientNet model based on (`Tan et al, 2019 <https://arxiv.org/abs/1905.11946>`_).
-
-    Args:
-        num_classes (int): Size of the EfficientNet output, typically viewed
-             as the number of classes in a classification task.
-        width_multiplier (float, optional): How much to scale the EfficientNet-B0 channel
-             dimension throughout the model. Default: ``1.0``.
-        depth_multiplier (float, optional): How much to scale the EFficientNet-B0 depth. Default: ``1.0``.
-        drop_rate (float, optional): Dropout probability for the penultimate activations. Default: ``0.2``.
-        drop_connect_rate (float, optional): Probability of dropping a sample before the
-             identity connection, provides regularization similar to stochastic
-             depth. Default: ``0.2``.
-        act_layer (torch.nn.Module, optional): Activation layer to use in the model. Default: ``nn.SiLU``.
-        norm_kwargs (dict, optional): Normalization layer's keyword arguments. Default: ``{"momentum": 0.1, "eps": 1e-5}``.
-        norm_layer (torch.nn.Module, optional): Normalization layer to use in the model. Default: ``nn.BatchNorm2d``.
-    """
-
-    # EfficientNet-B0 architecture specification.
-    # block_strings are decoded into block level hyperparameters.
-    # r=repeat, k=kernel_size, s=stride, e=expand_ratio, i=in_channels, o=out_channels, se=se_ratio.
-    _blocks_strings = [
-        'r1_k3_s1_e1_i32_o16_se0.25',
-        'r2_k3_s2_e6_i16_o24_se0.25',
-        'r2_k5_s2_e6_i24_o40_se0.25',
-        'r3_k3_s2_e6_i40_o80_se0.25',
-        'r3_k5_s1_e6_i80_o112_se0.25',
-        'r4_k5_s2_e6_i112_o192_se0.25',
-        'r1_k3_s1_e6_i192_o320_se0.25',
-    ]
-
-    def __init__(self,
-                 num_classes: int,
-                 width_multiplier: float = 1.0,
-                 depth_multiplier: float = 1.0,
-                 drop_rate: float = 0.2,
-                 drop_connect_rate: float = 0.2,
-                 act_layer: Callable[..., nn.Module] = nn.SiLU,
-                 norm_kwargs: Optional[dict] = None,
-                 norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d):
-        warnings.warn(DeprecationWarning('EfficientNet is deprecated and will be removed in v0.18'))
-
-        super(EfficientNet, self).__init__()
-        self.num_classes = num_classes
-
-        if norm_kwargs is None:
-            norm_kwargs = {'momentum': 0.1, 'eps': 1e-5}
-
-        in_channels = 3
-        out_channels = round_channels(32, width_multiplier)
-        padding = calculate_same_padding(kernel_size=3, dilation=1, stride=2)
-        self.conv_stem = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=2,
-            padding=padding,
-            bias=False,
-        )
-        self.bn1 = norm_layer(num_features=out_channels, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Count the number of blocks in the model
-        block_count = 0.
-        for block_string in self._blocks_strings:
-            _, num_repeat = self._decode_block_string(block_string)
-            block_count += num_repeat
-
-        # Decode block strings and add blocks
-        block_idx = 0.
-        blocks = []
-        block_args = {}
-        for block_string in self._blocks_strings:
-            block_args, num_repeat = self._decode_block_string(block_string)
-            # Scale channels and number of repeated blocks based on multipliers
-            block_args['in_channels'] = round_channels(
-                block_args['in_channels'],
-                width_multiplier,
-            )
-            block_args['out_channels'] = round_channels(
-                block_args['out_channels'],
-                width_multiplier,
-            )
-            num_repeat = int(math.ceil(depth_multiplier * num_repeat))
-
-            # Add activation, normalization layers, and drop connect
-            block_args['act_layer'] = act_layer
-            block_args['norm_kwargs'] = norm_kwargs
-            block_args['norm_layer'] = norm_layer
-
-            # Delete expand_ratio when set to 1 to use depthwise separable convolution layer
-            if block_args['expand_ratio'] == 1:
-                del block_args['expand_ratio']
-
-            for i in range(num_repeat):
-                # Linearly decay drop_connect_rate across model depth
-                block_args['drop_connect_rate'] = drop_connect_rate * block_idx / block_count
-
-                if 'expand_ratio' not in block_args:
-                    blocks.append(DepthwiseSeparableConv(**block_args))
-                else:
-                    blocks.append(MBConvBlock(**block_args))
-                block_idx += 1
-
-                # Only the first block in a stage can have stride != 1
-                if i == 0:
-                    block_args['stride'] = 1
-                    block_args['in_channels'] = block_args['out_channels']
-
-        self.blocks = nn.Sequential(*blocks)
-
-        in_channels = block_args['out_channels']
-        out_channels = round_channels(1280, width_multiplier)
-        self.conv_head = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
-        self.bn2 = norm_layer(out_channels, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.dropout = nn.Dropout(drop_rate)
-        self.classifier = nn.Linear(out_channels, num_classes)
-
-        # Initialization from gen-efficientnet-pytorch repo
-        for m in self.modules():
-            if isinstance(m, torch.nn.Conv2d):
-                fan_out = (m.kernel_size[0] * m.kernel_size[1] * m.out_channels) // m.groups
-                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-                if m.bias is not None:
-                    m.bias.data.zero_()
-            elif isinstance(m, torch.nn.BatchNorm2d):
-                m.weight.data.fill_(1.0)
-                m.bias.data.zero_()
-            elif isinstance(m, torch.nn.Linear):
-                fan_out = m.weight.size(0)
-                init_range = 1.0 / math.sqrt(fan_out)
-                m.weight.data.uniform_(-init_range, init_range)
-                m.bias.data.zero_()
-
-    def extract_features(self, input: torch.Tensor):
-        out = self.conv_stem(input)
-        out = self.bn1(out)
-        out = self.act1(out)
-        out = self.blocks(out)
-        out = self.conv_head(out)
-        out = self.bn2(out)
-        out = self.act2(out)
-        out = self.global_avg_pool(out)
-        return out.flatten(1)
-
-    def forward(self, input: torch.Tensor):
-        out = self.extract_features(input)
-        out = self.dropout(out)
-        return self.classifier(out)
-
-    @staticmethod
-    def get_model_from_name(model_name: str, num_classes, drop_connect_rate: float):
-        """Instantiate an EfficientNet model family member based on the model_name string.
-
-        Args:
-            model_name: (str): One of ``'efficientnet-b0'`` through ``'efficientnet-b7'``.
-            num_classes (int): Size of the EfficientNet output, typically viewed as the number of classes in a classification task.
-            drop_connect_rate (float): Probability of dropping a sample before the identity connection,
-                provides regularization similar to stochastic depth.
-        """
-
-        # Coefficients: width, depth, res, dropout
-        model_arch = {
-            'efficientnet-b0': (1.0, 1.0, 224, 0.2),
-            'efficientnet-b1': (1.0, 1.1, 240, 0.2),
-            'efficientnet-b2': (1.1, 1.2, 260, 0.3),
-            'efficientnet-b3': (1.2, 1.4, 300, 0.3),
-            'efficientnet-b4': (1.4, 1.8, 380, 0.4),
-            'efficientnet-b5': (1.6, 2.2, 456, 0.4),
-            'efficientnet-b6': (1.8, 2.6, 528, 0.5),
-            'efficientnet-b7': (2.0, 3.1, 600, 0.5),
-        }
-
-        model_params = model_arch[model_name]
-        width_multiplier = model_params[0]
-        depth_multiplier = model_params[1]
-        drop_rate = model_params[3]
-        return EfficientNet(num_classes=num_classes,
-                            width_multiplier=width_multiplier,
-                            depth_multiplier=depth_multiplier,
-                            drop_rate=drop_rate,
-                            drop_connect_rate=drop_connect_rate)
-
-    def _decode_block_string(self, block_string: str):
-        """Decodes an EfficientNet block specification string into a dictionary of keyword arguments for a block in the
-        architecture."""
-
-        arg_strings = block_string.split('_')
-        args = {}
-        for arg_string in arg_strings:
-            splits = re.split(r'(\d.*)', arg_string)
-            if len(splits) >= 2:
-                key, value = splits[:2]
-                args[key] = value
-        num_repeat = int(args['r'])
-        block_args = {
-            'kernel_size': int(args['k']),
-            'stride': int(args['s']),
-            'expand_ratio': int(args['e']),
-            'in_channels': int(args['i']),
-            'out_channels': int(args['o']),
-            'se_ratio': float(args['se']) if 'se' in args else None,
-        }
-        return block_args, num_repeat
diff --git a/composer/models/efficientnetb0/model.py b/composer/models/efficientnetb0/model.py
deleted file mode 100644
index 67ae193895..0000000000
--- a/composer/models/efficientnetb0/model.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A :class:`.ComposerClassifier` wrapper around the EfficientNet-b0 architecture."""
-
-import warnings
-
-from composer.models.efficientnetb0.efficientnets import EfficientNet
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_efficientnetb0']
-
-
-def composer_efficientnetb0(num_classes: int = 1000, drop_connect_rate: float = 0.2) -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with an EfficientNet-b0 architecture.
-
-    See `Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_
-        (Tan et al, 2019) for more details.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        drop_connect_rate (float, optional): Probability of dropping a sample within a block before identity
-            connection. Default: ``0.2``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a EfficientNet-B0 model.
-
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_efficientnetb0
-
-        model = composer_efficientnetb0()  # creates EfficientNet-b0 for image classification
-    """
-    warnings.warn(DeprecationWarning('composer_efficientnetb0 is deprecated and will be removed in v0.18'))
-    model = EfficientNet.get_model_from_name(model_name='efficientnet-b0',
-                                             num_classes=num_classes,
-                                             drop_connect_rate=drop_connect_rate)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/gpt2/README.md b/composer/models/gpt2/README.md
deleted file mode 100644
index 52ee26a97f..0000000000
--- a/composer/models/gpt2/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# GPT-2
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`NLP` /  ``Autoregressive Language Modeling``
-
-The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales. This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream language generation tasks, such as summarization, translation, and dialog.
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's GPT-2 model with one of our configurations.
-
-## Example
-
-<!-- TODO: Address timeouts -->
-<!--pytest.mark.skip-->
-```python
-import transformers
-from composer.models import GPT2Model
-
-model = GPT2Model(module=transformers.AutoModelForCausalLM.from_pretrained("gpt2"),
-                  config=transformers.GPT2Config.from_pretrained("gpt2"),
-                  tokenizer_name="gpt2")
-```
-
-## Architecture
-
-GPT-2 consists of a a decoder-only Transformer parameterized by $n_{layer}$, $d_{model}$, $d_{ff}$, $d_{attn}$ and $n_{heads}$. The parameters for each model family member can be seen below:
-
-| Name       | $n_{layer}$ | $d_{model}$ | $d_{ff}$ | $d_{attn}$ | $n_{heads}$ |
-|------------|-------------|-------------|----------|------------|-------------|
-| GPT-2 52M  | 8           | 512         | 2048     | 8          | 8           |
-| GPT-2 83M  | 10          | 640         | 2560     | 640        | 10          |
-| GPT-2 125M | 12          | 768         | 3072     | 768        | 12          |
-
-## Family Members
-
-We implement three members of this family at different scales: GPT 52M, GPT 83M, and GPT 125M. These models are named after their parameter counts. We selected these particular configurations because (1) they represent points on the pareto frontier of the scaling law for language models as described by [Kaplan et al. at OpenAI](https://arxiv.org/abs/2001.08361) and (2) they are small enough to rapidly iterate on methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Final Loss | Predicted Perplexity | Actual Perplexity |
-|---------------------|------------|---------------------------|-----------------|------------|----------------------|-------------------|
-| GPT-2 52M           | 53.9M      | 02:44                     | 4.6B            | 3.43       | 32.54                | 30.88             |
-| GPT-2 83M           | 85.8M      | 04:52                     | 5.5B            | 3.28       | 27.84                | 26.57             |
-| GPT-2 125M          | 114M       | 08:25                     | 6.7B            | 3.18       | 24.64                | 24.04             |
-
-
-There are two ways of varying the amount of time necessary to train a model or the cost necessary to do so: varying the size of the model or varying the number of steps (and therefore data) for which the model is trained. With the GPT family of models, we explore both of these axes. To develop methods for these models, we generally begin with the smallest members of this model family for initial experimentation and scale up once the ideas have been refined.
-
-To explore tradeoffs between quality and number of training steps: we have ablated both number of training steps, and number of data points to train on. We do this by checkpointing the model throughout training.
-
-To explore tradeoffs between quality and the size of the model, we use [Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361) to provide suggestions on model capacity and dataset size, and then sweep hyperparameters such as learning rate and batch size to minimize loss.
-
-
-## Attribution
-
-The GPT model family is described in *[Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)* by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-
-The Scaling Law that we use to choose the members of this model family are described in *[Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361)* by Jared Kaplan, Sam McCandish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
-
-## Default Training Hyperparameters
-
-Below are hyperparameters we used to train GPT-2 125M on [OpenWebText](https://huggingface.co/datasets/openwebtext).
-
-```yaml
-optimizer:
-  adamw:
-    lr: 6.0e-4
-    betas:
-      - 0.9
-      - 0.999
-    eps: 1.0e-08
-    weight_decay: 0.0
-schedulers:
-  - cosine_decay_with_warmup:
-      t_warmup: 140ba
-train_batch_size: 512
-```
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.gpt2.GPT2Model
-    :noindex:
-```
diff --git a/composer/models/gpt2/__init__.py b/composer/models/gpt2/__init__.py
deleted file mode 100644
index 1ae37b122a..0000000000
--- a/composer/models/gpt2/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The GPT-2 model family is set of transformer-based networks for autoregressive language modeling at various scales.
-This family was originally proposed by OpenAI, and is trained on the OpenWebText dataset. It is useful for downstream
-language generation tasks, such as summarization, translation, and dialog.
-
-See the :doc:`Model Card </model_cards/GPT2>` for more details.
-"""
-
-from composer.models.gpt2.model import create_gpt2 as create_gpt2
-
-__all__ = ['create_gpt2']
-
-_metadata = {
-    'gpt2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 52M',
-        '_quality': '30.88',
-        '_metric': 'Perplexity',
-        '_ttt': '02:44',
-        '_hparams': 'gpt2_52m.yaml'
-    },
-    'gpt2 -- TODO RENAME TO GPT2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 83M',
-        '_quality': '26.57',
-        '_metric': 'Perplexity',
-        '_ttt': '04:52',
-        '_hparams': 'gpt2_83m.yaml'
-    },
-    'gpt2 --! TODO RENAME TO GPT2': {
-        '_task': 'Language Modeling',
-        '_dataset': 'OpenWebText',
-        '_name': 'GPT-2 125M',
-        '_quality': '24.04',
-        '_metric': 'Perplexity',
-        '_ttt': '08:25',
-        '_hparams': 'gpt2_125m.yaml'
-    }
-}
diff --git a/composer/models/gpt2/model.py b/composer/models/gpt2/model.py
deleted file mode 100644
index ea924b7b99..0000000000
--- a/composer/models/gpt2/model.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""GPT-2 model based on `Hugging Face GPT-2 <https://huggingface.co/docs/transformers/master/en/model_doc/gpt2>`_.
-
-Implemented as a wrapper using :class:`.ComposerTrainer`.
-"""
-
-from __future__ import annotations
-
-import warnings
-from typing import Optional
-
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
-from composer.models.huggingface import HuggingFaceModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['create_gpt2']
-
-
-def create_gpt2(use_pretrained: Optional[bool] = False,
-                pretrained_model_name: Optional[str] = None,
-                model_config: Optional[dict] = None,
-                tokenizer_name: Optional[str] = None,
-                gradient_checkpointing: Optional[bool] = False):
-    """Implements :class:`~composer.models.huggingface.HuggingFaceModel` to wrap `Hugging Face GPT-2 \
-    transformers <https://huggingface.co/docs/transformers/master/en/model_doc/gpt2#overview>`_. Logs training and
-    validation perplexity.
-
-    From `Language Models are Unsupervised Multitask Learners <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ (Radford et al, 2018).
-
-    Args:
-
-        gradient_checkpointing (bool, optional): Use gradient checkpointing. Default: ``False``.
-        use_pretrained (bool, optional): Whether to initialize the model with the pretrained weights. Default: ``False``.
-        model_config (dict): A dictionary providing a HuggingFace model configuration.
-        tokenizer_name (str, optional): Tokenizer name used to preprocess the dataset
-        and validate the models inputs.
-
-        .. code-block::
-
-            {
-              "_name_or_path": "gpt2",
-              "activation_function": "gelu_new",
-              "architectures": ["GPT2LMHeadModel"],
-              "attn_pdrop": 0.1,
-              "bos_token_id": 50256,
-              "embd_pdrop": 0.1,
-              "eos_token_id": 50256,
-              "initializer_range": 0.02,
-              "layer_norm_epsilon": 1e-05,
-              "model_type": "gpt2",
-              "n_ctx": 1024,
-              "n_embd": 768,
-              "n_head": 12,
-              "n_inner": null,
-              "n_layer": 12,
-              "n_positions": 1024,
-              "reorder_and_upcast_attn": false,
-              "resid_pdrop": 0.1,
-              "scale_attn_by_inverse_layer_idx": false,
-              "scale_attn_weights": true,
-              "summary_activation": null,
-              "summary_first_dropout": 0.1,
-              "summary_proj_to_labels": true,
-              "summary_type": "cls_index",
-              "summary_use_proj": true,
-              "task_specific_params": {
-              "text-generation": {
-              "do_sample": true,
-              "max_length": 50 }
-              },
-              "transformers_version": "4.16.0",
-              "use_cache": true,
-              "vocab_size": 50257
-            }
-
-   To create a GPT-2 model for language modeling pretraining:
-
-    .. testcode::
-
-        from composer.models import create_gpt2
-
-        composer_model = create_gpt2()
-
-    """
-    warnings.warn(DeprecationWarning('create_gpt2 is deprecated and will be removed in v0.18'))
-
-    try:
-        import transformers
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers') from e
-
-    if not model_config:
-        model_config = {}
-
-    if not pretrained_model_name:
-        pretrained_model_name = 'gpt2'
-
-    if use_pretrained:
-        assert transformers.AutoModelForCausalLM.from_pretrained is not None, 'AutoModelForCausalLM has from_pretrained method'
-        model = transformers.AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
-                                                                  **model_config)
-    else:
-        config = transformers.AutoConfig.from_pretrained(pretrained_model_name, **model_config)
-        assert transformers.AutoModelForCausalLM.from_config is not None, 'AutoModelForCausalLM has from_config method'
-        model = transformers.AutoModelForCausalLM.from_config(config)
-
-    if gradient_checkpointing:
-        model.gradient_checkpointing_enable()  # type: ignore
-
-    # setup the tokenizer
-    if tokenizer_name:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-    else:
-        tokenizer = None
-
-    return HuggingFaceModel(model=model,
-                            tokenizer=tokenizer,
-                            metrics=[LanguageCrossEntropy(), LanguagePerplexity()],
-                            use_logits=True)
diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 9cf5939bfe..439f8b50fe 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import copy
 import inspect
 import json
 import logging
@@ -13,24 +14,31 @@
 import string
 import tempfile
 import textwrap
+import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, Type, Union
 
 import torch
 from torchmetrics import Metric
 
-from composer.metrics import InContextLearningMetric, InContextLearningQAAccuracy
 from composer.models.base import ComposerModel
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
+try:
+    from peft import PeftModel, get_peft_model
+    peft_installed = True
+except:
+    peft_installed = False
+
 if TYPE_CHECKING:
     import transformers
+    from peft import PeftConfig, PeftModel
     from transformers import PretrainedConfig
     from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 log = logging.getLogger(__name__)
 
-__all__ = ['HuggingFaceModel']
+__all__ = ['HuggingFaceModel', 'peft_installed']
 
 
 class HuggingFaceModel(ComposerModel):
@@ -38,7 +46,7 @@ class HuggingFaceModel(ComposerModel):
     A wrapper class that converts 🤗 Transformers models to composer models.
 
     Args:
-        model (transformers.PreTrainedModel): A 🤗 Transformers model.
+        model (Union[transformers.PreTrainedModel, peft.PeftModel)): A 🤗 Transformers model or a PEFT model.
         tokenizer (transformers.PreTrainedTokenizer, optional): The tokenizer used to prepare the dataset. Default ``None``.
 
             .. note:: If the tokenizer is provided, its config will be saved in the composer checkpoint, and it can be reloaded
@@ -48,6 +56,8 @@ class HuggingFaceModel(ComposerModel):
         eval_metrics (list[Metric], optional): list of torchmetrics to compute on the eval_dataloader, or be accessible to :class:`Evaluator`s. Default: ``None``.
         shift_labels (bool, optional): If True, the batch's labels will be shifted before being used to calculate metrics. This should be set to true for CausalLM models and false otherwise. If not specified, `shift_labels` will be set automatically based on the model class name. Default: ``None``.
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
+        peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
+        should_save_peft_only (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
 
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
@@ -66,14 +76,16 @@ class HuggingFaceModel(ComposerModel):
     """
 
     def __init__(self,
-                 model: transformers.PreTrainedModel,
+                 model: Union[transformers.PreTrainedModel, 'PeftModel'],
                  tokenizer: Optional[Union[transformers.PreTrainedTokenizer,
                                            transformers.PreTrainedTokenizerFast]] = None,
                  use_logits: Optional[bool] = False,
                  metrics: Optional[List[Metric]] = None,
                  eval_metrics: Optional[List[Metric]] = None,
                  shift_labels: Optional[bool] = None,
-                 allow_embedding_resizing: bool = False) -> None:
+                 allow_embedding_resizing: bool = False,
+                 peft_config: Optional['PeftConfig'] = None,
+                 should_save_peft_only: bool = True) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -82,71 +94,118 @@ def __init__(self,
                                                 conda_package='transformers',
                                                 conda_channel='conda-forge') from e
 
+        if peft_config is not None:
+            if not peft_installed:
+                raise MissingConditionalImportError(extra_deps_group='peft',
+                                                    conda_package='peft',
+                                                    conda_channel='conda-forge')
+
+        if peft_config is not None:
+            # Hugging Face requires the peft type and task type to be upper case, so we do that here
+            # https://github.com/huggingface/peft/blob/ebbff4023ad276cbcb2466fd7e99be7d3ae0ae11/src/peft/utils/peft_types.py#L22-L51
+            if isinstance(peft_config.peft_type, str):
+                peft_config.peft_type = peft_config.peft_type.upper()
+            if isinstance(peft_config.task_type, str):
+                peft_config.task_type = peft_config.task_type.upper()
+
+            if peft_config.peft_type != 'LORA':
+                raise ValueError(
+                    f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
+
         super().__init__()
         self.model = model
-        self.config = model.config
-        self.model_forward_args = inspect.getfullargspec(self.model.forward).args
+        self.config: PretrainedConfig = model.config
+        self.model_forward_args = self._get_model_forward_args()
         self.tokenizer = tokenizer
+        self.should_save_peft_only = should_save_peft_only
+        self.use_logits = use_logits
+        self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
+        self.dummy_forward_called = False  # Used to make FSDP generate work, see generate function for more details
+        self.train_metrics: Optional[Dict] = self._get_metric_dict(metrics) if metrics is not None else None
+        self.val_metrics: Optional[Dict] = self._get_metric_dict(
+            eval_metrics) if eval_metrics is not None else copy.deepcopy(self.train_metrics)
+
+        is_causal_lm = _is_registered_causal_lm(self.model)
+        self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
+
+        self._check_tokenizer_and_maybe_resize_embeddings(allow_embedding_resizing)
+
+        if is_causal_lm and not self.shift_labels:
+            log.warning('The shift_labels argument was set to False but the model is an instance of a'
+                        ' HuggingFace Causal LM. This may lead to incorrect behavior.')
+            # Note: No warning if shift_labels and not is_causal_lm, since the model may simply be a custom class.
+
+        if peft_config is not None:
+            self.model = _maybe_get_peft_model(peft_config, self.model)
 
+        self.using_peft = isinstance(self.model, PeftModel) if peft_installed else False
+
+    def _check_tokenizer_and_maybe_resize_embeddings(self, allow_embedding_resizing: bool) -> None:
         if self.tokenizer is None:
             log.warning(
                 'The tokenizer was not provided. This means the tokenizer config will not be saved in the checkpoint.')
 
-        if tokenizer is not None and self.config.vocab_size < len(tokenizer):
+        if self.tokenizer is not None and self.config.vocab_size < len(self.tokenizer):
             if allow_embedding_resizing:
                 # when the embedding size is smaller than the tokenizer vocab size,
                 # the embeddings should get resized to match the tokenizer vocab size
                 log.warning(f'The number of tokens in the tokenizer is greater than the number of tokens in the model.'
                             f' This would cause an error during training.'
-                            f' Resizing the model embeddings to {len(tokenizer)} from {self.config.vocab_size}.')
-                self.model.resize_token_embeddings(len(tokenizer))
+                            f' Resizing the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}.')
+                self.model.resize_token_embeddings(len(self.tokenizer))
             else:
                 raise ValueError(
                     f'The number of tokens in the tokenizer is greater than the number of tokens in the model.'
                     f' This would cause an error during training.'
-                    f' You can resize the model embeddings to {len(tokenizer)} from {self.config.vocab_size}'
+                    f' You can resize the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}'
                     f' by calling `model.resize_token_embeddings(len(tokenizer))` before calling the `HuggingFaceModel`'
                     f' constructor, or pass `allow_embedding_resizing=True` to have it done automatically.')
-        elif tokenizer is not None and self.config.vocab_size > len(tokenizer):
+        elif self.tokenizer is not None and self.config.vocab_size > len(self.tokenizer):
             # when the embedding size is greater than the tokenizer vocab size,
             # the embeddings do not _need_ to be resized to match the tokenizer vocab size,
             # and should be done by the user if desired
             log.info(
                 f'The number of tokens in the tokenizer is less than the number of tokens in the model.'
-                f' You may want to resize the model embeddings to {len(tokenizer)} from {self.config.vocab_size}'
+                f' You may want to resize the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}'
                 f' by calling `model.resize_token_embeddings(len(tokenizer))` before calling the `HuggingFaceModel`'
                 f' constructor. The vocab size is sometimes intentionally set to a multiple of 32 or 64 to improve'
                 f' performance.')
 
-        self.use_logits = use_logits
+    def _get_metric_dict(self, metrics: List[Metric]) -> Dict[str, Metric]:
+        """Returns a dictionary of metrics keyed by their class name."""
+        return {metric.__class__.__name__: metric for metric in metrics}
 
-        self.train_metrics: Optional[Dict] = None
-        self.val_metrics: Optional[Dict] = None
+    def _get_model_forward_args(self) -> Set[str]:
+        """Returns the arguments to the model's forward function."""
+        model_forward_args = inspect.signature(maybe_get_underlying_model(self.model).forward).parameters.keys()
 
-        if eval_metrics is not None:
-            self.val_metrics = {metric.__class__.__name__: metric for metric in eval_metrics}
-        if metrics is not None:
-            self.train_metrics = {metric.__class__.__name__: metric for metric in metrics}
-            # if eval_metrics is None, use the same metrics as train_metrics
-            if eval_metrics is None:
-                self.val_metrics = {metric.__class__.__name__: metric for metric in metrics}
+        if not model_forward_args:
+            raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
 
-        self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
+        model_forward_args = set(model_forward_args)
 
-        is_causal_lm = _is_registered_causal_lm(model)
-        self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
-        if is_causal_lm and not self.shift_labels:
-            log.warning('The shift_labels argument was set to False but the model is an instance of a'
-                        ' HuggingFace Causal LM. This may lead to incorrect behavior.')
-            # Note: No warning if shift_labels and not is_causal_lm, since the model may simply be a custom class.
+        return model_forward_args
+
+    def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """Returns the state dict of the model."""
+        full_state_dict = super().state_dict(*args, **kwargs)
 
-        self.dummy_forward_called = False
+        if self.using_peft and self.should_save_peft_only:
+            active_adapter = self.model.active_adapter
+            assert isinstance(active_adapter, str)
+            full_state_dict = filter_state_dict_peft(full_state_dict,
+                                                     self.model.peft_config[active_adapter],
+                                                     adapter_name='default',
+                                                     remove_adapter_names=False)
+
+        return full_state_dict
 
     @staticmethod
     def load_huggingface_tokenizer_from_saved_state(
-            hf_state: Dict[str, Any],
-            trust_remote_code: bool = False,
-            tokenizer_save_dir: Optional[str] = None) -> Optional[transformers.PreTrainedTokenizer]:
+        hf_state: Dict[str, Any],
+        trust_remote_code: bool = False,
+        tokenizer_save_dir: Optional[str] = None
+    ) -> Optional[transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast]:
         """A helper function that loads a HuggingFace tokenizer from a loaded in hf state.
 
         Args:
@@ -156,7 +215,7 @@ def load_huggingface_tokenizer_from_saved_state(
                 a folder with a unique suffix will be saved in the current working directory. Defaults to None.
 
         Returns:
-            Optional[transformers.PreTrainedTokenizer]: The loaded HuggingFace tokenizer
+            Optional[transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast]: The loaded HuggingFace tokenizer
         """
         try:
             import transformers
@@ -201,7 +260,7 @@ def load_huggingface_tokenizer_from_saved_state(
                         raise MissingConditionalImportError(extra_deps_group='sentencepiece',
                                                             conda_package='sentencepiece') from e
                     s = spm.SentencePieceProcessor()
-                    s.load_from_serialized_proto(saved_content['content'])
+                    s.load_from_serialized_proto(saved_content['content'])  # pyright: ignore[reportGeneralTypeIssues]
                     with open(tokenizer_file_path, 'wb') as _f:
                         _f.write(s.serialized_model_proto())
 
@@ -265,7 +324,8 @@ def load_huggingface_model_from_saved_state(
             # pyright can't tell this isn't a string at this point
             if issubclass(
                     model_instantiation_class,  # type: ignore
-                    transformers.models.auto.auto_factory._BaseAutoModelClass):
+                    transformers.models.auto.auto_factory._BaseAutoModelClass  # type: ignore
+            ):  # pyright: ignore[reportGeneralTypeIssues]
                 hf_model = model_instantiation_class.from_config(loaded_config)  # type: ignore
             else:
                 hf_model = model_instantiation_class(loaded_config)  # type: ignore
@@ -291,7 +351,8 @@ def hf_from_composer_checkpoint(
         model_config_kwargs: Optional[dict] = None,
         local_checkpoint_save_location: Optional[Union[Path, str]] = None,
         trust_remote_code: bool = False,
-    ) -> Tuple[transformers.PreTrainedModel, Optional[transformers.PreTrainedTokenizer]]:
+    ) -> Tuple[transformers.PreTrainedModel, Optional[Union[transformers.PreTrainedTokenizer,
+                                                            transformers.PreTrainedTokenizerFast]]]:
         """Loads a HuggingFace model (and tokenizer if present) from a composer checkpoint.
 
         .. note:: This function does not load the weights from the checkpoint. It just loads the correctly configured
@@ -353,7 +414,7 @@ def hf_from_composer_checkpoint(
             ValueError: If the ``model_instantiation_class``, or the model class saved in the checkpoint, is not able to be imported
 
         Returns:
-            Tuple[transformers.PreTrainedModel, Optional[transformers.PreTrainedTokenizer]]: The loaded HuggingFace model and (if present) tokenizer
+            Tuple[transformers.PreTrainedModel, Optional[Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]]]: The loaded HuggingFace model and (if present) tokenizer
         """
 
         # default local path to a tempfile if path is not provided
@@ -413,7 +474,8 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                                        **batch.get('generation_kwargs', {}))
 
             # don't remove prefix space to sentencepiece models
-            if len(self.tokenizer(' a', add_special_tokens=False)['input_ids']) == 1:
+            if len(self.tokenizer(
+                    ' a', add_special_tokens=False)['input_ids']) == 1:  # pyright: ignore[reportGeneralTypeIssues]
                 return self.tokenizer.batch_decode(generation[:, batch['input_ids'].shape[1]:],
                                                    skip_special_tokens=True)
             else:
@@ -429,7 +491,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             # HF encoder decoder models like T5 expect either decoder_input_ids or labels,
             # so we add decoder_input_ids to the batch if it is missing
-            if self.model.config.is_encoder_decoder and 'decoder_input_ids' not in batch:
+            if self.config.is_encoder_decoder and 'decoder_input_ids' not in batch:
                 if hasattr(self.model, 'prepare_decoder_input_ids_from_labels'):
                     batch['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels=self.labels)
                 else:
@@ -469,14 +531,10 @@ def get_metrics(self, is_train: bool = False) -> Dict[str, Metric]:
         return metrics if metrics else {}
 
     def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
-        if isinstance(metric, InContextLearningQAAccuracy):
-            assert self.labels is not None
-            metric.update(batch=batch, outputs=outputs, labels=self.labels)  # pyright: ignore [reportGeneralTypeIssues]
-        elif isinstance(metric, InContextLearningMetric):
-            assert self.labels is not None
-            metric.update(batch, outputs, self.labels)  # pyright: ignore [reportGeneralTypeIssues]
+        if getattr(metric, 'needs_batch', False):
+            metric.update(batch=batch, outputs=outputs, labels=self.labels)
         else:
-            metric.update(outputs, self.labels)  # pyright: ignore [reportGeneralTypeIssues]
+            metric.update(outputs, self.labels)
 
     def get_metadata(self):
         model_output = {}
@@ -485,7 +543,9 @@ def get_metadata(self):
             tmp_dir = Path(tmp_dir)
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
-            self.model.config.save_pretrained(model_dir)
+
+            original_model_config: PretrainedConfig = self.config
+            original_model_config.save_pretrained(model_dir)
             if self.tokenizer is not None:
                 self.tokenizer.save_pretrained(tokenizer_dir)
 
@@ -498,6 +558,19 @@ def get_metadata(self):
                 'class': f'{self.model.__class__.__module__}.{self.model.__class__.__name__}'
             }
 
+            # Also save PEFT config if the model is a peft model
+            if self.using_peft:
+                active_adapter = self.model.active_adapter
+                assert isinstance(active_adapter, str)
+                self.model.peft_config[active_adapter].save_pretrained(str(model_dir))
+                with open(model_dir / 'adapter_config.json') as _peft_config_file:
+                    peft_config = json.load(_peft_config_file)
+
+                model_output['peft_config'] = {
+                    'file_extension': '.json',
+                    'content': peft_config,
+                }
+
             if self.tokenizer is not None:
                 for tokenizer_file_name in tokenizer_dir.iterdir():
                     tokenizer_file_path = tokenizer_dir / tokenizer_file_name
@@ -517,7 +590,8 @@ def get_metadata(self):
                         except ImportError as e:
                             raise MissingConditionalImportError(extra_deps_group='sentencepiece',
                                                                 conda_package='sentencepiece') from e
-                        s = spm.SentencePieceProcessor(model_file=str(tokenizer_file_path))
+                        s = spm.SentencePieceProcessor(
+                            model_file=str(tokenizer_file_path))  # pyright: ignore[reportGeneralTypeIssues]
                         tokenizer_file_content = s.serialized_model_proto()
                     else:
                         raise ValueError(
@@ -542,25 +616,7 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
         """
         pad_token_id = kwargs.pop('pad_token_id', self.tokenizer.pad_token_id if self.tokenizer is not None else None)
 
-        from composer.utils.misc import using_torch_2
-
-        # We need to call forward once in order for FSDP + generate to work
-        # This solution works because parameters in the root FSDP module are not freed after forward
-        # See https://github.com/huggingface/accelerate/issues/570, https://github.com/huggingface/accelerate/issues/947,
-        # and https://github.com/pytorch/pytorch/issues/82461, https://github.com/pytorch/pytorch/issues/100069 for more info
-        # Note: This is a solution for Torch 1.13.x, and there is a different solution below for Torch 2.0
-        if not using_torch_2() and not self.dummy_forward_called and is_model_fsdp(self.model):
-            with torch.no_grad():
-                maybe_decoder_input_ids = {}
-                if self.model.config.is_encoder_decoder:
-                    maybe_decoder_input_ids['decoder_input_ids'] = torch.tensor([[0]],
-                                                                                dtype=torch.long,
-                                                                                device=input_ids.device)
-                self.model(input_ids=torch.tensor([[0]], dtype=torch.long, device=input_ids.device),
-                           **maybe_decoder_input_ids)
-            self.dummy_forward_called = True
-
-        if is_model_fsdp(self.model) and using_torch_2():
+        if is_model_fsdp(self.model):
             from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
             # Note: We need to use the FSDP.summon_full_params context manager here because the generate function
@@ -574,7 +630,49 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
             return self.model.generate(input_ids=input_ids, pad_token_id=pad_token_id, **kwargs)
 
 
-def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
+def _maybe_get_peft_model(
+    peft_config: 'PeftConfig',
+    model: Union[transformers.PreTrainedModel, 'PeftModel'],
+) -> 'PeftModel':
+    """Creates a PEFT model if the model is not already a PEFT model.
+
+    Args:
+        peft_config (Optional[peft.PeftConfig]): The PEFT config to use to create the PEFT model
+        model (Union[transformers.PreTrainedModel, 'PeftModel']): The model to create the PEFT model from
+
+    Returns:
+        PeftModel: The PEFT model
+    """
+    if not peft_installed:
+        raise MissingConditionalImportError(extra_deps_group='peft', conda_package='peft', conda_channel='conda-forge')
+
+    if not isinstance(model, PeftModel):
+        log.info('Creating PEFT model')
+        peft_model = get_peft_model(model, peft_config)
+        assert isinstance(peft_model, PeftModel)
+        return peft_model
+    else:
+        warnings.warn('PEFT model was passed in directly. Ignoring the provided PEFT config.')
+        return model
+
+
+def maybe_get_underlying_model(
+        model: Union[transformers.PreTrainedModel, 'PeftModel']) -> Union[transformers.PreTrainedModel, 'PeftModel']:
+    """Get the underlying PreTrainedModel from a model if it is a PEFT model
+
+    Args:
+        model (Union[transformers.PreTrainedModel, 'PeftModel']): The model to get the underlying model from
+
+    Returns:
+        Union[transformers.PreTrainedModel]: The underlying transformers model
+    """
+    if peft_installed and isinstance(model, PeftModel):
+        return model.base_model.model
+    else:
+        return model
+
+
+def _is_registered_causal_lm(model: Union[transformers.PreTrainedModel, 'PeftModel']) -> bool:
     """Return True if model class is either a registered 🤗 Causal LM or a subclass of one"""
     try:
         from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
@@ -583,6 +681,8 @@ def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
                                             conda_package='transformers',
                                             conda_channel='conda-forge') from e
 
+    model_to_check = maybe_get_underlying_model(model)
+
     # This try/except is needed until https://github.com/huggingface/transformers/issues/26778
     # is resolved in a release. This means that this attempt to automatically detect causal LMs
     # does not currently work in an environment with flash attention <2 installed.
@@ -594,7 +694,7 @@ def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
             return False
         else:
             raise e
-    return any(isinstance(model, causal_lm_class) for causal_lm_class in causal_lm_classes)
+    return any(isinstance(model_to_check, causal_lm_class) for causal_lm_class in causal_lm_classes)  # type: ignore
 
 
 def get_hf_config_from_composer_state_dict(state_dict: Dict[str, Any],
@@ -637,6 +737,30 @@ def get_hf_config_from_composer_state_dict(state_dict: Dict[str, Any],
                 f'config has a valid `_name_or_path`.')
 
 
+def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Optional['PeftConfig']:
+    """Get a PEFT config from a composer state dict
+
+    Args:
+        state_dict (Dict[str, Any]): The state dict to get the config from
+
+    Returns:
+        Optional[peft.PeftConfig]: The PEFT config. Will be ``None`` if the model is not a PEFT model.
+    """
+    try:
+        import peft
+    except ImportError as e:
+        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='peft',
+                                            conda_channel='conda-forge') from e
+
+    hf_model_dict = state_dict['state']['integrations']['huggingface']['model']
+    if 'peft_config' not in hf_model_dict:
+        return None
+
+    peft_config_dict = hf_model_dict['peft_config']['content']
+
+    return peft.get_peft_config(peft_config_dict)
+
+
 def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
         output_folder: Union[Path, str],
@@ -713,6 +837,61 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     config = get_hf_config_from_composer_state_dict(composer_state_dict)
     config.save_pretrained(output_folder)
 
+    peft_config = get_peft_config_from_composer_state_dict(composer_state_dict)
+    if peft_config is not None:
+        peft_config.save_pretrained(str(output_folder))
+
     weights_state_dict = composer_state_dict['state']['model']
     torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
-    torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
+
+    # NOTE: This only works for default adapter name, not multiple adapters
+    if peft_config is not None:
+        weights_state_dict = filter_state_dict_peft(weights_state_dict, peft_config, adapter_name='default')
+
+        torch.save(weights_state_dict, Path(output_folder) / 'adapter_model.bin')
+    else:
+        torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
+
+
+def filter_state_dict_peft(state_dict: Dict[str, Any],
+                           peft_config: 'PeftConfig',
+                           adapter_name: str = 'default',
+                           remove_adapter_names: bool = True) -> Dict[str, Any]:
+    """Filter a state dict to only include the weights needed for a PEFT model
+
+    Note: This function only works with LORA PEFT models right now.
+
+    Args:
+        state_dict (Dict[str, Any]): The state dict to filter
+        peft_config (PeftConfig): The PEFT config to use to filter the state dict
+        adapter_name (str, optional): The name of the adapter to filter for. Defaults to 'default'.
+        remove_adapter_names (bool, optional): Whether to remove the adapter names from the state dict keys. Defaults to True.
+
+    Returns:
+        Dict[str, Any]: The filtered state dict
+    """
+
+    if peft_config.peft_type != 'LORA':
+        raise NotImplementedError(f'Only LoRA PEFT is supported. Got {peft_config.peft_type}')
+
+    # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
+    bias = peft_config.bias  # type: ignore
+    if bias == 'none':
+        to_return = {k: state_dict[k] for k in state_dict if 'lora_' in k}
+    elif bias == 'all':
+        to_return = {k: state_dict[k] for k in state_dict if 'lora_' in k or 'bias' in k}
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in state_dict:
+            if 'lora_' in k:
+                to_return[k] = state_dict[k]
+                bias_name = k.split('lora_')[0] + 'bias'
+                if bias_name in state_dict:
+                    to_return[bias_name] = state_dict[bias_name]
+    else:
+        raise NotImplementedError
+    to_return = {k: v for k, v in to_return.items() if (('lora_' in k and adapter_name in k) or ('bias' in k))}
+
+    if remove_adapter_names:
+        to_return = {k.replace(f'.{adapter_name}', ''): v for k, v in to_return.items()}
+    return to_return
diff --git a/composer/models/mmdetection.py b/composer/models/mmdetection.py
deleted file mode 100644
index 2e53aac543..0000000000
--- a/composer/models/mmdetection.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper class that converts mmdet detection models to composer models"""
-
-from __future__ import annotations
-
-import warnings
-from typing import TYPE_CHECKING, Any, List, Optional
-
-import numpy as np
-import torch
-from torchmetrics import Metric
-from torchmetrics.collections import MetricCollection
-
-from composer.models import ComposerModel
-
-if TYPE_CHECKING:
-    import mmdet
-
-__all__ = ['MMDetModel']
-
-
-class MMDetModel(ComposerModel):
-    """A wrapper class that adapts mmdetection detectors to composer models.
-
-    Args:
-        model (mmdet.models.detectors.BaseDetector): An MMdetection Detector.
-        metrics (list[Metric], optional): list of torchmetrics to apply to the output of `eval_forward`. Default: ``None``.
-
-    .. warning:: This wrapper is designed to work with mmdet datasets.
-
-    Example:
-
-    .. code-block:: python
-
-        from mmdet.models import build_model
-        from mmcv import ConfigDict
-        from composer.models import MMDetModel
-
-        yolox_s_config = dict(
-            type='YOLOX',
-            input_size=(640, 640),
-            random_size_range=(15, 25),
-            random_size_interval=10,
-            backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
-            neck=dict(type='YOLOXPAFPN', in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
-            bbox_head=dict(type='YOLOXHead', num_classes=num_classes, in_channels=128, feat_channels=128),
-            train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
-            test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
-        yolox = build_model(ConfigDict(yolox_s_config))
-        yolox.init_weights()
-        model = MMDetModel(yolox)
-    """
-
-    def __init__(
-            self,
-            model: mmdet.models.detectors.BaseDetector,  # type: ignore
-            metrics: Optional[List[Metric]] = None) -> None:
-        warnings.warn(DeprecationWarning('MMDetModel is deprecated and will be removed in v0.18'))
-        super().__init__()
-        self.model = model
-
-        self.train_metrics = None
-        self.val_metrics = None
-
-        if metrics:
-            metric_collection = MetricCollection(metrics)
-            self.train_metrics = metric_collection.clone(prefix='train_')
-            self.val_metrics = metric_collection.clone(prefix='val_')
-
-    def forward(self, batch):
-        # this will return a dictionary of losses in train mode and model outputs in test mode.
-        return self.model(**batch)
-
-    def loss(self, outputs, batch, **kwargs):
-        return outputs
-
-    def eval_forward(self, batch, outputs: Optional[Any] = None):
-        """
-        Args:
-            batch (dict): a eval batch of the format:
-
-
-            ``img`` (List[torch.Tensor]): list of image torch.Tensors of shape (batch, c, h , w).
-
-
-            ``img_metas`` (List[Dict]): (1, batch_size) list of ``image_meta`` dicts.
-        Returns: model predictions: A batch_size length list of dictionaries containg detection boxes in (x,y, x2, y2) format, class labels, and class probabilities.
-        """
-        device = batch['img'][0].device
-        batch.pop('gt_labels')
-        batch.pop('gt_bboxes')
-        results = self.model(return_loss=False, rescale=True, **batch)  # models behave differently in eval mode
-
-        # outputs are a list of bbox results (x, y, x2, y2, score)
-        # pack mmdet bounding boxes and labels into the format for torchmetrics MAP expects
-        preds = []
-        for bbox_result in results:
-            boxes_scores = np.vstack(bbox_result)
-            boxes, scores = torch.from_numpy(boxes_scores[..., :-1]).to(device), torch.from_numpy(
-                boxes_scores[..., -1]).to(device)
-            labels = [np.full(result.shape[0], i, dtype=np.int32) for i, result in enumerate(bbox_result)]
-            pred = {
-                'labels': torch.from_numpy(np.concatenate(labels)).to(device).long(),
-                'boxes': boxes.float(),
-                'scores': scores.float()
-            }
-            preds.append(pred)
-        return preds
-
-    def get_metrics(self, is_train: bool = False):
-        if is_train:
-            metrics = self.train_metrics
-        else:
-            metrics = self.val_metrics
-        return metrics if metrics else {}
-
-    def update_metric(self, batch: Any, outputs: Any, metric: Metric):
-        targets_box = batch.pop('gt_bboxes')[0]
-        targets_cls = batch.pop('gt_labels')[0]
-        targets = []
-        for i in range(len(targets_box)):
-            t = {'boxes': targets_box[i], 'labels': targets_cls[i]}
-            targets.append(t)
-        metric.update(outputs, targets)
diff --git a/composer/models/resnet/README.md b/composer/models/resnet/README.md
deleted file mode 100644
index 430dd303b4..0000000000
--- a/composer/models/resnet/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# 🏙️ ResNet
-[\[How to Use\]](#how-to-use) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Image Classification`
-
-The ResNet model family is a set of convolutional neural networks that can be used as a basis for a variety of vision tasks. Our implementation is a simple wrapper on top of the [torchvision ResNet implementation](https://pytorch.org/vision/stable/models.html).
-
-## How to Use
-
-```python
-from composer.models import composer_resnet
-
-model = composer_resnet(
-    model_name="resnet50",
-    num_classes=1000,
-    weights=None
-)
-```
-
-## Architecture
-
-The basic architecture defined in the original papers is as follows:
-
-- The first layer is a 7x7 Convolution with stride 2 and 64 filters.
-- Subsequent layers follow 4 stages with {64, 128, 256, 512} input channels with a varying number of residual blocks at each stage that depends on the family member. At the end of every stage, the resolution is reduced by half using a convolution with stride 2.
-- The final section consists of a global average pooling followed by a linear + softmax layer that outputs values for the specified number of classes.
-
-The below table from [He et al.](https://arxiv.org/abs/1512.03385) details some of the building blocks for ResNets of different sizes.
-
-![resnet.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/resnet.png)
-
-## Family Members
-
-ResNet family members are identified by their number of layers. Parameter count, accuracy, and training time are provided below.
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 8xA100s |
-|----------------------|-----------------|--------------|--------------------------|
-| ResNet-18            | 11.5M           | TBA          | TBA                      |
-| ResNet-34            | 21.8M           | TBA          | TBA                      |
-| ResNet-50            | 25.6M           | 76.5%        | 3.83 hrs                 |
-| ResNet-101           | 44.5M           | 78.1%        | 5.50 hrs                 |
-| ResNet-152           | 60.2M           | TBA          | TBA                      |
-
-
-> ❗ **Note**: Please see the [CIFAR ResNet model card](https://docs.mosaicml.com/projects/composer/en/stable/model_cards/cifar_resnet.html#architecture) for the differences between CIFAR and ImageNet ResNets.
-
-## Default Training Hyperparameters
-
-- Optimizer: Decoupled SGDW
-  - Learning rate: 2.048
-    Momentum: 0.875
-    Weight_decay: 5.0e-4
-- LR schedulers:
-  - Cosine decay with warmup for 8 epochs
-- Batch size: 2048
-- Number of epochs: 90ep
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-
-Code and hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5) by Nvidia
-
-## API Reference
-
-```{eval-rst}
-.. autofunction:: composer.models.resnet.model.composer_resnet
-    :noindex:
-```
diff --git a/composer/models/resnet/__init__.py b/composer/models/resnet/__init__.py
deleted file mode 100644
index e00a37035b..0000000000
--- a/composer/models/resnet/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The ResNet model family is a set of convolutional neural networks described in `Deep Residual Learning for Image
-Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015). ResNets can be used as the base for a variety of
-vision tasks. ImageNet ResNets are a subset of the ResNet family which were designed specifically for classification on
-the ImageNet dataset.
-
-See the :doc:`Model Card </model_cards/resnet>` for more details.
-"""
-from composer.models.resnet.model import composer_resnet
-
-__all__ = ['composer_resnet']
-
-_metadata = {
-    'resnet18': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet18',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet18.yaml'
-    },
-    'resnet34': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet34',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet34.yaml'
-    },
-    'resnet50': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet50',
-        '_quality': '76.51',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '3h 33m',
-        '_hparams': 'resnet50.yaml'
-    },
-    'resnet101': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet101',
-        '_quality': '78.10',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '8h 15m',
-        '_hparams': 'resnet101.yaml',
-    },
-    'resnet152': {
-        '_task': 'Image Classification',
-        '_dataset': 'ImageNet',
-        '_name': 'ResNet152',
-        '_quality': 'TBD',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'TBD',
-        '_hparams': 'resnet152.yaml'
-    }
-}
diff --git a/composer/models/resnet/model.py b/composer/models/resnet/model.py
deleted file mode 100644
index 5b023fabcf..0000000000
--- a/composer/models/resnet/model.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A :class:`.ComposerClassifier` wrapper around the torchvision implementations of the ResNet model family."""
-
-import logging
-import warnings
-from typing import List, Optional
-
-from torchmetrics import MetricCollection
-from torchmetrics.classification import MulticlassAccuracy
-from torchvision.models import resnet
-
-from composer.loss import loss_registry
-from composer.metrics import CrossEntropy
-from composer.models.initializers import Initializer
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_resnet']
-
-log = logging.getLogger(__name__)
-
-valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
-
-
-def composer_resnet(model_name: str,
-                    num_classes: int = 1000,
-                    weights: Optional[str] = None,
-                    groups: int = 1,
-                    width_per_group: int = 64,
-                    initializers: Optional[List[Initializer]] = None,
-                    loss_name: str = 'soft_cross_entropy') -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model.
-
-    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
-
-    Args:
-        model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``,
-            ``"resnet152"``].
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``.
-        groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``.
-        width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage.
-            Default: ``64``.
-        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
-            Default: ``None``.
-        loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or
-            'binary_cross_entropy_with_logits'. Loss function must be in
-            :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``".
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_resnet
-
-        model = composer_resnet(model_name='resnet18')  # creates a torchvision resnet18 for image classification
-    """
-    warnings.warn(DeprecationWarning('composer_resnet is deprecated and will be removed in v0.18'))
-
-    valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
-    if model_name not in valid_model_names:
-        raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.')
-
-    if loss_name not in loss_registry.keys():
-        raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the '
-                         'specified loss function is present in composer.loss.loss.py')
-
-    if loss_name == 'binary_cross_entropy_with_logits' and (initializers is None or
-                                                            Initializer.LINEAR_LOG_CONSTANT_BIAS not in initializers):
-        log.warning('UserWarning: Using `binary_cross_entropy_loss_with_logits` '
-                    'without using `initializers.linear_log_constant_bias` can degrade '
-                    'performance. '
-                    'Please ensure you are using `initializers. '
-                    'linear_log_constant_bias`.')
-
-    if initializers is None:
-        initializers = []
-
-    # Instantiate model
-    model_fn = getattr(resnet, model_name)
-    model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group)
-
-    # Grab loss function from loss registry
-    loss_fn = loss_registry[loss_name]
-
-    # Create metrics for train and validation
-    train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro')
-    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')])
-
-    # Apply Initializers to model
-    for initializer in initializers:
-        initializer = Initializer(initializer)
-        model.apply(initializer.get_initializer())
-
-    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
-    return composer_model
diff --git a/composer/models/resnet_cifar/README.md b/composer/models/resnet_cifar/README.md
deleted file mode 100644
index 5a32ae03b8..0000000000
--- a/composer/models/resnet_cifar/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# CIFAR ResNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Family Members\]](#family-members) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Image Classification`
-
-The ResNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks. CIFAR ResNet models are a subset of this family designed specifically for the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) and [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) datasets.
-
-## Example
-
-```python
-from composer.models import composer_resnet_cifar
-
-model = composer_resnet_cifar(model_name='resnet_56', num_classes=10)
-```
-
-## Architecture
-
-Residual Networks are feedforward convolutional networks with “residual” connections between non-consecutive layers.
-
-The model architecture is defined by the original paper:
-
-- The network inputs are of dimension 32×32x3.
-- The first layer is 3×3 convolutions
-- The subsequent layers are a stack of 6n layers with 3×3 convolutions on the feature maps of sizes {32,16,8}, with 2n layers for each feature map size. The number of filters are {16,32,64} for the respective feature map sizes. Subsampling is performed by convolutions with a stride of 2
-- The network ends with a global average pooling, a linear layer with the output dimension equal to the number of classes, and softmax function.
-
-There are a total 6n+2 stacked weighted layers. Each family member is specified by the number of layers, for example n=9 corresponds to ResNet56
-
-The biggest differences between CIFAR ResNet models and ImageNet ResNet models are:
-
-- CIFAR ResNet models use fewer filters for each convolution.
-- The ImageNet ResNets contain four stages, while the CIFAR ResNets contain three stages. In addition, CIFAR ResNets uniformly distribute blocks across each stage while ImageNet ResNets have a specific number of blocks for each stage.
-
-## Family Members
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 1x3080 |
-|----------------------|-----------------|--------------|-------------------------|
-| ResNet20             | 0.27M           | TBA          | TBA                     |
-| ResNet32             | 0.46M           | TBA          | TBA                     |
-| ResNet44             | 0.66M           | TBA          | TBA                     |
-| ResNet56             | 0.85M           | 93.1%        | 35 min                  |
-| ResNet110            | 1.7M            | TBA          | TBA                     |
-## Default Training Hyperparameters
-
-```yaml
-optimizer:
-  sgd:
-    learning_rate: 1.2
-    momentum: 0.9
-    weight_decay: 1e-4
-schedulers:
-  - multistep_with_warmup:
-      t_warmup: "5ep"
-      milestones:
-        - "80ep"
-        - "120ep"
-      gamma: 0.1
-train_batch_size: 1024
-max_duration: 160ep
-```
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
-
-Note that this paper set the standard for ResNet style architectures for both CIFAR-10/100 and ImageNet
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.resnet_cifar.model.composer_resnet_cifar
-    :noindex:
-```
diff --git a/composer/models/resnet_cifar/__init__.py b/composer/models/resnet_cifar/__init__.py
deleted file mode 100644
index 2ea6ac226c..0000000000
--- a/composer/models/resnet_cifar/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A ResNet model family adapted for CIFAR10 image sizes.
-
-See the :doc:`Model Card </model_cards/cifar_resnet>` for more details.
-"""
-
-from composer.models.resnet_cifar.model import composer_resnet_cifar as composer_resnet_cifar
-
-__all__ = ['composer_resnet_cifar']
-_metadata = {
-    'resnet9': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet9',
-        '_quality': 'tbd',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'tbd',
-        '_hparams': 'resnet9_cifar10.yaml'
-    },
-    'resnet20': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet20',
-        '_quality': 'tbd',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': 'tbd',
-        '_hparams': 'resnet20_cifar10.yaml'
-    },
-    'resnet56': {
-        '_task': 'Image Classification',
-        '_dataset': 'CIFAR10',
-        '_name': 'ResNet56',
-        '_quality': '93.1',
-        '_metric': 'Top-1 Accuracy',
-        '_ttt': '35m',
-        '_hparams': 'resnet56_cifar10.yaml'
-    }
-}
diff --git a/composer/models/resnet_cifar/model.py b/composer/models/resnet_cifar/model.py
deleted file mode 100644
index 5bb8660b56..0000000000
--- a/composer/models/resnet_cifar/model.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ResNet models for CIFAR extending :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import List, Optional
-
-from composer.models.initializers import Initializer
-from composer.models.resnet_cifar.resnets import ResNet9, ResNetCIFAR
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['composer_resnet_cifar']
-
-
-def composer_resnet_cifar(model_name: str,
-                          num_classes: int = 10,
-                          initializers: Optional[List[Initializer]] = None) -> ComposerClassifier:
-    """Helper function to create a :class:`.ComposerClassifier` with a CIFAR ResNet models.
-
-    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
-    ResNet9 is based on the  model from myrtle.ai `blog`_.
-
-    Args:
-        model_name (str): ``"resnet_9"``, ``"resnet_20"``, or ``"resnet_56"``.
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``.
-        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
-            Default: ``None``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a CIFAR ResNet model.
-
-    Example:
-
-    .. testcode::
-
-        from composer.models import composer_resnet_cifar
-
-        model = composer_resnet_cifar(model_name="resnet_56")  # creates a resnet56 for cifar image classification
-
-    .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/
-    """
-    warnings.warn(DeprecationWarning('composer_resnet_cifar is deprecated and will be removed in v0.18'))
-    if initializers is None:
-        initializers = []
-
-    if model_name == 'resnet_9':
-        model = ResNet9(num_classes)  # current initializers don't work with this architecture.
-    else:
-        model = ResNetCIFAR.get_model_from_name(model_name, initializers, num_classes)
-
-    composer_model = ComposerClassifier(module=model, num_classes=num_classes)
-    return composer_model
diff --git a/composer/models/resnet_cifar/resnets.py b/composer/models/resnet_cifar/resnets.py
deleted file mode 100644
index b4f1576b46..0000000000
--- a/composer/models/resnet_cifar/resnets.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The CIFAR ResNet torch module.
-
-See the :doc:`Model Card </model_cards/resnet>` for more details.
-"""
-
-# Code below adapted from https://github.com/facebookresearch/open_lth
-# and https://github.com/pytorch/vision
-
-from typing import List, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torchvision.models.resnet import BasicBlock
-
-from composer.models import Initializer
-
-__all__ = ['ResNetCIFAR', 'ResNet9']
-
-
-class ResNetCIFAR(nn.Module):
-    """A residual neural network as originally designed for CIFAR-10."""
-
-    class Block(nn.Module):
-        """A ResNet block."""
-
-        def __init__(self, f_in: int, f_out: int, downsample: bool = False):
-            super(ResNetCIFAR.Block, self).__init__()
-
-            stride = 2 if downsample else 1
-            self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)
-            self.bn1 = nn.BatchNorm2d(f_out)
-            self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)
-            self.bn2 = nn.BatchNorm2d(f_out)
-            self.relu = nn.ReLU(inplace=True)
-
-            # No parameters for shortcut connections.
-            if downsample or f_in != f_out:
-                self.shortcut = nn.Sequential(
-                    nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),
-                    nn.BatchNorm2d(f_out),
-                )
-            else:
-                self.shortcut = nn.Sequential()
-
-        def forward(self, x: torch.Tensor):
-            out = self.relu(self.bn1(self.conv1(x)))
-            out = self.bn2(self.conv2(out))
-            out += self.shortcut(x)
-            return self.relu(out)
-
-    def __init__(self, plan: List[Tuple[int, int]], initializers: List[Initializer], outputs: int = 10):
-        super(ResNetCIFAR, self).__init__()
-        outputs = outputs or 10
-
-        self.num_classes = outputs
-
-        # Initial convolution.
-        current_filters = plan[0][0]
-        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn = nn.BatchNorm2d(current_filters)
-        self.relu = nn.ReLU(inplace=True)
-
-        # The subsequent blocks of the ResNet.
-        blocks = []
-        for segment_index, (filters, num_blocks) in enumerate(plan):
-            for block_index in range(num_blocks):
-                downsample = segment_index > 0 and block_index == 0
-                blocks.append(ResNetCIFAR.Block(current_filters, filters, downsample))
-                current_filters = filters
-
-        self.blocks = nn.Sequential(*blocks)
-
-        # Final fc layer. Size = number of filters in last segment.
-        self.fc = nn.Linear(plan[-1][0], outputs)
-        self.criterion = nn.CrossEntropyLoss()
-
-        for initializer in initializers:
-            initializer = Initializer(initializer)
-            self.apply(initializer.get_initializer())
-
-    def forward(self, x: torch.Tensor):
-        out = self.relu(self.bn(self.conv(x)))
-        out = self.blocks(out)
-        out = F.avg_pool2d(out, out.size()[3])
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        return out
-
-    @staticmethod
-    def is_valid_model_name(model_name: str):
-        valid_model_names = [f'resnet_{layers}' for layers in (20, 56)]
-        return (model_name in valid_model_names)
-
-    @staticmethod
-    def get_model_from_name(model_name: str, initializers: List[Initializer], outputs: int = 10):
-        """The naming scheme for a ResNet is ``'resnet_D[_W]'``.
-
-        D is the model depth (e.g. ``'resnet_56'``)
-        """
-
-        if not ResNetCIFAR.is_valid_model_name(model_name):
-            raise ValueError('Invalid model name: {}'.format(model_name))
-
-        depth = int(model_name.split('_')[-1])  # for resnet56, depth 56, width 16
-        if len(model_name.split('_')) == 2:
-            width = 16
-        else:
-            width = int(model_name.split('_')[3])
-
-        if (depth - 2) % 3 != 0:
-            raise ValueError('Invalid ResNetCIFAR depth: {}'.format(depth))
-        num_blocks = (depth - 2) // 6
-
-        model_arch = {
-            56: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)],
-            20: [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)],
-        }
-
-        return ResNetCIFAR(model_arch[depth], initializers, outputs)
-
-
-# adapted from https://raw.githubusercontent.com/matthias-wright/cifar10-resnet/master/model.py
-# under the MIT license
-class ResNet9(nn.Module):
-    """A 9-layer residual network, excluding BatchNorms and activation functions.
-
-    Based on the myrtle.ai `blog`_ and Deep Residual Learning for Image Recognition (`He et al, 2015`_).
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``10``.
-
-    .. _blog: https://myrtle.ai/learn/how-to-train-your-resnet-4-architecture/
-    .. _He et al, 2015: https://arxiv.org/abs/1512.03385
-    """
-
-    def __init__(self, num_classes: int = 10):
-        super().__init__()
-
-        self.body = nn.Sequential(
-            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=64, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=128, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            BasicBlock(inplanes=128, planes=128, stride=1),
-            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=256, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(num_features=256, momentum=0.9),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            BasicBlock(inplanes=256, planes=256, stride=1),
-        )
-
-        self.fc = nn.Linear(in_features=256, out_features=num_classes, bias=True)
-
-    def forward(self, x):
-        out = self.body(x)
-        out = F.avg_pool2d(out, out.size()[3])
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        return out
diff --git a/composer/models/timm/__init__.py b/composer/models/timm/__init__.py
deleted file mode 100644
index b7960b426a..0000000000
--- a/composer/models/timm/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`_
-used to create :class:`.ComposerClassifier`."""
-
-from composer.models.timm.model import composer_timm as composer_timm
-
-__all__ = ['composer_timm']
diff --git a/composer/models/timm/model.py b/composer/models/timm/model.py
deleted file mode 100644
index df0ffbca91..0000000000
--- a/composer/models/timm/model.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`_
-used to create :class:`.ComposerClassifier`."""
-
-import warnings
-from typing import Optional
-
-from composer.models.tasks import ComposerClassifier
-from composer.utils.import_helpers import MissingConditionalImportError
-
-__all__ = ['composer_timm']
-
-
-def composer_timm(model_name: str,
-                  pretrained: bool = False,
-                  num_classes: int = 1000,
-                  drop_rate: float = 0.0,
-                  drop_path_rate: Optional[float] = None,
-                  drop_block_rate: Optional[float] = None,
-                  global_pool: Optional[str] = None,
-                  bn_momentum: Optional[float] = None,
-                  bn_eps: Optional[float] = None) -> ComposerClassifier:
-    """A wrapper around `timm.create_model() <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-
-    model>`_ used to create :class:`.ComposerClassifier`.
-
-    Args:
-        model_name (str): timm model name e.g: ``"resnet50"``. List of models can be found at
-            `PyTorch Image Models <https://github.com/rwightman/pytorch-image-models>`_.
-        pretrained (bool, optional): Imagenet pretrained. Default: ``False``.
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
-        drop_rate (float, optional): Dropout rate. Default: ``0.0``.
-        drop_path_rate (float, optional): Drop path rate (model default if ``None``). Default: ``None``.
-        drop_block_rate (float, optional): Drop block rate (model default if ``None``). Default: ``None``.
-        global_pool (str, optional): Global pool type, one of (``"fast"``, ``"avg"``, ``"max"``, ``"avgmax"``, ``"avgmaxc"``). Model default if ``None``. Default: ``None``.
-        bn_momentum (float, optional): BatchNorm momentum override (model default if ``None``). Default: ``None``.
-        bn_eps (float, optional): BatchNorm epsilon override (model default if ``None``). Default: ``None``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with the specified TIMM model.
-
-    Resnet18 Example:
-
-    .. testcode::
-
-        from composer.models import composer_timm
-
-        model = composer_timm(model_name='resnet18')  # creates a timm resnet18
-    """
-    warnings.warn(DeprecationWarning('composer_timm is deprecated and will be removed in v0.18'))
-    try:
-        import timm
-    except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='timm', conda_package='timm>=0.5.4',
-                                            conda_channel=None) from e
-    model = timm.create_model(  # type: ignore (third-party)
-        model_name=model_name,
-        pretrained=pretrained,
-        num_classes=num_classes,
-        drop_rate=drop_rate,
-        drop_path_rate=drop_path_rate,
-        drop_block_rate=drop_block_rate,
-        global_pool=global_pool,
-        bn_momentum=bn_momentum,
-        bn_eps=bn_eps)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/models/unet/README.md b/composer/models/unet/README.md
deleted file mode 100644
index 530832051b..0000000000
--- a/composer/models/unet/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# UNet
-[\[Example\]](#example) &middot; [\[Architecture\]](#architecture) &middot; [\[Default Training Hyperparameters\]](#default-training-hyperparameters) &middot; [\[Attribution\]](#attribution) &middot; [\[API Reference\]](#api-reference)
-
-`Vision` / `Segmentation`
-
-Unet is an architecture used for image segmentation.
-
-## Example
-
-<!--pytest-codeblocks:importorskip(monai)-->
-<!--pytest-codeblocks:importorskip(scikit-learn)-->
-```python
-from composer.models import UNet
-
-model = UNet()
-```
-
-## Architecture
-
-The figure below ([source](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet)) shows a 3D version of the UNet architecture. Quoting the [Nvidia Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet), "U-Net is composed of a contractive and an expanding path, that aims at building a bottleneck in its centremost part through a combination of convolution, instance norm and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve training."
-
-![unet3d.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/unet3d.png)
-
-
-There are 3 main differences between our implementation and the original NVDA DALI implementation.
-
-The first two refer to removing the NVDA DALI pipeline and replacing all transforms with torch implementations. We are omitting the Zoom transform and use a kernel size of 3 for the Gaussian Blur transform.
-
-While NVDA DLE examples reports the training accuracy using an average of 5 folds, we are using only 1 fold in the interest of faster iteration time, so all of our results are reported using fold 0 and 200 epochs.
-
-
-## Default Training Hyperparameters
-
-Below are the hyperparameters we used to train UNet on the [BraTS](http://braintumorsegmentation.org) image segmentation dataset.
-
-```yaml
-optimizer:
-  radam:
-    lr: 0.001
-    betas: [0.9, 0.999]
-    eps: 0.00000001
-    weight_decay: 0.0001
-schedulers:
-  - constant: {}
-train_batch_size: 64
-max_duration: 200ep
-```
-
-
-## Attribution
-
-The UNet model has been introduced in "U-Net: Convolutional Networks for Biomedical Image Segmentation" by Olaf Ronneberger, Philipp Fischer, Thomas Brox in [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597).
-
-We are using the NVDA DLE examples version in
-[https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet).
-
-## API Reference
-
-```{eval-rst}
-.. autoclass:: composer.models.unet.UNet
-    :noindex:
-```
diff --git a/composer/models/unet/__init__.py b/composer/models/unet/__init__.py
deleted file mode 100644
index 6f26bd4625..0000000000
--- a/composer/models/unet/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset.
-
-See the :doc:`Model Card </model_cards/unet>` for more details.
-"""
-
-from composer.models.unet.unet import UNet as UNet
-
-__all__ = ['UNet']
-
-_task = 'Image Segmentation'
-_dataset = 'BRATS'
-_name = 'UNet'
-_quality = '69.1'
-_metric = 'Dice'
-_ttt = '21m'
-_hparams = 'unet.yaml'
diff --git a/composer/models/unet/_layers.py b/composer/models/unet/_layers.py
deleted file mode 100644
index 6fae767bf5..0000000000
--- a/composer/models/unet/_layers.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-## Code adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-normalizations = {
-    'instancenorm3d': nn.InstanceNorm3d,
-    'instancenorm2d': nn.InstanceNorm2d,
-    'batchnorm3d': nn.BatchNorm3d,
-    'batchnorm2d': nn.BatchNorm2d,
-}
-
-convolutions = {
-    'Conv2d': nn.Conv2d,
-    'Conv3d': nn.Conv3d,
-    'ConvTranspose2d': nn.ConvTranspose2d,
-    'ConvTranspose3d': nn.ConvTranspose3d,
-}
-
-
-def get_norm(name, out_channels):
-    if 'groupnorm' in name:
-        return nn.GroupNorm(32, out_channels, affine=True)
-    return normalizations[name](out_channels, affine=True)
-
-
-def get_conv(in_channels, out_channels, kernel_size, stride, dim, bias=False):
-    conv = convolutions[f'Conv{dim}d']
-    padding = get_padding(kernel_size, stride)
-    return conv(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
-
-
-def get_transp_conv(in_channels, out_channels, kernel_size, stride, dim):
-    conv = convolutions[f'ConvTranspose{dim}d']
-    padding = get_padding(kernel_size, stride)
-    output_padding = get_output_padding(kernel_size, stride, padding)
-    return conv(in_channels, out_channels, kernel_size, stride, padding, output_padding, bias=True)
-
-
-def get_padding(kernel_size, stride):
-    #kernel_size_np = np.cast(np.ndarray, np.atleast_1d(kernel_size))
-    #stride_np = np.cast(np.ndarray, np.atleast_1d(stride))
-    kernel_size_np = np.atleast_1d(kernel_size)
-    stride_np = np.atleast_1d(stride)
-    padding_np = (kernel_size_np - stride_np + 1) / 2  # type: ignore
-    padding = tuple(int(p) for p in padding_np)  # type: ignore
-    return padding if len(padding) > 1 else padding[0]
-
-
-def get_output_padding(kernel_size, stride, padding):
-    kernel_size_np = np.atleast_1d(kernel_size)
-    stride_np = np.atleast_1d(stride)
-    padding_np = np.atleast_1d(padding)
-    out_padding_np = 2 * padding_np + stride_np - kernel_size_np
-    out_padding = tuple(int(p) for p in out_padding_np)
-    return out_padding if len(out_padding) > 1 else out_padding[0]
-
-
-class ConvLayer(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ConvLayer, self).__init__()
-        self.conv = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim'])
-        self.norm = get_norm(kwargs['norm'], out_channels)
-        self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True)
-
-    def forward(self, data):
-        out = self.conv(data)
-        out = self.norm(out)
-        out = self.lrelu(out)
-        return out
-
-
-class ConvBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ConvBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs)
-        self.conv2 = ConvLayer(out_channels, out_channels, kernel_size, 1, **kwargs)
-
-    def forward(self, input_data):
-        out = self.conv1(input_data)
-        out = self.conv2(out)
-        return out
-
-
-class ResidBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(ResidBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, out_channels, kernel_size, stride, **kwargs)
-        self.conv2 = get_conv(out_channels, out_channels, kernel_size, 1, kwargs['dim'])
-        self.norm = get_norm(kwargs['norm'], out_channels)
-        self.lrelu = nn.LeakyReLU(negative_slope=kwargs['negative_slope'], inplace=True)
-        self.downsample = None
-        if max(stride) > 1 or in_channels != out_channels:  # type: ignore
-            self.downsample = get_conv(in_channels, out_channels, kernel_size, stride, kwargs['dim'])
-            self.norm_res = get_norm(kwargs['norm'], out_channels)
-
-    def forward(self, input_data):
-        residual = input_data
-        out = self.conv1(input_data)
-        out = self.conv2(out)
-        out = self.norm(out)
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-            residual = self.norm_res(residual)
-        out = self.lrelu(out + residual)
-        return out
-
-
-class UpsampleBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
-        super(UpsampleBlock, self).__init__()
-        self.transp_conv = get_transp_conv(in_channels, out_channels, stride, stride, kwargs['dim'])
-        self.conv_block = ConvBlock(2 * out_channels, out_channels, kernel_size, 1, **kwargs)
-
-    def forward(self, input_data, skip_data):
-        out = self.transp_conv(input_data)
-        out = torch.cat((out, skip_data), dim=1)
-        out = self.conv_block(out)
-        return out
-
-
-class OutputBlock(nn.Module):
-
-    def __init__(self, in_channels, out_channels, dim):
-        super(OutputBlock, self).__init__()
-        self.conv = get_conv(in_channels, out_channels, kernel_size=1, stride=1, dim=dim, bias=True)
-        nn.init.constant_(self.conv.bias, 0)
-
-    def forward(self, input_data):
-        return self.conv(input_data)
diff --git a/composer/models/unet/model.py b/composer/models/unet/model.py
deleted file mode 100644
index 08c49ff57c..0000000000
--- a/composer/models/unet/model.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""The Unet architecture used in image segmentation. The example we are using is for BRATS medical brain tumor dataset.
-
-See the :doc:`Model Card </model_cards/unet>` for more details.
-"""
-
-import warnings
-
-import torch.nn as nn
-
-from composer.models.unet._layers import ConvBlock, OutputBlock, ResidBlock, UpsampleBlock
-
-__all__ = ['UNet']
-
-
-class UNet(nn.Module):
-    """Unet Architecture adapted from NVidia `Deep Learning Examples`_.
-
-    .. _Deep Learning Examples: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Segmentation/nnUNet/
-
-    Args:
-        in_channels (int): Number of input channels.
-        n_class (int): Number of output layers.
-        kernels (list): Conv layer kernel sizes.
-        strides (list): Conv layer strides.
-        normalization_layer (str): Normalization layer type, one of (``"batch"``, ``"instance"``).
-        negative_slope (float): Leaky relu negative slope.
-        residual (bool): Use residual connections.
-        dimension (int): Filter dimensions.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        n_class,
-        kernels,
-        strides,
-        normalization_layer,
-        negative_slope,
-        residual,
-        dimension,
-    ):
-        warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18'))
-        super(UNet, self).__init__()
-        self.dim = dimension
-        self.n_class = n_class
-        self.residual = residual
-        self.negative_slope = negative_slope
-        self.norm = normalization_layer + f'norm{dimension}d'
-        self.filters = [min(2**(5 + i), 320 if dimension == 3 else 512) for i in range(len(strides))]
-
-        down_block = ResidBlock if self.residual else ConvBlock
-        self.input_block = self.get_conv_block(
-            conv_block=down_block,
-            in_channels=in_channels,
-            out_channels=self.filters[0],
-            kernel_size=kernels[0],
-            stride=strides[0],
-        )
-        self.downsamples = self.get_module_list(
-            conv_block=down_block,
-            in_channels=self.filters[:-1],
-            out_channels=self.filters[1:],
-            kernels=kernels[1:-1],
-            strides=strides[1:-1],
-        )
-        self.bottleneck = self.get_conv_block(
-            conv_block=down_block,
-            in_channels=self.filters[-2],
-            out_channels=self.filters[-1],
-            kernel_size=kernels[-1],
-            stride=strides[-1],
-        )
-        self.upsamples = self.get_module_list(
-            conv_block=UpsampleBlock,
-            in_channels=self.filters[1:][::-1],
-            out_channels=self.filters[:-1][::-1],
-            kernels=kernels[1:][::-1],
-            strides=strides[1:][::-1],
-        )
-        self.output_block = self.get_output_block(decoder_level=0)
-        self.apply(self.initialize_weights)
-        self.n_layers = len(self.upsamples) - 1
-
-    def forward(self, input_data):
-        out = self.input_block(input_data)
-        encoder_outputs = [out]
-        for downsample in self.downsamples:
-            out = downsample(out)
-            encoder_outputs.append(out)
-        out = self.bottleneck(out)
-        for idx, upsample in enumerate(self.upsamples):
-            out = upsample(out, encoder_outputs[self.n_layers - idx])
-        out = self.output_block(out)
-        return out
-
-    def get_conv_block(self, conv_block, in_channels, out_channels, kernel_size, stride):
-        return conv_block(
-            dim=self.dim,
-            stride=stride,
-            norm=self.norm,
-            kernel_size=kernel_size,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            negative_slope=self.negative_slope,
-        )
-
-    def get_output_block(self, decoder_level):
-        return OutputBlock(in_channels=self.filters[decoder_level], out_channels=self.n_class, dim=self.dim)
-
-    def get_module_list(self, in_channels, out_channels, kernels, strides, conv_block):
-        layers = []
-        for in_channel, out_channel, kernel, stride in zip(in_channels, out_channels, kernels, strides):
-            conv_layer = self.get_conv_block(conv_block, in_channel, out_channel, kernel, stride)
-            layers.append(conv_layer)
-        return nn.ModuleList(layers)
-
-    def initialize_weights(self, module):
-        name = module.__class__.__name__.lower()
-        if name in ['conv2d']:
-            nn.init.kaiming_normal_(module.weight, a=self.negative_slope)
diff --git a/composer/models/unet/unet.py b/composer/models/unet/unet.py
deleted file mode 100644
index dde555bb4f..0000000000
--- a/composer/models/unet/unet.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""A U-Net model extending :class:`.ComposerModel`."""
-
-import logging
-import warnings
-from typing import Any, Dict, Optional, Sequence, Union
-
-import torch
-import torch.nn as nn
-from torchmetrics import Metric
-
-from composer.metrics.metrics import Dice
-from composer.models.base import ComposerModel
-from composer.models.unet.model import UNet as UNetModel
-from composer.utils.import_helpers import MissingConditionalImportError
-
-log = logging.getLogger(__name__)
-
-__all__ = ['UNet']
-
-
-class UNet(ComposerModel):
-    """A U-Net model extending :class:`.ComposerModel`.
-
-    See U-Net: Convolutional Networks for Biomedical Image Segmentation (`Ronneberger et al, 2015`_)
-    on the U-Net architecture.
-
-    Args:
-        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``3``.
-
-    .. _Ronneberger et al, 2015: https://arxiv.org/abs/1505.04597
-    """
-
-    def __init__(self, num_classes: int = 3) -> None:
-        warnings.warn(DeprecationWarning('UNet is deprecated and will be removed in v0.18'))
-
-        super().__init__()
-        try:
-            from monai.losses import DiceLoss
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='unet',
-                                                conda_package='monai',
-                                                conda_channel='conda-forge') from e
-
-        self.module = self.build_nnunet()
-
-        self.dice = Dice(num_classes=num_classes)
-        self.dloss = DiceLoss(include_background=False, softmax=True, to_onehot_y=True, batch=True)
-        self.closs = nn.CrossEntropyLoss()
-
-    def loss(self, outputs: Any, batch: Any, *args, **kwargs) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
-        _, y = batch
-        y = y.squeeze(1)  # type: ignore
-        loss = self.dloss(outputs, y)
-        loss += self.closs(outputs, y[:, 0].long())
-        return loss
-
-    @staticmethod
-    def metric_mean(name, outputs):
-        return torch.stack([out[name] for out in outputs]).mean(dim=0)
-
-    def get_metrics(self, is_train: bool = False) -> Dict[str, Metric]:
-        return {'Dice': self.dice}
-
-    def forward(self, batch: Any) -> torch.Tensor:
-        x, _ = batch
-        x = x.squeeze(1)  # type: ignore
-        logits = self.module(x)
-        return logits
-
-    def inference2d(self, image):
-        """Runs inference on a 3D image, by passing each depth slice through the model."""
-        batch_modulo = image.shape[2] % 64
-        if batch_modulo != 0:
-            batch_pad = 64 - batch_modulo
-            image = nn.ConstantPad3d((0, 0, 0, 0, batch_pad, 0), 0)(image)
-
-        image = torch.transpose(image.squeeze(0), 0, 1)
-        preds_shape = (image.shape[0], 4, *image.shape[2:])
-        preds = torch.zeros(preds_shape, dtype=image.dtype, device=image.device)
-        for start in range(0, image.shape[0] - 64 + 1, 64):
-            end = start + 64
-            with torch.no_grad():
-                pred = self.module(image[start:end])
-            preds[start:end] = pred.data
-        if batch_modulo != 0:
-            preds = preds[batch_pad:]  # type: ignore
-        return torch.transpose(preds, 0, 1).unsqueeze(0)
-
-    def eval_forward(self, batch: Any, outputs: Optional[Any] = None):
-        assert self.training is False, 'For validation, model must be in eval mode'
-        image, _ = batch
-        pred = self.inference2d(image)
-        return pred
-
-    def build_nnunet(self) -> torch.nn.Module:
-        kernels = [[3, 3]] * 6
-        strides = [[1, 1]] + [[2, 2]] * 5
-        model = UNetModel(in_channels=4,
-                          n_class=4,
-                          kernels=kernels,
-                          strides=strides,
-                          dimension=2,
-                          residual=True,
-                          normalization_layer='batch',
-                          negative_slope=0.01)
-
-        return model
diff --git a/composer/models/vit_small_patch16/__init__.py b/composer/models/vit_small_patch16/__init__.py
deleted file mode 100644
index 9992807ade..0000000000
--- a/composer/models/vit_small_patch16/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""ViT Small Patch 16 for image classification."""
-
-from composer.models.vit_small_patch16.model import vit_small_patch16 as vit_small_patch16
-
-__all__ = ['vit_small_patch16']
-
-_task = 'Image Classification'
-_dataset = 'ImageNet'
-_name = 'ViT-Small-Patch16'
-_quality = '74.52'
-_metric = 'Top-1 Accuracy'
-_ttt = '1d 59m'
-_hparams = 'vit_small_patch16.yaml'
diff --git a/composer/models/vit_small_patch16/model.py b/composer/models/vit_small_patch16/model.py
deleted file mode 100644
index dacb9db56a..0000000000
--- a/composer/models/vit_small_patch16/model.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Implements ViT-S/16 as a :class:`.ComposerClassifier`."""
-
-import warnings
-
-from composer.models.tasks import ComposerClassifier
-
-__all__ = ['vit_small_patch16']
-
-
-def vit_small_patch16(num_classes: int = 1000,
-                      image_size: int = 224,
-                      channels: int = 3,
-                      dropout: float = 0.0,
-                      embedding_dropout: float = 0.0):
-    """Helper function to create a :class:`.ComposerClassifier` using a ViT-S/16 model.
-
-    See `Training data-efficient image transformers & distillation through attention <https://arxiv.org/pdf/2012.12877.pdf>`_
-        (Touvron et al, 2021) for details on ViT-S/16.
-
-    Args:
-        num_classes (int, optional): number of classes for the model. Default: ``1000``.
-        image_size (int, optional): input image size. If you have rectangular images, make sure your image
-         size is the maximum of the width and height. Default: ``224``.
-        channels (int, optional): number of  image channels. Default: ``3``.
-        dropout (float, optional): 0.0 - 1.0 dropout rate. Default: ``0``.
-        embedding_dropout (float, optional): 0.0 - 1.0 embedding dropout rate. Default: ``0``.
-
-    Returns:
-        ComposerModel: instance of :class:`.ComposerClassifier` with a ViT-S/16 model.
-    """
-    warnings.warn(DeprecationWarning('vit_small_patch16 is deprecated and will be removed in v0.18'))
-
-    from vit_pytorch import ViT
-    model = ViT(
-        image_size=image_size,
-        channels=channels,
-        num_classes=num_classes,
-        dim=384,  # embed dim/width
-        patch_size=16,
-        depth=12,  # layers
-        heads=6,
-        mlp_dim=1536,
-        dropout=dropout,
-        emb_dropout=embedding_dropout)
-
-    composer_model = ComposerClassifier(module=model)
-    return composer_model
diff --git a/composer/optim/decoupled_weight_decay.py b/composer/optim/decoupled_weight_decay.py
index 35a235cce7..2d20aad286 100644
--- a/composer/optim/decoupled_weight_decay.py
+++ b/composer/optim/decoupled_weight_decay.py
@@ -11,7 +11,7 @@
 
 import logging
 import math
-from typing import Iterable, List, Tuple, Union
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch.optim import SGD, AdamW
@@ -70,8 +70,9 @@ def __init__(
             group['initial_lr'] = group['lr']
 
     @staticmethod
-    def sgdw(params: List[torch.Tensor], d_p_list: List[torch.Tensor], momentum_buffer_list: List[torch.Tensor], *,
-             weight_decay: float, momentum: float, lr: float, initial_lr: float, dampening: float, nesterov: bool):
+    def sgdw(params: List[torch.Tensor], d_p_list: List[torch.Tensor],
+             momentum_buffer_list: List[Optional[torch.Tensor]], *, weight_decay: float, momentum: float, lr: float,
+             initial_lr: float, dampening: float, nesterov: bool):
         r"""Functional API that performs SGDW algorithm computation.
 
         Args:
@@ -109,7 +110,7 @@ def sgdw(params: List[torch.Tensor], d_p_list: List[torch.Tensor], momentum_buff
 
             param.add_(d_p, alpha=-lr)
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def step(self, closure=None):
         """Performs a single optimization step.
 
@@ -263,7 +264,7 @@ def adamw(params: List[torch.Tensor], grads: List[torch.Tensor], exp_avgs: List[
 
             param.addcdiv_(exp_avg, denom, value=-step_size)
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def step(self, closure=None):
         """Performs a single optimization step.
 
diff --git a/composer/optim/scheduler.py b/composer/optim/scheduler.py
index 294d26ddb4..d8c6c82c6d 100644
--- a/composer/optim/scheduler.py
+++ b/composer/optim/scheduler.py
@@ -18,9 +18,9 @@
 import warnings
 from typing import TYPE_CHECKING, List, Union
 
-from torch.optim.lr_scheduler import LambdaLR
+from torch.optim.lr_scheduler import LambdaLR, LRScheduler
 
-from composer.core import PyTorchScheduler, State, Time, TimeUnit
+from composer.core import State, Time, TimeUnit
 
 if TYPE_CHECKING:
     from typing import Protocol
@@ -31,10 +31,21 @@
 log = logging.getLogger(__name__)
 
 __all__ = [
-    'ComposerScheduler', 'compile_composer_scheduler', 'StepScheduler', 'MultiStepScheduler', 'ConstantScheduler',
-    'LinearScheduler', 'ExponentialScheduler', 'CosineAnnealingScheduler', 'CosineAnnealingWarmRestartsScheduler',
-    'PolynomialScheduler', 'MultiStepWithWarmupScheduler', 'ConstantWithWarmupScheduler', 'LinearWithWarmupScheduler',
-    'CosineAnnealingWithWarmupScheduler', 'PolynomialWithWarmupScheduler'
+    'ComposerScheduler',
+    'compile_composer_scheduler',
+    'StepScheduler',
+    'MultiStepScheduler',
+    'ConstantScheduler',
+    'LinearScheduler',
+    'ExponentialScheduler',
+    'CosineAnnealingScheduler',
+    'CosineAnnealingWarmRestartsScheduler',
+    'PolynomialScheduler',
+    'MultiStepWithWarmupScheduler',
+    'ConstantWithWarmupScheduler',
+    'LinearWithWarmupScheduler',
+    'CosineAnnealingWithWarmupScheduler',
+    'PolynomialWithWarmupScheduler',
 ]
 
 
@@ -147,7 +158,7 @@ def _convert_time(time: Union[str, Time[int], Time[float]], state: State, ssr: f
     return Time(value=int(time.value * ssr), unit=time.unit)
 
 
-def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr: float = 1.0) -> PyTorchScheduler:
+def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr: float = 1.0) -> LRScheduler:
     """Converts a stateless scheduler into a PyTorch scheduler object.
 
     While the resulting scheduler provides a ``.step()`` interface similar to other PyTorch schedulers, the scheduler is
@@ -160,7 +171,7 @@ def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr:
         state (State): The Composer Trainer's state.
 
     Returns:
-        compiled_scheduler (PyTorchScheduler): The scheduler, in a form compatible with PyTorch scheduler interfaces.
+        compiled_scheduler (LRScheduler): The scheduler, in a form compatible with PyTorch scheduler interfaces.
     """
     optimizers = state.optimizers
     if len(optimizers) != 1:
diff --git a/composer/profiler/profiler.py b/composer/profiler/profiler.py
index c88c1f0912..a3a7127e58 100644
--- a/composer/profiler/profiler.py
+++ b/composer/profiler/profiler.py
@@ -9,6 +9,8 @@
 import pathlib
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
+from composer.core import Callback
+from composer.loggers import Logger
 from composer.profiler.json_trace_handler import JSONTraceHandler
 from composer.profiler.marker import Marker
 from composer.profiler.profiler_action import ProfilerAction
@@ -18,14 +20,14 @@
 from composer.utils import ensure_tuple, parse_uri
 
 if TYPE_CHECKING:
-    from composer.core import Callback, State
+    from composer.core import State
 
 __all__ = ['Profiler']
 
 log = logging.getLogger(__name__)
 
 
-class Profiler:
+class Profiler(Callback):
     """Composer Profiler.
 
     See the :doc:`Profiling Guide </trainer/performance_tutorials/profiling>` for additional information.
@@ -118,6 +120,8 @@ def __init__(
         self.schedule = schedule
         self.state = None
         self._callbacks: List[Callback] = []
+        # Used to count skip_first starting from resumption timestamp
+        self.resumption_batch_idx: int = 0
         self.remote_filenames: List[str] = []
         # First, add each remote file name to self.remote_filenames to create RemoteUploaderDownloader logger in trainer. [s3://bucket/path/to/file]
         # Then modify remote file name to be a local path to pass into torch_profiler and system_profiler. e.g: path/to/file
@@ -185,6 +189,7 @@ def bind_to_state(
             state (State): The training state.
         """
         self.state = state
+        self.state.callbacks.append(self)
         self.state.callbacks.extend(self._callbacks)
         self.state.callbacks.extend(self._trace_handlers)
 
@@ -289,3 +294,7 @@ def should_record(state: State) -> bool:
             )
         self._names_to_markers[name].categories = categories
         return self._names_to_markers[name]
+
+    def after_load(self, state: State, logger: Logger) -> None:
+        del logger
+        self.resumption_batch_idx = int(state.timestamp.batch_in_epoch)
diff --git a/composer/profiler/profiler_schedule.py b/composer/profiler/profiler_schedule.py
index 02b72b8a50..08d2549c2b 100644
--- a/composer/profiler/profiler_schedule.py
+++ b/composer/profiler/profiler_schedule.py
@@ -23,10 +23,11 @@ def cyclic_schedule(
     This function returns a schedule function that uses a cyclic profiling window. The resulting function can be
     passed as the ``prof_schedule`` argument to the :class:`.Trainer`.
 
-    The cyclic window skips the first ``skip_first`` batches in every epoch. Then, it performs a cycle of
-    skipping ``wait`` batches, warming up for ``warmup`` batches, and recording ``active`` batches.
-    It repeats this cycle up to ``repeat`` times per epoch (or for the entire epoch, if ``repeat`` is 0).
-    This logic repeats every epoch.
+    The cyclic window skips the first ``skip_first`` + ``resumption_batch_idx`` batches in every epoch.
+    ``resumption_batch_idx`` is accessed from state.profiler. It is the ``state.timestamp.batch_in_epoch``
+    when resuming training.  Then, it performs a cycle of skipping ``wait`` batches, warming up for ``warmup``
+    batches, and recording ``active`` batches. It repeats this cycle up to ``repeat`` times per epoch (or
+    for the entire epoch, if ``repeat`` is 0). This logic repeats every epoch.
 
     Args:
         skip_first (int, optional): Number of batches to skip profiling at epoch start.  Defaults to ``0``.
@@ -46,12 +47,16 @@ def schedule(state: State):
         # do wait, then warump, then active, up to repeat times per cycle
         cycle_len = wait + warmup + active
         batch_idx = int(state.timestamp.batch_in_epoch)
-        if batch_idx < skip_first:
+        if state.profiler is not None:
+            skip_first_after_resumption = skip_first + state.profiler.resumption_batch_idx
+        else:
+            skip_first_after_resumption = skip_first
+        if batch_idx < skip_first_after_resumption:
             return ProfilerAction.SKIP
-        if repeat != 0 and batch_idx >= cycle_len * repeat + skip_first:
+        if repeat != 0 and batch_idx >= cycle_len * repeat + skip_first_after_resumption:
             # exhausted the repeat
             return ProfilerAction.SKIP
-        position_in_cycle = (batch_idx - skip_first) % cycle_len
+        position_in_cycle = (batch_idx - skip_first_after_resumption) % cycle_len
         if position_in_cycle < wait:
             return ProfilerAction.SKIP
         if position_in_cycle < wait + warmup:
diff --git a/composer/profiler/system_profiler.py b/composer/profiler/system_profiler.py
index 3f9c928c23..3bc19fb3ca 100644
--- a/composer/profiler/system_profiler.py
+++ b/composer/profiler/system_profiler.py
@@ -98,7 +98,9 @@ def _stats_thread(self, profiler: Profiler):
                 })
 
             if self.profile_disk:
-                disk_io_counters = cast(Dict[str, psutil._common.sdiskio], psutil.disk_io_counters(perdisk=True))
+                disk_io_counters = cast(
+                    Dict[str, psutil._common.sdiskio],  # type: ignore
+                    psutil.disk_io_counters(perdisk=True))
                 for disk_name, disk_stats in disk_io_counters.items():
                     for field_name in ('read_count', 'write_count', 'read_bytes', 'write_bytes', 'read_time',
                                        'write_time', 'busy_time'):
@@ -106,7 +108,9 @@ def _stats_thread(self, profiler: Profiler):
                                         categories=['disk']).counter({'field_name': getattr(disk_stats, field_name)})
 
             if self.profile_net:
-                net_io_counters = cast(Dict[str, psutil._common.snetio], psutil.net_io_counters(pernic=True))
+                net_io_counters = cast(
+                    Dict[str, psutil._common.snetio],  # type: ignore
+                    psutil.net_io_counters(pernic=True))
                 for nic, nic_stats in net_io_counters.items():
                     profiler.marker(f'network/{nic}/kb_sent',
                                     categories=['net']).counter({'kb_sent': nic_stats.bytes_sent / 2**3})
diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py
index a8c51bb27b..0f8f1f4fb0 100644
--- a/composer/profiler/torch_profiler.py
+++ b/composer/profiler/torch_profiler.py
@@ -259,24 +259,23 @@ def handler_fn(prof: torch.profiler.profiler.profile):
             timestamp = state.timestamp
 
             log.info(f'PyTorch Chrome trace profiler enabled: {self.filename if self.filename else False}')
-            if self.filename is not None:
-                trace_file_name = os.path.join(
-                    folder_name,
-                    format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=timestamp),
-                )
-                trace_file_dirname = os.path.dirname(trace_file_name)
-                if trace_file_dirname:
-                    os.makedirs(trace_file_dirname, exist_ok=True)
-                prof.export_chrome_trace(trace_file_name)
-                state.profiler.record_chrome_json_trace_file(trace_file_name)
-                if self.remote_file_name is not None:
-                    trace_remote_file_name = format_name_with_dist_and_time(self.remote_file_name,
-                                                                            run_name=state.run_name,
-                                                                            timestamp=timestamp)
-                    trace_remote_file_name = trace_remote_file_name.lstrip('/')
-                    logger.upload_file(remote_file_name=trace_remote_file_name,
-                                       file_path=trace_file_name,
-                                       overwrite=self.overwrite)
+            trace_file_name = os.path.join(
+                folder_name,
+                format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=timestamp),
+            )
+            trace_file_dirname = os.path.dirname(trace_file_name)
+            if trace_file_dirname:
+                os.makedirs(trace_file_dirname, exist_ok=True)
+            prof.export_chrome_trace(trace_file_name)
+            state.profiler.record_chrome_json_trace_file(trace_file_name)
+            if self.remote_file_name is not None:
+                trace_remote_file_name = format_name_with_dist_and_time(self.remote_file_name,
+                                                                        run_name=state.run_name,
+                                                                        timestamp=timestamp)
+                trace_remote_file_name = trace_remote_file_name.lstrip('/')
+                logger.upload_file(remote_file_name=trace_remote_file_name,
+                                   file_path=trace_file_name,
+                                   overwrite=self.overwrite)
 
             log.info(
                 f'PyTorch memory timeline profiler enabled: {self.memory_filename if self.memory_filename else False}')
diff --git a/composer/trainer/_deepspeed.py b/composer/trainer/_deepspeed.py
index a3ef6e0ef2..0217770a23 100644
--- a/composer/trainer/_deepspeed.py
+++ b/composer/trainer/_deepspeed.py
@@ -161,7 +161,7 @@ def _fix_batch_precision_for_deepspeed(batch: Batch, precision: Precision) -> Ba
         Batch: The batch with it's precision adjusted to the specified precision.
     """
     if precision == Precision.AMP_FP16:
-        return map_collection(batch, _convert_fp32_tensor_to_fp16)  # type: ignore
+        return map_collection(batch, _convert_fp32_tensor_to_fp16)
     elif precision == Precision.AMP_BF16:
-        return map_collection(batch, _convert_fp32_tensor_to_bf16)  # type: ignore
+        return map_collection(batch, _convert_fp32_tensor_to_bf16)
     return batch
diff --git a/composer/trainer/_scale_schedule.py b/composer/trainer/_scale_schedule.py
index 5cdb37da60..cc94caf7c4 100644
--- a/composer/trainer/_scale_schedule.py
+++ b/composer/trainer/_scale_schedule.py
@@ -3,12 +3,11 @@
 
 from collections import Counter
 
-from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ExponentialLR, MultiStepLR, StepLR
+from torch.optim.lr_scheduler import (CosineAnnealingLR, CosineAnnealingWarmRestarts, ExponentialLR, LRScheduler,
+                                      MultiStepLR, StepLR)
 
-from composer.core import PyTorchScheduler
 
-
-def scale_pytorch_scheduler(scheduler: PyTorchScheduler, ssr: float):
+def scale_pytorch_scheduler(scheduler: LRScheduler, ssr: float):
     """Makes a learning rate schedule take a different number of epochs.
 
     Training for less time is a strong baseline approach to speeding up
diff --git a/composer/trainer/dist_strategy.py b/composer/trainer/dist_strategy.py
index f2c8c615b4..8b76f8b1ba 100644
--- a/composer/trainer/dist_strategy.py
+++ b/composer/trainer/dist_strategy.py
@@ -11,6 +11,10 @@
 
 import torch
 from packaging import version
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (CheckpointImpl, apply_activation_checkpointing,
+                                                                         checkpoint_wrapper)
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.nn.parallel import DistributedDataParallel
 from torchmetrics import Metric, MetricCollection
 
@@ -20,7 +24,7 @@
 from composer.trainer.mosaic_fsdp import patch_pytorch
 from composer.trainer.mosaic_fsdp_utils import (BACKWARD_PREFETCH_MAP, SHARDING_MAP, _set_custom_fsdp_module_kwargs,
                                                 get_cpu_offload, get_mixed_precision)
-from composer.utils import StringEnum, dist, ensure_tuple, using_torch_2
+from composer.utils import StringEnum, dist, ensure_tuple
 
 __all__ = ['DDPSyncStrategy', 'ddp_sync_context', 'prepare_ddp_module', 'prepare_fsdp_module']
 
@@ -178,13 +182,7 @@ def _recreate_fsdp_param_groups_from_unwrapped_opt_info(
 
     Returns a list of param groups, referencing the fsdp parameters
     """
-    is_torch_2_0 = using_torch_2()
-    if not is_torch_2_0:
-        raise RuntimeError('Helper function is only supported in torch 2.0')
-
-    from torch.distributed.fsdp._common_utils import clean_tensor_name
-
-    # initialize an empty list of parameters for each optimizer group
+    # Initialize an empty list of parameters for each optimizer group
     for group_num in group_num_to_optimizer_info.keys():
         group_num_to_optimizer_info[group_num]['params'] = []
 
@@ -217,16 +215,6 @@ def prepare_fsdp_module(
         device (Device): The device being used by the Trainer.
         auto_microbatching (bool, optional): Whether or not auto microbatching is enabled.
     """
-    if version.parse(torch.__version__) < version.parse('1.13.0'):
-        raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-    is_torch_2_0 = using_torch_2()
-    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (CheckpointImpl,
-                                                                             apply_activation_checkpointing,
-                                                                             checkpoint_wrapper)
-    from torch.distributed.fsdp import FullyShardedDataParallel
-    if not is_torch_2_0:
-        from torch.distributed.fsdp.flatten_params_wrapper import FlattenParamsWrapper
-
     patch_pytorch()
 
     set_fsdp_default(fsdp_config)
@@ -243,10 +231,6 @@ def prepare_fsdp_module(
                              'gpu and some ranks are on meta. Either keep all ranks on the same '
                              "device or set fsdp_config['sync_module_states'] = True. Otherwise, "
                              'some weights may be randomly initialized when loading a checkpoint.')
-        if fsdp_config['sharding_strategy'] in ('HYBRID_SHARD', '_HYBRID_SHARD_ZERO2'):
-            raise ValueError('HSDP (HYBRID_SHARD or _HYBRID_SHARD_ZERO2) requires '
-                             'fsdp_config["sync_module_states"] = True or different replicas will '
-                             'have different weights.')
 
     # Check if other ranks OOMed after forward/backward pass when using auto microbatching. This
     # may happen when close to memory limit or with uneven memory usage across ranks. Since we
@@ -265,14 +249,13 @@ def sync_hook(*args):
             raise RuntimeError('CUDA out of memory encountered on a different rank')
 
     kwargs = {}
-    if is_torch_2_0:
-        # Support of new parameter `use_orig_params` in PyTorch 2.0 or higher.
-        # Setting this to `True` has FSDP use `module`'s original parameters via method
-        # `nn.Module.named_parameters` instead of FSDP's internal class `FlatParameter`. However,
-        # setting it to `False` exposes FSDP's internal class `FlatParameter` via method
-        # `nn.Module.named_parameters`.
-        # Setting it to `True` is mandatory when using `torch.compile()`.
-        kwargs['use_orig_params'] = fsdp_config['use_orig_params']
+    if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.2.0'):
+        if 'device_mesh' in fsdp_config:
+            from torch.distributed._tensor import init_device_mesh
+            kwargs['device_mesh'] = init_device_mesh(
+                'cuda',
+                tuple([int(x) for x in fsdp_config['device_mesh']]),
+            )
 
     # necessary variables for optimizers with multiple param groups in FSDP
     num_param_groups = None
@@ -291,9 +274,9 @@ def sync_hook(*args):
 
         num_param_groups = len(optim.param_groups)
         if num_param_groups > 1:
-            if not (is_torch_2_0 and kwargs['use_orig_params']):
-                raise RuntimeError('Multiple optimizer groups with FSDP are only supported on torch 2.0 \
-                                   with use_orig_params=True.')
+            if not fsdp_config['use_orig_params']:
+                raise RuntimeError('Multiple optimizer groups with FSDP are only supported with '
+                                   'use_orig_params=True.')
             # optimizer.param_groups do not contain parameter names which are needed
             # to keep track of the different parameters in each group
             # so we use the pointers between model.parameters() and model.named_parameters()
@@ -367,6 +350,7 @@ def sync_hook(*args):
     state_dict_type = fsdp_config['state_dict_type']
     activation_checkpointing_reentrant = fsdp_config['activation_checkpointing_reentrant']
     sharded_ckpt_prefix_dir = fsdp_config['sharded_ckpt_prefix_dir']
+    use_orig_params = fsdp_config['use_orig_params']
 
     # We choose to not wrap the ComposerModel directly, but instead wrap any submodules like `ComposerModel.model`
     # This makes it safer to call ComposerModel-specific functions like 'eval_forward' that
@@ -546,19 +530,10 @@ def __auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel:
                         module.register_full_backward_hook(sync_hook)
                     return should_be_wrapped
 
-                if is_torch_2_0:
-
-                    def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-                        return __auto_wrap_policy(module, recurse, nonwrapped_numel)
-
-                    _auto_wrap_policy = _auto_wrap_policy_new
+                def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
+                    return __auto_wrap_policy(module, recurse, nonwrapped_numel)
 
-                else:
-
-                    def _auto_wrap_policy_old(module: torch.nn.Module, recurse: bool, unwrapped_params: int) -> bool:
-                        return __auto_wrap_policy(module, recurse, unwrapped_params)
-
-                    _auto_wrap_policy = _auto_wrap_policy_old
+                _auto_wrap_policy = _auto_wrap_policy_new
 
             fsdp_obj = FullyShardedDataParallel(
                 obj,
@@ -574,6 +549,7 @@ def _auto_wrap_policy_old(module: torch.nn.Module, recurse: bool, unwrapped_para
                 sync_module_states=sync_module_states,
                 forward_prefetch=forward_prefetch,
                 limit_all_gathers=limit_all_gathers,
+                use_orig_params=use_orig_params,
                 **kwargs,
             )
 
@@ -636,8 +612,6 @@ def _auto_wrap_policy_old(module: torch.nn.Module, recurse: bool, unwrapped_para
                 # If module has attribute `module._activation_checkpointing = ...`, always respect it
                 # Otherwise checkpoint if root object `obj.activation_checkpointing_fn(module)` is true
                 def _check_fn(module: torch.nn.Module) -> bool:
-                    if not is_torch_2_0 and isinstance(module, FlattenParamsWrapper):
-                        return False
                     if isinstance(module, FullyShardedDataParallel):
                         return False
                     if hasattr(module, '_activation_checkpointing'):
@@ -657,24 +631,22 @@ def _check_fn(module: torch.nn.Module) -> bool:
 
     # Print FSDP wrapped model and FSDP config if `verbose=True`
     if fsdp_config['verbose']:
-        print(f'FSDP: Wrapped Model:')
-        print(model)
-        print(f'FSDP: Using sharding_strategy={sharding_strategy}')
-        print(f'FSDP: Using cpu_offload={cpu_offload}')
-        print(f'FSDP: Using mixed_precision={mixed_precision}')
-        print(f'FSDP: Using backward_prefetch={backward_prefetch}')
-        print(f'FSDP: Using activation_checkpointing={activation_checkpointing}')
-        print(f'FSDP: Using activation_cpu_offload={activation_cpu_offload}')
-        print(f'FSDP: Using sync_module_states={sync_module_states}')
-        print(f'FSDP: Using forward_prefetch={forward_prefetch}')
-        print(f'FSDP: Using limit_all_gathers={limit_all_gathers}')
-        print(f'FSDP: Using state_dict_type={state_dict_type}')
-        print(f'FSDP: Using sharded_ckpt_prefix_dir={sharded_ckpt_prefix_dir}')
+        log.info(f'FSDP: Wrapped model: {model}')
+        log.info(f'FSDP: Using sharding_strategy={sharding_strategy}')
+        log.info(f'FSDP: Using cpu_offload={cpu_offload}')
+        log.info(f'FSDP: Using mixed_precision={mixed_precision}')
+        log.info(f'FSDP: Using backward_prefetch={backward_prefetch}')
+        log.info(f'FSDP: Using activation_checkpointing={activation_checkpointing}')
+        log.info(f'FSDP: Using activation_cpu_offload={activation_cpu_offload}')
+        log.info(f'FSDP: Using sync_module_states={sync_module_states}')
+        log.info(f'FSDP: Using forward_prefetch={forward_prefetch}')
+        log.info(f'FSDP: Using limit_all_gathers={limit_all_gathers}')
+        log.info(f'FSDP: Using state_dict_type={state_dict_type}')
+        log.info(f'FSDP: Using sharded_ckpt_prefix_dir={sharded_ckpt_prefix_dir}')
 
     # Rebuild optimizer now that parameters are sharded
     if optimizers:
-        optimizers_tuple = ensure_tuple(optimizers)
-        optim = optimizers_tuple[0]
+        optim = ensure_tuple(optimizers)[0]
         optim.param_groups.clear()
 
         assert num_param_groups is not None
diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py
index ad0fd0904c..07a4f15fbf 100644
--- a/composer/trainer/mosaic_fsdp.py
+++ b/composer/trainer/mosaic_fsdp.py
@@ -6,30 +6,16 @@
 
 """Monkey patch FSDPs _auto_wrap to enable module_kwargs and custom process_group cache and ChunkShardingSpec to enable sharding over all gpus."""
 
+# pyright: reportGeneralTypeIssues=false
 import torch
 from packaging import version
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed.fsdp import FullyShardedDataParallel
 
-from composer.trainer.mosaic_fsdp_utils import (_sharded_pre_load_state_dict_hook, build_metadata,
-                                                custom_auto_wrap_t1p13p1)
-
 
 def patch_pytorch():
     """Monkey patches pytorch functions based on pytorch version."""
-    if version.parse(torch.__version__) < version.parse('1.13.1'):
-        raise NotImplementedError(f'Not supported for torch < 1.13.1')
-
-    elif version.parse(torch.__version__) < version.parse('2.0.0'):
-        # Monkey patch for torch < 2.0 ie torch == 1.13.1
-
-        # Monkey patch _auto_wrap with _custom_auto_wrap fn
-        FullyShardedDataParallel._auto_wrap = custom_auto_wrap_t1p13p1  # type: ignore
-
-    elif version.parse(torch.__version__) < version.parse('2.0.1'):
-        raise NotImplementedError(f'Not supported for torch == 2.0.0')
-
-    elif version.parse(torch.__version__) < version.parse('2.0.2'):
+    if version.parse(torch.__version__) < version.parse('2.0.2'):
         # Monkey patch for torch == 2.0.1
 
         # Monkey patch __init__ where __init__ calls the custom _auto_wrap fn
@@ -38,16 +24,23 @@ def patch_pytorch():
         FullyShardedDataParallel.__init__ = init_fn_t2p0p1  # type: ignore
 
         # Monkey patch sharding method
+        from composer.trainer.mosaic_fsdp_utils import build_metadata
+
         ChunkShardingSpec.build_metadata = build_metadata
 
     elif version.parse(torch.__version__) < version.parse('2.1.1'):
         # Monkey patch for torch < 2.1.1 ie torch == 2.1.0
 
         # Monkey patch sharding method
+        from composer.trainer.mosaic_fsdp_utils import build_metadata
+
         ChunkShardingSpec.build_metadata = build_metadata
 
         # Monkey patch partial state dict handling
         from torch.distributed.fsdp import _state_dict_utils
+
+        from composer.trainer.mosaic_fsdp_utils import _sharded_pre_load_state_dict_hook
+
         _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook)
 
         # Allow 2D HSDP
@@ -61,23 +54,34 @@ def patch_pytorch():
         from torch.distributed.fsdp import _runtime_utils
         _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
 
-        # Better overlap communication and computation
-        from composer.trainer.mosaic_fsdp_utils import (_root_pre_forward, _share_state_and_init_handle_attrs_t2p1,
-                                                        _wait_for_computation_stream, forward)
-        _runtime_utils._share_state_and_init_handle_attrs = _share_state_and_init_handle_attrs_t2p1
-        _runtime_utils._wait_for_computation_stream = _wait_for_computation_stream
-        _runtime_utils._root_pre_forward = _root_pre_forward
-        FullyShardedDataParallel.forward = forward
-
     elif version.parse(torch.__version__) < version.parse('2.2.1'):
         # Monkey patch for torch < 2.2.1 ie torch == 2.2.0
 
-        # Better overlap communication and computation
+        # Allow 2D HSDP
         from torch.distributed.fsdp import _runtime_utils
+        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
+
+    elif version.parse(torch.__version__) < version.parse('2.3.1'):
+        # Monkey patch for torch < 2.3.1 ie torch == 2.3.0
+        # Note: this is the same patch as 2.2.0, we are just making a new if branch
+        # for clarity and modularity of changes.
+
+        # Allow 2D HSDP
+        from torch.distributed.fsdp import _runtime_utils
+        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
+
+        # Monkeypatch state_dict
+        from composer.trainer.mosaic_fsdp_utils import init_fn_t2p3p0
+        FullyShardedDataParallel.__init__ = init_fn_t2p3p0
+
+        # Monkeypatch state_dict
+        from torch.distributed.checkpoint import state_dict  # type: ignore
+
+        from composer.trainer.mosaic_fsdp_utils import _verify_options_t2p3p0
+        state_dict._verify_options = _verify_options_t2p3p0
+
+        # Monkeypatch sharding optim state
+        from torch.distributed.fsdp import _optim_utils
 
-        from composer.trainer.mosaic_fsdp_utils import (_root_pre_forward, _share_state_and_init_handle_attrs_t2p2,
-                                                        _wait_for_computation_stream, forward)
-        _runtime_utils._share_state_and_init_handle_attrs = _share_state_and_init_handle_attrs_t2p2
-        _runtime_utils._wait_for_computation_stream = _wait_for_computation_stream
-        _runtime_utils._root_pre_forward = _root_pre_forward
-        FullyShardedDataParallel.forward = forward
+        from composer.trainer.mosaic_fsdp_utils import _shard_orig_param_state
+        _optim_utils._shard_orig_param_state = _shard_orig_param_state
diff --git a/composer/trainer/mosaic_fsdp_utils.py b/composer/trainer/mosaic_fsdp_utils.py
index 3cf26d79ec..5b08f4c35f 100644
--- a/composer/trainer/mosaic_fsdp_utils.py
+++ b/composer/trainer/mosaic_fsdp_utils.py
@@ -4,13 +4,18 @@
 # Released under BSD 3-Clause License,
 # Copyright (c) Facebook, Inc. and its affiliates.
 
+# yapf: disable
+# isort: skip_file
+
 """Utilities for monkey patching FSDP."""
 
 import functools
 import logging
 import math
 import warnings
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Set, Tuple, Union, cast, no_type_check
+import contextlib
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union, cast, no_type_check
 
 import torch
 import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
@@ -35,6 +40,7 @@
             torch.__version__) < version.parse('2.2.0'):
         from torch.distributed.fsdp._common_utils import _FSDPState
 
+
 log = logging.getLogger(__name__)
 
 SHARDING_MAP = {
@@ -206,153 +212,6 @@ def _set_custom_fsdp_module_kwargs(module_kwargs: Dict, process_group_cache: Dic
 
     return module_kwargs
 
-
-def _custom_recursive_wrap_t1p13p1(
-    module: nn.Module,
-    auto_wrap_policy: Callable,
-    wrapper_cls: Callable,
-    ignored_modules: Set[nn.Module],
-    ignored_params: Set[nn.Parameter],
-    process_group_cache: Dict[Tuple[int], Any],
-    only_wrap_children: bool = False,
-    **kwargs: Any,
-) -> Tuple[nn.Module, int]:
-    """Updates FSDPs _recursive_wrap to enable module_kwargs and custom process_group cache.
-
-    torch version must be 1.13.1.
-
-    modified version of
-    https://github.com/pytorch/pytorch/blob/d922c29a22e4bf0fba49526f7536395eb8cd66f4/torch/distributed/fsdp/wrap.py#L353
-    which recursively wraps modules as FSDP modules for parameter sharding.
-    This modification enables the user to pass custom FSDP arguements for every wrapped module.
-    The added process_group_cache enables different FSDP modules to, when appropriate, use the
-    same process group instead of instantiating a new process group.
-
-    Automatically wrap child modules of *module* that meet the given
-    criteria with :func:`auto_wrap`. Does not rely on _ConfigAutoWrap.
-
-    Args:
-        module (nn.Module):
-            module to recursively wrap
-        auto_wrap_policy (Callable):
-            A callable specifying a policy to recursively wrap layers with FSDP.
-        ignored_modules (Set[torch.nn.Module]): Modules to ignore when
-            wrapping.
-        ignored_params (Set[torch.nn.Parameter]): Parameters to ignore when
-            wrapping; these should be the parameters contained in the modules
-            in ``ignored_modules``.
-        process_group_cache (Dict[Tuple[int], Any]): a cache of process_group to
-            use instead of potentially instantiating a new process_group
-
-    Returns:
-        (nn.Module, int):
-            Wrapped module and the number parameters wrapped recursively.
-    """
-    from torch.distributed.fsdp.wrap import _wrap
-
-    assert auto_wrap_policy is not None, 'Must specify auto_wrap_policy.'
-    assert wrapper_cls is not None, 'Must specify wrapper_cls'
-    # Make sure no child is already wrapped.
-    for _, child in module.named_modules():
-        if child in ignored_modules:
-            continue
-        try:
-            assert not isinstance(child, cast(type, wrapper_cls))
-        except TypeError:
-            # wrapper_cls is a function as opposed to a class type, just bypass above check.
-            pass
-
-    # We count all params, assuming none of them are already wrapped.
-    num_params = sum(p.numel() for p in module.parameters() if p not in ignored_params)
-
-    assert auto_wrap_policy is not None
-    if auto_wrap_policy(module=module, recurse=True, unwrapped_params=num_params):
-        total_wrapped_params = 0
-        # Iterate through the children, recursively wrap if necessary
-        for name, child in module.named_children():
-            if child in ignored_modules:
-                continue
-            wrapped_child, num_wrapped_params = _custom_recursive_wrap_t1p13p1(
-                module=child,
-                auto_wrap_policy=auto_wrap_policy,
-                wrapper_cls=wrapper_cls,
-                ignored_modules=ignored_modules,
-                ignored_params=ignored_params,
-                process_group_cache=process_group_cache,
-                **kwargs,
-            )
-            setattr(module, name, wrapped_child)
-            # Keep track of how many parameters have been wrapped
-            total_wrapped_params += num_wrapped_params
-        # decide if we need to wrap the current module,
-        # since the left over parameters exceed the number of params to wrap
-        remainder = num_params - total_wrapped_params
-        module_kwargs = auto_wrap_policy(module=module, recurse=False, unwrapped_params=remainder)
-        if not only_wrap_children and module_kwargs:
-            # CHANGE: We modify the original code to support custom FSDP kwargs and add
-            # the process_group_cache to avoid instantiating a new process group.
-            module_kwargs = module_kwargs if isinstance(module_kwargs, dict) else {}
-            module_kwargs = _set_custom_fsdp_module_kwargs(module_kwargs, process_group_cache)
-
-            final_kwargs = {**kwargs, **module_kwargs}
-
-            # Leaf node or final wrapping of the remainder both happen here.
-            return _wrap(module, wrapper_cls, **final_kwargs), num_params
-        else:
-            return module, total_wrapped_params
-    return module, 0
-
-
-def custom_auto_wrap_t1p13p1(
-    self,
-    auto_wrap_kwargs: Dict[str, Any],
-    fsdp_kwargs: Dict[str, Any],
-) -> None:
-    """Updates _auto_wrap to enable module_kwargs.
-
-    torch version must be 1.13.1.
-
-    modified version of
-    https://github.com/pytorch/pytorch/blob/d922c29a22e4bf0fba49526f7536395eb8cd66f4/torch/distributed/fsdp/fully_sharded_data_parallel.py#L1252
-    FSDP's _auto_wrap recursively wraps modules as FSDP modules for parameter sharding.
-    This modification enables the user to pass custom FSDP arguements for every wrapped module.
-    The added process_group_cache enables different FSDP modules to, when appropriate, use the
-    same process group instead of instantiating a new process group.
-
-    Recursively auto wraps the root module given by the key "module" in
-    ``auto_wrap_kwargs`` with the arguments in ``auto_wrap_kwargs`` and
-    ``fsdp_kwargs``.
-    Precondition: ``auto_wrap_policy`` contains the arguments expected by
-    ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
-    ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
-    """
-    from torch.distributed.fsdp._utils import _contains_batchnorm, _override_batchnorm_mixed_precision
-    from torch.distributed.fsdp.wrap import _or_policy, _wrap_batchnorm_individually
-
-    auto_wrap_policy = auto_wrap_kwargs['auto_wrap_policy']
-    root_module = auto_wrap_kwargs['module']
-    assert auto_wrap_policy is not None
-    # For auto wrapping, submodules should not already be wrapped with FSDP
-    # since double wrapping is not supported
-    for module_name, module in root_module.named_modules():
-        if isinstance(module, FullyShardedDataParallel):
-            raise ValueError(f'Expected {module_name} to NOT be FullyShardedDataParallel '
-                             'if using an `auto_wrap_policy`')
-    mixed_precision = fsdp_kwargs['mixed_precision']
-    if mixed_precision is not None and _contains_batchnorm(root_module):
-        _override_batchnorm_mixed_precision(root_module)
-        auto_wrap_policy = functools.partial(_or_policy, policies=[_wrap_batchnorm_individually, auto_wrap_policy])
-        warnings.warn('Both mixed precision and an `auto_wrap_policy` were specified '
-                      'for FSDP, where the wrapped module has batch norm submodules. '
-                      'The batch norm submodules will be wrapped as separate FSDP '
-                      'instances with mixed precision disabled since some batch norm '
-                      'kernels do not support low precision.')
-        auto_wrap_kwargs['auto_wrap_policy'] = auto_wrap_policy
-    # CHANGE: Add process group cache and call our custom _recursive_wrap
-    auto_wrap_kwargs['process_group_cache'] = {}
-    _custom_recursive_wrap_t1p13p1(**auto_wrap_kwargs, **fsdp_kwargs)
-
-
 def _custom_recursive_wrap_t2p0p1(
     module: nn.Module,
     auto_wrap_policy: Callable,
@@ -370,7 +229,7 @@ def _custom_recursive_wrap_t2p0p1(
     modified version of
     https://github.com/pytorch/pytorch/blob/96ca226a7332be0d8f3d6159d0c797e032ab0721/torch/distributed/fsdp/wrap.py#L320
     which recursively wraps modules as FSDP modules for parameter sharding.
-    This modification enables the user to pass custom FSDP arguements for every wrapped module.
+    This modification enables the user to pass custom FSDP arguments for every wrapped module.
     The added process_group_cache enables different FSDP modules to, when appropriate, use the
     same process group instead of instantiating a new process group.
 
@@ -468,7 +327,7 @@ def _custom_auto_wrap_t2p0p1(
     modified version of
     https://github.com/pytorch/pytorch/blob/96ca226a7332be0d8f3d6159d0c797e032ab0721/torch/distributed/fsdp/_wrap_utils.py#L31
     FSDP's _auto_wrap recursively wraps modules as FSDP modules for parameter sharding.
-    This modification enables the user to pass custom FSDP arguements for every wrapped module.
+    This modification enables the user to pass custom FSDP arguments for every wrapped module.
     The added process_group_cache enables different FSDP modules to, when appropriate, use the
     same process group instead of instantiating a new process group.
 
@@ -756,361 +615,421 @@ def _sharded_pre_load_state_dict_hook(
     _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
 
 
-def fsdp_state_has_default_pg(state: '_FSDPState') -> bool:
-    """Indicates whether FlatParamHandle has the default process group.
-
-    Args:
-        handle (_FSDPState): FSDP State object
-
-    Returns:
-        bool: True if the ProcessGroup of the _FSDPState object is the default process group. False
-            otherwise.
-    """
-    if state.process_group is None:
-        # If no process group is attached to the _FSDPState, assume it uses default process group.
-        return True
-    return len(get_process_group_ranks(state.process_group)) == dist.get_world_size()
-
-
-def fsdp_state_pg_ranks(state: '_FSDPState') -> Tuple[int, ...]:
-    """Gets the ranks included in the ProcessGroup of an _FSDPState.
-
-    Args:
-        state (_FSDPState): FSDP State object
-
-    Returns:
-        Tuple[int]: Ranks for the FSDP State's process group.
-    """
-    if state.process_group is None:
-        # If no process group is attached to the _FSDPState, assume it uses default process group.
-        return tuple(range(dist.get_world_size()))
-    else:
-        return tuple(get_process_group_ranks(state.process_group))
-
-
-def _wait_for_computation_stream(
-    computation_stream: torch.Stream,
-    root_state: '_FSDPState',
-    pre_unshard_stream: torch.Stream,
-):
-    """Unshard and pre-unshard streams wait for computation stream.
+if version.parse(torch.__version__) > version.parse('2.2.9') and version.parse(
+        torch.__version__) < version.parse('2.3.1'):
+    import copy
+
+    from torch.distributed._tensor import DeviceMesh, DTensor, Replicate
+    from torch.distributed._tensor import Shard as DShard
+    from torch.distributed.algorithms._comm_hooks import default_hooks
+    from torch.distributed.device_mesh import _mesh_resources
+    from torch.distributed.distributed_c10d import _get_default_group
+    from torch.distributed.fsdp._common_utils import _FSDPState
+    from torch.distributed.fsdp._init_utils import (HYBRID_SHARDING_STRATEGIES, ProcessGroupType,
+                                                    _get_default_comm_hook_state, _init_intra_and_inter_node_groups,
+                                                    _is_valid_hybrid_shard_pg_type, _init_extension)
+    from torch.distributed.fsdp.fully_sharded_data_parallel import (_annotate_modules_for_dynamo, _auto_wrap,
+                                                                    _check_orig_params_flattened, _init_buffer_state,
+                                                                    _init_core_state, _init_device_handle,
+                                                                    _init_ignored_module_states,
+                                                                    _init_param_handle_from_module,
+                                                                    _init_prefetching_state, _init_runtime_state,
+                                                                    _init_state_dict_state,
+                                                                    _register_all_state_dict_hooks,
+                                                                    _register_flat_param)
+    from torch.distributed.fsdp.wrap import CustomPolicy, ModuleWrapPolicy, _Policy
+    from torch.distributed.tensor.parallel.fsdp import DTensorExtensions
+
+    def all_gather_dtensor_t2p3p0(
+        self,
+        tensor: DTensor,
+        parent_mesh: Optional[DeviceMesh],
+    ) -> torch.Tensor:
+        """All gather a DTensor in its FSDP dimension and return the local tensor."""
+        assert parent_mesh == tensor.device_mesh
+
+        placements = list(copy.deepcopy(tensor.placements))
+        # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
+        # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
+        for i in range(0, len(placements) - 1):
+            placements[i] = Replicate()
+        tensor = tensor.redistribute(
+            device_mesh=tensor.device_mesh,
+            placements=placements,
+        )
+        return tensor.to_local()
 
-    Has the unshard and pre-unshard streams wait for the computation stream.
-    For example, this should be called in the FSDP root's pre-forward to
-    respect optimizer step computation.
-    """
-    # Tracing does not need to wait
-    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
-        return
-    # Ensure all unshard streams wait for the computation stream.
-    unshard_streams = set()
-    for fsdp_state in root_state._all_fsdp_states:
-        unshard_streams.add(fsdp_state._unshard_stream)
-    for unshard_stream in unshard_streams:
-        unshard_stream.wait_stream(computation_stream)  # type: ignore[attr-defined]
-    # Having the pre-all-gather stream wait for the current stream even if we
-    # do not leverage the pre-all-gather stream is tolerable since this only
-    # runs once per iteration
-    pre_unshard_stream.wait_stream(computation_stream)  # type: ignore[attr-defined]
+    def chunk_dtensor_t2p3p0(
+        self,
+        tensor: torch.Tensor,
+        rank: int,
+        device_mesh: DeviceMesh,
+    ) -> DTensor:
+        """Shard a tensor to chunks along the first dimension.
 
+        The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
+        """
+        parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
+        if parent_mesh is None:
+            raise RuntimeError('No parent device_mesh is found for FSDP device_mesh.')
+        # if parent_mesh.ndim != 2:
+        #     raise RuntimeError(
+        #         f"Found parent device_mesh of ndim={parent_mesh.ndim},",
+        #         "but only 2D meshes are currently supported.",
+        #     )
+
+        # We need to explicitly call .detach() to return a new tensor detached from the current graph.
+        tensor = tensor.clone().detach()
+
+        # When a layer is not involved in TP, then the tensor will not be a DTensor.
+        # e.g. When a layer is not specified in the parallelize_plan, TP will have no effect on the layer.
+        # e.g. When you do PairwiseParallel on a 3 layer model, TP will have no effect on the third layer.
+        if isinstance(tensor, torch.Tensor) and not isinstance(tensor, DTensor):
+
+            # For tensors, it is replicated across tp dimension and sharded across FSDP dimension.
+            # TP is the inner dimension and FSDP is the outer dimension.
+            # Therefore, shard placements for tensor is (Shard(0), Replicate()).
+            replicate_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+            shard_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+            shard_placements[0] = DShard(0)  # type: ignore[call-overload]
+
+            return DTensor.from_local(tensor, parent_mesh, replicate_placements).redistribute(
+                device_mesh=parent_mesh,
+                placements=shard_placements,
+            )
 
-@no_type_check
-def _root_pre_forward(
-    state: '_FSDPState',
-    module: nn.Module,
-    args,
-    kwargs,
-) -> None:
-    """Runs pre-forward logic specific to the root FSDP instance.
+        else:
+            tp_placements = tensor.placements
+            tp_placement = tp_placements[0]
+
+            tensor = tensor.to_local()
+
+            if parent_mesh.ndim <= 2:
+                # For DTensors, it is sharded across tp dimension first and then sharded across FSDP dimension.
+                # TP is the inner dimension and FSDP is the outer dimension.
+                # Therefore, shard placements for tensor is (Shard(0), tp_placement).
+                replicate_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+                replicate_placements[-1] = tp_placement  # type: ignore[call-overload]
+                shard_placements = [DShard(0) for _ in range(parent_mesh.ndim)]  # type: ignore[misc]
+                shard_placements[-1] = tp_placement  # type: ignore[call-overload]
+
+            elif parent_mesh.ndim == 3:
+                replicate_placements = [Replicate(), Replicate(), tp_placement]
+                shard_placements = [Replicate(), DShard(0), tp_placement]  # type: ignore[misc]
+
+            return DTensor.from_local(tensor, parent_mesh, replicate_placements).redistribute(
+                device_mesh=parent_mesh,
+                placements=shard_placements,
+            )
 
-    This should run before any individual module's pre-forward. This starts
-    with an attempt at lazy initialization (which only runs non-vacuously once).
-    Otherwise, if this is called on a non-root FSDP instance, then it returns
-    directly.
-    """
-    from torch.distributed.fsdp._common_utils import _is_composable
-    from torch.distributed.fsdp._runtime_utils import (_cast_buffers_to_dtype_and_device,
-                                                       _get_buffers_and_dtypes_for_computation, _lazy_init,
-                                                       _reset_flat_param_grad_info_if_needed, _root_cast_forward_input)
-    from torch.distributed.utils import _p_assert, _to_kwargs
-    with torch.profiler.record_function('FullyShardedDataParallel._root_pre_forward'):
-        _lazy_init(state, module)
-        _p_assert(state._is_root is not None, 'Expects a root FSDP to have been set')
-        if not state._is_root:
-            # Always cast forward inputs in the root of this local FSDP unit for mixed
-            # precision, as this is where mixed precision could be configed.
-            # This is more useful for auto wrapping that is recommended in composable path.
-            # For manual wrapping, cast forward inputs on each local FSDP unit root will
-            # increase some overhead, so not turned on for model wrapper path right now where
-            # manual wrapping is more broadly used.
-            if _is_composable(state):
-                return _root_cast_forward_input(state, module, args, kwargs)
-            return args, kwargs
-
-        # We cast buffers back to full precision if we're forcing full precision. Disjointly, we check if buffers
-        # are in full precision and if we should cast them back to lower precision, which happens when
-        # exiting eval() mode.
-        handle = state._handle
-        if handle:
-            should_cast_buffers_to_full_prec = handle._force_full_precision
+    DTensorExtensions.all_gather_dtensor = all_gather_dtensor_t2p3p0
+    DTensorExtensions.chunk_dtensor = chunk_dtensor_t2p3p0
+
+    def _is_valid_hybrid_shard_device_mesh_t2p3p0(device_mesh: DeviceMesh) -> bool:
+        #parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
+        #if parent_mesh is not None:
+        #    raise RuntimeError(
+        #        f"Found device_mesh {device_mesh} passed in has a parent device_mesh {parent_mesh}.",
+        #        "Hybrid sharding + TP is not supported yet.",
+        #    )
+        return isinstance(device_mesh, DeviceMesh) and device_mesh.ndim == 2
+
+    def _init_process_group_state_for_hybrid_shard_t2p3p0(
+        state: _FSDPState,
+        process_group: ProcessGroupType,
+        device_mesh: DeviceMesh,
+    ) -> _FSDPState:
+        if device_mesh:
+            if _is_valid_hybrid_shard_device_mesh_t2p3p0(device_mesh):
+                state._device_mesh = device_mesh
+                # We currently only allow _inter_node_pg to be the outermost dimension, and the
+                # process_group(intra_node) to be the innermost dimension.
+                state._inter_node_pg = device_mesh.get_group(mesh_dim=0)
+                state.process_group = device_mesh.get_group(mesh_dim=1)
+            else:
+                raise ValueError('Expected device_mesh to have ndim=2 '
+                                 f'but got {len(device_mesh.get_group())}')
+        elif process_group is None:
+            default_group = _get_default_group()
+            intra_node_group, inter_node_group = _init_intra_and_inter_node_groups(default_group,
+                                                                                   state._device_handle.device_count())
+            # we shard across intra-node
+            state.process_group = intra_node_group
+            # save _inter_node_pg to allreduce across.
+            state._inter_node_pg = inter_node_group
         else:
-            should_cast_buffers_to_full_prec = True
+            # Check type and assign state.process_group and state._inter_node_pg.
+            if _is_valid_hybrid_shard_pg_type(process_group):
+                # Assuming that user passed in as intra node group and inter node group
+                # as documented.
+                state.process_group, state._inter_node_pg = process_group
+            else:
+                raise ValueError('Expected process_group to be passed in as either None or '
+                                 f'Tuple[dist.ProcessGroup, dist.ProcessGroup] but got {type(process_group)}')
+        # Create state for allreduce
+        state._inter_node_state = _get_default_comm_hook_state(process_group=state._inter_node_pg,)
+        return state
+
+    def _init_process_group_state_t2p3p0(
+        state: _FSDPState,
+        process_group: ProcessGroupType,
+        sharding_strategy: ShardingStrategy,
+        policy: Optional[_Policy],
+        device_mesh: Optional[DeviceMesh] = None,
+    ) -> _FSDPState:
+        if process_group is not None and device_mesh is not None:
+            raise ValueError('Cannot pass both process_group and device_mesh at the '
+                             'same time. Please just pass only one of them.')
+        is_hybrid_strategy = sharding_strategy in HYBRID_SHARDING_STRATEGIES
+        if is_hybrid_strategy:
+            if process_group is None and policy is None and device_mesh is None:
+                # Raise an error here, since this is manual wrapping with no process group
+                # passed in, there is no way to ensure all wrapped FSDP instances use the same
+                # process groups.
+                raise ValueError(
+                    f'Manual wrapping with {sharding_strategy}',
+                    'requires explicit specification of process group or device_mesh.',
+                )
+            else:
+                state = _init_process_group_state_for_hybrid_shard_t2p3p0(state, process_group, device_mesh)
+        else:
+            if device_mesh:
+                state._device_mesh = device_mesh
+                state.process_group = device_mesh.get_group(mesh_dim=0)
+            else:
+                state.process_group = (process_group if process_group is not None else _get_default_group())
+
+        state.rank = state.process_group.rank()
+        state.world_size = state.process_group.size()
+        data_parallel_world_size = state.world_size
+        if is_hybrid_strategy:
+            data_parallel_world_size *= state._inter_node_pg.size()
+        state._gradient_predivide_factor = (
+            default_hooks.DefaultState._get_gradient_predivide_factor(data_parallel_world_size))
+        state._gradient_postdivide_factor = (data_parallel_world_size / state._gradient_predivide_factor)
+        return state
+
+    def init_fn_t2p3p0(
+        self,
+        module: nn.Module,
+        process_group: ProcessGroupType = None,
+        sharding_strategy: Optional[ShardingStrategy] = None,
+        cpu_offload: Optional[CPUOffload] = None,
+        auto_wrap_policy: Optional[Union[Callable, ModuleWrapPolicy, CustomPolicy]] = None,
+        backward_prefetch: Optional[BackwardPrefetch] = BackwardPrefetch.BACKWARD_PRE,
+        mixed_precision: Optional[MixedPrecision] = None,
+        ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+        param_init_fn: Optional[Callable[[nn.Module], None]] = None,
+        device_id: Optional[Union[int, torch.device]] = None,
+        sync_module_states: bool = False,
+        forward_prefetch: bool = False,
+        limit_all_gathers: bool = True,
+        use_orig_params: bool = False,
+        ignored_states: Union[Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]] = None,
+        device_mesh: Optional[DeviceMesh] = None,
+    ):
+        """Docstring for lint."""
+        torch._C._log_api_usage_once('torch.distributed.fsdp')
+        super(FullyShardedDataParallel, self).__init__()
+        _init_ignored_module_states(self, module, ignored_modules, ignored_states)
+        _init_device_handle(self, module, self._ignored_params, device_id)
 
-        if should_cast_buffers_to_full_prec:
-            _cast_buffers_to_dtype_and_device(
-                buffers=dict(module.named_buffers()).values(),
-                buffer_dtypes=list(state._buffer_name_to_orig_dtype.values()),
-                device=state.compute_device,
-            )
-            # This flag is only set when we cast buffers to full precision, to avoid the
-            # CPU overhead that can stem from retrieving all buffers and their types in the
-            # following else branch.
-            state._needs_buffer_dtype_restore_check = True
-        elif getattr(state, '_needs_buffer_dtype_restore_check', False):
-            # Check if buffers are in full precision and we need to cast them
-            # back down.
-            (
-                buffers,
-                buffer_dtypes_for_computation,
-            ) = _get_buffers_and_dtypes_for_computation(state, module)
-            if len(buffers) > 0 and len(buffer_dtypes_for_computation) > 0:
-                if any(buffer.dtype != buffer_dtype_for_computation
-                       for buffer, buffer_dtype_for_computation in zip(buffers, buffer_dtypes_for_computation)):
-                    # Assume we have to cast everything if there is one mismatch
-                    _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes_for_computation, state.compute_device)
-            # We don't have to check this again until we cast buffers to full precision again.
-            state._needs_buffer_dtype_restore_check = False
-
-        if state.forward_prefetch:
-            handles = []
-            for fsdp_state in state._all_fsdp_states:
-                if fsdp_state._handle:
-                    handles.append(fsdp_state._handle)
-            for handle in handles:
-                handle._needs_pre_forward_unshard = True
-                handle._prefetched = False
-
-        _wait_for_computation_stream(
-            state._device_handle.current_stream(),
-            state,
-            state._pre_unshard_stream,
-        )
-        _reset_flat_param_grad_info_if_needed(state._all_handles)
-
-        # Prepares the forward inputs by moving them to ``compute_device``
-        # TODO: Do not use the side stream for tensor copies for now; investigate
-        # the perf with/without it.
-        with torch.profiler.record_function('FullyShardedDataParallel._to_kwargs'):
-            args_tuple, kwargs_tuple = _to_kwargs(args, kwargs, state.compute_device, False)
-        args = args_tuple[0]
-        kwargs = kwargs_tuple[0]
-
-        return _root_cast_forward_input(state, module, args, kwargs)
-
-
-def forward(self, *args: Any, **kwargs: Any) -> Any:
-    """Run the forward pass for the wrapped module, inserting FSDP-specific pre- and post-forward sharding logic."""
-    from torch.distributed.fsdp._runtime_utils import (_post_forward, _post_forward_reshard, _pre_forward,
-                                                       _pre_forward_unshard)
-    from torch.distributed.utils import _p_assert
-    handle = self._handle
-    with torch.autograd.profiler.record_function('FullyShardedDataParallel.forward'):
-        args, kwargs = _root_pre_forward(self, self, args, kwargs)
-        unused = None
-        args, kwargs = _pre_forward(
+        # Add module annotations for Dynamo support (see function for details)
+        _annotate_modules_for_dynamo(module, self._ignored_modules, use_orig_params)
+
+        # Initializes self.process_group, along with rank and world size. This will
+        # also set another attribute, _inter_node_pg, to control the process group
+        # over which sharding occurs, if sharding_strategy is {HYBRID_SHARD, _HYBRID_SHARD_ZERO2}.
+        # Note that this is done before auto_wrapping, so that child FSDP modules simply pick up
+        # the same process group state as the root FSDP module.
+        self._device_mesh = device_mesh
+        _init_process_group_state_t2p3p0(
             self,
-            handle,
-            _pre_forward_unshard,
-            self._fsdp_wrapped_module,
-            args,
-            kwargs,
+            process_group,
+            sharding_strategy,
+            auto_wrap_policy,
+            device_mesh,
         )
-        if handle:
-            _p_assert(
-                handle.flat_param.device == self.compute_device,
-                'Expected `FlatParameter` to be on the compute device '
-                f'{self.compute_device} but got {handle.flat_param.device}',
+        if auto_wrap_policy is not None:
+            root_kwargs = {
+                'process_group': process_group,
+                'sharding_strategy': sharding_strategy,
+                'cpu_offload': cpu_offload,
+                'backward_prefetch': backward_prefetch,
+                'mixed_precision': mixed_precision,
+                'param_init_fn': param_init_fn,
+                'device_id': device_id,
+                'sync_module_states': sync_module_states,
+                'forward_prefetch': forward_prefetch,
+                'limit_all_gathers': limit_all_gathers,
+                'use_orig_params': use_orig_params,
+                'ignored_states': self._ignored_params,
+                'device_mesh': device_mesh,
+            }
+            if sharding_strategy in HYBRID_SHARDING_STRATEGIES and device_mesh is None:
+                # Share root process groups with children to maintain
+                # the invariant that all FSDP modules will have the same
+                # process groups.
+                root_kwargs['process_group'] = (self.process_group, self._inter_node_pg)
+
+            _auto_wrap(
+                module,
+                auto_wrap_policy,
+                self._ignored_modules,
+                self._ignored_params,
+                root_kwargs,
+                FullyShardedDataParallel,
             )
-        output = self._fsdp_wrapped_module(*args, **kwargs)
-        return _post_forward(self, handle, _post_forward_reshard, self, unused, output)
-
-
-@no_type_check
-def _share_state_and_init_handle_attrs_t2p1(
-    root_state: '_FSDPState',
-    root_module: nn.Module,
-) -> None:
-    """Shares state from ``root_state`` to other FSDP states.
 
-    Shares data structure state from the ``root_state`` to all FSDP states in
-    ``root_module`` 's module tree, and initializes handle attributes. These are
-    done together to require a single loop over the states. This function has
-    been modified to assign a different unshard stream to each process group.
-    """
-    from torch.distributed.fsdp._runtime_utils import (HOMOGENEOUS_ATTR_NAMES, _init_device_mesh,
-                                                       _validate_and_get_hybrid_shard_state)
-    from torch.distributed.utils import _p_assert
-
-    handle = root_state._handle
-    if handle:
-        handle.init_flat_param_attributes()
-    _validate_and_get_hybrid_shard_state(root_module)
-    attr_name_to_values: Dict[str, Set[Any]] = {}
-    for attr_name in HOMOGENEOUS_ATTR_NAMES:
-        attr_name_to_values[attr_name] = set()
-    root_state._all_handles = root_state._exec_order_data.all_handles  # share reference
-    root_state._device_mesh = _init_device_mesh(root_state)
-    # Update _has_optim_in_backward for each handle.
-    for handle in root_state._all_handles:
-        flat_param = handle.flat_param
-        if hasattr(flat_param, '_in_backward_optimizers'):
-            raise RuntimeError('FSDP optimizer in backward only supported with use_orig_params=True!')
-        handle._has_optim_in_backward = flat_param._params is not None and any(
-            hasattr(param, '_in_backward_optimizers') for param in flat_param._params)
-
-    # Patching so that _FSDPStates with different process groups have separate unshard streams.
-    # Keep track of any new unshard streams we may have to add for specific process groups.
-    fsdp_pg_unshard_streams = {}
-    try:
-        unshard_priority = root_state._unshard_stream.priority
-    except AttributeError:
-        # Use the default priority of 0 if the stream has no assigned priority.
-        unshard_priority = 0
-    for fsdp_state in root_state._all_fsdp_states:
-        for attr_name in HOMOGENEOUS_ATTR_NAMES:
-            _p_assert(
-                hasattr(fsdp_state, attr_name),
-                f'FSDP state missing attribute {attr_name}',
-            )
-            attr_name_to_values[attr_name].add(getattr(fsdp_state, attr_name))
-        if fsdp_state is root_state:
-            continue
-        # Relax the assert for non-root FSDP instances in case the nested
-        # initialized module is wrapped again in FSDP later (e.g. after
-        # training to run inference)
-        _p_assert(
-            fsdp_state._is_root is None or not fsdp_state._is_root,
-            "Non-root FSDP instance's `_is_root` should not have been "
-            'set yet or should have been set to `False`',
+        backward_prefetch_limit = 1
+        forward_prefetch_limit = 1
+        _init_core_state(
+            self,
+            sharding_strategy,
+            mixed_precision,
+            cpu_offload,
+            limit_all_gathers,
+            use_orig_params,
+            backward_prefetch_limit,
+            forward_prefetch_limit,
         )
-        fsdp_state._is_root = False
-
-        # Take care of any new unshard streams we have to create for non-default process groups.
-        if fsdp_state_has_default_pg(fsdp_state):
-            # If using default process group, unshard stream is the same as root fsdp instance.
-            fsdp_state._unshard_stream = root_state._unshard_stream
-        else:
-            # Otherwise, unshard stream is separate.
-            state_pg_ranks = fsdp_state_pg_ranks(fsdp_state)
-            if state_pg_ranks in fsdp_pg_unshard_streams:
-                # We have created the unshard stream for this process group already. Use it.
-                fsdp_state._unshard_stream = fsdp_pg_unshard_streams[state_pg_ranks]
-            else:
-                # We don't have an unshard stream for this process group yet. Make it.
-                fsdp_state._unshard_stream = fsdp_state._device_handle.Stream(priority=unshard_priority)
-                fsdp_pg_unshard_streams[state_pg_ranks] = fsdp_state._unshard_stream
-
-        # All other stream assignments stay common across all of FSDP.
-        fsdp_state._post_backward_stream = root_state._post_backward_stream
-        fsdp_state._pre_unshard_stream = root_state._pre_unshard_stream
-        fsdp_state._all_reduce_stream = root_state._all_reduce_stream
-        fsdp_state._default_stream = root_state._default_stream
-        fsdp_state._exec_order_data = root_state._exec_order_data
-        fsdp_state._free_event_queue = root_state._free_event_queue
-        fsdp_state._device_mesh = root_state._device_mesh
-        handle = fsdp_state._handle
-        if handle:
-            handle.init_flat_param_attributes()
-    for attr_name, attr_values in attr_name_to_values.items():
-        if len(attr_values) != 1:
-            raise ValueError(f'Expects one homogeneous value for {attr_name} but got {attr_values}')
-
+        _init_runtime_state(self)
+        _init_prefetching_state(self, backward_prefetch, forward_prefetch)
+        _init_buffer_state(self, module)
+        # extension needs to be set before `_init_param_handle_from_module()`
+        _init_extension(self, device_mesh)
+        _init_param_handle_from_module(
+            self,
+            module,
+            device_id,
+            param_init_fn,
+            sync_module_states,
+        )
+        self._fsdp_wrapped_module = module
+        if not use_orig_params:
+            _check_orig_params_flattened(self, self._ignored_params)
+            _register_flat_param(self, self)
 
-@no_type_check
-def _share_state_and_init_handle_attrs_t2p2(
-    root_state: '_FSDPState',
-    root_module: nn.Module,
-) -> None:
-    """Shares state from ``root_state`` to other FSDP states.
+        # `_state_dict_type` controls the `state_dict()` behavior, which is
+        # implemented using post-save and pre-load hooks
+        _init_state_dict_state(self)
+        _register_all_state_dict_hooks(self)
 
-    Shares data structure state from the ``root_state`` to all FSDP states in
-    ``root_module`` 's module tree, and initializes handle attributes. These are
-    done together to require a single loop over the states. This function has
-    been modified to assign a different unshard stream to each process group.
-    """
-    from torch.distributed.fsdp._runtime_utils import HOMOGENEOUS_ATTR_NAMES, _validate_and_get_hybrid_shard_state
-    from torch.distributed.utils import _p_assert
-
-    handle = root_state._handle
-    if handle:
-        handle.init_flat_param_attributes()
-    _validate_and_get_hybrid_shard_state(root_module)
-    attr_name_to_values: Dict[str, Set[Any]] = {}
-    for attr_name in HOMOGENEOUS_ATTR_NAMES:
-        attr_name_to_values[attr_name] = set()
-    root_state._all_handles = root_state._exec_order_data.all_handles  # share reference
-    # Update _has_optim_in_backward for each handle.
-    for handle in root_state._all_handles:
-        flat_param = handle.flat_param
-        if hasattr(flat_param, '_in_backward_optimizers'):
-            raise RuntimeError('FSDP optimizer in backward only supported with use_orig_params=True!')
-        handle._has_optim_in_backward = flat_param._params is not None and any(
-            hasattr(param, '_in_backward_optimizers') for param in flat_param._params)
-        if handle._has_optim_in_backward:
-            torch._C._log_api_usage_once('fsdp.optimizer_in_backward')
-
-    # Patching so that _FSDPStates with different process groups have separate unshard streams.
-    # Keep track of any new unshard streams we may have to add for specific process groups.
-    fsdp_pg_unshard_streams = {}
-    try:
-        unshard_priority = root_state._unshard_stream.priority
-    except AttributeError:
-        # Use the default priority of 0 if the stream has no assigned priority.
-        unshard_priority = 0
-    for fsdp_state in root_state._all_fsdp_states:
-        for attr_name in HOMOGENEOUS_ATTR_NAMES:
-            _p_assert(
-                hasattr(fsdp_state, attr_name),
-                f'FSDP state missing attribute {attr_name}',
+    from torch.distributed.checkpoint.state_dict import StateDictOptions, _StateDictInfo
+
+    def _verify_options_t2p3p0(
+        model: nn.Module,
+        optims: Tuple[torch.optim.Optimizer, ...],
+        optim_only: bool,
+        *,
+        submodules: Optional[Set[nn.Module]] = None,
+        options: Optional[StateDictOptions] = None,
+    ) -> _StateDictInfo:
+        """Verify the model and options passed by the user and generates _StateDictInfo."""
+        from torch.distributed.checkpoint.state_dict import StateDictOptions, _get_fqns, _StateDictInfo
+        from torch.distributed.fsdp import FullOptimStateDictConfig, FullStateDictConfig
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        from torch.distributed.fsdp import (OptimStateDictConfig, ShardedOptimStateDictConfig, ShardedStateDictConfig,
+                                            StateDictConfig, StateDictType)
+
+        if optim_only and not optims:
+            raise RuntimeError('Optimizers are not passed in but optim_only is set to True.')
+
+        options = options or StateDictOptions()
+        assert options is not None  # pyright
+
+        fqn_param_mapping: Dict[Union[str, torch.Tensor], Union[Set[str], torch.Tensor]] = {}
+        all_fqns = set()
+        for name, param in model.named_parameters():
+            fqns = _get_fqns(model, name)
+            fqns = {fqn.replace('_checkpoint_wrapped_module.', '') for fqn in fqns}
+            fqn_param_mapping[param] = fqns
+            for fqn in fqns:
+                fqn_param_mapping[fqn] = param
+                all_fqns.add(fqn)
+
+        submodule_prefixes = set()
+        if submodules:
+            submodules = set(submodules)
+            for name, module in model.named_modules():
+                if module not in submodules:
+                    continue
+                fqns = _get_fqns(model, name)
+                assert len(fqns) == 1, 'Submodule FQN should only have 1 instance'
+                for fqn in fqns:
+                    submodule_prefixes.add(f'{fqn}.')
+        fsdp_modules = FSDP.fsdp_modules(model)
+        state_dict_config: StateDictConfig
+        optim_state_dict_config: OptimStateDictConfig
+        fsdp_context: Callable
+        if fsdp_modules:
+            # FSDP API only work if at least one FSDP instance exists.
+            if options.full_state_dict:
+                state_dict_config = FullStateDictConfig(offload_to_cpu=options.cpu_offload, rank0_only=options.cpu_offload)
+                optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=options.cpu_offload,
+                                                                rank0_only=options.cpu_offload)
+                state_dict_type = StateDictType.FULL_STATE_DICT
+            else:
+                state_dict_config = ShardedStateDictConfig(offload_to_cpu=options.cpu_offload,)
+                optim_state_dict_config = ShardedOptimStateDictConfig(offload_to_cpu=options.cpu_offload,)
+                state_dict_type = StateDictType.SHARDED_STATE_DICT
+
+            fsdp_context = functools.partial(
+                FSDP.state_dict_type,
+                module=model,
+                state_dict_type=state_dict_type,
+                state_dict_config=state_dict_config,
+                optim_state_dict_config=optim_state_dict_config,
             )
-            attr_name_to_values[attr_name].add(getattr(fsdp_state, attr_name))
-        if fsdp_state is root_state:
-            continue
-        # Relax the assert for non-root FSDP instances in case the nested
-        # initialized module is wrapped again in FSDP later (e.g. after
-        # training to run inference)
-        _p_assert(
-            fsdp_state._is_root is None or not fsdp_state._is_root,
-            "Non-root FSDP instance's `_is_root` should not have been "
-            'set yet or should have been set to `False`',
+        else:
+            fsdp_context = contextlib.nullcontext
+        return _StateDictInfo(
+            **asdict(options),
+            fqn_param_mapping=fqn_param_mapping,
+            all_fqns=all_fqns,
+            submodule_prefixes=submodule_prefixes,
+            fsdp_context=fsdp_context,
+            fsdp_modules=cast(List[nn.Module], fsdp_modules),
+            handle_model=not optim_only,
+            handle_optim=(len(optims) > 0),
         )
-        fsdp_state._is_root = False
 
-        # Take care of any new unshard streams we have to create for non-default process groups.
-        if fsdp_state_has_default_pg(fsdp_state):
-            # If using default process group, unshard stream is the same as root fsdp instance.
-            fsdp_state._unshard_stream = root_state._unshard_stream
-        else:
-            # Otherwise, unshard stream is separate.
-            state_pg_ranks = fsdp_state_pg_ranks(fsdp_state)
-            if state_pg_ranks in fsdp_pg_unshard_streams:
-                # We have created the unshard stream for this process group already. Use it.
-                fsdp_state._unshard_stream = fsdp_pg_unshard_streams[state_pg_ranks]
-            else:
-                # We don't have an unshard stream for this process group yet. Make it.
-                fsdp_state._unshard_stream = fsdp_state._device_handle.Stream(priority=unshard_priority)
-                fsdp_pg_unshard_streams[state_pg_ranks] = fsdp_state._unshard_stream
-
-        # All other stream assignments stay common across all of FSDP.
-        fsdp_state._post_backward_stream = root_state._post_backward_stream
-        fsdp_state._pre_unshard_stream = root_state._pre_unshard_stream
-        fsdp_state._all_reduce_stream = root_state._all_reduce_stream
-        fsdp_state._default_stream = root_state._default_stream
-        fsdp_state._exec_order_data = root_state._exec_order_data
-        fsdp_state._free_event_queue = root_state._free_event_queue
-        handle = fsdp_state._handle
-        if handle:
-            handle.init_flat_param_attributes()
-    for attr_name, attr_values in attr_name_to_values.items():
-        if len(attr_values) != 1:
-            raise ValueError(f'Expects one homogeneous value for {attr_name} but got {attr_values}')
+    from torch.distributed.fsdp._optim_utils import FSDPParamInfo
+    from torch.distributed._state_dict_utils import _gather_state_dict
+    def _shard_orig_param_state(
+        fsdp_param_info: FSDPParamInfo,
+        fqn: str,
+        optim_state: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """Shard function monkeypatch.
+
+        Shard the optimizer state for the original parameter with the name ``fqn``.
+        This API should only be used when ``use_orig_params`` is True.
+        """
+        if not optim_state:
+            return {}
+        fsdp_state = fsdp_param_info.state
+        flat_param = fsdp_param_info.handle.flat_param
+        param_idx = fsdp_param_info.param_indices[fqn]
+        shard_param_info = flat_param._shard_param_infos[param_idx]  # type: ignore[attr-defined]
+        optim_state = _gather_state_dict(
+            optim_state,
+            pg=fsdp_state.process_group,
+            device=fsdp_state.compute_device,
+        )
+        if not shard_param_info.in_shard:
+            return {}
+        # Flatten and shard the state.
+        new_optim_state: Dict[str, Any] = {}
+        intra_param_start_idx = shard_param_info.intra_param_start_idx
+        intra_param_end_idx = shard_param_info.intra_param_end_idx
+        for state_name, value in optim_state.items():
+            if (
+                torch.is_tensor(value)
+                and value.dim() > 0
+                and fsdp_state.sharding_strategy != ShardingStrategy.NO_SHARD
+            ):
+                value = value.flatten()[intra_param_start_idx : intra_param_end_idx + 1].clone()  # type: ignore[operator]
+            new_optim_state[state_name] = value
+        torch.cuda.synchronize()
+        return new_optim_state
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index c8c6d325e0..b7c9bd4d4a 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -28,18 +28,20 @@
 import torch.distributed
 import torch.nn as nn
 import torch.utils.data
-from packaging import version
+from torch._dynamo import OptimizedModule
 from torch.cuda.amp.grad_scaler import GradScaler, _refresh_per_optimizer_state
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.nn.parallel import DistributedDataParallel
+from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import DataLoader, DistributedSampler
 from torchmetrics import Metric
 
-from composer.callbacks import CheckpointSaver, OptimizerMonitor
+from composer.callbacks import CheckpointSaver, MemorySnapshot, OOMObserver, OptimizerMonitor
 from composer.core import (Algorithm, AlgorithmPass, Batch, Callback, DataSpec, Engine, Evaluator, Event, Precision,
-                           PyTorchScheduler, State, Time, Timestamp, TimeUnit, TrainerMode, ensure_data_spec,
-                           ensure_evaluator, ensure_time, get_precision_context, validate_eval_automicrobatching)
+                           State, Time, Timestamp, TimeUnit, TrainerMode, ensure_data_spec, ensure_evaluator,
+                           ensure_time, get_precision_context, validate_eval_automicrobatching)
 from composer.devices import Device, DeviceCPU, DeviceGPU, DeviceMPS, DeviceTPU
-from composer.loggers import (ConsoleLogger, Logger, LoggerDestination, MosaicMLLogger, ProgressBarLogger,
+from composer.loggers import (ConsoleLogger, Logger, LoggerDestination, MLFlowLogger, MosaicMLLogger, ProgressBarLogger,
                               RemoteUploaderDownloader, WandBLogger)
 from composer.loggers.mosaicml_logger import MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR
 from composer.models import ComposerModel
@@ -54,8 +56,9 @@
                             ensure_tuple, export_with_logger, extract_hparams, format_name_with_dist,
                             get_composer_env_dict, get_device, get_file, is_tpu_installed, map_collection,
                             maybe_create_object_store_from_uri, maybe_create_remote_uploader_downloader_from_uri,
-                            model_eval_mode, parse_uri, reproducibility, using_torch_2)
+                            model_eval_mode, parse_uri, partial_format, reproducibility)
 from composer.utils.misc import is_model_deepspeed
+from composer.utils.object_store.mlflow_object_store import MLFLOW_EXPERIMENT_ID_FORMAT_KEY, MLFLOW_RUN_ID_FORMAT_KEY
 
 if is_tpu_installed():
     import torch_xla.core.xla_model as xm
@@ -66,7 +69,7 @@
 __all__ = ['Trainer']
 
 # syntax to shorten the Scheduler type annotations
-Scheduler = Union[ComposerScheduler, PyTorchScheduler]
+Scheduler = Union[ComposerScheduler, LRScheduler]
 
 
 def _raise_missing_argument_exception(arg_name: str):
@@ -90,7 +93,7 @@ def _scale_max_duration_by_ssr(
 
 
 def _get_default_scheduler_frequency(schedulers: Optional[Union[Scheduler, Sequence[Scheduler]]]):
-    has_pytorch_scheduler = any(isinstance(scheduler, PyTorchScheduler) for scheduler in ensure_tuple(schedulers))
+    has_pytorch_scheduler = any(isinstance(scheduler, LRScheduler) for scheduler in ensure_tuple(schedulers))
     if has_pytorch_scheduler:
         log.info(('Stepping schedulers every epoch, as a PyTorch scheduler was provided. '
                   'The trainer cannot automatically convert the parameters (e.g. step_size, T_max) of the '
@@ -124,14 +127,19 @@ def _compile_schedulers(
     schedulers: Optional[Union[Scheduler, Sequence[Scheduler]]],
     state: State,
     scale_schedule_ratio: float,
-) -> List[PyTorchScheduler]:
+) -> List[LRScheduler]:
     compiled_schedulers = []
     for scheduler in ensure_tuple(schedulers):
-        if isinstance(scheduler, PyTorchScheduler):
+        if isinstance(scheduler, LRScheduler):
             scale_pytorch_scheduler(scheduler, scale_schedule_ratio)
             compiled_schedulers.append(scheduler)
-        else:  # it's a composer scheduler
-            compiled_schedulers.append(compile_composer_scheduler(scheduler, state, scale_schedule_ratio))
+        # It's a composer scheduler
+        else:
+            compiled_schedulers.append(compile_composer_scheduler(
+                scheduler,
+                state,
+                scale_schedule_ratio,
+            ))
 
     return compiled_schedulers
 
@@ -148,8 +156,7 @@ def _set_evaluator_interval_and_subset_num_batches(
         if evaluator.eval_interval is None:
             evaluator.eval_interval = eval_interval
         eval_dataloader = evaluator.dataloader.dataloader
-        if isinstance(eval_dataloader, collections.abc.Sized) and (evaluator.subset_num_batches is None or
-                                                                   evaluator.subset_num_batches == -1):
+        if isinstance(eval_dataloader, collections.abc.Sized) and evaluator.subset_num_batches == -1:
             try:
                 dataloader_len = len(eval_dataloader)
             except TypeError:
@@ -451,7 +458,7 @@ class Trainer:
             If ``None``, will be set to ``DecoupledSGDW(model.parameters(), lr=0.1)``. (default: ``None``)
 
             .. seealso:: :mod:`composer.optim` for the different optimizers built into Composer.
-        schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional):
+        schedulers (LRScheduler | ComposerScheduler | Sequence[LRScheduler | ComposerScheduler], optional):
             The learning rate schedulers. If ``[]`` or ``None``, the learning rate will be constant.
             (default: ``None``).
 
@@ -695,6 +702,27 @@ class Trainer:
             state. This parameter has no effect if ``save_folder`` is ``None``. (default: ``False``)
 
             .. seealso:: :class:`~.CheckpointSaver`
+        save_ignore_keys (List[str] | (Dict) -> None, optional): A list of paths for the ``state_dict`` of the checkpoint,
+            which, when provided, will be ignored from the state_dict before a checkpoint is saved. Each path is a list
+            of strings specifying the keys to index into ``state_dict`` joined together with `/` as a separator (as PyTorch
+            uses `.` in parameter names). If a prefix is provided, all children are also ignored (see Example 2).
+            See :mod:`composer.core.state` for the structure of state_dict.
+
+            Example 1: ``save_ignore_keys = ["state/model/layer1.weights", "state/model/layer1.bias"]`` would ignore
+            layer 1 weights and bias.
+
+            Example 2: ``save_ignore_keys = ["state/model/*"]`` would ignore the entire model, which would have the same
+            effect as the previous example if there was only 1 layer.
+
+            Example 3: ``save_ignore_keys = ["state/model/layer*.weights"]`` would ignore all weights in the model.
+
+            Example 4: ``save_ignore_keys = ["state/rank_zero_seed", "rng"]`` would reset all randomness when
+            saving the checkpoint.
+
+            If a callable, it should take one argument which is the state_dict. The callable is free to arbitrarily modify
+            the state_dict before it is loaded.
+
+            (default: ``None``)
         save_num_checkpoints_to_keep (int, optional): The number of checkpoints to keep locally. The oldest checkpoints
             are removed first. Set to ``-1`` to keep all checkpoints locally. (default: ``-1``)
 
@@ -824,8 +852,8 @@ def __init__(
 
         # Optimizers and Scheduling
         optimizers: Optional[torch.optim.Optimizer] = None,
-        schedulers: Optional[Union[ComposerScheduler, PyTorchScheduler, Sequence[Union[ComposerScheduler,
-                                                                                       PyTorchScheduler]]]] = None,
+        schedulers: Optional[Union[ComposerScheduler, LRScheduler, Sequence[Union[ComposerScheduler,
+                                                                                  LRScheduler]]]] = None,
         scale_schedule_ratio: float = 1.0,
         step_schedulers_every_batch: Optional[bool] = None,
 
@@ -861,6 +889,7 @@ def __init__(
         save_overwrite: bool = False,
         save_interval: Union[str, int, Time, Callable[[State, Event], bool]] = '1ep',
         save_weights_only: bool = False,
+        save_ignore_keys: Optional[Union[List[str], Callable[[Dict], None]]] = None,
         save_num_checkpoints_to_keep: int = -1,
         save_metrics: bool = False,
 
@@ -922,26 +951,22 @@ def __init__(
         _validate_precision(precision, device)
 
         # check if provided model is compiled or not
-        is_torch_2_0 = using_torch_2()
         is_model_compiled = False
-        if is_torch_2_0:
-            from torch._dynamo import OptimizedModule
-            if isinstance(model, OptimizedModule):
-                log.warning(f'Provided `model` is already compiled with `torch.compile`. Ignoring ' +
-                            f'parameter `compile_config` if provided. If you would like `Trainer` ' +
-                            f'to takes care of model compilation, provide a not-compiled model and ' +
-                            f'`compile_config` parameter.')
-                # The `torch.compile` function returns an object of type `torch._dynamo.OptimizedModule`
-                # which wraps the original `nn.Module` object and later patches its forward method to
-                # optimized `self.forward` method.
-                is_model_compiled = True
-                compiled_model = model._orig_mod
-                if not isinstance(compiled_model, ComposerModel):
-                    raise ValueError(f'Provided `model` must be a subclass of ComposerModel. ' +
-                                     f'Instead found as type `{type(compiled_model)}`')
-                compiled_model.forward = model.dynamo_ctx(
-                    compiled_model.forward)  # pyright: ignore [reportGeneralTypeIssues]
-                model = compiled_model
+        if isinstance(model, OptimizedModule):
+            log.warning(f'Provided `model` is already compiled with `torch.compile`. Ignoring ' +
+                        f'parameter `compile_config` if provided. If you would like `Trainer` ' +
+                        f'to takes care of model compilation, provide a not-compiled model and ' +
+                        f'`compile_config` parameter.')
+            # The `torch.compile` function returns an object of type `torch._dynamo.OptimizedModule`
+            # which wraps the original `nn.Module` object and later patches its forward method to
+            # optimized `self.forward` method.
+            is_model_compiled = True
+            compiled_model = model._orig_mod
+            if not isinstance(compiled_model, ComposerModel):
+                raise ValueError(f'Provided `model` must be a subclass of ComposerModel. ' +
+                                 f'Instead found as type `{type(compiled_model)}`')
+            compiled_model.forward = model.dynamo_ctx(compiled_model.forward)
+            model = compiled_model
 
         # Microbatching
         auto_microbatching = _is_auto_microbatching(device_train_microbatch_size, device=device)
@@ -1047,6 +1072,15 @@ def __init__(
                     loggers.append(remote_ud)
             self.state.profiler.bind_to_state(self.state)
 
+        # MemorySnapshot, OOMObserver
+        for cb in self.state.callbacks:
+            if isinstance(cb, MemorySnapshot) or isinstance(cb, OOMObserver):
+                if cb.remote_file_name:
+                    remote_ud = maybe_create_remote_uploader_downloader_from_uri(uri=cb.remote_file_name,
+                                                                                 loggers=loggers)
+                    if remote_ud is not None:
+                        loggers.append(remote_ud)
+
         if progress_bar and log_to_console:
             warnings.warn(
                 'Setting both `progress_bar` and `log_to_console` both to True is not recommended and will'
@@ -1085,6 +1119,11 @@ def __init__(
             mosaicml_logger = MosaicMLLogger()
             loggers.append(mosaicml_logger)
 
+        # Remote Uploader Downloader
+        # Keep the ``RemoteUploaderDownloader`` below client-provided loggers so the loggers init callbacks run before
+        # the ``RemoteUploaderDownloader`` init. This is necessary to use an ``MLFlowObjectStore`` to log objects to a
+        # run managed by an ``MLFlowLogger``, as the ``MLFlowObjectStore`` relies on the ``MLFlowLogger`` to initialize
+        # the active MLFlow run.
         if save_folder is not None:
             remote_ud = maybe_create_remote_uploader_downloader_from_uri(save_folder, loggers)
             if remote_ud is not None:
@@ -1144,6 +1183,7 @@ def __init__(
                 latest_remote_file_name=latest_remote_file_name,
                 overwrite=save_overwrite,
                 weights_only=save_weights_only,
+                ignore_keys=save_ignore_keys,
                 save_interval=save_interval,
                 num_checkpoints_to_keep=save_num_checkpoints_to_keep,
             )
@@ -1153,14 +1193,41 @@ def __init__(
         self.engine = Engine(state=self.state, logger=self.logger, algorithm_passes=algorithm_passes)
 
         # Set the logger
-        self.state.model.logger = self.logger
+        self.state.model.logger = self.logger  # pyright: ignore[reportGeneralTypeIssues]
 
         # Run Event.INIT
         self.engine.run_event(Event.INIT)
 
+        # If the experiment is being tracked with an `MLFlowLogger`, then MLFlow experiment and run are available
+        # after Event.INIT.
+        if save_folder is not None:
+            mlflow_logger = None
+            for destination in self.logger.destinations:
+                if isinstance(destination, MLFlowLogger):
+                    mlflow_logger = destination
+                    break
+
+            if mlflow_logger is not None:
+                mlflow_experiment_id = mlflow_logger._experiment_id
+                mlflow_run_id = mlflow_logger._run_id
+
+                # The save folder and related paths/filenames may contain format placeholders for the MLFlow IDs, so
+                # populate them now.
+                mlflow_format_kwargs = {
+                    MLFLOW_EXPERIMENT_ID_FORMAT_KEY: mlflow_experiment_id,
+                    MLFLOW_RUN_ID_FORMAT_KEY: mlflow_run_id
+                }
+
+                save_folder = partial_format(save_folder, **mlflow_format_kwargs)
+                if latest_remote_file_name is not None:
+                    latest_remote_file_name = partial_format(latest_remote_file_name, **mlflow_format_kwargs)
+
         # Log hparams.
         if self.auto_log_hparams:
-            self.local_hparams = extract_hparams(locals())
+            locs = locals()
+            if 'cb' in locs:
+                del locs['cb']
+            self.local_hparams = extract_hparams(locs)
             self.logger.log_hyperparameters(self.local_hparams)
 
         # Log composer version
@@ -1272,10 +1339,6 @@ def __init__(
         self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler()
 
         if self.state.fsdp_config is not None:
-            if version.parse(torch.__version__) < version.parse('1.13.0'):
-                raise RuntimeError('To use FSDP with Composer, you must use torch>=1.13.0.')
-            from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
-
             # This state should never be reached, but we raise a ValueError just in case
             if self._use_closures() and self.state.precision == Precision.AMP_FP16:
                 raise ValueError(f'Using closures and precision {self.state.precision} is not supported'
@@ -1296,7 +1359,8 @@ def __init__(
 
         # FSDP wrap if not using monolith checkpoint on rank 0 only
         if self.state.fsdp_config is not None and fsdp_auto_wrap and not self.state.load_fsdp_monolith_rank0_only:
-            prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching)
+            with reproducibility.seed_context(self.state.rank_zero_seed):
+                prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching)
 
         # Configure Deepspeed
         if self.state.deepspeed_config is not None:
@@ -1317,9 +1381,11 @@ def __init__(
             self.state.deepspeed_config = _parse_deepspeed_config(self.state.deepspeed_config, state=self.state)
             optimizer = ensure_tuple(self.state.optimizers)[0]
             log.debug('Initializing deepspeed')
-            (self.state.model, self.state.optimizers, _, _) = deepspeed.initialize(config=self.state.deepspeed_config,
-                                                                                   model=self.state.model,
-                                                                                   optimizer=optimizer)
+            (self.state.model, self.state.optimizers, _, _) = deepspeed.initialize(
+                config=self.state.deepspeed_config,
+                model=self.state.model,
+                optimizer=optimizer,
+            )
             # Since the DeepSpeed ZeRO optimizer does not inherit torch.optim.Optimizer, the schedulers must be
             # compiled and bound BEFORE DeepSpeed initialization. However, this is OK, as the the DeepSpeed Zero
             # optimizer uses the same underlying parameter groups as the original optimizer. See
@@ -1333,6 +1399,8 @@ def __init__(
             if 'optimizers' in self.state.serialized_attributes:
                 self.state.serialized_attributes.remove('optimizers')
 
+        self.engine.run_event(Event.BEFORE_LOAD)
+
         # Load Checkpoint
         self._rng_state = None
         # If autoresume is enabled, first check for existing checkpoints to load
@@ -1347,8 +1415,6 @@ def __init__(
                     'latest existing checkpoint in `save_folder`. ')
             if save_latest_filename is None:
                 error_message += 'The `save_latest_filename` must be specified so autoresume knows where to load checkpoints from. '
-            if run_name is None:
-                error_message += 'The `run_name` must be specified when using autoresume so Event.INIT is run with the correct run name. '
             if error_message != '':
                 raise ValueError(error_message)
             assert save_folder is not None
@@ -1443,14 +1509,15 @@ def __init__(
         # FSDP wrap if model is not yet wrapped and FSDP is enabled. This can happen if
         # load_fsdp_monolith_rank0_only=True but no checkpoint was loaded.
         if not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_auto_wrap and self.state.load_fsdp_monolith_rank0_only:
-            prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching)
+            with reproducibility.seed_context(self.state.rank_zero_seed):
+                prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching)
 
         self.engine.run_event(Event.AFTER_LOAD)
 
         # reseed here. This helps with a couple of issues:
-        # 1. rng state may change at Event.INIT/Event.AFTER_LOAD. For example, if an algorithm
-        # creates a new module and module parameters are initialized randomly, rng state will
-        # change. This reseeding nullifies such effects.
+        # 1. rng state may change at Event.INIT/Event.BEFORE_LOAD/Event.AFTER_LOAD. For example,
+        # if an algorithm creates a new module and module parameters are initialized randomly, rng
+        # state will change. This reseeding nullifies such effects.
         # 2. While resuming from a checkpoint, we want to spin dataloader and bring it back to the
         # same state as at the time of the checkpoint. Therefore, spinning needs to start from the
         # same rng state as in the original run.
@@ -1463,9 +1530,8 @@ def __init__(
 
         # The model would need to be torch.compile()'d after being wrapped in a distributed strategy
         # to take advantage of any graph breaks.
-        if is_torch_2_0 and not is_model_compiled and compile_config is not None:
-            compiled_model = torch.compile(  # pyright: ignore [reportGeneralTypeIssues]
-                self.state.model, **compile_config)
+        if not is_model_compiled and compile_config is not None:
+            compiled_model = torch.compile(self.state.model, **compile_config)
             self.state.model = compiled_model._orig_mod
             self.state.model.forward = compiled_model.dynamo_ctx(self.state.model.forward)
             is_model_compiled = True
@@ -1473,10 +1539,6 @@ def __init__(
             # debugging purpose and for unit test.
             if self.auto_log_hparams:
                 self.local_hparams['is_model_compiled'] = is_model_compiled
-        elif not is_torch_2_0 and compile_config is not None:
-            raise ValueError(f'`torch.compile` is supported for PyTorch 2.0 or higher.' +
-                             f'Either update your PyTorch version or disable parameter by providing ' +
-                             f'`compile_config` to `None`.')
 
     @property
     def saved_checkpoints(self) -> List[str]:
@@ -1625,8 +1687,8 @@ def fit(
         reset_time: bool = False,
 
         # Schedulers
-        schedulers: Optional[Union[ComposerScheduler, PyTorchScheduler, Sequence[Union[ComposerScheduler,
-                                                                                       PyTorchScheduler]]]] = None,
+        schedulers: Optional[Union[ComposerScheduler, LRScheduler, Sequence[Union[ComposerScheduler,
+                                                                                  LRScheduler]]]] = None,
         scale_schedule_ratio: float = 1.0,
         step_schedulers_every_batch: Optional[bool] = None,
 
@@ -1739,7 +1801,7 @@ def fit(
                 If ``reset_time`` is True, then :attr:`.State.max_duration` will be set to this parameter.
 
             optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer], optional): See :class:`.Trainer`.
-            schedulers (PyTorchScheduler | ComposerScheduler | Sequence[PyTorchScheduler | ComposerScheduler], optional): See :class:`.Trainer`.
+            schedulers (LRScheduler | ComposerScheduler | Sequence[LRScheduler | ComposerScheduler], optional): See :class:`.Trainer`.
             scale_schedule_ratio (float, optional): See :class:`.Trainer`.
             step_schedulers_every_batch (bool, optional): See :class:`.Trainer`.
             eval_dataloader (Iterable | DataSpec | Evaluator | Sequence[Evaluator], optional): See :class:`.Trainer`.
@@ -1784,6 +1846,7 @@ def fit(
 
         if self.state.max_duration is None:
             _raise_missing_argument_exception('max_duration')
+        assert self.state.max_duration is not None
 
         if self.state.dataloader_len is None and self.state.max_duration.unit == TimeUnit.EPOCH:
             raise ValueError(
@@ -1932,6 +1995,7 @@ def _compute_and_log_metrics(self, dataloader_label: str, metrics: Dict[str, Met
         for metric_name, metric in metrics.items():
             assert isinstance(metric, Metric)
             if dataloader_label == 'train':
+                assert self.state.train_metrics is not None
                 self.state.train_metrics[metric_name] = metric
                 self.state.train_metric_values[metric_name] = computed_metrics[metric_name]
             else:
@@ -2022,6 +2086,7 @@ def _train_loop(self) -> None:
             # asserted to be not None when Trainer.fit() is called
             raise RuntimeError('max_duration must be specified when initializing the Trainer')
 
+        log.debug('Starting training loop')
         while self.state.timestamp < self.state.max_duration:
             if int(self.state.timestamp.batch_in_epoch) == 0:
                 self.engine.run_event(Event.EPOCH_START)
@@ -2070,7 +2135,7 @@ def _train_loop(self) -> None:
                     self.state.scaler.update()
 
                 # total_loss_dict can be None if gradient scaling failed
-                if total_loss_dict is not None:
+                if total_loss_dict is not None:  # pyright: ignore[reportUnnecessaryComparison]
                     map_collection(total_loss_dict, dist.all_reduce)
                     total_loss_dict = {
                         k: loss.cpu().item() / dist.get_world_size() for k, loss in total_loss_dict.items()
@@ -2099,7 +2164,7 @@ def _train_loop(self) -> None:
                     for scheduler in self.state.schedulers:
                         scheduler.step()
 
-                if self.state.train_metrics is not None:
+                if self.state.train_metrics is not None:  # pyright: ignore[reportUnnecessaryComparison]
                     self._compute_and_log_metrics(
                         dataloader_label='train',
                         metrics=self.state.train_metrics,
@@ -2134,7 +2199,7 @@ def _train_loop(self) -> None:
                 # This happens if the "break" did not trigger above, or if it
                 # did (e.g. duration specified in samples/batches/tokens), but it is still
                 # the end of the dataloader (i.e. next(dataloader) would raise StopIteration)
-                if self.state.train_metrics is not None:
+                if self.state.train_metrics is not None:  # pyright: ignore[reportUnnecessaryComparison]
                     self.state.train_metrics = self._ensure_metrics_device_and_dtype(self.state.train_metrics)
                     self._compute_and_log_metrics(
                         dataloader_label='train',
@@ -2230,7 +2295,7 @@ def _train_batch(self, use_grad_scaling: bool) -> Dict[str, torch.Tensor]:
         while True:
             # Reset train_metrics on every batch
             # Placing reset here ensures that if auto grad accum catches an OOM, incomplete metric state is cleared
-            if self.state.train_metrics is not None:
+            if self.state.train_metrics is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 for metric in self.state.train_metrics.values():
                     metric.reset()
 
@@ -2454,6 +2519,7 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int,
             else:
                 microbatch_loss = self.state.device.tensor_to_device(torch.zeros(size=(1,)))
                 for loss in ensure_tuple(self.state.loss):
+                    assert isinstance(loss, torch.Tensor)
                     microbatch_loss.add_(loss.mean())
 
                 # Copy the loss if it is a dictionary
@@ -2471,7 +2537,7 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int,
                 microbatch_loss_dict[k] = loss.detach().clone().mean() * (microbatch_num_samples / current_batch_size)
 
             if use_grad_scaling:
-                microbatch_loss = cast(torch.Tensor, self.state.scaler.scale(microbatch_loss))
+                microbatch_loss = cast(torch.Tensor, self.state.scaler.scale(microbatch_loss))  # type: ignore
 
             if self.state.deepspeed_enabled:
                 self.state.deepspeed_model.backward(microbatch_loss)
@@ -2483,7 +2549,8 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int,
             self.engine.run_event(Event.AFTER_BACKWARD)
 
             # Use microbatch outputs to update training metrics
-            if self.state.train_metrics is not None and len(self.state.train_metrics) != 0:
+            if (self.state.train_metrics is not None and  # pyright: ignore[reportUnnecessaryComparison]
+                    len(self.state.train_metrics) != 0):
                 self.state.train_metrics = self._ensure_metrics_device_and_dtype(self.state.train_metrics)
                 self._eval_train_metrics(device_batch)
 
@@ -2592,8 +2659,7 @@ def predict_batch_end(self, state: State, logger: Logger) -> None:
                 self.state.batch = self.state.device.batch_to_device(self.state.batch)
 
                 # Perform any device transforms
-                if data_spec.device_transforms is not None:
-                    self.state.batch = data_spec.device_transforms(self.state.batch)
+                self.state.batch = data_spec.device_transforms(self.state.batch)
 
                 # Count the batch size and num tokens before any events run
                 rank_num_samples = data_spec.get_num_samples_in_batch(self.state.batch)
@@ -2856,8 +2922,7 @@ def _eval_loop(
 
             for self.state.batch in self._iter_dataloader(TrainerMode.EVAL):
                 self.state.batch = self.state.device.batch_to_device(self.state.batch)
-                if data_spec.device_transforms is not None:
-                    self.state.batch = data_spec.device_transforms(self.state.batch)
+                self.state.batch = data_spec.device_transforms(self.state.batch)
 
                 # Count the batch size and num tokens before any events run
                 rank_num_samples = data_spec.get_num_samples_in_batch(self.state.batch)
@@ -2927,6 +2992,13 @@ def _eval_loop(
                                                 outputs[k] = v.cpu()
                                             else:
                                                 outputs[k] = v
+                                    elif isinstance(self.state.outputs, Sequence):
+                                        outputs = []
+                                        for v in self.state.outputs:
+                                            if isinstance(v, torch.Tensor):
+                                                outputs.append(v.cpu())
+                                            else:
+                                                outputs.append(v)
                                     else:
                                         outputs = self.state.outputs.cpu()
                                 else:
@@ -3070,7 +3142,7 @@ def _use_closures(self) -> bool:
         if self.state.precision != Precision.AMP_FP16:
             return True
 
-        if self.state.optimizers is None:
+        if not hasattr(self.state, 'optimizers'):
             raise RuntimeError('state.optimizers must be set before `_use_closures` can be determined')
 
         return all(
diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py
index 30930250d9..67ed33cdd3 100644
--- a/composer/utils/__init__.py
+++ b/composer/utils/__init__.py
@@ -6,7 +6,8 @@
 from composer.utils.auto_log_hparams import (convert_flat_dict_to_nested_dict, convert_nested_dict_to_flat_dict,
                                              extract_hparams)
 from composer.utils.batch_helpers import batch_get, batch_set
-from composer.utils.checkpoint import PartialFilePath, load_checkpoint, safe_torch_load, save_checkpoint
+from composer.utils.checkpoint import (PartialFilePath, get_save_filename, load_checkpoint, safe_torch_load,
+                                       save_checkpoint)
 from composer.utils.collect_env import (configure_excepthook, disable_env_report, enable_env_report,
                                         get_composer_env_dict, print_env)
 from composer.utils.device import get_device, is_hpu_installed, is_tpu_installed
@@ -20,7 +21,7 @@
 from composer.utils.inference import ExportFormat, Transform, export_for_inference, export_with_logger, quantize_dynamic
 from composer.utils.iter_helpers import IteratorFileStream, ensure_tuple, map_collection
 from composer.utils.misc import (create_interval_scheduler, get_free_tcp_port, is_model_deepspeed, is_model_fsdp,
-                                 is_notebook, model_eval_mode, using_torch_2)
+                                 is_notebook, model_eval_mode, partial_format)
 from composer.utils.object_store import (GCSObjectStore, LibcloudObjectStore, MLFlowObjectStore, ObjectStore,
                                          ObjectStoreTransientError, OCIObjectStore, S3ObjectStore, SFTPObjectStore,
                                          UCObjectStore)
@@ -42,11 +43,12 @@
     'LibcloudObjectStore',
     'S3ObjectStore',
     'SFTPObjectStore',
+    'MLFlowObjectStore',
     'OCIObjectStore',
     'GCSObjectStore',
     'UCObjectStore',
-    'MLFlowObjectStore',
     'MissingConditionalImportError',
+    'get_save_filename',
     'import_object',
     'is_model_deepspeed',
     'is_model_fsdp',
@@ -84,10 +86,10 @@
     'extract_hparams',
     'convert_nested_dict_to_flat_dict',
     'convert_flat_dict_to_nested_dict',
-    'using_torch_2',
     'create_interval_scheduler',
     'EvalClient',
     'LambdaEvalClient',
     'LocalEvalClient',
     'MosaicMLLambdaEvalClient',
+    'partial_format',
 ]
diff --git a/composer/utils/batch_helpers.py b/composer/utils/batch_helpers.py
index c897fccd5c..5778776dd2 100644
--- a/composer/utils/batch_helpers.py
+++ b/composer/utils/batch_helpers.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """Helpers to get items and set items in a batch."""
+from __future__ import annotations
 
 from operator import attrgetter, itemgetter
 from typing import Any, Callable, Sequence, Union, cast
@@ -9,7 +10,7 @@
 __all__ = ['batch_get', 'batch_set']
 
 
-def batch_get(batch: Any, key: Union[str, int, Callable, Any]):
+def batch_get(batch: Any, key: Union[str, int, tuple[Callable, Callable], Callable, Any]):
     """Indexes into the batch given the key.
 
     >>> from composer.utils.batch_helpers import batch_get
@@ -27,7 +28,7 @@ def batch_get(batch: Any, key: Union[str, int, Callable, Any]):
             Can be any abritrary type that user creates, but we assume some sort of
             sequence (list, tuple, tensor, array), mapping (dictionary),
             or attribute store (object with data members, namedtuple).
-        key (str | int | Tuple[Callable, Callable] | Any, optional): A key to index into the batch or a
+        key (str | int | Tuple[Callable, Callable] | Callable | Any, optional): A key to index into the batch or a
                 user-specified function to do the extracting. A pair of callables is also
                 supported for cases where a get and set function pair are both passed
                 (like in Algorithms). The getter is assumed to be the first of the pair.
@@ -58,7 +59,7 @@ def batch_get(batch: Any, key: Union[str, int, Callable, Any]):
                 return attrgetter(*key)(batch)
 
 
-def batch_set(batch: Any, key: Union[str, int, Callable, Any], value: Any) -> Any:
+def batch_set(batch: Any, key: Union[str, int, tuple[Callable, Callable], Callable, Any], value: Any) -> Any:
     """Indexes into the batch given the key and sets the element at that index to value.
 
     This is not an in-place operation for batches of type tuple as tuples are not mutable.
@@ -83,7 +84,7 @@ def batch_set(batch: Any, key: Union[str, int, Callable, Any], value: Any) -> An
             Can be any abritrary type that user creates, but we assume some sort of
             sequence (list, tuple, tensor, array), mapping (dictionary),
             or attribute store (object with data members, namedtuple).
-        key (str | int | Tuple[Callable, Callable] | Any, optional): A key to index into the batch or a user-specified function
+        key (str | int | Tuple[Callable, Callable] | Callable | Any, optional): A key to index into the batch or a user-specified function
             to do the setting. A pair of callables is also supported for cases where a get
             and set function pair are both passed (like in Algorithms). The setter is
             assumed to be the second of the pair.
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index f7610b6daf..a50a2db27d 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -16,15 +16,20 @@
 import warnings
 from importlib import import_module
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import torch
 from packaging import version
+from torch.distributed import checkpoint as dist_cp
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.checkpoint.metadata import Metadata
+from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+from torch.distributed.checkpoint.planner import LoadPlan, LoadPlanner
 
 from composer.utils import dist, reproducibility
 from composer.utils.file_helpers import (FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, format_name_with_dist,
                                          format_name_with_dist_and_time, get_file, is_tar)
-from composer.utils.misc import is_model_deepspeed, using_torch_2
+from composer.utils.misc import is_model_deepspeed, partial_format
 from composer.utils.object_store import ObjectStore
 
 if TYPE_CHECKING:
@@ -33,7 +38,7 @@
 
 log = logging.getLogger(__name__)
 
-__all__ = ['load_checkpoint', 'save_checkpoint', 'download_checkpoint']
+__all__ = ['get_save_filename', 'load_checkpoint', 'save_checkpoint', 'download_checkpoint']
 
 _COMPOSER_STATES_FILENAME = 'composer_states.pt'
 _DEEPSPEED_TAG = 'deepspeed'  # always tag with the same, deterministic name. We'll rename the tarball to the appropriate name.
@@ -131,6 +136,178 @@ def _get_write_mode(name: str) -> str:
     raise ValueError(f'{name} does not end with a valid tarfile extension.')
 
 
+def _get_num_ranks_that_saved_rng(metadata: Metadata):
+    rng_inds = []
+    for field_name, field_value in metadata.planner_data.items():
+        if 'rng' in field_name:
+            _, rng_rank_index, _ = field_value
+            rng_inds.append(rng_rank_index)
+    rng_inds = set(rng_inds)
+    return len(rng_inds)
+
+
+class FileSystemReaderWithValidation(dist_cp.FileSystemReader):
+    """FileSystemReader that validates checkpoint files prior to reading."""
+
+    def __init__(self, path: str):
+        if _get_checkpoint_validation_function() is None:
+            log.info('No checkpoint validation function found when loading sharded checkpoints.')
+        super().__init__(path)
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner):
+        """Reads data file.
+
+        Raises:
+            ValueError if the data file is invalid.
+        """
+        validated_checkpoint_paths = set()
+        for read_item in plan.items:
+            data_path = self.path / self.storage_data[read_item.storage_index].relative_path
+            if data_path in validated_checkpoint_paths:
+                continue
+            _ensure_valid_checkpoint(data_path)
+            validated_checkpoint_paths.add(data_path)
+        return super().read_data(plan, planner)
+
+    def read_metadata(self) -> Metadata:
+        """Reads metadata file.
+
+        Raises:
+            ValueError if the metadata file is invalid.
+        """
+        metadata_file_path = self.path / '.metadata'
+        _ensure_valid_checkpoint(metadata_file_path)
+        return super().read_metadata()
+
+
+# A subclass of FileSystemReaderWithValidation that downloads files from the object store before reading them from the local filesystem.
+class DistCPObjectStoreReader(FileSystemReaderWithValidation):
+
+    def __init__(self, source_path: str, destination_path: str, object_store: Union[ObjectStore, LoggerDestination],
+                 device_mesh: Optional[DeviceMesh]):
+        self.source_path = source_path
+        self.destination_path = destination_path
+        self.object_store = object_store
+        self.device_mesh = device_mesh
+
+        # Download metadata file.
+        Path(self.destination_path).mkdir(parents=True, exist_ok=True)
+        metadata_destination = os.path.join(self.destination_path, '.metadata')
+        if dist.get_local_rank() == 0:
+            metadata_path = str(Path(source_path) / Path('.metadata'))
+            if isinstance(object_store, ObjectStore):
+                object_store.download_object(
+                    object_name=metadata_path,
+                    filename=metadata_destination,
+                )
+            else:
+                object_store.download_file(
+                    remote_file_name=metadata_path,
+                    destination=metadata_destination,
+                )
+        dist.barrier()
+
+        # FileSystemReader takes in a root directory in its constructor, which is the dir where
+        # the metadata is expected to be stored. Also, this is parent directory for any shard file relative paths
+        # specified in the metadata file.
+        super().__init__(destination_path)
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner):
+        # Download files if not using HSDP or if on first replica with HSDP enabled
+        first_replica = self.device_mesh is None or self.device_mesh.ndim == 1 or (
+            self.device_mesh.ndim >= 2 and self.device_mesh.get_local_rank(mesh_dim=0) == 0)
+
+        # 1. Collect the relative paths to download for all ranks for deduplication
+        relative_file_paths = set()
+        for plan_item in plan.items:
+            relative_file_paths.add(self.storage_data[plan_item.storage_index].relative_path)
+        all_file_paths = dist.all_gather_object(relative_file_paths)
+
+        # 2. Download to the destination all files this rank needs if on first replica
+        if first_replica:
+            log.debug(f'Rank {dist.get_global_rank()} starting to download files.')
+
+            # Get the lowest rank in the current node
+            local_rank_0 = dist.get_global_rank() - dist.get_local_rank()
+
+            for plan_item in plan.items:
+                relative_file_path = self.storage_data[plan_item.storage_index].relative_path
+                # Check if the file is scheduled to be downloaded by a lower rank on the same node
+                # i.e. if rank 0 and rank 1 on the same node have the same the same required file,
+                # only rank 0 should download it and not rank 1.
+                is_downloaded = any(
+                    relative_file_path in all_file_paths[i] for i in range(local_rank_0, dist.get_global_rank()))
+
+                # Download the shard file to the relative path it's associated to and save that relative path
+                # to the root directory specified to the FileSystem reader constructor.
+                file_destination = str(Path(self.destination_path) / Path(relative_file_path))
+
+                # The file could have already been downloaded as different plan items can point to same file.
+                if not is_downloaded and not os.path.exists(file_destination):
+                    log.debug(f'Downloading {relative_file_path} to {file_destination}.')
+                    object_name = str(Path(self.source_path) / Path(relative_file_path))
+                    if isinstance(self.object_store, ObjectStore):
+                        self.object_store.download_object(
+                            object_name=object_name,
+                            filename=file_destination,
+                        )
+                    else:
+                        self.object_store.download_file(
+                            remote_file_name=object_name,
+                            destination=file_destination,
+                        )
+                    log.debug(f'Finished downloading {relative_file_path} to {file_destination}.')
+
+        # 3. Wait for all ranks to finish.
+        log.debug(f'Rank {dist.get_global_rank()} finished downloading all files.')
+        dist.barrier()
+        log.debug('Done waiting for all ranks to finish downloading files.')
+
+        # 4. Broadcast files to all other replicas if HSDP
+        if self.device_mesh is not None and self.device_mesh.ndim == 2:
+            # Broadcast file to all replicas
+            replicate_process_group = self.device_mesh.get_group(0)
+            shard_size = self.device_mesh.size(1)
+            rank_in_first_replica = dist.get_global_rank() % shard_size
+            sender = dist.get_global_rank() == rank_in_first_replica
+            receiver = dist.get_global_rank() != rank_in_first_replica
+
+            # Send list of files to all ranks
+            file_list = [sorted(os.listdir(self.destination_path))]
+            dist.broadcast_object_list(file_list, src=rank_in_first_replica, group=replicate_process_group)
+            file_list = file_list[0]
+            log.debug(f'List of files to broadcast: {file_list}')
+
+            # Send each file to the appropriate rank
+            for file_name in file_list:
+                if 'metadata' in file_name:  # All ranks already have the metadata file
+                    continue
+                if dist.get_local_rank() == 0:  # Only 1 rank per node needs to transfer file
+                    full_path = os.path.join(self.destination_path, file_name)
+                    log.debug(f'Transferring {full_path=}')
+                    file_object = [None]
+                    if sender:
+                        with open(full_path, 'rb') as f:
+                            file_object = [{'content': f.read()}]
+                    dist.broadcast_object_list(file_object,
+                                               src=dist.get_global_rank() % shard_size,
+                                               group=replicate_process_group)
+                    received_file_object = file_object[0]
+                    assert received_file_object is not None
+                    if receiver and not os.path.exists(full_path):
+                        with open(full_path, 'wb') as f:
+                            f.write(received_file_object['content'])
+
+            log.debug(f'Rank {dist.get_global_rank()} finished transferring files to all ranks.')
+            dist.barrier()
+            log.debug(
+                f'Done waiting for all ranks to finish transferring files. Local checkpoint files: {os.listdir(self.destination_path)}'
+            )
+
+        # 5. Piggyback off of the FileSystemReader to read all the files now that they are downloaded.
+        return super().read_data(plan, planner)
+
+
 class PartialFilePath:
 
     def __init__(self, filename: str, folder: Optional[str] = None):
@@ -170,7 +347,16 @@ def is_checkpoint_legacy_sharded(object_store: Optional[ObjectStore], source_pat
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 metadata_destination = os.path.join(str(temp_dir), '.metadata')
-                object_store.download_object(object_name=metadata_path, filename=metadata_destination)
+                if isinstance(object_store, ObjectStore):
+                    object_store.download_object(
+                        object_name=metadata_path,
+                        filename=metadata_destination,
+                    )
+                else:
+                    object_store.download_file(
+                        remote_file_name=metadata_path,
+                        destination=metadata_destination,
+                    )
             return False
         except FileNotFoundError:
             return True
@@ -276,6 +462,7 @@ def load_checkpoint(
         Optional[list[dict[str, Any]]]: The RNG state dicts, indexed by global rank, if
             :attr:`load_weights_only` is not None. Otherwise, None.
     """
+    path = partial_format(path, run_name=state.run_name)
     using_legacy_sharded = False
     if state.fsdp_elastic_sharded_enabled:
         assert object_store is None or isinstance(
@@ -367,12 +554,7 @@ def load_sharded_checkpoint(
     ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
     exclude_algorithms: Optional[list[str]] = None,
     algorithm_passes: Optional[list[AlgorithmPass]] = None,
-) -> list[dict]:
-
-    if not using_torch_2():
-        raise ValueError(
-            f'Sharded checkpoint loading requires torch version >= 2.0.0. You have torch version {torch.__version__}')
-
+) -> Union[list[dict], None]:
     using_multinode = dist.get_world_size() != dist.get_local_world_size()
     if not version.parse(torch.__version__) >= version.parse('2.0.1') and using_multinode:
         raise ValueError(
@@ -381,93 +563,6 @@ def load_sharded_checkpoint(
 
     if state.fsdp_config is None:
         raise ValueError('Loading a sharded checkpoint requires passing an FSDP config to Trainer.')
-    load_planner = state.fsdp_config['load_planner']
-    _validate_load_planner(load_planner)
-
-    from torch.distributed import checkpoint as dist_cp
-    from torch.distributed.checkpoint.metadata import Metadata
-    from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
-    from torch.distributed.checkpoint.planner import LoadPlan, LoadPlanner
-
-    # This function is used so we can figure out which ranks need to load saved rngs and which can just make their own.
-    def _get_num_ranks_that_saved_rng(metadata: Metadata):
-        rng_inds = []
-        for field_name, field_value in metadata.planner_data.items():
-            if 'rng' in field_name:
-                _, rng_rank_index, _ = field_value
-                rng_inds.append(rng_rank_index)
-        rng_inds = set(rng_inds)
-        return len(rng_inds)
-
-    class FileSystemReaderWithValidation(dist_cp.FileSystemReader):
-        """FileSystemReader that validates checkpoint files prior to reading."""
-
-        def __init__(self, path: str):
-            if _get_checkpoint_validation_function() is None:
-                log.info('No checkpoint validation function found when loading sharded checkpoints.')
-            super().__init__(path)
-
-        def read_data(self, plan: LoadPlan, planner: LoadPlanner):
-            """Reads data file.
-
-            Raises:
-                ValueError if the data file is invalid.
-            """
-            for read_item in plan.items:
-                data_path = self.path / self.storage_data[read_item.storage_index].relative_path
-                _ensure_valid_checkpoint(data_path)
-            return super().read_data(plan, planner)
-
-        def read_metadata(self) -> Metadata:
-            """Reads metadata file.
-
-            Raises:
-                ValueError if the metadata file is invalid.
-            """
-            metadata_file_path = self.path / '.metadata'
-            _ensure_valid_checkpoint(metadata_file_path)
-            return super().read_metadata()
-
-    # A subclass of FileSystemReaderWithValidation that downloads files from the object store before reading them from the local filesystem.
-    class DistCPObjectStoreReader(FileSystemReaderWithValidation):
-
-        def __init__(self, source_path: str, destination_path: str, object_store):
-            self.source_path = source_path
-            self.destination_path = destination_path
-            self.object_store = object_store
-
-            # Download metadata file.
-            Path(self.destination_path).mkdir(parents=True, exist_ok=True)
-            metadata_destination = os.path.join(self.destination_path, '.metadata')
-            if dist.get_local_rank() == 0:
-                object_store.download_object(object_name=str(Path(source_path) / Path('.metadata')),
-                                             filename=metadata_destination)
-            dist.barrier()
-
-            # FileSystemReader takes in a root directory in its constructor, which is the dir where
-            # the metadata is expected to be stored. Also, this is parent directory for any shard file relative paths
-            # specified in the metadata file.
-            super().__init__(destination_path)
-
-        def read_data(self, plan: LoadPlan, planner: LoadPlanner):
-            # 1. Download to the destination all files that this rank is responsible for.
-            for plan_item in plan.items:
-                # Each plan item has a storage index which points to the relative path of the shard file at save time.
-                relative_file_path = self.storage_data[plan_item.storage_index].relative_path
-                # Download the shard file to the relative path it's associated to and save that relative path
-                # to the root directory specified to the FileSystem reader constructor.
-                file_destination = str(Path(self.destination_path) / Path(relative_file_path))
-                # The file could have already been downloaded as diffeent plan items can point to same file.
-                if not os.path.exists(file_destination):
-                    self.object_store.download_object(object_name=str(
-                        Path(self.source_path) / Path(relative_file_path)),
-                                                      filename=file_destination)
-
-            # 2. Wait for all ranks to finish.
-            dist.barrier()
-
-            # 3. Piggyback off of the FileSystemReader to read all the files now that they are downloaded.
-            return super().read_data(plan, planner)
 
     # Check to make sure source_path is a directory.
     if object_store is None:
@@ -486,39 +581,58 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner):
             # Get the tempfile made on local rank 0.
             local_rank0_index = dist.get_global_rank() - dist.get_local_rank()
             rank0_download_tempdir = str(dist.all_gather_object(temp_download_dir)[local_rank0_index])
-            storage_reader = DistCPObjectStoreReader(source_path=source_path,
-                                                     destination_path=str(
-                                                         Path(rank0_download_tempdir) / Path('checkpoints')),
-                                                     object_store=object_store)
+            storage_reader = DistCPObjectStoreReader(
+                source_path=source_path,
+                destination_path=str(Path(rank0_download_tempdir) / Path('checkpoints')),
+                object_store=object_store,
+                device_mesh=state.fsdp_device_mesh,
+            )
         else:
             storage_reader = FileSystemReaderWithValidation(source_path)
 
         # We need no_grad because we overwrite tensor values with set_() when we do elastic loading and we don't want the set_ op recorded in the computation graph.
         with torch.no_grad():
             # 1. Load model and metadata first
-            model_state_dict = None
             if load_weights_only:
-                model_state_dict = {'state': {'model': state.get_model_state_dict()}}
+                state_dict: Dict[str, Any] = {'state': {'model': state.get_model_state_dict()}}
             else:
                 cur_state_dict = state.state_dict()
-                cur_state_dict.pop('optimizers')
-                model_state_dict = {'state': cur_state_dict}
+                # For older versions of torch, we load optimizer separately.
+                if version.parse(torch.__version__) < version.parse('2.2.9'):
+                    cur_state_dict.pop('optimizers')
+                num_rng_ranks = _get_num_ranks_that_saved_rng(storage_reader.read_metadata())
+                state_dict: Dict[str, Any] = {
+                    'state': cur_state_dict,
+                    'rng': reproducibility.get_rng_state()[:num_rng_ranks],
+                }
 
             if ignore_keys:
                 # Filter provided list of key paths
                 if not callable(ignore_keys):
                     ignore_keys = glob_filter(ignore_keys)
                 # Call function to modify state_dict
-                ignore_keys(model_state_dict)
-
-            dist_cp.load_state_dict(
-                state_dict=model_state_dict,
-                storage_reader=storage_reader,
-                planner=load_planner,
-            )
+                ignore_keys(state_dict)
+                # Ensure state exists
+                state_dict['state'] = state_dict.get('state', {})
+
+            if version.parse(torch.__version__) > version.parse('2.2.9'):
+                dist_cp.load(  # type: ignore
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config['load_planner'],
+                    no_dist=(not dist.is_initialized()),
+                )
+            else:
+                dist_cp.load_state_dict(
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config['load_planner'],
+                    no_dist=(not dist.is_initialized()),
+                )
 
+            log.info(f'Loaded state dict')
             state.load_state_dict(
-                model_state_dict['state'],
+                state_dict['state'],
                 logger,
                 strict=strict_model_weights,
                 exclude_algorithms=exclude_algorithms,
@@ -526,32 +640,14 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner):
             )
 
             # 2. Optionally load optimizer
-            if not load_weights_only:
+            # if we are using later than 2.2.9 then optimizer will already be loaded
+            if version.parse(torch.__version__) < version.parse('2.2.9') and not load_weights_only:
                 optim_state = load_sharded_optimizer_state_dict(model_state_dict=state.state_dict()['model'],
                                                                 optimizer_key='optimizers',
                                                                 storage_reader=storage_reader)
-                state.load_optim_state(optim_state)
-
-        # 3. Optionally load RNG
-        rng_state_dicts = reproducibility.get_rng_state()
-        if not load_weights_only:
-            # If we are resuming on more ranks than were used at save time we only want to load in rngs for those ranks
-            num_ranks_that_saved_rng = _get_num_ranks_that_saved_rng(storage_reader.read_metadata())
-            rng_state_dicts_load = {}
-            rng_state_dicts_load['rng'] = rng_state_dicts[:num_ranks_that_saved_rng] if len(
-                rng_state_dicts) > num_ranks_that_saved_rng else rng_state_dicts
-            dist_cp.load_state_dict(
-                state_dict=rng_state_dicts_load,
-                storage_reader=storage_reader,
-                planner=load_planner,
-            )
-            # We also want to append newly generated rng states for the ranks that don't have an rng state to load in
-            # if we are resuming on more ranks than were used at save time.
-            if len(rng_state_dicts) > num_ranks_that_saved_rng:
-                rng_state_dicts_load['rng'].extend(rng_state_dicts[num_ranks_that_saved_rng:])
-            rng_state_dicts = rng_state_dicts_load['rng']
+                state._legacy_load_optim_state(optim_state)
 
-    return rng_state_dicts
+    return state_dict.get('rng', None)
 
 
 def _get_local_rank_zero_path(path: Optional[str]) -> str:
@@ -599,7 +695,7 @@ def download_checkpoint(path: str,
     checkpoint_is_sharded = fsdp_sharded_state_dict_enabled or deepspeed_sharded_checkpoint
     try:
         if not checkpoint_is_sharded and dist.get_local_rank() == 0:
-            # if the checkpoint is not sharded, then local rank 0 on each node needs to download the
+            # If the checkpoint is not sharded, then local rank 0 on each node needs to download the
             # global rank 0 checkpoint
             path = _format_path_with_rank_zero(path)
             get_file(destination=rank_zero_checkpoint_filepath,
@@ -616,18 +712,18 @@ def download_checkpoint(path: str,
                     # or could not be downloaded
                     raise RuntimeError(f'Checkpoint {path} does not exist')
         elif checkpoint_is_sharded:
-            # if the checkpoint is sharded, then every rank needs to download its own checkpoint
+            # If the checkpoint is sharded, then every rank needs to download its own checkpoint
+            path = _format_path_with_current_rank(path)
             try:
                 get_file(destination=rank_n_checkpoint_filepath,
-                         path=_format_path_with_current_rank(path),
+                         path=path,
                          object_store=object_store,
                          progress_bar=progress_bar)
             except FileNotFoundError as e:
                 raise FileNotFoundError(
-                    (f'Checkpoint {_format_path_with_current_rank(path)} does not exist, '
-                     f'but is required for sharded checkpointing on rank {dist.get_global_rank()}. '
-                     'Please ensure that the checkpoint exists and your load_path was specified as a format string'
-                     'with the {rank} argument.')) from e
+                    (f'Checkpoint {path} does not exist, but is required for sharded checkpointing '
+                     f'on rank {dist.get_global_rank()}. Please ensure that the checkpoint exists '
+                     'and your load_path was specified as a format string with the {rank} argument.')) from e
 
             if extracted_checkpoint_folder is not None:
                 try:
@@ -677,14 +773,25 @@ def _flatten_keys(obj: Any, paths: list[str], existing_path: str):
 
 
 def _remove_paths(obj: Union[list, dict[str, Any]], exclude_paths: list[list[str]]):
+    # Build str(key) to key map to undo cast from glob filtering. Despite typing, some state_dict
+    # keys are not strings, so we need to cast them back to their original type.
+    str_key_to_key = {}
+    if isinstance(obj, dict):
+        for key in obj.keys():
+            str_key_to_key[str(key)] = key
+
     # First determine the keys which will be recursed on and which will be removed entirely
     # Group the `exclude_paths` by the key
     keys_to_recurse = {}
     keys_to_remove = []
     for exclude_path_parts in exclude_paths:
         key = exclude_path_parts[0]
+        # Cast list indices to int
         if isinstance(obj, list):
             key = int(key)
+        # Un-str dict keys if necessary
+        if key in str_key_to_key:
+            key = str_key_to_key[key]
         if len(exclude_path_parts) == 1:
             keys_to_remove.append(key)
         else:
@@ -720,51 +827,17 @@ def filter_func(state_dict: dict) -> None:
                     f'No parts from loaded checkpoint state_dict were ignored by load_ignore_key {exclude_glob}')
             filtered_paths.extend(filtered_paths_from_glob)
         filtered_paths = list(set(filtered_paths))
-        filtered_paths_str = ', '.join(filtered_paths)
         if filtered_paths:
+            filtered_paths_str = ', '.join(filtered_paths)
             log.info(f'Ignoring the following paths from the loaded checkpoint state_dict: {filtered_paths_str}')
 
         # Loop through all paths to exclude
-        paths_to_remove = [path.split('/') for path in filtered_paths]
+        paths_to_remove = [path.split('/') for path in filtered_paths if len(path) > 0]
         _remove_paths(state_dict, paths_to_remove)
 
     return filter_func
 
 
-def _validate_save_planner(save_planner: Optional[Any]) -> None:
-    """Checks that ``save_planner`` is an instance of a :class:`~torch.distributed.checkpoint.planner.SavePlanner`.
-
-    TODO(GRT-2456): Remove validation once we deprecate torch 1.13 and can use
-    type hints.
-
-    Raises:
-        ValueError: If ``save_planner`` is not a
-            :class:`~torch.distributed.checkpoint.planner.SavePlanner`.
-    """
-    from torch.distributed.checkpoint.planner import SavePlanner
-
-    if save_planner is not None and not isinstance(save_planner, SavePlanner):
-        raise ValueError((f'save_planner {type(save_planner)} is not a '
-                          'torch.distributed.checkpoint.planner.SavePlanner'))
-
-
-def _validate_load_planner(load_planner: Optional[Any]) -> None:
-    """Checks that ``load_planner`` is an instance of a :class:`~torch.distributed.checkpoint.planner.LoadPlanner`.
-
-    TODO(GRT-2456): Remove validation once we deprecate torch 1.13 and can use
-    type hints.
-
-    Raises:
-        ValueError: If ``load_planner`` is not a
-            :class:`~torch.distributed.checkpoint.planner.LoadPlanner`.
-    """
-    from torch.distributed.checkpoint.planner import LoadPlanner
-
-    if load_planner is not None and not isinstance(load_planner, LoadPlanner):
-        raise ValueError((f'load_planner {type(load_planner)} is not a '
-                          'torch.distributed.checkpoint.planner.LoadPlanner'))
-
-
 def safe_torch_load(
     composer_states_filepath: Union[Path, str],
     map_location: str = 'cpu',
@@ -840,6 +913,8 @@ def _restore_checkpoint(
             ignore_keys = glob_filter(ignore_keys)
         # Call function to modify state_dict
         ignore_keys(state_dict)
+        # Ensure state exists
+        state_dict['state'] = state_dict.get('state', {})
     log.debug(f"Loaded checkpoint with keys {state_dict.keys()} and state keys {state_dict['state'].keys()}")
 
     if is_model_deepspeed(state.model):
@@ -876,11 +951,39 @@ def _restore_checkpoint(
         return state_dict.get('rng', None)
 
 
-def save_checkpoint(
+def get_save_filename(
     state: State,
     filename: str = 'ep{epoch}-ba{batch}-rank{rank}',
+) -> str:
+    """Gets full filename of save filename.
+
+    Args:
+        state (State): The :class:`~composer.core.State` to load the checkpoint into.
+        filename (filename): The name of the save file.
+
+    Returns:
+        Full filename of save file.
+    """
+    if not state.fsdp_sharded_state_dict_enabled:
+        is_deepspeed = is_model_deepspeed(state.model)
+        return PartialFilePath(filename).format(state, is_deepspeed)
+
+    # Sharded checkpoints get their own little folder.
+    assert state.sharded_ckpt_prefix_dir is not None
+    save_dirpath = Path(Path(filename).parent) / Path(state.sharded_ckpt_prefix_dir)
+    save_dirpath = format_name_with_dist_and_time(str(save_dirpath), state.run_name, state.timestamp)
+    # New name is now Trainer.save_folder / sharded_ckpt_prefix_dir / __{dist.get_global_rank()}_0.distcp’
+    # e.g. path/to/my/checkpoints/ep1-ba2/__1_0.distcp
+    ckpt_filename = _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
+    return str(Path(save_dirpath) / Path(ckpt_filename))
+
+
+def _save_checkpoint(
+    state: State,
+    save_filename: str,
     *,
     weights_only: bool = False,
+    ignore_keys: Optional[Union[List[str], Callable[[Dict], None]]] = None,
 ) -> Union[str, None]:  # noqa: D103
 
     is_deepspeed = is_model_deepspeed(state.model)
@@ -900,38 +1003,36 @@ def save_checkpoint(
             'rng': reproducibility.get_rng_state(),
         }
 
-    log.debug('State dict created.')
+    if ignore_keys:
+        # Filter provided list of key paths
+        if not callable(ignore_keys):
+            ignore_keys = glob_filter(ignore_keys)
+        # Call function to modify state_dict
+        ignore_keys(state_dict)
+        # Ensure state exists
+        state_dict['state'] = state_dict.get('state', {})
 
-    # Sharded checkpoints get their own little folder.
     if state.fsdp_sharded_state_dict_enabled:
-        # To load optimizer states with torch 2.0, the optimizer state must be at the top
+        # To load optimizer states with 2.0 <= torch < 2.2.9 , the optimizer state must be at the top
         # level of the state dict because the load_sharded_optimizer_state_dict function
         # requires a top level state dict key for the optimizer.
         # See https://github.com/pytorch/pytorch/blob/v2.0.1/torch/distributed/checkpoint/optimizer.py#L271
         # for more info.
-        if using_torch_2():
+        if version.parse(torch.__version__) < version.parse('2.2.9'):
             if not weights_only:
                 state_dict['optimizers'] = state_dict['state'].pop('optimizers')
-
-        # Specify save directory path and save_f
-        assert state.sharded_ckpt_prefix_dir is not None
-        save_dirpath = Path(Path(filename).parent) / Path(state.sharded_ckpt_prefix_dir)
-        save_dirpath = format_name_with_dist_and_time(str(save_dirpath), state.run_name, state.timestamp)
-        # New name is now Trainer.save_folder / sharded_ckpt_prefix_dir / __{dist.get_global_rank()}_0.distcp’ if torch > 2
-        # else Trainer.save_folder / sharded_ckpt_prefix_dir / ba{batch}_rank{dist.get_global_rank()}.pt’
-        # e.g. path/to/my/checkpoints/ep1-ba2/__1_0.distcp if torch >2 else its path/to/my/checkpoints/ep1-ba2/b2-rank1.pt
-        ckpt_filename = _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME if using_torch_2() else format_name_with_dist_and_time(
-            Path(filename).name, state.run_name, state.timestamp)
-        save_filename = str(Path(save_dirpath) / Path(ckpt_filename))
-    else:
-        save_filename = PartialFilePath(filename).format(state, is_deepspeed)
+    log.debug('State dict created.')
 
     dirname = os.path.dirname(save_filename)
     if dirname:
         os.makedirs(dirname, exist_ok=True)
 
+    # Only some ranks are meant to save checkpoint and produce a file
+    expect_file = False
+
     # All ranks save for deepspeed
     if is_deepspeed:
+        expect_file = True
         log.debug('Saving deepspeed checkpoints to %s...', save_filename)
         if dist.get_global_rank() == 0:
             with open(save_filename, 'wb') as f:
@@ -941,24 +1042,43 @@ def save_checkpoint(
 
         _save_deepspeed_model(state.deepspeed_model, save_filename)
 
-    # Sharded checkpointing for torch >=2.0 uses the torch.distributed.checkpoint module.
+    # Sharded checkpointing
     elif state.fsdp_elastic_sharded_enabled:
         if state.fsdp_config is None:
             raise ValueError('Saving a sharded checkpoint requires passing an FSDP config to Trainer.')
-        save_planner = state.fsdp_config['save_planner']
-        _validate_save_planner(save_planner)
-
-        import torch.distributed.checkpoint as dist_cp
 
-        log.debug('Saving sharded checkpoints to %s...', save_filename)
-        dist_cp.save_state_dict(
-            state_dict=state_dict,
-            storage_writer=dist_cp.FileSystemWriter(dirname),
-            planner=save_planner,
-        )
+        log.debug(f'Saving sharded checkpoints to {save_filename}...')
+        process_group = None
+        device_mesh = state.fsdp_device_mesh
+        if device_mesh is not None and device_mesh.ndim == 2:
+            # If hybrid shard, only rank in first replica saves
+            expect_file = device_mesh.get_local_rank(mesh_dim=0) == 0
+            if expect_file:
+                process_group = device_mesh.get_group(1)  # Shard process_group for first replica
+                log.debug(f'Saving on global_rank={dist.get_global_rank()}, {expect_file=}')
+        else:
+            expect_file = True
+
+        if expect_file:
+            if version.parse(torch.__version__) > version.parse('2.2.9'):
+                dist_cp.save(  # type: ignore
+                    state_dict=state_dict,
+                    storage_writer=dist_cp.FileSystemWriter(dirname),
+                    planner=state.fsdp_config['save_planner'],
+                    process_group=process_group,
+                )
+            else:
+                dist_cp.save_state_dict(
+                    state_dict=state_dict,
+                    storage_writer=dist_cp.FileSystemWriter(dirname),
+                    planner=state.fsdp_config['save_planner'],
+                    process_group=process_group,
+                )
+        log.debug('Finished pytorch save state dict')
 
     # Only rank 0 saves the state_dict unless you are using sharded checkpointing with torch <2.0
     elif dist.get_global_rank() == 0 or state.fsdp_sharded_state_dict_enabled:
+        expect_file = True
         log_msg = f'Saving sharded checkpoints to {save_filename}...' if state.fsdp_sharded_state_dict_enabled else f'Saving monolithic checkpoint to {save_filename}'
         with open(save_filename, 'wb') as f:
             log.debug(log_msg)
@@ -974,7 +1094,7 @@ def save_checkpoint(
 
     dist.barrier()  # ensure all ranks saved their files
 
-    if dist.get_global_rank() == 0 or is_deepspeed or state.fsdp_sharded_state_dict_enabled:
+    if expect_file:
         assert os.path.exists(save_filename), 'Expected file to have been saved.'
         return save_filename
     else:
@@ -1014,6 +1134,17 @@ def _save_deepspeed_model(model, filename: str):
             tar.add(tmpdir, arcname='')
 
 
+def save_checkpoint(
+    state: State,
+    filename: str = 'ep{epoch}-ba{batch}-rank{rank}',
+    *,
+    weights_only: bool = False,
+    ignore_keys: Optional[Union[List[str], Callable[[Dict], None]]] = None,
+) -> Union[str, None]:  # noqa: D103
+    save_filename = get_save_filename(state, filename)
+    return _save_checkpoint(state, save_filename, weights_only=weights_only, ignore_keys=ignore_keys)
+
+
 save_checkpoint.__doc__ = f"""Checkpoint the training ``state``.
 
 Args:
diff --git a/composer/utils/collect_env.py b/composer/utils/collect_env.py
index 2926c54a6f..02e74af8f9 100644
--- a/composer/utils/collect_env.py
+++ b/composer/utils/collect_env.py
@@ -378,7 +378,6 @@ def print_env(file: Optional[TextIO] = None) -> None:
         [pip3] torch-optimizer==0.1.0
         [pip3] torchmetrics==0.7.3
         [pip3] torchvision==0.10.1+cu111
-        [pip3] vit-pytorch==0.27.0
         [conda] Could not collect
 
 
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 1b59bff1d4..65edb5e80c 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -253,15 +253,19 @@ def get_node_rank() -> int:
     return _get_distributed_config_var(env_var='NODE_RANK', default=0, human_name='node rank')
 
 
-def barrier() -> None:
+def barrier(group=None) -> None:
     """Synchronizes all processes.
 
     This function blocks until all processes reach this function.
 
     .. seealso:: :func:`torch.distributed.barrier`
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
     """
     if dist.is_available() and dist.is_initialized():
-        dist.barrier()
+        dist.barrier(group=group)
         return
     world_size = get_world_size()
     if world_size == 1:
@@ -276,6 +280,7 @@ def barrier() -> None:
 def all_reduce(
     tensor: torch.Tensor,
     reduce_operation: str = 'SUM',
+    group=None,
 ) -> None:
     """Reduce a ``tensor`` by applying the ``reduce_operation``.
 
@@ -289,6 +294,8 @@ def all_reduce(
         op (optional): One of the values from
             ``torch.distributed.ReduceOp``
             enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
     Args:
         tensor (torch.Tensor): Tensor to reduce. The function operates in-place.
         reduce_operation (str, optional): The reduction operation (default: ``SUM``).
@@ -307,7 +314,7 @@ def all_reduce(
     """
     if dist.is_available() and dist.is_initialized():
         reduce_op = getattr(dist.ReduceOp, reduce_operation.upper())
-        dist.all_reduce(tensor, op=reduce_op)
+        dist.all_reduce(tensor, op=reduce_op, group=group)
         return
     world_size = get_world_size()
     if world_size == 1:
@@ -319,7 +326,7 @@ def all_reduce(
                        '`composer.utils.dist.initialize_dist` has been called first.')
 
 
-def broadcast(tensor: torch.Tensor, src: int) -> None:
+def broadcast(tensor: torch.Tensor, src: int, group=None) -> None:
     """Broadcasts the tensor to the whole group.
 
     ``tensor`` must have the same number of elements in all processes participating in the collective.
@@ -329,9 +336,11 @@ def broadcast(tensor: torch.Tensor, src: int) -> None:
         tensor (torch.Tensor): Data to be sent if ``src`` is the rank of current process,
             and tensor to be used to save received data otherwise.
         src (int): Source rank
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
     """
     if dist.is_available() and dist.is_initialized():
-        dist.broadcast(tensor, src)
+        dist.broadcast(tensor, src=src, group=group)
         return
     world_size = get_world_size()
     if world_size == 1:
@@ -343,7 +352,7 @@ def broadcast(tensor: torch.Tensor, src: int) -> None:
                        '`composer.utils.dist.initialize_dist` has been called first.')
 
 
-def broadcast_object_list(object_list: List[Any], src: int = 0) -> None:
+def broadcast_object_list(object_list: List[Any], src: int = 0, group=None) -> None:
     """Broadcasts picklable objects in ``object_list`` to the whole group.
 
     Similar to :func:`broadcast`, but Python objects can be passed in.
@@ -356,12 +365,14 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None:
             Each object must be picklable. Only objects on the ``src`` rank will be broadcast,
             but each rank must provide lists of equal sizes.
         src (int, optional): Source rank (default: ``0``)
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
 
     Returns:
         None:  ``object_list`` will be modified in-place and set to values of ``object_list`` from the ``src`` rank.
     """
     if dist.is_available() and dist.is_initialized():
-        dist.broadcast_object_list(object_list, src)
+        dist.broadcast_object_list(object_list, src=src, group=group)
         # torch.distributed will replace the None's in obj_gather_list with the gathered objects on rank 0
         # or will just be None on non-rank-0
         return
@@ -375,20 +386,22 @@ def broadcast_object_list(object_list: List[Any], src: int = 0) -> None:
                        '`composer.utils.dist.initialize_dist` has been called first.')
 
 
-def all_gather(tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+def all_gather(tensor: torch.Tensor, group=None) -> Sequence[torch.Tensor]:
     """Collects a :class:`~torch.Tensor` from each rank.
 
     .. seealso:: :func:`torch.distributed.all_gather`
 
     Args:
         tensor (torch.Tensor): Tensor from each rank to be gathered.
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
 
     Returns:
         Sequence[Tensor]: A sequence of tensors indexed by rank.
     """
     if dist.is_available() and dist.is_initialized():
         obj_gather_list = [torch.zeros_like(tensor) for _ in range(get_world_size())]
-        dist.all_gather(obj_gather_list, tensor)
+        dist.all_gather(obj_gather_list, tensor, group=group)
         return obj_gather_list
     world_size = get_world_size()
     if world_size == 1:
@@ -400,13 +413,15 @@ def all_gather(tensor: torch.Tensor) -> Sequence[torch.Tensor]:
                        '`composer.utils.dist.initialize_dist` has been called first.')
 
 
-def all_gather_object(obj: TObj) -> List[TObj]:
+def all_gather_object(obj: TObj, group=None) -> List[TObj]:
     """Collect a pickleable object from each rank and return a list of these objects indexed by rank.
 
     .. seealso:: :func:`torch.distributed.all_gather_object`
 
     Args:
         obj (TObj): Object to be gathered.
+        group (ProcessGroup, optional): The process group to work on. If ``None``,
+            the default process group will be used. Default is ``None``.
 
     Returns:
         List[TObj]: A list of objects indexed by rank.
@@ -414,9 +429,9 @@ def all_gather_object(obj: TObj) -> List[TObj]:
     if dist.is_available() and dist.is_initialized():
         obj_gather_list = [None for _ in range(get_world_size())]
         if is_hpu_installed():
-            all_gather_object_list_hpu(obj_gather_list, obj)
+            all_gather_object_list_hpu(obj_gather_list, obj, group=group)
         else:
-            dist.all_gather_object(obj_gather_list, obj)
+            dist.all_gather_object(obj_gather_list, obj, group=group)
         # torch.distributed will replace the None's in obj_gather_list with the gathered objects on rank 0
         # or will just be None on non-rank-0
         return cast(List[TObj], obj_gather_list)
diff --git a/composer/utils/eval_client/local_eval_client.py b/composer/utils/eval_client/local_eval_client.py
index 357660b284..710a478473 100644
--- a/composer/utils/eval_client/local_eval_client.py
+++ b/composer/utils/eval_client/local_eval_client.py
@@ -38,7 +38,7 @@ def invoke_helper(self, payload: Dict[str, str]) -> bool:
         p.start()
         p.join(TIMEOUT)  # wait for timeout to terminate
         p.terminate()
-        return bool(ret.value)
+        return bool(ret.value)  # pyright: ignore[reportGeneralTypeIssues]
 
     def update_offline_helper(self, code_gen: str, test_input: str, test_output: str, entry_point: str, language: str,
                               val: multiprocessing.Value):  # type: ignore
diff --git a/composer/utils/eval_client/mosaicml_lambda_eval_client.py b/composer/utils/eval_client/mosaicml_lambda_eval_client.py
index fabb6b32be..cc9ea74714 100644
--- a/composer/utils/eval_client/mosaicml_lambda_eval_client.py
+++ b/composer/utils/eval_client/mosaicml_lambda_eval_client.py
@@ -46,7 +46,7 @@ def invoke(self, payload: List[List[List[Dict[str, str]]]]) -> List[List[List[bo
         ret_helper = [False] * len(test_cases)
         for i in range(self.num_retries):
             try:
-                ret_helper = mcli.get_code_eval_output(test_cases).data
+                ret_helper = mcli.get_code_eval_output(test_cases).data  # pyright: ignore[reportGeneralTypeIssues]
                 break
             except mcli.MAPIException as e:
                 if e.status >= 500:
diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py
index a3d421259b..7c75b4633e 100644
--- a/composer/utils/file_helpers.py
+++ b/composer/utils/file_helpers.py
@@ -20,7 +20,10 @@
 
 from composer.utils import dist
 from composer.utils.iter_helpers import iterate_with_callback
-from composer.utils.object_store import GCSObjectStore, ObjectStore, OCIObjectStore, S3ObjectStore, UCObjectStore
+from composer.utils.misc import partial_format
+from composer.utils.object_store import (GCSObjectStore, LibcloudObjectStore, MLFlowObjectStore, ObjectStore,
+                                         OCIObjectStore, S3ObjectStore, UCObjectStore)
+from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX
 
 if TYPE_CHECKING:
     from composer.core import Timestamp
@@ -29,9 +32,16 @@
 log = logging.getLogger(__name__)
 
 __all__ = [
-    'get_file', 'ensure_folder_is_empty', 'ensure_folder_has_no_conflicting_files', 'format_name_with_dist',
-    'format_name_with_dist_and_time', 'is_tar', 'create_symlink_file', 'maybe_create_object_store_from_uri',
-    'maybe_create_remote_uploader_downloader_from_uri', 'parse_uri'
+    'get_file',
+    'ensure_folder_is_empty',
+    'ensure_folder_has_no_conflicting_files',
+    'format_name_with_dist',
+    'format_name_with_dist_and_time',
+    'is_tar',
+    'create_symlink_file',
+    'maybe_create_object_store_from_uri',
+    'maybe_create_remote_uploader_downloader_from_uri',
+    'parse_uri',
 ]
 
 
@@ -166,7 +176,8 @@ def ensure_folder_has_no_conflicting_files(folder_name: Union[str, pathlib.Path]
 
 
 def format_name_with_dist(format_str: str, run_name: str, **extra_format_kwargs: object):  # noqa: D103
-    formatted_str = format_str.format(
+    formatted_str = partial_format(
+        format_str,
         run_name=run_name,
         **_get_dist_config(strict=False),
         **extra_format_kwargs,
@@ -259,7 +270,8 @@ def format_name_with_dist_and_time(
     timestamp: Timestamp,
     **extra_format_kwargs: object,
 ):  # noqa: D103
-    formatted_str = format_str.format(
+    formatted_str = partial_format(
+        format_str,
         run_name=run_name,
         epoch=int(timestamp.epoch),
         batch=int(timestamp.batch),
@@ -314,6 +326,7 @@ def parse_uri(uri: str) -> Tuple[str, str, str]:
         Tuple[str, str, str]: A tuple containing the backend (e.g. s3), bucket name, and path.
                               Backend name will be empty string if the input is a local path
     """
+    uri = uri.replace('AZURE_BLOBS', 'azure')  # urlparse does not support _ in scheme
     parse_result = urlparse(uri)
     backend, net_loc, path = parse_result.scheme, parse_result.netloc, parse_result.path
     bucket_name = net_loc if '@' not in net_loc else net_loc.split('@')[0]
@@ -349,10 +362,36 @@ def maybe_create_object_store_from_uri(uri: str) -> Optional[ObjectStore]:
         return GCSObjectStore(bucket=bucket_name)
     elif backend == 'oci':
         return OCIObjectStore(bucket=bucket_name)
+    elif backend == 'azure':
+        return LibcloudObjectStore(
+            provider='AZURE_BLOBS',
+            container=bucket_name,
+            key_environ='AZURE_ACCOUNT_NAME',
+            secret_environ='AZURE_ACCOUNT_ACCESS_KEY',
+        )
     elif backend == 'dbfs':
-        # validate if the path conforms to the requirements for UC volume paths
-        UCObjectStore.validate_path(path)
-        return UCObjectStore(path=path)
+        if path.startswith(MLFLOW_DBFS_PATH_PREFIX):
+            store = None
+            if dist.get_global_rank() == 0:
+                store = MLFlowObjectStore(path)
+
+                # The path may have had placeholders, so update it with the experiment/run IDs initialized by the store
+                path = store.get_dbfs_path(path)
+
+            # Broadcast the rank 0 updated path to all ranks for their own object stores
+            path_list = [path]
+            dist.broadcast_object_list(path_list, src=0)
+            path = path_list[0]
+
+            # Create the object store for all other ranks
+            if dist.get_global_rank() != 0:
+                store = MLFlowObjectStore(path)
+
+            return store
+        else:
+            # validate if the path conforms to the requirements for UC volume paths
+            UCObjectStore.validate_path(path)
+            return UCObjectStore(path=path)
     else:
         raise NotImplementedError(f'There is no implementation for the cloud backend {backend} via URI. Please use '
                                   'one of the supported object stores')
@@ -387,14 +426,21 @@ def maybe_create_remote_uploader_downloader_from_uri(
             return None
     if backend in ['s3', 'oci', 'gs']:
         return RemoteUploaderDownloader(bucket_uri=f'{backend}://{bucket_name}')
-
+    elif backend == 'azure':
+        return RemoteUploaderDownloader(
+            bucket_uri=f'libcloud://{bucket_name}',
+            backend_kwargs={
+                'provider': 'AZURE_BLOBS',
+                'container': bucket_name,
+                'key_environ': 'AZURE_ACCOUNT_NAME',
+                'secret_environ': 'AZURE_ACCOUNT_ACCESS_KEY',
+            },
+        )
+    elif backend == 'dbfs':
+        return RemoteUploaderDownloader(bucket_uri=uri, backend_kwargs={'path': path})
     elif backend == 'wandb':
         raise NotImplementedError(f'There is no implementation for WandB via URI. Please use '
                                   'WandBLogger with log_artifacts set to True')
-    elif backend == 'dbfs':
-        # validate if the path conforms to the requirements for UC volume paths
-        UCObjectStore.validate_path(path)
-        return RemoteUploaderDownloader(bucket_uri=uri, backend_kwargs={'path': path})
     else:
         raise NotImplementedError(f'There is no implementation for the cloud backend {backend} via URI. Please use '
                                   'one of the supported RemoteUploaderDownloader object stores')
diff --git a/composer/utils/fx_utils.py b/composer/utils/fx_utils.py
index 2b1ff41b3e..9162b84878 100644
--- a/composer/utils/fx_utils.py
+++ b/composer/utils/fx_utils.py
@@ -234,6 +234,7 @@ def apply_stochastic_residual(gm: GraphModule, drop_rate: float = 0.2) -> Tuple[
             f'Input to apply_stochastic_residual should be an instance of GraphModule. Received {type(gm)}')
     all_tags, count = _tag_residual_nodes(gm)
     split_gm = split_by_tags(gm, all_tags)
+    assert isinstance(split_gm, GraphModule)
     for node in split_gm.graph.nodes:
         if node.op != 'call_module':
             continue
diff --git a/composer/utils/iter_helpers.py b/composer/utils/iter_helpers.py
index 1338dbf872..7c256fd21e 100644
--- a/composer/utils/iter_helpers.py
+++ b/composer/utils/iter_helpers.py
@@ -6,11 +6,14 @@
 # All methods signatures must be defined in there.
 
 """Utilities for iterating over collections."""
+from __future__ import annotations
+
 import collections.abc
 import io
+from typing import Any
 
 
-def map_collection(collection, map_fn):
+def map_collection(collection, map_fn) -> Any:
     """Applies ``map_fn`` on each element in ``collection``.
 
     * If ``collection`` is a tuple or list of elements, ``map_fn`` is applied on each element,
@@ -37,7 +40,7 @@ def map_collection(collection, map_fn):
     return map_fn(collection)
 
 
-def ensure_tuple(x):
+def ensure_tuple(x) -> tuple[Any, ...]:
     """Converts ``x`` into a tuple.
 
     * If ``x`` is ``None``, then ``tuple()`` is returned.
diff --git a/composer/utils/misc.py b/composer/utils/misc.py
index 76573f8901..e5fa5942ae 100644
--- a/composer/utils/misc.py
+++ b/composer/utils/misc.py
@@ -9,7 +9,6 @@
 from typing import TYPE_CHECKING, Callable, Optional, Set, Type, Union
 
 import torch
-from packaging import version
 from torch.nn.parallel import DistributedDataParallel
 
 if TYPE_CHECKING:
@@ -52,21 +51,21 @@ def create_interval_scheduler(interval: Union[str, int, 'Time'],
     if final_events is None:
         final_events = {Event.BATCH_CHECKPOINT, Event.EPOCH_CHECKPOINT}
 
-    interval = Time.from_input(interval, TimeUnit.EPOCH)
-    if interval.unit == TimeUnit.EPOCH:
+    time_interval: Time = Time.from_input(interval, TimeUnit.EPOCH)
+    if time_interval.unit == TimeUnit.EPOCH:
         interval_event = Event.EPOCH_CHECKPOINT if checkpoint_events else Event.EPOCH_END
-    elif interval.unit in {TimeUnit.BATCH, TimeUnit.TOKEN, TimeUnit.SAMPLE, TimeUnit.DURATION}:
+    elif time_interval.unit in {TimeUnit.BATCH, TimeUnit.TOKEN, TimeUnit.SAMPLE, TimeUnit.DURATION}:
         interval_event = Event.BATCH_CHECKPOINT if checkpoint_events else Event.BATCH_END
     else:
         raise NotImplementedError(
-            f'Unknown interval: {interval.unit}. Must be TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, or TimeUnit.SAMPLE.'
+            f'Unknown interval: {time_interval.unit}. Must be TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, or TimeUnit.SAMPLE.'
         )
 
     last_batch_seen = -1
 
     def check_interval(state: State, event: Event):
         # `TimeUnit.Duration` value is a float from `[0.0, 1.0)`
-        if not interval.unit == TimeUnit.DURATION and int(interval) <= 0:
+        if not time_interval.unit == TimeUnit.DURATION and int(time_interval) <= 0:
             return False
         nonlocal last_batch_seen  # required to use the last_batch_seen from the outer function scope
 
@@ -81,25 +80,25 @@ def check_interval(state: State, event: Event):
         if include_end_of_training and event in final_events and elapsed_duration >= 1.0 and state.timestamp.batch != last_batch_seen:
             return True
 
-        if interval.unit in {TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, TimeUnit.SAMPLE}:
-            previous_count = state.previous_timestamp.get(interval.unit)
-            count = state.timestamp.get(interval.unit)
+        if time_interval.unit in {TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, TimeUnit.SAMPLE}:
+            previous_count = state.previous_timestamp.get(time_interval.unit)
+            count = state.timestamp.get(time_interval.unit)
         # If the eval_interval is a duration, we will track progress in terms of the unit of max_duration
-        elif interval.unit == TimeUnit.DURATION:
+        elif time_interval.unit == TimeUnit.DURATION:
             assert state.max_duration is not None
             previous_count = state.previous_timestamp.get(state.max_duration.unit)
             count = state.timestamp.get(state.max_duration.unit)
         else:
             raise NotImplementedError(
-                f'Unknown interval: {interval.unit}. Must be TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, or TimeUnit.SAMPLE.'
+                f'Unknown interval: {time_interval.unit}. Must be TimeUnit.EPOCH, TimeUnit.BATCH, TimeUnit.TOKEN, or TimeUnit.SAMPLE.'
             )
 
-        threshold_passed = math.floor(previous_count / interval.value) != math.floor(count / interval.value)
+        threshold_passed = math.floor(previous_count / time_interval.value) != math.floor(count / time_interval.value)
 
-        if interval.unit != TimeUnit.DURATION and event == interval_event and threshold_passed:
+        if time_interval.unit != TimeUnit.DURATION and event == interval_event and threshold_passed:
             last_batch_seen = state.timestamp.batch
             return True
-        elif interval.unit == TimeUnit.DURATION:
+        elif time_interval.unit == TimeUnit.DURATION:
             assert state.max_duration is not None, 'max_duration should not be None'
             if state.dataloader_len is None:
                 raise RuntimeError(
@@ -107,22 +106,22 @@ def check_interval(state: State, event: Event):
 
             if event == interval_event:
                 if state.max_duration.unit == TimeUnit.EPOCH and int(state.timestamp.batch) % math.ceil(
-                        state.max_duration.value * float(interval) * state.dataloader_len) == 0:
+                        state.max_duration.value * float(time_interval) * state.dataloader_len) == 0:
                     last_batch_seen = state.timestamp.batch
                     return True
                 elif state.max_duration.unit == TimeUnit.BATCH and int(state.timestamp.batch) % math.ceil(
-                        state.max_duration.value * interval.value) == 0:
+                        state.max_duration.value * time_interval.value) == 0:
                     last_batch_seen = state.timestamp.batch
                     return True
                 elif state.max_duration.unit == TimeUnit.SAMPLE:
-                    samples_per_interval = math.ceil(state.max_duration.value * interval)
+                    samples_per_interval = math.ceil(state.max_duration.value * time_interval)
                     threshold_passed = math.floor(previous_count / samples_per_interval) != math.floor(
                         count / samples_per_interval)
                     if threshold_passed:
                         last_batch_seen = state.timestamp.batch
                         return True
                 elif state.max_duration.unit == TimeUnit.TOKEN:
-                    tokens_per_interval = math.ceil(state.max_duration.value * interval)
+                    tokens_per_interval = math.ceil(state.max_duration.value * time_interval)
                     threshold_passed = math.floor(previous_count / tokens_per_interval) != math.floor(
                         count / tokens_per_interval)
                     if threshold_passed:
@@ -208,19 +207,21 @@ def model_eval_mode(model: torch.nn.Module):
         model.train(mode=is_training)
 
 
-def using_torch_2() -> bool:
-    """Check the PyTorch version and compared it with version 2.0.0.
+def partial_format(s, *args, **kwargs) -> str:
+    """Format a string with a partial set of arguments.
 
-    Returns:
-        bool: Return True if current version is greater than or equal to 2.0.0 else False
-    """
-    return version.parse(torch.__version__) >= version.parse('2.0.0')
-
-
-def using_torch_2_0_1() -> bool:
-    """Check the PyTorch version and compare it with version 2.0.1.
-
-    Returns:
-        bool: Return True if current version is greater than or equal to 2.0.1 else False
+    Since `str.format()` raises a `KeyError` if a format key is missing from the arguments, this
+    function allows for a partial set of arguments to be provided. Any missing arguments will be
+    left as-is in the string.
     """
-    return version.parse(torch.__version__) >= version.parse('2.0.1')
+    max_iters = 10_000  # Just in case we get stuck in a loop somehow.
+    for _ in range(max_iters):
+        try:
+            return s.format(*args, **kwargs)
+        except IndexError as e:  # Missing positional arg
+            args += ('{}',)
+        except KeyError as e:  # Missing keyword arg
+            key = e.args[0]
+            kwargs[key] = '{' + key + '}'
+
+    raise RuntimeError(f'Failed to format string {s} after {max_iters} iterations.')
diff --git a/composer/utils/object_store/gcs_object_store.py b/composer/utils/object_store/gcs_object_store.py
index 1240754968..f3550d1ac9 100644
--- a/composer/utils/object_store/gcs_object_store.py
+++ b/composer/utils/object_store/gcs_object_store.py
@@ -76,7 +76,7 @@ def __init__(
             self.client = Client.from_service_account_json(service_account_path)
             self.use_gcs_sdk = True
             try:
-                self.bucket = self.client.get_bucket(self.bucket_name, timeout=60.0)
+                self.bucket = self.client.get_bucket(self.bucket_name, timeout=60)
             except Exception as e:
                 _reraise_gcs_errors(self.get_uri(object_name=''), e)
 
@@ -127,12 +127,15 @@ def get_object_size(self, object_name: str) -> int:
         blob_exists = Blob(bucket=self.bucket, name=key).exists(self.client)
         if not blob_exists:
             raise FileNotFoundError(f'{object_name} not found in {self.bucket_name}')
+        blob = None
         try:
             key = self.get_key(object_name)
             blob = self.bucket.get_blob(key)
         except Exception as e:
             _reraise_gcs_errors(self.get_uri(object_name), e)
 
+        if blob is None or blob.size is None:
+            return -1
         return blob.size  # size in bytes
 
     def upload_object(self,
@@ -223,6 +226,7 @@ def list_objects(self, prefix: Optional[str] = None) -> List[str]:
             prefix = ''
         prefix = self.get_key(prefix)
 
+        objects = []
         try:
             objects = self.bucket.list_blobs(prefix=prefix)
         except Exception as e:
diff --git a/composer/utils/object_store/libcloud_object_store.py b/composer/utils/object_store/libcloud_object_store.py
index 9f9d9a7b91..6dec288502 100644
--- a/composer/utils/object_store/libcloud_object_store.py
+++ b/composer/utils/object_store/libcloud_object_store.py
@@ -157,7 +157,9 @@ def _get_object(self, object_name: str):
             self._ensure_transient_errors_are_wrapped(e)
 
     def get_object_size(self, object_name: str) -> int:
-        return self._get_object(object_name).size
+        obj = self._get_object(object_name)
+        assert obj is not None
+        return obj.size
 
     def download_object(
         self,
@@ -178,6 +180,7 @@ def download_object(
         tmp_filepath = str(filename) + f'.{uuid.uuid4()}.tmp'
         try:
             with open(tmp_filepath, 'wb+') as f:
+                assert obj is not None
                 stream = self._provider.download_object_as_stream(obj, chunk_size=self.chunk_size)
                 for chunk in iterate_with_callback(stream, obj.size, callback):
                     f.write(chunk)
diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py
index 15f50bcdb0..a156007dae 100644
--- a/composer/utils/object_store/mlflow_object_store.py
+++ b/composer/utils/object_store/mlflow_object_store.py
@@ -21,8 +21,11 @@
 
 DEFAULT_MLFLOW_EXPERIMENT_NAME = 'mlflow-object-store'
 
-PLACEHOLDER_EXPERIMENT_ID = '{mlflow_experiment_id}'
-PLACEHOLDER_RUN_ID = '{mlflow_run_id}'
+MLFLOW_EXPERIMENT_ID_FORMAT_KEY = 'mlflow_experiment_id'
+MLFLOW_RUN_ID_FORMAT_KEY = 'mlflow_run_id'
+
+MLFLOW_EXPERIMENT_ID_PLACEHOLDER = '{' + MLFLOW_EXPERIMENT_ID_FORMAT_KEY + '}'
+MLFLOW_RUN_ID_PLACEHOLDER = '{' + MLFLOW_RUN_ID_FORMAT_KEY + '}'
 
 log = logging.getLogger(__name__)
 
@@ -112,7 +115,10 @@ def __init__(self, path: str, multipart_upload_chunk_size: int = 100 * 1024 * 10
         except ImportError as e:
             raise MissingConditionalImportError('databricks', conda_package='databricks-sdk>=0.15.0,<1.0') from e
 
-        tracking_uri = os.getenv(mlflow.environment_variables.MLFLOW_TRACKING_URI.name, MLFLOW_DATABRICKS_TRACKING_URI)
+        tracking_uri = os.getenv(
+            mlflow.environment_variables.MLFLOW_TRACKING_URI.name,  # pyright: ignore[reportGeneralTypeIssues]
+            MLFLOW_DATABRICKS_TRACKING_URI,
+        )
         if tracking_uri != MLFLOW_DATABRICKS_TRACKING_URI:
             raise ValueError(
                 'MLFlowObjectStore currently only supports Databricks-hosted MLflow tracking. '
@@ -129,12 +135,13 @@ def __init__(self, path: str, multipart_upload_chunk_size: int = 100 * 1024 * 10
                 'to identify different ways to setup credentials.') from e
 
         self._mlflow_client = MlflowClient(tracking_uri)
-        mlflow.environment_variables.MLFLOW_MULTIPART_UPLOAD_CHUNK_SIZE.set(multipart_upload_chunk_size)
+        mlflow.environment_variables.MLFLOW_MULTIPART_UPLOAD_CHUNK_SIZE.set(  # pyright: ignore[reportGeneralTypeIssues]
+            multipart_upload_chunk_size,)
 
         experiment_id, run_id, _ = MLFlowObjectStore.parse_dbfs_path(path)
-        if experiment_id == PLACEHOLDER_EXPERIMENT_ID:
+        if experiment_id == MLFLOW_EXPERIMENT_ID_PLACEHOLDER:
             experiment_id = None
-        if run_id == PLACEHOLDER_RUN_ID:
+        if run_id == MLFLOW_RUN_ID_PLACEHOLDER:
             run_id = None
 
         # Construct the `experiment_id` and `run_id` depending on whether format placeholders were provided.
@@ -158,8 +165,8 @@ def _init_run_info(self, experiment_id: Optional[str], run_id: Optional[str]) ->
                 log.debug(f'MLFlowObjectStore using active MLflow run {run_id=}')
             else:
                 # If no active run exists, create a new run for the default experiment.
-                experiment_name = os.getenv(mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.name,
-                                            DEFAULT_MLFLOW_EXPERIMENT_NAME)
+                mlflow_env_var_name = mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.name  # pyright: ignore[reportGeneralTypeIssues]
+                experiment_name = os.getenv(mlflow_env_var_name, DEFAULT_MLFLOW_EXPERIMENT_NAME)
 
                 experiment = self._mlflow_client.get_experiment_by_name(experiment_name)
                 if experiment is not None:
@@ -236,10 +243,10 @@ def get_artifact_path(self, object_name: str) -> str:
         """
         if object_name.startswith(MLFLOW_DBFS_PATH_PREFIX):
             experiment_id, run_id, object_name = self.parse_dbfs_path(object_name)
-            if (experiment_id != self.experiment_id and experiment_id != PLACEHOLDER_EXPERIMENT_ID):
+            if (experiment_id != self.experiment_id and experiment_id != MLFLOW_EXPERIMENT_ID_PLACEHOLDER):
                 raise ValueError(f'Object {object_name} belongs to experiment ID {experiment_id}, '
                                  f'but MLFlowObjectStore is associated with experiment ID {self.experiment_id}.')
-            if (run_id != self.run_id and run_id != PLACEHOLDER_RUN_ID):
+            if (run_id != self.run_id and run_id != MLFLOW_RUN_ID_PLACEHOLDER):
                 raise ValueError(f'Object {object_name} belongs to run ID {run_id}, '
                                  f'but MLFlowObjectStore is associated with run ID {self.run_id}.')
         return object_name
diff --git a/composer/utils/object_store/oci_object_store.py b/composer/utils/object_store/oci_object_store.py
index ce3fd5ea2c..d36b13e03b 100644
--- a/composer/utils/object_store/oci_object_store.py
+++ b/composer/utils/object_store/oci_object_store.py
@@ -32,7 +32,7 @@ def _reraise_oci_errors(uri: str, e: Exception):
                 raise FileNotFoundError(f'Object {uri} not found. {e.message}') from e  # type: ignore
             if e.code == 'BucketNotFound':  # type: ignore
                 raise ValueError(f'Bucket specified in {uri} not found. {e.message}') from e  # type: ignore
-            raise e
+            raise FileNotFoundError(f'Object {uri} not found with no error code. {e.message}') from e  # type: ignore
 
     # Client errors
     if isinstance(e, oci.exceptions.ClientError):
@@ -81,7 +81,7 @@ def __init__(
         except Exception as e:
             _reraise_oci_errors(self.get_uri(object_name=''), e)
 
-        self.namespace = self.client.get_namespace().data
+        self.namespace = self.client.get_namespace().data  # pyright: ignore[reportOptionalMemberAccess]
         self.upload_manager = oci.object_storage.UploadManager(self.client)
 
     def get_uri(self, object_name: str) -> str:
@@ -97,10 +97,12 @@ def get_object_size(self, object_name: str) -> int:
         except Exception as e:
             _reraise_oci_errors(self.get_uri(object_name), e)
 
-        if response.status == 200:
-            return int(response.data.headers['Content-Length'])
+        if response.status == 200:  # pyright: ignore[reportUnboundVariable, reportOptionalMemberAccess]
+            data = response.data  # pyright: ignore[reportUnboundVariable, reportOptionalMemberAccess]
+            return int(data.headers['Content-Length'])
         else:
-            raise ValueError(f'OCI get_object was not successful with a {response.status} status code.')
+            status = response.status  # pyright: ignore[reportUnboundVariable, reportOptionalMemberAccess]
+            raise ValueError(f'OCI get_object was not successful with a {status} status code.')
 
     def upload_object(
         self,
@@ -126,7 +128,7 @@ def _download_part(self, object_name, filename, start_byte, end_byte, part_numbe
                                           object_name=object_name,
                                           range=range_header)
         with open(tmp_part_path, 'wb') as f:
-            f.write(response.data.content)
+            f.write(response.data.content)  # pyright: ignore[reportOptionalMemberAccess]
         return part_number, tmp_part_path
 
     def download_object(
@@ -146,8 +148,12 @@ def download_object(
             os.makedirs(dirname, exist_ok=True)
 
         # Get the size of the object
-        head_object_response = self.client.head_object(self.namespace, self.bucket, object_name)
-        object_size = head_object_response.headers['content-length']
+        object_size = 0
+        try:
+            head_object_response = self.client.head_object(self.namespace, self.bucket, object_name)
+            object_size = head_object_response.headers['content-length']  # pyright: ignore[reportOptionalMemberAccess]
+        except Exception as e:
+            _reraise_oci_errors(self.get_uri(object_name), e)
         # Calculate the part sizes
         base_part_size, remainder = divmod(int(object_size), num_parts)
         part_sizes = [base_part_size] * num_parts
@@ -156,9 +162,9 @@ def download_object(
         part_sizes = [part_size for part_size in part_sizes if part_size > 0]
 
         with TemporaryDirectory(dir=dirname, prefix=f'{str(filename)}') as temp_dir:
+            parts = []
             try:
                 # Download parts in parallel
-                parts = []
                 with concurrent.futures.ThreadPoolExecutor() as executor:
                     futures = []
                     start_byte = 0
@@ -198,10 +204,9 @@ def list_objects(self, prefix: Optional[str] = None) -> List[str]:
         response_complete = False
         try:
             while not response_complete:
-                response = self.client.list_objects(namespace_name=self.namespace,
-                                                    bucket_name=self.bucket,
-                                                    prefix=prefix,
-                                                    start=next_start_with).data
+                response = self.client.list_objects(
+                    namespace_name=self.namespace, bucket_name=self.bucket, prefix=prefix,
+                    start=next_start_with).data  # pyright: ignore[reportOptionalMemberAccess]
                 object_names.extend([obj.name for obj in response.objects])
                 next_start_with = response.next_start_with
                 if not next_start_with:
diff --git a/composer/utils/object_store/s3_object_store.py b/composer/utils/object_store/s3_object_store.py
index 854d447665..eeae4d28fa 100644
--- a/composer/utils/object_store/s3_object_store.py
+++ b/composer/utils/object_store/s3_object_store.py
@@ -116,6 +116,7 @@ def get_key(self, object_name: str) -> str:
         return f'{self.prefix}{object_name}'
 
     def get_object_size(self, object_name: str) -> int:
+        obj = {'ContentLength': -1}
         try:
             obj = self.client.get_object(Bucket=self.bucket, Key=self.get_key(object_name))
         except Exception as e:
diff --git a/composer/utils/object_store/uc_object_store.py b/composer/utils/object_store/uc_object_store.py
index 23e8440354..4fc901212a 100644
--- a/composer/utils/object_store/uc_object_store.py
+++ b/composer/utils/object_store/uc_object_store.py
@@ -24,8 +24,9 @@
 
 def _wrap_errors(uri: str, e: Exception):
     from databricks.sdk.core import DatabricksError
+    from databricks.sdk.errors.mapping import NotFound
     if isinstance(e, DatabricksError):
-        if e.error_code == _NOT_FOUND_ERROR_CODE:  # type: ignore
+        if isinstance(e, NotFound) or e.error_code == _NOT_FOUND_ERROR_CODE:  # type: ignore
             raise FileNotFoundError(f'Object {uri} not found') from e
     raise ObjectStoreTransientError from e
 
@@ -48,6 +49,7 @@ class UCObjectStore(ObjectStore):
     """
 
     _UC_VOLUME_LIST_API_ENDPOINT = '/api/2.0/fs/list'
+    _UC_VOLUME_FILES_API_ENDPOINT = '/api/2.0/fs/files'
 
     def __init__(self, path: str) -> None:
         try:
@@ -206,21 +208,21 @@ def get_object_size(self, object_name: str) -> int:
         """
         from databricks.sdk.core import DatabricksError
         try:
-            file_info = self.client.files.get_status(self._get_object_path(object_name))
-            if file_info.is_dir:
-                raise IsADirectoryError(f'{object_name} is a UC directory, not a file.')
-
-            return file_info.file_size  # pyright: ignore
+            # Note: The UC team is working on changes to fix the files.get_status API, but it currently
+            # does not work. Once fixed, we will call the files API endpoint. We currently only use this
+            # function in Composer and LLM-foundry to check the UC object's existence.
+            object_path = self._get_object_path(object_name).lstrip('/')
+            path = os.path.join(self._UC_VOLUME_FILES_API_ENDPOINT, object_path)
+            self.client.api_client.do(method='HEAD', path=path, headers={'Source': 'mosaicml/composer'})
+            return 1000000  # Dummy value, as we don't have a way to get the size of the file
         except DatabricksError as e:
+            # If the code reaches here, the file was not found
             _wrap_errors(self.get_uri(object_name), e)
+        return -1
 
     def list_objects(self, prefix: Optional[str]) -> List[str]:
         """List all objects in the object store with the given prefix.
 
-         .. note::
-
-            This function removes the directories from the returned list.
-
         Args:
             prefix (str): The prefix to search for.
 
@@ -232,13 +234,35 @@ def list_objects(self, prefix: Optional[str]) -> List[str]:
 
         from databricks.sdk.core import DatabricksError
         try:
-            data = json.dumps({'path': self._get_object_path(prefix)})
             # NOTE: This API is in preview and should not be directly used outside of this instance
-            resp = self.client.api_client.do(method='GET',
-                                             path=self._UC_VOLUME_LIST_API_ENDPOINT,
-                                             data=data,
-                                             headers={'Source': 'mosaicml/composer'})
-            assert isinstance(resp, dict)
-            return [f['path'] for f in resp.get('files', []) if not f['is_dir']]
+            logging.warn('UCObjectStore.list_objects is experimental.')
+
+            # Iteratively get all UC Volume files with `prefix`.
+            stack = [prefix]
+            all_files = []
+
+            while len(stack) > 0:
+                current_path = stack.pop()
+
+                # Note: Databricks SDK handles HTTP errors and retries.
+                # See https://github.com/databricks/databricks-sdk-py/blob/v0.18.0/databricks/sdk/core.py#L125 and
+                # https://github.com/databricks/databricks-sdk-py/blob/v0.18.0/databricks/sdk/retries.py#L33 .
+                resp = self.client.api_client.do(method='GET',
+                                                 path=self._UC_VOLUME_LIST_API_ENDPOINT,
+                                                 data=json.dumps({'path': self._get_object_path(current_path)}),
+                                                 headers={'Source': 'mosaicml/composer'})
+
+                assert isinstance(resp, dict), 'Response is not a dictionary'
+
+                for f in resp.get('files', []):
+                    fpath = f['path']
+                    if f['is_dir']:
+                        stack.append(fpath)
+                    else:
+                        all_files.append(fpath)
+
+            return all_files
+
         except DatabricksError as e:
             _wrap_errors(self.get_uri(prefix), e)
+        return []
diff --git a/composer/utils/reproducibility.py b/composer/utils/reproducibility.py
index 0895b530d9..0e2ee0cb5f 100644
--- a/composer/utils/reproducibility.py
+++ b/composer/utils/reproducibility.py
@@ -53,6 +53,7 @@
 import textwrap
 import time
 import warnings
+from contextlib import contextmanager
 from typing import Any, Dict, List
 
 import numpy as np
@@ -62,6 +63,7 @@
 from composer.utils import dist
 
 __all__ = [
+    'seed_context',
     'configure_deterministic_mode',
     'get_random_seed',
     'seed_all',
@@ -76,6 +78,15 @@
 MAX_SEED = 2**32 - 1
 
 
+@contextmanager
+def seed_context(seed: int):
+    """Context manager to store rng_state and reseed for duration of context."""
+    rng_state = get_rng_state()
+    seed_all(seed)
+    yield
+    load_rng_state(rng_state)
+
+
 def configure_deterministic_mode():
     """Configure PyTorch deterministic mode.
 
@@ -218,7 +229,7 @@ def load_rng_state(rng_state_dicts: List[Dict[str, Any]]):
             try:
                 torch.cuda.set_rng_state(rng_state_dict['cuda'])
             except RuntimeError as e:
-                if 'RNG state is wrong size' in str(e):
+                if 'RNG state is wrong size' in str(e) or 'offset must be a multiple of 4' in str(e):
                     warnings.warn('The CUDA RNG state could not be loaded from the checkpoint, '
                                   'likely because a different version of torch was used to save the '
                                   'checkpoint. Skipping loading the CUDA RNG state.')
diff --git a/composer/utils/string_enum.py b/composer/utils/string_enum.py
index ba4e534e0c..18a98f9339 100644
--- a/composer/utils/string_enum.py
+++ b/composer/utils/string_enum.py
@@ -64,7 +64,7 @@ class StringEnum(Enum):
 
             warnings.resetwarnings()
     """
-    __hash__ = Enum.__hash__
+    __hash__ = Enum.__hash__  # pyright: ignore[reportGeneralTypeIssues]
 
     def __eq__(self, other: object) -> bool:
         if isinstance(other, str):
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ea72ebc7b4..e5ae9b9468 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -118,6 +118,7 @@ RUN apt-get update && \
         tcl \
         libjpeg8-dev \
         less \
+        libsnappy-dev \
         # For AWS EFA:
         autoconf \
         autotools-dev \
@@ -269,6 +270,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
         rm -rf /tmp/mofed ; \
     fi
 
+
 #####################
 # Install NVIDIA Apex
 #####################
@@ -294,7 +296,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] &&  [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \
 RUN if [ -n "$CUDA_VERSION" ] ; then \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \
-        pip${PYTHON_VERSION} install --no-cache-dir flash-attn==1.0.9; \
+        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.5.0; \
     fi
 
 ###############
@@ -353,7 +355,8 @@ RUN apt-get update && \
 RUN pip install --no-cache-dir --upgrade \
         certifi${CERTIFI_VERSION} \
         ipython${IPYTHON_VERSION} \
-        urllib3${URLLIB3_VERSION}
+        urllib3${URLLIB3_VERSION} \
+        python-snappy
 
 ##################################################
 # Override NVIDIA mistaken env var for 11.8 images
diff --git a/docker/README.md b/docker/README.md
index 81f7f1fa0d..c617567f2f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.17.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.17.2`         |
-| 0.17.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.17.2_cpu` |
+| 0.19.1             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.19.1`         |
+| 0.19.1             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.19.1_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
@@ -25,22 +25,23 @@ installing Composer within the image.
 ## PyTorch Images
 
 The [`mosaicml/pytorch`](https://hub.docker.com/r/mosaicml/pytorch) images contain PyTorch preinstalled, without Composer.
-The base flavor contains PyTorch pre-installed; the vision flavor also includes OpenCV, MM Segmentation, and FFCV dependencies.
 To install composer, once inside the image, run `pip install mosaicml`.
 
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04`                    |
-| Ubuntu 20.04   | Base     | 2.1.1             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.1.1             | 12.1.0 (EFA)        | 3.10             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.1.1             | cpu                 | 3.10             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.1_cpu-python3.10-ubuntu20.04`       |
+| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04`                    |
+| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04`                    |
+| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (EFA)        | 3.10             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws`                |
+| Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04`                                    |
+| Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws`                                |
+| Ubuntu 20.04   | Base     | 2.2.0             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04`                                      |
+| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.0 (EFA)        | 3.10             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.1.2             | cpu                 | 3.10             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04`       |
 | Ubuntu 20.04   | Base     | 2.0.1             | 11.8.0 (Infiniband) | 3.10             | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.0.1             | 11.8.0 (EFA)        | 3.10             | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws`                                |
 | Ubuntu 20.04   | Base     | 2.0.1             | cpu                 | 3.10             | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04`                                      |
-| Ubuntu 20.04   | Base     | 1.13.1            | 11.7.1 (Infiniband) | 3.10             | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04`                                   |
-| Ubuntu 20.04   | Base     | 1.13.1            | 11.7.1 (EFA)        | 3.10             | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws`                               |
-| Ubuntu 20.04   | Base     | 1.13.1            | cpu                 | 3.10             | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04`                                     |
 <!-- END_PYTORCH_BUILD_MATRIX -->
 
 **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 98bac4504b..700bd4c010 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -2,7 +2,75 @@
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-2-1-1-cu121
+  IMAGE_NAME: torch-2-2-0-cu121
+  MOFED_VERSION: 5.5-1.0.3.2
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
+    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
+    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
+    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
+    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
+    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
+    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
+    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
+    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
+    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
+    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
+    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
+    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
+    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  PYTHON_VERSION: '3.11'
+  PYTORCH_NIGHTLY_URL: ''
+  PYTORCH_NIGHTLY_VERSION: ''
+  PYTORCH_VERSION: 2.2.0
+  TAGS:
+  - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
+  TARGET: pytorch_stage
+  TORCHVISION_VERSION: 0.17.0
+- AWS_OFI_NCCL_VERSION: v1.7.4-aws
+  BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.0
+  IMAGE_NAME: torch-2-2-0-cu121-aws
+  MOFED_VERSION: ''
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
+    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
+    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
+    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
+    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
+    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
+    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
+    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
+    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
+    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
+    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
+    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
+    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
+    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  PYTHON_VERSION: '3.11'
+  PYTORCH_NIGHTLY_URL: ''
+  PYTORCH_NIGHTLY_VERSION: ''
+  PYTORCH_VERSION: 2.2.0
+  TAGS:
+  - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws
+  TARGET: pytorch_stage
+  TORCHVISION_VERSION: 0.17.0
+- AWS_OFI_NCCL_VERSION: ''
+  BASE_IMAGE: ubuntu:20.04
+  CUDA_VERSION: ''
+  IMAGE_NAME: torch-2-2-0-cpu
+  MOFED_VERSION: ''
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
+  PYTHON_VERSION: '3.11'
+  PYTORCH_NIGHTLY_URL: ''
+  PYTORCH_NIGHTLY_VERSION: ''
+  PYTORCH_VERSION: 2.2.0
+  TAGS:
+  - mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+  TARGET: pytorch_stage
+  TORCHVISION_VERSION: 0.17.0
+- AWS_OFI_NCCL_VERSION: ''
+  BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.0
+  IMAGE_NAME: torch-2-1-2-cu121
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -21,16 +89,16 @@
   PYTHON_VERSION: '3.10'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.1
+  PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.1
+  TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: v1.7.4-aws
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-2-1-1-cu121-aws
+  IMAGE_NAME: torch-2-1-2-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -49,27 +117,27 @@
   PYTHON_VERSION: '3.10'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.1
+  PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/pytorch:2.1.1_cu121-python3.10-ubuntu20.04-aws
+  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.1
+  TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-1-1-cpu
+  IMAGE_NAME: torch-2-1-2-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.10'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.1
+  PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/pytorch:2.1.1_cpu-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.1
+  TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 11.8.0
@@ -122,52 +190,64 @@
   - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.15.2
-- AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 11.7.1
-  IMAGE_NAME: torch-1-13-1-cu117
-  MOFED_VERSION: 5.5-1.0.3.2
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
-  PYTHON_VERSION: '3.10'
-  PYTORCH_NIGHTLY_URL: ''
-  PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 1.13.1
-  TAGS:
-  - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-  TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.14.1
 - AWS_OFI_NCCL_VERSION: v1.7.4-aws
-  BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 11.7.1
-  IMAGE_NAME: torch-1-13-1-cu117-aws
+  BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.0
+  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10-aws
   MOFED_VERSION: ''
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
+    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
+    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
+    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
+    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
+    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
+    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
+    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
+    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
+    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
+    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
+    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
+    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
+    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
   PYTHON_VERSION: '3.10'
-  PYTORCH_NIGHTLY_URL: ''
-  PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 1.13.1
+  PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
+  PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
+  PYTORCH_VERSION: 2.3.0
   TAGS:
-  - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws
+  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.14.1
+  TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: ubuntu:20.04
-  CUDA_VERSION: ''
-  IMAGE_NAME: torch-1-13-1-cpu
-  MOFED_VERSION: ''
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
+  BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.0
+  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10
+  MOFED_VERSION: 5.5-1.0.3.2
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
+    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
+    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
+    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
+    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
+    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
+    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
+    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
+    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
+    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
+    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
+    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
+    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
+    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
   PYTHON_VERSION: '3.10'
-  PYTORCH_NIGHTLY_URL: ''
-  PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 1.13.1
+  PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
+  PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
+  PYTORCH_VERSION: 2.3.0
   TAGS:
-  - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.14.1
+  TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-nightly-2-2-0-20231213-cu121
+  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-11
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -183,19 +263,19 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
-  PYTORCH_NIGHTLY_VERSION: dev20231213+cu121
-  PYTORCH_VERSION: 2.2.0
+  PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
+  PYTORCH_VERSION: 2.3.0
   TAGS:
-  - mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.17.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: composer-0-17-2
+  IMAGE_NAME: composer-0-19-1
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -214,25 +294,25 @@
   PYTHON_VERSION: '3.10'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.1
+  PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.17.2
+  - mosaicml/composer:0.19.1
   - mosaicml/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.16.1
+  TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.17.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-17-2-cpu
+  IMAGE_NAME: composer-0-19-1-cpu
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.10'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.1
+  PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.17.2_cpu
+  - mosaicml/composer:0.19.1_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.16.1
+  TORCHVISION_VERSION: 0.16.2
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index a7dca9bc50..333010304b 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -18,17 +18,17 @@
 import tabulate
 import yaml
 
-LATEST_PYTHON_VERSION = '3.10'
-PRODUCTION_PYTORCH_VERSION = '2.1.1'
+PRODUCTION_PYTHON_VERSION = '3.10'
+PRODUCTION_PYTORCH_VERSION = '2.1.2'
 
 
 def _get_torchvision_version(pytorch_version: str):
-    if pytorch_version == '2.1.1':
-        return '0.16.1'
+    if pytorch_version == '2.2.0':
+        return '0.17.0'
+    if pytorch_version == '2.1.2':
+        return '0.16.2'
     if pytorch_version == '2.0.1':
         return '0.15.2'
-    if pytorch_version == '1.13.1':
-        return '0.14.1'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -39,14 +39,15 @@ def _get_base_image(cuda_version: str):
 
 
 def _get_cuda_version(pytorch_version: str, use_cuda: bool):
+    # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
-    if pytorch_version == '2.1.1':
+    if pytorch_version == '2.2.0':
+        return '12.1.0'
+    if pytorch_version == '2.1.2':
         return '12.1.0'
     if pytorch_version == '2.0.1':
         return '11.8.0'
-    if pytorch_version == '1.13.1':
-        return '11.7.1'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -81,8 +82,7 @@ def _get_cuda_override(cuda_version: str):
                                     'brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526')
 
         return cuda_121_override_string
-
-    if cuda_version == '11.8.0':
+    elif cuda_version == '11.8.0':
         cuda_118_override_string = ('cuda>=11.8 brand=tesla,driver>=470,driver<471 '
                                     'brand=tesla,driver>=515,driver<516 brand=unknown,driver>=470,driver<471 '
                                     'brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 '
@@ -92,9 +92,7 @@ def _get_cuda_override(cuda_version: str):
                                     'brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 '
                                     'brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 '
                                     'brand=titanrtx,driver>=515,driver<516')
-
         return cuda_118_override_string
-
     return ''
 
 
@@ -106,7 +104,7 @@ def _get_pytorch_tags(python_version: str, pytorch_version: str, cuda_version: s
     cuda_version_tag = _get_cuda_version_tag(cuda_version)
     tags = [f'{base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04']
 
-    if python_version == LATEST_PYTHON_VERSION and pytorch_version == PRODUCTION_PYTORCH_VERSION:
+    if python_version == PRODUCTION_PYTHON_VERSION and pytorch_version == PRODUCTION_PYTORCH_VERSION:
         if not cuda_version:
             tags.append(f'{base_image_name}:latest_cpu')
         else:
@@ -165,16 +163,15 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_versions = ['3.10']
-    pytorch_versions = ['2.1.1', '2.0.1', '1.13.1']
+    python_pytorch_versions = [('3.11', '2.2.0'), ('3.10', '2.1.2'), ('3.10', '2.0.1')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS
 
     pytorch_entries = []
 
-    for product in itertools.product(python_versions, pytorch_versions, cuda_options, stages, interconnects):
-        python_version, pytorch_version, use_cuda, stage, interconnect = product
+    for product in itertools.product(python_pytorch_versions, cuda_options, stages, interconnects):
+        (python_version, pytorch_version), use_cuda, stage, interconnect = product
 
         cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda)
 
@@ -209,9 +206,8 @@ def _main():
                 _get_cuda_override(cuda_version),
         }
 
-        # Only build EFA image on latest python with cuda on pytorch_stage
-        if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and
-                                          stage == 'pytorch_stage'):
+        # Only build EFA image on cuda and pytorch_stage
+        if interconnect == 'EFA' and not (use_cuda and stage == 'pytorch_stage'):
             continue
 
         # Skip the mellanox drivers if not in the cuda images or using EFA
@@ -227,27 +223,63 @@ def _main():
             entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws'
 
         pytorch_entries.append(entry)
-    nightly_entry = {
+
+    nightly_entry_310_aws = {
+        'AWS_OFI_NCCL_VERSION': 'v1.7.4-aws',
+        'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
+        'CUDA_VERSION': '12.1.0',
+        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10-aws',
+        'MOFED_VERSION': '',
+        'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
+        'PYTHON_VERSION': '3.10',
+        'PYTORCH_VERSION': '2.3.0',
+        'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
+        'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
+        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws'],
+        'TARGET': 'pytorch_stage',
+        'TORCHVISION_VERSION': '0.18.0'
+    }
+    pytorch_entries.append(nightly_entry_310_aws)
+
+    nightly_entry_310 = {
         'AWS_OFI_NCCL_VERSION': '',
         'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
         'CUDA_VERSION': '12.1.0',
-        'IMAGE_NAME': 'torch-nightly-2-2-0-20231213-cu121',
+        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10',
         'MOFED_VERSION': '5.5-1.0.3.2',
         'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
         'PYTHON_VERSION': '3.10',
-        'PYTORCH_VERSION': '2.2.0',
+        'PYTORCH_VERSION': '2.3.0',
+        'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
+        'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
+        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'],
+        'TARGET': 'pytorch_stage',
+        'TORCHVISION_VERSION': '0.18.0'
+    }
+    pytorch_entries.append(nightly_entry_310)
+
+    nightly_entry_311 = {
+        'AWS_OFI_NCCL_VERSION': '',
+        'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
+        'CUDA_VERSION': '12.1.0',
+        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-11',
+        'MOFED_VERSION': '5.5-1.0.3.2',
+        'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
+        'PYTHON_VERSION': '3.11',
+        'PYTORCH_VERSION': '2.3.0',
         'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
-        'PYTORCH_NIGHTLY_VERSION': 'dev20231213+cu121',
-        'TAGS': ['mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04'],
+        'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
+        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04'],
         'TARGET': 'pytorch_stage',
         'TORCHVISION_VERSION': '0.18.0'
     }
-    pytorch_entries.append(nightly_entry)
+    pytorch_entries.append(nightly_entry_311)
+
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.17.2']  # Only build images for the latest composer version
-    composer_python_versions = [LATEST_PYTHON_VERSION]  # just build composer against the latest
+    composer_versions = ['0.19.1']  # Only build images for the latest composer version
+    composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):
         python_version, composer_version, use_cuda = product
diff --git a/docs/source/composer_model.rst b/docs/source/composer_model.rst
index 3f4c32dab8..bd80be1d10 100644
--- a/docs/source/composer_model.rst
+++ b/docs/source/composer_model.rst
@@ -75,8 +75,6 @@ We also provide several common classes for various tasks, specifically:
 
 -  :class:`.ComposerClassifier` - classification tasks with a cross entropy
    loss and accuracy metric.
--  :func:`.composer_timm` - creates classification models from the popular `TIMM`_
-   library.
 -  :class:`.HuggingFaceModel` - :class:`.ComposerModel` wrapper for a 🤗 `Transformers`_ model.
 
 .. note::
@@ -195,18 +193,6 @@ Integrations
 ------------
 
 
-
-TIMM
-~~~~
-
-Integrate with your favorite `TIMM`_ models with our :func:`.composer_timm` function.
-
-.. code:: python
-
-    from composer.models import composer_timm
-
-    timm_model = composer_timm(model_name='resnet50', pretrained=True)
-
 BERT Example with 🤗 Transformers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -265,5 +251,4 @@ and make it compatible with our trainer.
 .. |loss| replace:: :meth:`~.ComposerModel.loss`
 .. _MMDetection: https://mmdetection.readthedocs.io/en/latest/
 .. _Transformers: https://huggingface.co/docs/transformers/index
-.. _TIMM: https://timm.fast.ai/
 .. _torchvision: https://pytorch.org/vision/stable/models.html
diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 2b640283b3..91b7c909b8 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -48,7 +48,6 @@
 from composer.core import Timestamp as Timestamp
 from composer.core import TimeUnit as TimeUnit
 from composer.core import types as types
-from composer.datasets.synthetic import SyntheticBatchPairDataset
 from composer.devices import DeviceCPU
 from composer.loggers import InMemoryLogger as InMemoryLogger
 from composer.loggers import Logger as Logger
@@ -72,6 +71,13 @@
 except ImportError:
     _COMETML_INSTALLED = False
 
+try:
+    import neptune
+    _NEPTUNE_INSTALLED = True
+    del neptune  # unused
+except ImportError:
+    _NEPTUNE_INSTALLED = False
+
 try:
     import libcloud
     _LIBCLOUD_INSTALLED = True
@@ -87,7 +93,7 @@
     sys.path.insert(0, _repo_root)
 
 from tests.common import SimpleModel
-from tests.common.datasets import RandomTextClassificationDataset
+from tests.common.datasets import RandomClassificationDataset, RandomTextClassificationDataset
 
 # Disable mosaicml logger
 os.environ['MOSAICML_PLATFORM'] = 'False'
@@ -112,11 +118,10 @@
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 
-dataset = SyntheticBatchPairDataset(
-    total_dataset_size=100,
-    data_shape=data_shape,
+dataset = RandomClassificationDataset(
+    shape=data_shape,
+    size=100,
     num_classes=num_classes,
-    num_unique_samples_to_create=10,
 )
 
 train_dataset = dataset
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d55745608f..100247983a 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -20,16 +20,14 @@ the following installation targets are available:
   and building documentation.
 * ``pip install 'mosaicml[deepspeed]'``: Installs Composer with support for :mod:`deepspeed`.
 * ``pip install 'mosaicml[nlp]'``: Installs Composer with support for NLP models and algorithms.
-* ``pip install 'mosaicml[unet]'``: Installs Composer with support for :doc:`Unet </model_cards/unet>`.
-* ``pip install 'mosaicml[timm]'``: Installs Composer with support for :mod:`timm`.
 * ``pip install 'mosaicml[wandb]'``: Installs Composer with support for :mod:`wandb`.
 * ``pip install 'mosaicml[comet_ml]'``: Installs Composer with support for :mod:`comet_ml`.
+* ``pip install 'mosaicml[neptune]'``: Installs Composer with support for :mod:`neptune`.
 * ``pip install 'mosaicml[tensorboard]'``: Installs Composer with support for :mod:`tensorboard`.
 * ``pip install 'mosaicml[streaming]'``: Installs Composer with support for `streaming <https://github.com/mosaicml/streaming>`_.
 * ``pip install 'mosaicml[mlflow]'``: Installs Composer with support for :mod:`mlflow`.
 * ``pip install 'mosaicml[oci]'``: Installs Composer with support for :mod:`oci`.
 * ``pip install 'mosaicml[onnx]'``: Installs Composer with support for :mod:`onnx`.
-* ``pip install 'mosaicml[vit]'``: Installs Composer with support for :mod:`vit`.
 * ``pip install 'mosaicml[coco]'``: Installs Composer with support for :mod:`coco`.
 * ``pip install 'mosaicml[libcloud]'``: Installs Composer with support for :mod:`libcloud`.
 * ``pip install 'mosaicml[all]'``: Installs all optional dependencies.
diff --git a/docs/source/getting_started/quick_start.rst b/docs/source/getting_started/quick_start.rst
index c3c7d6f7ed..f7613384ba 100644
--- a/docs/source/getting_started/quick_start.rst
+++ b/docs/source/getting_started/quick_start.rst
@@ -61,7 +61,7 @@ Besides easily running our built-in algorithms, Composer also features:
 * An interface to flexibly add algorithms to the training loop
 * An engine that manages the ordering of algorithms for composition
 * A trainer to handle boilerplate around numerics, distributed training, and others
-* Integration with popular model libraries such as TIMM and HuggingFace Transformers
+* Integration with popular model libraries such as HuggingFace Transformers
 
 Next steps
 ----------
diff --git a/docs/source/getting_started/welcome_tour.rst b/docs/source/getting_started/welcome_tour.rst
index a46dc85f33..649a9c87b0 100644
--- a/docs/source/getting_started/welcome_tour.rst
+++ b/docs/source/getting_started/welcome_tour.rst
@@ -65,6 +65,7 @@ We could add events to our training loop as follows:
 .. code-block:: python
 
     # <INIT>
+    # <BEFORE_LOAD>
     # <AFTER_LOAD>
     # <FIT_START>
     for epoch in range(NUM_EPOCHS):
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ce95ba6e1b..425dcad93c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -47,7 +47,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c
 
    examples/getting_started.ipynb
    examples/functional_api.ipynb
-   examples/medical_image_segmentation.ipynb
    examples/custom_speedup_methods.ipynb
    examples/finetune_huggingface.ipynb
    examples/pretrain_finetune_huggingface.ipynb
@@ -136,19 +135,6 @@ Composer is part of the broader Machine Learning community, and we welcome any c
    method_cards/swa.md
    method_cards/weight_standardization.md
 
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-   :caption: Model Library
-
-   model_cards/BERT.md
-   model_cards/cifar_resnet.md
-   model_cards/deeplabv3.md
-   model_cards/efficientnet.md
-   model_cards/GPT2.md
-   model_cards/resnet.md
-   model_cards/unet.md
-
 .. toctree::
    :hidden:
    :caption: API Reference
diff --git a/docs/source/method_cards/decoupled_weight_decay.md b/docs/source/method_cards/decoupled_weight_decay.md
index 2d9f78f94f..71e0f4312f 100644
--- a/docs/source/method_cards/decoupled_weight_decay.md
+++ b/docs/source/method_cards/decoupled_weight_decay.md
@@ -16,9 +16,7 @@ L2 regularization is typically considered equivalent to weight decay, but this e
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
-
-from composer.models import composer_resnet
+from tests.common import RandomImageDataset, composer_resnet
 
 model = composer_resnet('resnet50')
 
diff --git a/docs/source/method_cards/stochastic_depth.md b/docs/source/method_cards/stochastic_depth.md
index 6cc24f5944..2609266a5e 100644
--- a/docs/source/method_cards/stochastic_depth.md
+++ b/docs/source/method_cards/stochastic_depth.md
@@ -14,7 +14,7 @@ Block-wise stochastic depth assigns every residual block a probability of droppi
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 ```
@@ -27,7 +27,6 @@ import torch
 import torch.nn.functional as F
 
 import composer.functional as cf
-from composer.models import composer_resnet
 
 # Training
 
@@ -63,7 +62,7 @@ for epoch in range(1):
 <!--
 ```python
 from torch.utils.data import DataLoader
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 train_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
@@ -75,7 +74,6 @@ eval_dataloader = DataLoader(RandomImageDataset(size=2), batch_size=2)
 # The trainer will automatically run it at the appropriate point in the training loop
 
 from composer.algorithms import StochasticDepth
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 
 # Train model
diff --git a/docs/source/model_cards/BERT.md b/docs/source/model_cards/BERT.md
deleted file mode 100644
index 476a7cfc51..0000000000
--- a/docs/source/model_cards/BERT.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# 🦭 BERT
-
-Category of Task: ``NLP``
-
-Kind of Task: ``Masked Language Modeling``
-
-## Overview
-
-The BERT model family is a set of transformer-based networks for Masked language modeling at various scales. This family was originally proposed by Google AI and is trained on the BooksCorpus (800M words) and English Wikipedia (2,500M words). It is useful for downstream language classification tasks such as Sentence Classification, Sentiment Analysis, Sentence Similarity, and Natural Language Inference.
-
-## Attribution
-
-The BERT model family is described in *[BERT: Pre-training of Deep Bidirectional Transformers for
-Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)* by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
-
-
-## Architecture
-
-BERT consists of a multi-layer bidirectional Transformer encoder parameterised by $n_{num_hidden_layers}$, $d_{hidden_size}$, $d_{num_attention_heads}$. The parameters for each model family member can be seen below:
-
-| Name        | Parameters | $n_{num_hidden_layers}$ | $d_{hidden_size}$ | $d_{num_attention_heads}$ |
-|-------------|------------|-------------|--------------|------------|
-| BERT-Base   | 110M       | 12          | 768         | 12         |
-| BERT-Large  | 340M       | 24          | 1024        | 16         |
-
-## Family Members
-
-We chose to implement BERT-Base as it is small enough to rapidly test methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Cross Entropy Loss | Masked Accuracy |
-|---------------------|------------|---------------------------|-----------------|--------------------------|-----------------|
-| BERT-Base           | 110M       | 10h 38m                   | 35.2B           | 1.59                     | 0.67            |
-
-## Implementation Details
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's BERT model with one of our configurations.
-
-Our recipe training is based off *[How to Train BERT with an Academic Budget](https://arxiv.org/pdf/2104.07705.pdf)* by Peter Izsak, Moshe Berchansky, and Omer Levy. Specifically, we skip the Next Sentence Prediction loss and maintain a sequence length of 128.
-
-After reproducing the original work, we decided to pre-train with the C4 dataset (Colossal Clean Crawled Corpus) from *[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683v3.pdf)* by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. C4 has been shown to yield better results on downstream tasks.
diff --git a/docs/source/model_cards/GPT2.md b/docs/source/model_cards/GPT2.md
deleted file mode 100644
index 81b9df1a1e..0000000000
--- a/docs/source/model_cards/GPT2.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# 📚 GPT-2
-
-Category of Task: ``NLP``
-
-Kind of Task: ``Autoregressive Language Modeling``
-
-## Overview
-
-The GPT-2 model family is a set of transformer-based networks for autoregressive language modeling at various scales. This family was originally proposed by OpenAI and is trained on the OpenWebText dataset. It is useful for downstream language generation tasks such as summarization, translation, and dialog.
-
-## Attribution
-
-The GPT model family is described in *[Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)* by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-
-The scaling law that we use to choose the members of this model family is described in *[Scaling Laws for Neural Language Models](https://arxiv.org/abs/2001.08361)* by Jared Kaplan, Sam McCandish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
-
-## Architecture
-
-GPT-2 consists of a decoder-only Transformer parameterized by $n_{layer}$, $d_{model}$, $d_{ff}$, $d_{attn}$ and $n_{heads}$. The parameters for each model family member can be seen below:
-
-| Name       | $n_{layer}$ | $d_{model}$ | $d_{ff}$ | $d_{attn}$ | $n_{heads}$ |
-|------------|-------------|-------------|----------|------------|-------------|
-| GPT-2 52M  | 8           | 512         | 2048     | 8          | 8           |
-| GPT-2 83M  | 10          | 640         | 2560     | 640        | 10          |
-| GPT-2 125M | 12          | 768         | 3072     | 768        | 12          |
-
-## Family Members
-
-We implement three members of this family at different scales: GPT 52M, GPT 83M, and GPT 125M. These models are named after their parameter counts. We selected these particular configurations because (1) they represent points on the pareto frontier of the scaling law for language models as described by [Kaplan et al. at OpenAI](https://arxiv.org/abs/2001.08361) and (2) they are small enough to rapidly iterate on methods using a single GPU node.
-
-| Model Family Member | Parameters | Training Hours on 8xA100s | Training Tokens | Final Loss | Predicted Perplexity | Actual Perplexity |
-|---------------------|------------|---------------------------|-----------------|------------|----------------------|-------------------|
-| GPT-2 52M           | 53.9M      | 02:44                     | 4.6B            | 3.43       | 32.54                | 30.88             |
-| GPT-2 83M           | 85.8M      | 04:52                     | 5.5B            | 3.28       | 27.84                | 26.57             |
-| GPT-2 125M          | 114M       | 08:25                     | 6.7B            | 3.18       | 24.64                | 24.04             |
-
-## Implementation Details
-
-Our codebase builds off of the Hugging Face *[Transformers](https://huggingface.co/transformers/)* library. We initialize Huggingface's GPT-2 model with one of our configurations.
-
-## Exploring Tradeoffs Between Quality and Training Speed / Cost
-
-There are two ways of varying the amount of time necessary to train a model and the cost necessary to do so: varying the size of the model or varying the number of steps (and therefore data) for which the model is trained. With the GPT family of models, we explore both of these axes. To develop methods for these models, we generally begin with the smallest members of this model family for initial experimentation and scale up once the ideas have been refined.
-
-To explore tradeoffs between quality and the number of training steps, we have ablated both the number of training steps and the number of data points to train on. We do this by checkpointing the model throughout training.
-
-To explore tradeoffs between quality and the size of the model, we use "Scaling Laws for Neural Language Models" to provide suggestions on model capacity and dataset size and then sweep hyperparameters such as learning rate and batch size to minimize loss.
diff --git a/docs/source/model_cards/cifar_resnet.md b/docs/source/model_cards/cifar_resnet.md
deleted file mode 100644
index babd11892c..0000000000
--- a/docs/source/model_cards/cifar_resnet.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# 👀 CIFAR ResNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Image Classification`
-
-## Overview
-
-The ResNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks. CIFAR ResNet models are a subset of this family designed specifically for the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) and [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) datasets.
-
-## Attribution
-
-Paper: [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by He, Zhang, Ren, and Sun 2015. Note that this paper set the standard for ResNet style architectures for both CIFAR-10/100 and ImageNet.
-
-## Architecture
-
-Residual Networks are feedforward convolutional networks with "residual" connections between non-consecutive layers.
-
-The model architecture is defined by the original paper:
-
-- The network inputs are of dimension 32×32x3.
-- The first layer uses 3×3 convolutions.
-- The subsequent layers are a stack of 6n layers with 3×3 convolutions on the feature maps of sizes {32,16,8}, with 2n layers for each feature map size. The number of filters are {16,32,64} for the respective feature map sizes. Subsampling is performed by convolutions with a stride of 2.
-- The network ends with a global average pooling, followed by a linear layer with the output dimension equal to the number of classes and a softmax activation.
-
-There are a total 6n+2 stacked weighted layers. Each family member is specified by the number of layers, for example n=9 corresponds to ResNet56.
-
-The biggest differences between CIFAR ResNet models and ImageNet ResNet models are:
-
-- ImageNet ResNets substantially downsample their input compared to CIFAR ResNets. The input layer of ImageNet ResNets is a 7x7 convolutional layer with stride 2, followed shortly thereafter by a 3x3 maxpool with stride 2, after which the input continues on to the convolutional blocks. CIFAR ResNets only have a single 3x3, stride 1, convolutional input layer.
-- CIFAR ResNet models use fewer filters for each convolution.
-- The ImageNet ResNets contain four stages, while the CIFAR ResNets contain three stages. In addition, CIFAR ResNets uniformly distribute blocks across each stage while ImageNet ResNets have a specific number of blocks for each stage.
-
-## Family members
-
-| Model Family Members | Parameter Count | Our Accuracy | Training Time on 1x3080 |
-|----------------------|-----------------|--------------|-------------------------|
-| ResNet20             | 0.27M           | TBA          | TBA                     |
-| ResNet32             | 0.46M           | TBA          | TBA                     |
-| ResNet44             | 0.66M           | TBA          | TBA                     |
-| ResNet56             | 0.85M           | 93.1%        | 35 min                  |
-| ResNet110            | 1.7M            | TBA          | TBA                     |
-
-## Default Training Hyperparameters
-
-- Optimizer: SGD
-    - Learning rate: 1.2
-    - Momentum: 0.9
-    - Weight decay: 1e-4
-- Batch size: 1024
-- LR Schedulers
-    - Linear warmup for 5 epochs
-    - Multistep decay by 0.1 at epochs 80 and 120
-- Number of epochs: 160
diff --git a/docs/source/model_cards/deeplabv3.md b/docs/source/model_cards/deeplabv3.md
deleted file mode 120000
index 330a125c39..0000000000
--- a/docs/source/model_cards/deeplabv3.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../composer/models/deeplabv3/README.md
\ No newline at end of file
diff --git a/docs/source/model_cards/efficientnet.md b/docs/source/model_cards/efficientnet.md
deleted file mode 100644
index 8f6aa28dfa..0000000000
--- a/docs/source/model_cards/efficientnet.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# 🕸️ EfficientNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Image Classification`
-
-## Overview
-
-The EfficientNet model family is a set of convolutional neural networks that can be used as the basis for a variety of vision tasks, although they were initially designed for image classification. The model family was designed to reach the highest accuracy for a given computation budget during inference by simultaneously scaling model depth, model width, and image resolution according to an empirically determined scaling law.
-
-## Attribution
-
-Paper: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan and Quoc V. Le
-
-Code: [gen-efficientnet-pytorch Github repository](https://github.com/rwightman/gen-efficientnet-pytorch) by Ross Wightman
-
-Hyperparameters: [DeepLearningExamples Github repository](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/efficientnet) by Nvidia
-
-## Architecture
-
-The table below from Tan and Le specifies the EfficientNet baseline architecture broken up into separate stages. MBConv indicates a mobile inverted bottleneck with a specific expansion size and kernel size. Resolution is the expected input resolution of the current stage. Number of channels is the number of output channels of the current stage. Number of layers indicates the number of repeated blocks in each stage. Subsequent EfficientNet family members scale the resolution, number of channels, and number of layers according to the resolution, width, and depth scaling parameters defined by Tan and Le.
-
-![efficientnet_arch.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/efficientnet_arch.png)
-
-## Family members
-
-Tan and Le included 8 members in their model family. The goal was for each family member to have approximately double the FLOPs of the previous family member. Currently, we only support EfficientNet-B0.
-
-| Model Family Member | Parameter Count | TPU Repo Accuracy* | Our Accuracy** | Training Time on 8x3080 |
-|---------------------|-----------------|--------------------|----------------|-------------------------|
-| EfficientNet-B0     | 5.3M            | 77.1%              | 77.22%         | 23.3 hr                 |
-| EfficientNet-B1     | 7.8M            | 79.1%              | TBA            | TBA                     |
-| EfficientNet-B2     | 9.2M            | 80.1%              | TBA            | TBA                     |
-| EfficientNet-B3     | 12M             | 81.6%              | TBA            | TBA                     |
-| EfficientNet-B4     | 19M             | 82.9%              | TBA            | TBA                     |
-| EfficientNet-B5     | 30M             | 83.6%              | TBA            | TBA                     |
-| EfficientNet-B6     | 43M             | 84.0%              | TBA            | TBA                     |
-| EfficientNet-B7     | 66M             | 84.3%              | TBA            | TBA                     |
-
-*Includes label smoothing, sample-wise stochastic depth, and AutoAugment
-
-**Includes label smoothing and sample-wise stochastic depth
-
-## Default Training Hyperparameters
-
-Our default hyperparameters are identical to the Nvidia Deep Learning Examples except:
-
-- Applying weight decay to batch normalization trainable parameters
-- Batch normalization parameters are `momentum = 0.1` and `eps = 1e-5`
diff --git a/docs/source/model_cards/resnet.md b/docs/source/model_cards/resnet.md
deleted file mode 120000
index d7534294da..0000000000
--- a/docs/source/model_cards/resnet.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../composer/models/resnet/README.md
\ No newline at end of file
diff --git a/docs/source/model_cards/unet.md b/docs/source/model_cards/unet.md
deleted file mode 100644
index d896854d3f..0000000000
--- a/docs/source/model_cards/unet.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# ↩️ UNet
-
-Category of Task: `Vision`
-
-Kind of Task: `Segmentation`
-
-Link to Code: [https://github.com/mosaicml/composer/tree/main/composer/models/unet](https://github.com/mosaicml/composer/tree/main/composer/models/unet)
-
-## Overview
-
-UNet is an architecture used in image segmentation. The example we are using is for medical brain tumor data.
-
-## Attribution
-
-The UNet model has been introduced in "U-Net: Convolutional Networks for Biomedical Image Segmentation" by Olaf Ronneberger, Philipp Fischer, Thomas Brox in [https://arxiv.org/abs/1505.04597](https://arxiv.org/abs/1505.04597).
-
-We are using the NVDA DLE examples version in
-[https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/nnUNet).
-
-## Architecture
-
-The figure below shows a 3D version of the UNet architecture. Quoting the DLE examples, U-Net is composed of a contractive and an expanding path that aims at building a bottleneck in its centermost part through a combination of convolution, instance norm, and leaky relu operations. After this bottleneck, the image is reconstructed through a combination of convolutions and upsampling. Skip connections are added with the goal of helping the backward flow of gradients in order to improve training.
-
-![unet3d.png](https://storage.googleapis.com/docs.mosaicml.com/images/models/unet3d.png)
-
-## Implementation Details
-
-There are 3 main differences between our implementation and the original NVDA DALI implementation.
-
-The first two refer to removing the NVDA DALI pipeline and replacing all transforms with torch implementations. We are omitting the Zoom transform and use a kernel size of 3 for the Gaussian Blur transform.
-
-While NVDA DLE examples reports the training accuracy using an average of 5 folds, we are using only 1 fold in the interest of faster iteration time, so all of our results are reported using fold 0 and 200 epochs.
-
-## Exploring Tradeoffs Between Quality and Training Speed/Cost
-
-As noted above, we are reporting only 1 fold and a fixed number of 200 epochs in training the model, while DLE uses early stopping.
diff --git a/docs/source/trainer/algorithms.rst b/docs/source/trainer/algorithms.rst
index 8021034ab8..a494799dde 100644
--- a/docs/source/trainer/algorithms.rst
+++ b/docs/source/trainer/algorithms.rst
@@ -168,6 +168,7 @@ Composer’s `events` look as follows:
     state.model = model()
     state.train_dataloader = train_dataloader()
     state.optimizers = optimizers()
+    EVENT.BEFORE_LOAD
     load_checkpoint()
     EVENT.AFTER_LOAD
     EVENT.FIT_START
diff --git a/docs/source/trainer/callbacks.rst b/docs/source/trainer/callbacks.rst
index a2c02c71cd..9f6f26a9dd 100644
--- a/docs/source/trainer/callbacks.rst
+++ b/docs/source/trainer/callbacks.rst
@@ -50,6 +50,8 @@ components of training.
     ~lr_monitor.LRMonitor
     ~optimizer_monitor.OptimizerMonitor
     ~memory_monitor.MemoryMonitor
+    ~memory_snapshot.MemorySnapshot
+    ~oom_observer.OOMObserver
     ~nan_monitor.NaNMonitor
     ~image_visualizer.ImageVisualizer
     ~mlperf.MLPerfCallback
diff --git a/docs/source/trainer/file_uploading.rst b/docs/source/trainer/file_uploading.rst
index 9286db2d2a..b6224a9bd7 100644
--- a/docs/source/trainer/file_uploading.rst
+++ b/docs/source/trainer/file_uploading.rst
@@ -88,7 +88,8 @@ To store files remotely, in the ``loggers`` argument to the Trainer constructor,
 
 .. seealso::
 
-    The built-in :class:`~composer.loggers.wandb_logger.WandBLogger` and
+    The built-in :class:`~composer.loggers.wandb_logger.WandBLogger`,
+    :class:`~composer.loggers.neptune_logger.NeptuneLogger and
     :class:`~composer.loggers.remote_uploader_downloader.RemoteUploaderDownloader`
     implement this method -- see the examples below.
 
@@ -103,12 +104,16 @@ to upload them. Otherwise, you could run into an infinite loop!
 Where can I remotely store files?
 ---------------------------------
 
-Composer includes two built-in LoggerDestinations to store artifacts:
+Composer includes three built-in LoggerDestinations to store artifacts:
 
 *   The :class:`~composer.loggers.wandb_logger.WandBLogger` can upload Composer training files
     as `W & B Artifacts <https://docs.wandb.ai/ref/python/artifact>`_, which are associated with the corresponding
     W & B project.
 
+*   The :class:`~composer.logger.neptune_logger.NeptuneLogger` can upload Composer training files
+    as `Neptune Files <https://docs.neptune.ai/api/field_types/#file>`_, which are associated with the corresponding
+    Neptune project.
+
 *   The :class:`~composer.loggers.remote_uploader_downloader.RemoteUploaderDownloader` can upload Composer training files
     to any cloud storage backend or remote filesystem. We include integrations for AWS S3 and SFTP
     (see the :ref:`examples <file_uploading_examples>` below), and you can write your own integration for a custom backend.
diff --git a/docs/source/trainer/logging.rst b/docs/source/trainer/logging.rst
index 7458f89ff6..b44f919016 100644
--- a/docs/source/trainer/logging.rst
+++ b/docs/source/trainer/logging.rst
@@ -24,7 +24,7 @@ arguments in :class:`.Trainer`, like the following code, which will log metrics
 
 To attach other loggers, use the ``loggers`` argument. For example, the
 below logs the results to `Weights and
-Biases <https://www.wandb.com/>`__, `MLflow <https://www.mlflow.org/docs/latest/index.html>`__, and `CometML <https://www.comet.com/?utm_source=mosaicml&utm_medium=partner&utm_campaign=mosaicml_comet_integration>`__,
+Biases <https://www.wandb.com/>`__, `MLflow <https://www.mlflow.org/docs/latest/index.html>`__, `CometML <https://www.comet.com/?utm_source=mosaicml&utm_medium=partner&utm_campaign=mosaicml_comet_integration>`__, and `neptune.ai <https://neptune.ai/>`__,
 and also saves them to the file
 ``log.txt``.
 
@@ -41,7 +41,7 @@ and also saves them to the file
     :skipif: not _WANDB_INSTALLED or not _COMETML_INSTALLED
 
     from composer import Trainer
-    from composer.loggers import WandBLogger, CometMLLogger, MLFlowLogger, FileLogger
+    from composer.loggers import WandBLogger, CometMLLogger, MLFlowLogger, NeptuneLogger, FileLogger
 
     wandb_logger = WandBLogger()
     cometml_logger = CometMLLogger()
@@ -73,6 +73,7 @@ Available Loggers
     ~wandb_logger.WandBLogger
     ~mlflow_logger.MLFlowLogger
     ~cometml_logger.CometMLLogger
+    ~neptune_logger.NeptuneLogger
     ~progress_bar_logger.ProgressBarLogger
     ~tensorboard_logger.TensorboardLogger
     ~in_memory_logger.InMemoryLogger
diff --git a/docs/source/trainer/using_the_trainer.rst b/docs/source/trainer/using_the_trainer.rst
index 385fd10677..ab57c3f913 100644
--- a/docs/source/trainer/using_the_trainer.rst
+++ b/docs/source/trainer/using_the_trainer.rst
@@ -11,7 +11,7 @@ The Composer :class:`.Trainer` implements a highly-optimized PyTorch training lo
    learning work, with reproducible results in time-to-train and
    accuracy.
 -  Integration with your favorite model hubs:
-   🤗 `Transformers`_, `TIMM`_, and `torchvision`_.
+   🤗 `Transformers`_ and `torchvision`_.
 -  Iterate faster! We take care of performance and efficiency.
 
 .. note::
@@ -215,11 +215,12 @@ well as Composer's custom schedulers.
 .. testcode::
 
     from composer import Trainer
-    from composer.models import composer_resnet
+    from composer.models.tasks import ComposerClassifier
+    import torchvision.models as models
     from torch.optim import SGD
     from torch.optim.lr_scheduler import LinearLR
 
-    model = composer_resnet(model_name="resnet50", num_classes=1000)
+    model = ComposerClassifier(module=models.resnet18(), num_classes=1000)
     optimizer = SGD(model.parameters(), lr=0.1)
     scheduler = LinearLR(optimizer)
 
@@ -626,5 +627,4 @@ This was just a quick tour of the features available within our trainer.
 Please see the other guides and notebooks for further details.
 
 .. _Transformers: https://huggingface.co/docs/transformers/index
-.. _TIMM: https://fastai.github.io/timmdocs/
 .. _torchvision: https://pytorch.org/vision/stable/models.html
diff --git a/examples/TPU_Training_in_composer.ipynb b/examples/TPU_Training_in_composer.ipynb
index bea0251b4a..ca68592d5a 100644
--- a/examples/TPU_Training_in_composer.ipynb
+++ b/examples/TPU_Training_in_composer.ipynb
@@ -58,7 +58,7 @@
     "# %pip install 'mosaicml @ git+https://github.com/mosaicml/composer.git'\"\n",
     "\n",
     "from composer import Trainer\n",
-    "from composer import models"
+    "from composer.models import ComposerClassifier"
    ]
   },
   {
@@ -88,9 +88,82 @@
    "outputs": [],
    "source": [
     "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
     "import torch_xla.core.xla_model as xm\n",
     "\n",
-    "model = models.composer_resnet_cifar(model_name='resnet_20', num_classes=10)\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 20\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
+    "\n",
     "model = model.to(xm.xla_device())\n",
     "\n",
     "optimizer = torch.optim.SGD(\n",
diff --git a/examples/auto_microbatching.ipynb b/examples/auto_microbatching.ipynb
index 4f5d2c41f5..954a942656 100644
--- a/examples/auto_microbatching.ipynb
+++ b/examples/auto_microbatching.ipynb
@@ -101,8 +101,82 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer import models\n",
-    "model = models.composer_resnet_cifar(model_name='resnet_56', num_classes=10)\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 56\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
     "\n",
     "optimizer = composer.optim.DecoupledSGDW(\n",
     "    model.parameters(), # Model parameters to update\n",
diff --git a/examples/checkpoint_autoresume.ipynb b/examples/checkpoint_autoresume.ipynb
index d396f7a8e5..7b8de078bc 100644
--- a/examples/checkpoint_autoresume.ipynb
+++ b/examples/checkpoint_autoresume.ipynb
@@ -71,6 +71,41 @@
     "Simply configure the instance to start Composer with the same command every time until training has finished!"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "class MNISTModel(nn.Module):\n",
+    "    \"\"\"Toy convolutional neural network architecture in pytorch for MNIST.\"\"\"\n",
+    "\n",
+    "    def __init__(self, num_classes: int = 10):\n",
+    "        super().__init__()\n",
+    "        self.num_classes = num_classes\n",
+    "        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)\n",
+    "        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)\n",
+    "        self.bn = nn.BatchNorm2d(32)\n",
+    "        self.fc1 = nn.Linear(32 * 16, 32)\n",
+    "        self.fc2 = nn.Linear(32, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = self.conv1(x)\n",
+    "        out = F.relu(out)\n",
+    "        out = self.conv2(out)\n",
+    "        out = self.bn(out)\n",
+    "        out = F.relu(out)\n",
+    "        out = F.adaptive_avg_pool2d(out, (4, 4))\n",
+    "        out = torch.flatten(out, 1, -1)\n",
+    "        out = self.fc1(out)\n",
+    "        out = F.relu(out)\n",
+    "        return self.fc2(out)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -83,10 +118,10 @@
     "from torchvision.transforms import ToTensor\n",
     "\n",
     "from composer import Trainer\n",
-    "from composer.models.classify_mnist import mnist_model\n",
+    "from composer.models import ComposerClassifier\n",
     "\n",
     "# Configure the trainer -- here, we train a simple MNIST classifier\n",
-    "model = mnist_model(num_classes=10)\n",
+    "model = ComposerClassifier(module=MNISTModel(num_classes=10), num_classes=10)\n",
     "optimizer = SGD(model.parameters(), lr=0.01)\n",
     "train_dataloader = torch.utils.data.DataLoader(\n",
     "    dataset=MNIST('~/datasets', train=True, download=True, transform=ToTensor()),\n",
diff --git a/examples/checkpoint_with_wandb.py b/examples/checkpoint_with_wandb.py
index 44d54a51be..2d05bb0861 100644
--- a/examples/checkpoint_with_wandb.py
+++ b/examples/checkpoint_with_wandb.py
@@ -5,6 +5,9 @@
 
 import shutil
 
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils.data
 from torch.optim import SGD
 from torchvision.datasets import MNIST
@@ -12,7 +15,7 @@
 
 from composer import Trainer
 from composer.loggers import WandBLogger
-from composer.models.classify_mnist import mnist_model
+from composer.models.tasks import ComposerClassifier
 
 # Configure the WandBLogger to log artifacts, and set the project name
 # The project name must be deterministic, so we can restore from it
@@ -23,7 +26,36 @@
 
 # Configure the trainer -- here, we train a simple MNIST classifier
 print('Starting the first training run\n')
-model = mnist_model(num_classes=10)
+
+
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
+
+model = ComposerClassifier(module=Model(num_classes=10))
 optimizer = SGD(model.parameters(), lr=0.01)
 train_dataloader = torch.utils.data.DataLoader(
     dataset=MNIST('~/datasets', train=True, download=True, transform=ToTensor()),
diff --git a/examples/custom_speedup_methods.ipynb b/examples/custom_speedup_methods.ipynb
index 8ee769da20..565d7a6b06 100644
--- a/examples/custom_speedup_methods.ipynb
+++ b/examples/custom_speedup_methods.ipynb
@@ -53,6 +53,101 @@
     "%pip install matplotlib"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating a Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll set up a toy resnet model for CIFAR. This will be useful throughout this tutorial as an example model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 56\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -120,7 +215,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next, we will define a model. For this, we will simply use Composer's ResNet-56. One quirk to be aware of with this model is that the forward method takes in an `(X, y)` pair of inputs and targets, essentially what the dataloaders will spit out."
+    "Next, we will define a model. For this, we will simply use Composer's ResNet-50. One quirk to be aware of with this model is that the forward method takes in an `(X, y)` pair of inputs and targets, essentially what the dataloaders will spit out."
    ]
   },
   {
@@ -129,9 +224,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer.models import composer_resnet_cifar\n",
+    "from torchvision.models import resnet\n",
+    "from composer.models.tasks import ComposerClassifier\n",
     "\n",
-    "model = composer_resnet_cifar(model_name='resnet_56', num_classes=10)"
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)"
    ]
   },
   {
@@ -302,7 +398,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = composer_resnet_cifar(model_name='resnet_56', num_classes=10)"
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)"
    ]
   },
   {
@@ -516,7 +614,9 @@
    "outputs": [],
    "source": [
     "# Baseline (i.e., without ColOut)\n",
-    "model = composer_resnet_cifar(model_name=\"resnet_56\", num_classes=10)\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
     "optimizer = torch.optim.Adam(model.parameters())\n",
     "\n",
     "trainer = Trainer(\n",
@@ -559,7 +659,9 @@
    "outputs": [],
    "source": [
     "# With ColOut\n",
-    "model = composer_resnet_cifar(model_name=\"resnet_56\", num_classes=10)\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
     "optimizer = torch.optim.Adam(model.parameters())\n",
     "\n",
     "# An instance of our ColOut algorithm class!\n",
@@ -652,7 +754,9 @@
    "outputs": [],
    "source": [
     "# With ColOut and BlurPool\n",
-    "model = composer_resnet_cifar(model_name=\"resnet_56\", num_classes=10)\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
     "optimizer = torch.optim.Adam(model.parameters())\n",
     "\n",
     "trainer = Trainer(\n",
diff --git a/examples/early_stopping.ipynb b/examples/early_stopping.ipynb
index d0612b8801..3f8ac951f9 100644
--- a/examples/early_stopping.ipynb
+++ b/examples/early_stopping.ipynb
@@ -145,11 +145,85 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer import models\n",
+    "from composer.models import ComposerClassifier\n",
     "from composer.optim import DecoupledSGDW, LinearWithWarmupScheduler\n",
     "from composer.core import Evaluator\n",
     "\n",
-    "model = models.composer_resnet_cifar(model_name='resnet_56', num_classes=10)\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 56\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)\n",
     "\n",
     "optimizer = DecoupledSGDW(\n",
     "    model.parameters(), # Model parameters to update\n",
diff --git a/examples/exporting_for_inference.ipynb b/examples/exporting_for_inference.ipynb
index e65814ed6b..ca4081c9f2 100644
--- a/examples/exporting_for_inference.ipynb
+++ b/examples/exporting_for_inference.ipynb
@@ -188,7 +188,7 @@
     "## Export Using a Callback\n",
     "<a id=\"Export-Using-a-Callback\"></a>\n",
     "\n",
-    "The Composer trainer also lets you specify an export callback that automatically exports at the end of training. Since we will be training a model for a few epochs, we'll first create a dataloader with synthetic dataset for this tutorial."
+    "The Composer trainer also lets you specify an export callback that automatically exports at the end of training. Since we will be training a model for a few epochs, we'll first create a dataloader with CIFAR for this tutorial."
    ]
   },
   {
@@ -198,11 +198,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer.datasets.synthetic import SyntheticBatchPairDataset\n",
     "from torch.utils.data import DataLoader\n",
+    "from torchvision import datasets, transforms\n",
     "\n",
-    "dataset = SyntheticBatchPairDataset(total_dataset_size=8, data_shape=(3, 224, 224), num_classes=1000)\n",
-    "dataloader = DataLoader(dataset=dataset, batch_size=4)"
+    "mnist_transforms = transforms.Compose([transforms.ToTensor()])\n",
+    "\n",
+    "dataset = datasets.MNIST(\"./data\", train=True, download=True, transform=mnist_transforms)\n",
+    "dataloader = DataLoader(dataset=dataset, batch_size=4)\n",
+    "input_mnist = (torch.rand(4, 1, 28, 28), torch.Tensor())"
    ]
   },
   {
@@ -222,11 +225,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "from torchvision.models import resnet\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
     "from composer.models import ComposerClassifier\n",
     "\n",
-    "model = ComposerClassifier(module=resnet.resnet50(), num_classes=1000)"
+    "class ToyModel(nn.Module):\n",
+    "    \"\"\"Toy convolutional neural network architecture in pytorch for MNIST.\"\"\"\n",
+    "\n",
+    "    def __init__(self, num_classes: int = 10):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.num_classes = num_classes\n",
+    "\n",
+    "        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)\n",
+    "        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)\n",
+    "        self.bn = nn.BatchNorm2d(32)\n",
+    "        self.fc1 = nn.Linear(32 * 16, 32)\n",
+    "        self.fc2 = nn.Linear(32, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = self.conv1(x)\n",
+    "        out = F.relu(out)\n",
+    "        out = self.conv2(out)\n",
+    "        out = self.bn(out)\n",
+    "        out = F.relu(out)\n",
+    "        out = F.adaptive_avg_pool2d(out, (4, 4))\n",
+    "        out = torch.flatten(out, 1, -1)\n",
+    "        out = self.fc1(out)\n",
+    "        out = F.relu(out)\n",
+    "        return self.fc2(out)\n",
+    "\n",
+    "\n",
+    "model = ComposerClassifier(module=ToyModel(num_classes=10))"
    ]
   },
   {
@@ -411,7 +441,7 @@
     "ort_session = ort.InferenceSession(model_save_path, providers=['CPUExecutionProvider'])\n",
     "outputs = ort_session.run(\n",
     "    None,\n",
-    "    {'input': input[0].numpy()})\n",
+    "    {'input': input_mnist[0].numpy()})\n",
     "print(f\"The predicted classes are {np.argmax(outputs[0], axis=1)}\")"
    ]
   },
@@ -470,12 +500,12 @@
     "model_save_path = os.path.join(working_dir.name, 'model2.onnx')\n",
     "checkpoint_path = os.path.join(working_dir.name, 'ep2-ba4-rank0.pt')\n",
     "\n",
-    "model = ComposerClassifier(module=resnet.resnet50(), num_classes=1000)\n",
+    "model = ComposerClassifier(module=ToyModel(num_classes=10))\n",
     "\n",
-    "export_for_inference(model=model, \n",
+    "export_for_inference(model=model,\n",
     "                     save_format=save_format, \n",
     "                     save_path=model_save_path, \n",
-    "                     sample_input=(input, {}),\n",
+    "                     sample_input=(input_mnist, {}),\n",
     "                     surgery_algs=[cf.apply_squeeze_excite],\n",
     "                     load_path=checkpoint_path)"
    ]
@@ -516,7 +546,7 @@
     "ort_session = ort.InferenceSession(model_save_path, providers=['CPUExecutionProvider'])\n",
     "new_outputs = ort_session.run(\n",
     "    None,\n",
-    "    {'input': input[0].numpy()},\n",
+    "    {'input': input_mnist[0].numpy()},\n",
     ")\n",
     "print(np.allclose(outputs[0], new_outputs[0], atol=1e-07))"
    ]
@@ -589,7 +619,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "output = traced_model(input)\n",
+    "output = traced_model(input_mnist)\n",
     "print(f\"The predicted classes are {torch.argmax(output, dim=1)}\")"
    ]
   },
diff --git a/examples/getting_started.ipynb b/examples/getting_started.ipynb
index 65a3b15c8a..1d4f2dade8 100644
--- a/examples/getting_started.ipynb
+++ b/examples/getting_started.ipynb
@@ -164,8 +164,82 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from composer import models\n",
-    "model = models.composer_resnet_cifar(model_name='resnet_56', num_classes=10)"
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 56\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)"
    ]
   },
   {
@@ -458,8 +532,90 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = models.composer_resnet_cifar(model_name=\"resnet_56\", num_classes=10)\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "from composer.models import ComposerClassifier\n",
+    "\n",
+    "class Block(nn.Module):\n",
+    "    \"\"\"A ResNet block.\"\"\"\n",
+    "\n",
+    "    def __init__(self, f_in: int, f_out: int, downsample: bool = False):\n",
+    "        super(Block, self).__init__()\n",
+    "\n",
+    "        stride = 2 if downsample else 1\n",
+    "        self.conv1 = nn.Conv2d(f_in, f_out, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(f_out)\n",
+    "        self.conv2 = nn.Conv2d(f_out, f_out, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(f_out)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
     "\n",
+    "        # No parameters for shortcut connections.\n",
+    "        if downsample or f_in != f_out:\n",
+    "            self.shortcut = nn.Sequential(\n",
+    "                nn.Conv2d(f_in, f_out, kernel_size=1, stride=2, bias=False),\n",
+    "                nn.BatchNorm2d(f_out),\n",
+    "            )\n",
+    "        else:\n",
+    "            self.shortcut = nn.Sequential()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        return self.relu(out)\n",
+    "\n",
+    "class ResNetCIFAR(nn.Module):\n",
+    "    \"\"\"A residual neural network as originally designed for CIFAR-10.\"\"\"\n",
+    "\n",
+    "    def __init__(self, outputs: int = 10):\n",
+    "        super(ResNetCIFAR, self).__init__()\n",
+    "\n",
+    "        depth = 56\n",
+    "        width = 16\n",
+    "        num_blocks = (depth - 2) // 6\n",
+    "\n",
+    "        plan = [(width, num_blocks), (2 * width, num_blocks), (4 * width, num_blocks)]\n",
+    "\n",
+    "        self.num_classes = outputs\n",
+    "\n",
+    "        # Initial convolution.\n",
+    "        current_filters = plan[0][0]\n",
+    "        self.conv = nn.Conv2d(3, current_filters, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn = nn.BatchNorm2d(current_filters)\n",
+    "        self.relu = nn.ReLU(inplace=True)\n",
+    "\n",
+    "        # The subsequent blocks of the ResNet.\n",
+    "        blocks = []\n",
+    "        for segment_index, (filters, num_blocks) in enumerate(plan):\n",
+    "            for block_index in range(num_blocks):\n",
+    "                downsample = segment_index > 0 and block_index == 0\n",
+    "                blocks.append(Block(current_filters, filters, downsample))\n",
+    "                current_filters = filters\n",
+    "\n",
+    "        self.blocks = nn.Sequential(*blocks)\n",
+    "\n",
+    "        # Final fc layer. Size = number of filters in last segment.\n",
+    "        self.fc = nn.Linear(plan[-1][0], outputs)\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor):\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        out = self.blocks(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.fc(out)\n",
+    "        return out\n",
+    "\n",
+    "model = ComposerClassifier(module=ResNetCIFAR(), num_classes=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "logger_for_algorithm_run = InMemoryLogger()\n",
     "\n",
     "optimizer = composer.optim.DecoupledSGDW(\n",
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
deleted file mode 100644
index fcae2e2d9d..0000000000
--- a/examples/imagenet/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# ImageNet Examples
-
-These examples illustrate how to train models on ImageNet-1k using Composer.
-
-Prerequisites:
-- Install Composer: `pip install mosaicml`
-- ImageNet already downloaded on the system used for training
-
-## ResNet
-
-The `train_resnet_imagenet1k.py` script trains models from the ResNet architecture family e.g. ResNet-50, ResNet-101.
-
-Few details about the script:
-- Models are pulled directly from torchvision, then wrapped into a `ComposerModel`
-- Saves checkpoints to "checkpoints/{run_name}" by default where `run_name` is random if not specified
-- Logs time-to-train, throughput, and learning rate throughout training.
-
-### Example configurations:
-
-<!--pytest.mark.skip-->
-
-```bash
-# Single GPU/CPU depending on torch.cuda.is_available()
-python train_resnet_imagenet1k.py /path/to/imagenet
-
-# Log experiments to Weights and Biases
-python train_resnet_imagenet1k.py /path/to/imagenet --wandb_logger --wandb_entity my_username --wandb_project my_project --wandb_run_name my_run_name
-
-# Single/Multi GPU training (infers the number of GPUs available)
-composer train_resnet_imagenet1k.py /path/to/imagenet
-
-# Manually specify number of GPUs to use:
-composer -n $N_GPUS train_resnet_imagenet1k.py /path/to/imagenet
-
-# Mild ResNet recipe for fastest training to ~76.5% accuracy:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name mild --train_crop_size 176 --eval_crop_size 224 --max_duration 36ep  --loss_name binary_cross_entropy
-
-# Medium ResNet recipe highest accuracy with similar training time as baseline:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name medium --train_crop_size 176 --eval_crop_size 224 --max_duration 135ep  --loss_name binary_cross_entropy
-
-# Spicy ResNet recipe for our most accurate ResNet over a long training schedule:
-composer train_resnet_imagenet1k.py /path/to/imagenet --recipe_name spicy --train_crop_size 176 --eval_crop_size 224 --max_duration 270ep  --loss_name binary_cross_entropy
-```
diff --git a/examples/imagenet/train_resnet_imagenet1k.py b/examples/imagenet/train_resnet_imagenet1k.py
deleted file mode 100644
index d6f1dee008..0000000000
--- a/examples/imagenet/train_resnet_imagenet1k.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Example script to train a ResNet model on ImageNet."""
-
-import argparse
-import logging
-import os
-
-import torch
-from torch.utils.data import DataLoader
-from torchmetrics import MetricCollection
-from torchmetrics.classification import MulticlassAccuracy
-from torchvision import transforms
-from torchvision.datasets import ImageFolder
-from torchvision.models import resnet
-
-from composer import DataSpec, Time, Trainer
-from composer.algorithms import (EMA, SAM, BlurPool, ChannelsLast, ColOut, LabelSmoothing, MixUp, ProgressiveResizing,
-                                 RandAugment, StochasticDepth)
-from composer.callbacks import CheckpointSaver, LRMonitor, SpeedMonitor
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.loggers import WandBLogger
-from composer.loss import binary_cross_entropy_with_logits, soft_cross_entropy
-from composer.metrics import CrossEntropy
-from composer.models.tasks import ComposerClassifier
-from composer.optim import CosineAnnealingWithWarmupScheduler, DecoupledSGDW
-from composer.utils import dist
-
-logging.basicConfig()
-logging.getLogger().setLevel(logging.INFO)
-
-parser = argparse.ArgumentParser()
-
-# Dataloader arguments
-parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str)
-parser.add_argument('--train_crop_size', help='Training image crop size', type=int, default=224)
-parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=256)
-parser.add_argument('--eval_crop_size', help='Evaluation image crop size', type=int, default=224)
-parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=2048)
-parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=2048)
-
-# Model arguments
-parser.add_argument('--model_name',
-                    help='Name of the resnet model to train',
-                    default='resnet50',
-                    choices=['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'])
-parser.add_argument('--loss_name',
-                    help='Name of the loss function to use for training',
-                    default='cross_entropy',
-                    choices=['cross_entropy', 'binary_cross_entropy'])
-
-# Optimizer arguments
-parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=2.048)
-parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.875)
-parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-4)
-
-# LR scheduler arguments
-parser.add_argument('--t_warmup',
-                    help='Duration of learning rate warmup specified as a Time string',
-                    type=Time.from_timestring,
-                    default='8ep')
-parser.add_argument('--t_max',
-                    help='Duration to cosine decay the learning rate specified as a Time string',
-                    type=Time.from_timestring,
-                    default='1dur')
-
-# Save checkpoint arguments
-parser.add_argument('--save_checkpoint_dir',
-                    help='Directory in which to save model checkpoints',
-                    type=str,
-                    default='checkpoints/{run_name}')
-parser.add_argument('--checkpoint_interval', help='Frequency to save checkpoints', type=str, default='1ep')
-
-# Load checkpoint arguments, assumes resuming the previous training run instead of fine-tuning
-parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str)
-
-# Recipes
-parser.add_argument('--recipe_name',
-                    help='Either "mild", "medium" or "spicy" in order of increasing training time and accuracy',
-                    type=str,
-                    choices=['mild', 'medium', 'spicy'])
-
-# Logger parameters: progress bar logging is used by default
-# Only has Weights and Biases option to reduce the number of arguments. Other loggers can be substituted in the script
-parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true')
-parser.add_argument('--wandb_entity', help='WandB entity name', type=str)
-parser.add_argument('--wandb_project', help='WandB project name', type=str)
-parser.add_argument('--wandb_run_name', help='WandB run name', type=str)
-
-# Trainer arguments
-parser.add_argument('--run_name', help='Name of the training run used for checkpointing and other logging', type=str)
-parser.add_argument('--seed', help='Random seed', type=int, default=17)
-parser.add_argument('--max_duration',
-                    help='Duration to train specified as a Time string',
-                    type=Time.from_timestring,
-                    default='90ep')
-parser.add_argument('--eval_interval',
-                    help='How frequently to run evaluation on the validation set specified as a Time string',
-                    type=Time.from_timestring,
-                    default='1ep')
-
-args = parser.parse_args()
-
-
-def _main():
-
-    # Divide batch sizes by number of devices if running multi-gpu training
-    if dist.get_world_size():
-        args.train_batch_size = args.train_batch_size // dist.get_world_size()
-        args.eval_batch_size = args.eval_batch_size // dist.get_world_size()
-
-    # Scale by 255 since the collate `pil_image_collate` results in images in range 0-255
-    # If using ToTensor() and the default collate, remove the scaling by 255
-    IMAGENET_CHANNEL_MEAN = (0.485 * 255, 0.456 * 255, 0.406 * 255)
-    IMAGENET_CHANNEL_STD = (0.229 * 255, 0.224 * 255, 0.225 * 255)
-
-    # Train dataset
-    logging.info('Building train dataloader')
-    train_transforms = transforms.Compose([
-        transforms.RandomResizedCrop(args.train_crop_size, scale=(0.08, 1.0), ratio=(0.75, 4.0 / 3.0)),
-        transforms.RandomHorizontalFlip(),
-    ])
-    train_dataset = ImageFolder(os.path.join(args.data_dir, 'train'), train_transforms)
-    # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup
-    train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True)
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=True,
-        sampler=train_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,  # Reduce overhead of creating new workers at the expense of using slightly more RAM
-    )
-    # DataSpec allows for on-gpu transformations, marginally relieving dataloader bottleneck
-    train_dataspec = DataSpec(dataloader=train_dataloader,
-                              device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD))
-    logging.info('Built train dataloader\n')
-
-    # Validation dataset
-    logging.info('Building evaluation dataloader')
-    eval_transforms = transforms.Compose([
-        transforms.Resize(args.eval_resize_size),
-        transforms.CenterCrop(args.eval_crop_size),
-    ])
-    eval_dataset = ImageFolder(os.path.join(args.data_dir, 'val'), eval_transforms)
-    # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup,
-    eval_sampler = dist.get_sampler(eval_dataset, drop_last=False, shuffle=False)
-    eval_dataloader = DataLoader(
-        eval_dataset,
-        batch_size=args.eval_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=False,
-        sampler=eval_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,  # Reduce overhead of creating new workers at the expense of using slightly more RAM
-    )
-    eval_dataspec = DataSpec(dataloader=eval_dataloader,
-                             device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN, std=IMAGENET_CHANNEL_STD))
-    logging.info('Built evaluation dataloader\n')
-
-    # Instantiate torchvision ResNet model
-    logging.info('Building Composer model')
-    model_fn = getattr(resnet, args.model_name)
-    model = model_fn(num_classes=1000, groups=1, width_per_group=64)
-
-    # Specify model initialization
-    def weight_init(w: torch.nn.Module):
-        if isinstance(w, torch.nn.Linear) or isinstance(w, torch.nn.Conv2d):
-            torch.nn.init.kaiming_normal_(w.weight)
-        if isinstance(w, torch.nn.BatchNorm2d):
-            w.weight.data = torch.rand(w.weight.data.shape)
-            w.bias.data = torch.zeros_like(w.bias.data)
-        # When using binary cross entropy, set the classification layer bias to -log(num_classes)
-        # to ensure the initial probabilities are approximately 1 / num_classes
-        if args.loss_name == 'binary_cross_entropy' and isinstance(w, torch.nn.Linear):
-            w.bias.data = torch.ones(w.bias.shape) * -torch.log(torch.tensor(w.bias.shape[0]))
-
-    model.apply(weight_init)
-
-    # Performance metrics to log other than training loss
-    train_metrics = MulticlassAccuracy(num_classes=1000, average='micro')
-    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=1000, average='micro')])
-
-    # Cross entropy loss that can handle both index and one-hot targets
-
-    if args.loss_name == 'binary_cross_entropy':
-        loss_fn = binary_cross_entropy_with_logits
-    else:
-        loss_fn = soft_cross_entropy
-
-    # Wrapper function to convert a classification PyTorch model into a Composer model
-    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
-    logging.info('Built Composer model\n')
-
-    # Optimizer
-    logging.info('Building optimizer and learning rate scheduler')
-    optimizer = DecoupledSGDW(composer_model.parameters(),
-                              lr=args.learning_rate,
-                              momentum=args.momentum,
-                              weight_decay=args.weight_decay)
-
-    # Learning rate scheduler: LR warmup for 8 epochs, then cosine decay for the rest of training
-    lr_scheduler = CosineAnnealingWithWarmupScheduler(t_warmup=args.t_warmup, t_max=args.t_max)
-    logging.info('Built optimizer and learning rate scheduler\n')
-
-    # Callbacks for logging
-    logging.info('Building SpeedMonitor, LRMonitor, and CheckpointSaver callbacks')
-    speed_monitor = SpeedMonitor(window_size=50)  # Measures throughput as samples/sec and tracks total training time
-    lr_monitor = LRMonitor()  # Logs the learning rate
-
-    # Callback for checkpointing
-    checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval)
-    logging.info('Built SpeedMonitor, LRMonitor, and CheckpointSaver callbacks\n')
-
-    # Recipes for training ResNet architectures on ImageNet in order of increasing training time and accuracy
-    # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/
-    logging.info('Building algorithm recipes')
-    if args.recipe_name == 'mild':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.08),
-        ]
-    elif args.recipe_name == 'medium':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.5, delay_fraction=0.4, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.1),
-            MixUp(alpha=0.2),
-            SAM(rho=0.5, interval=10),
-        ]
-    elif args.recipe_name == 'spicy':
-        algorithms = [
-            BlurPool(),
-            ChannelsLast(),
-            EMA(half_life='100ba', update_interval='20ba'),
-            ProgressiveResizing(initial_scale=0.6, delay_fraction=0.2, finetune_fraction=0.2),
-            LabelSmoothing(smoothing=0.13),
-            MixUp(alpha=0.25),
-            SAM(rho=0.5, interval=5),
-            ColOut(p_col=0.05, p_row=0.05),
-            RandAugment(depth=1, severity=9),
-            StochasticDepth(target_layer_name='ResNetBottleneck',
-                            stochastic_method='sample',
-                            drop_distribution='linear',
-                            drop_rate=0.1)
-        ]
-    else:
-        algorithms = None
-    logging.info('Built algorithm recipes\n')
-
-    logger = None
-    if args.wandb_logger:
-        if args.wandb_entity is None:
-            raise ValueError('Please specify --wandb_entity argument')
-        if args.wandb_project is None:
-            raise ValueError('Please specify --wandb_project argument')
-        if args.wandb_run_name is None:
-            raise ValueError('Please specify --wandb_run_name argument')
-        logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project, name=args.wandb_run_name)
-
-    # Create the Trainer!
-    logging.info('Building Trainer')
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-    precision = 'amp' if device == 'gpu' else 'fp32'  # Mixed precision for fast training when using a GPU
-    trainer = Trainer(run_name=args.run_name,
-                      model=composer_model,
-                      train_dataloader=train_dataspec,
-                      eval_dataloader=eval_dataspec,
-                      eval_interval=args.eval_interval,
-                      optimizers=optimizer,
-                      schedulers=lr_scheduler,
-                      algorithms=algorithms,
-                      loggers=logger,
-                      max_duration=args.max_duration,
-                      callbacks=[speed_monitor, lr_monitor, checkpoint_saver],
-                      load_path=args.load_checkpoint_path,
-                      device=device,
-                      precision=precision,
-                      device_train_microbatch_size='auto',
-                      seed=args.seed)
-    logging.info('Built Trainer\n')
-
-    # Start training!
-    logging.info('Train!')
-    trainer.fit()
-
-
-if __name__ == '__main__':
-    _main()
diff --git a/examples/medical_image_segmentation.ipynb b/examples/medical_image_segmentation.ipynb
deleted file mode 100644
index d13f88fbea..0000000000
--- a/examples/medical_image_segmentation.ipynb
+++ /dev/null
@@ -1,725 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 🩺 Image Segmentation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this notebook you will use Composer and PyTorch to segment pneumothorax (air around or outside of the lungs) from chest radiographic images. This dataset was originally released for a [kaggle competition][kaggle] by the [Society for Informatics in Medicine][siim] (SIIM).\n",
-    "\n",
-    "**Disclaimer: This example represents a minimal working baseline. In order to get competitive results this notebook must run for a long time.**\n",
-    "\n",
-    "### Recommended Background\n",
-    "\n",
-    "This tutorial goes through the process of starting a project from scratch with Composer. It assumes you are fairly familiar with how such a process might look if working with PyTorch. In addition, it assumes some familiarity with computer vision models and methods.\n",
-    "\n",
-    "To better understand the Composer part, make sure you're comfortable with the material in our [Getting Started][getting_started] tutorial.\n",
-    "\n",
-    "### Tutorial Goals and Concepts Covered\n",
-    "\n",
-    "The goal of this tutorial is to provide an executable example of a computer vision project in Composer from the ground up.\n",
-    "\n",
-    "We will cover:\n",
-    "\n",
-    "- installing relevant packages\n",
-    "- downloading the SIIM dataset from kaggle\n",
-    "- cleaning and resampling the dataset\n",
-    "- splitting data for validation\n",
-    "- visualizing model inputs\n",
-    "- training a baseline model with Composer\n",
-    "- using Composer methods\n",
-    "- next steps\n",
-    "\n",
-    "Let's get started!\n",
-    "\n",
-    "[kaggle]: https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/overview\n",
-    "[siim]: https://siim.org/\n",
-    "[getting_started]: https://docs.mosaicml.com/projects/composer/en/stable/examples/getting_started.html"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "Let's get started and configure our environment.\n",
-    "\n",
-    "### Install Dependencies\n",
-    "\n",
-    "If you haven't already, let's install the following dependencies, which are needed for this example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install kaggle pydicom git+https://github.com/qubvel/segmentation_models.pytorch opencv-python-headless jupyterlab-widgets\n",
-    "\n",
-    "%pip install mosaicml\n",
-    "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# %pip install git+https://github.com/mosaicml/composer.git"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Kaggle Authentication\n",
-    "\n",
-    "To access the data you need a Kaggle Account\n",
-    "- accept competition terms https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/data\n",
-    "- download `kaggle.json` from https://www.kaggle.com/yourusername/account by clicking \"Create new API token\"\n",
-    "- make the `kaggle.json` file available to this notebook using the following code cells."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ipywidgets import FileUpload\n",
-    "from IPython.display import display\n",
-    "uploader = FileUpload(accept='.json', multiple=True)\n",
-    "display(uploader)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "kaggle_folder = os.path.join(os.path.expanduser(\"~\"), \".kaggle\")\n",
-    "os.makedirs(kaggle_folder, exist_ok=True)\n",
-    "kaggle_config_file = os.path.join(kaggle_folder, \"kaggle.json\")\n",
-    "with open(kaggle_config_file, 'wb+') as output_file: \n",
-    "    for uploaded_filename in uploader.value:\n",
-    "        content = uploader.value[uploaded_filename]['content']   \n",
-    "        output_file.write(content) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Download and unzip the data \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!kaggle datasets download -d seesee/siim-train-test\n",
-    "!unzip -q siim-train-test.zip -d .\n",
-    "!ls"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Flatten Image Directories\n",
-    "The original dataset is oddly nested. We flatten it out so the images are easier to access in our pytorch dataset.\n",
-    "\n",
-    "`/siim/dicom-images-train/id/id/id.dcm` to `/siim/dicom-images-train/id.dcm`. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "from tqdm.auto import tqdm\n",
-    "\n",
-    "train_images = list(Path('siim/dicom-images-train').glob('*/*/*.dcm'))\n",
-    "for image in tqdm(train_images):\n",
-    "    image.replace(f'siim/dicom-images-train/{image.parts[-1]}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Project setup"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import itertools\n",
-    "from ipywidgets import interact, fixed, IntSlider\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import torch\n",
-    "from torch import nn\n",
-    "import matplotlib.pyplot as plt\n",
-    "import cv2\n",
-    "\n",
-    "# model\n",
-    "import segmentation_models_pytorch as smp\n",
-    "\n",
-    "# data\n",
-    "from torch.utils.data import DataLoader, Dataset\n",
-    "from torchvision.utils import draw_segmentation_masks, make_grid\n",
-    "from pydicom.filereader import dcmread\n",
-    "from sklearn.model_selection import StratifiedKFold\n",
-    "\n",
-    "# transforms\n",
-    "from albumentations import ShiftScaleRotate, Resize, Compose\n",
-    "\n",
-    "from torchmetrics import Metric\n",
-    "from torchmetrics.collections import MetricCollection\n",
-    "\n",
-    "# composer\n",
-    "from composer import Trainer\n",
-    "from composer.models import ComposerModel\n",
-    "from composer.optim import DecoupledAdamW\n",
-    "from composer.metrics.metrics import Dice"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Utils\n",
-    "\n",
-    "Here we define some utility functions to help with logging, decoding/encoding targets, and visualization."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class LossMetric(Metric):\n",
-    "    \"\"\"Turns any torch.nn Loss Module into distributed torchmetrics Metric.\"\"\"\n",
-    "\n",
-    "    def __init__(self, loss, dist_sync_on_step=False):\n",
-    "        super().__init__(dist_sync_on_step=dist_sync_on_step)\n",
-    "        self.loss = loss\n",
-    "        self.add_state(\"sum_loss\", default=torch.tensor(0.), dist_reduce_fx=\"sum\")\n",
-    "        self.add_state(\"total_batches\", default=torch.tensor(0), dist_reduce_fx=\"sum\")\n",
-    "\n",
-    "    def update(self, preds, target):\n",
-    "        \"\"\"Update the state with new predictions and targets.\n",
-    "        \"\"\"\n",
-    "        # Loss calculated over samples/batch, accumulate loss over all batches\n",
-    "        self.sum_loss += self.loss(preds, target)\n",
-    "        self.total_batches += 1\n",
-    "\n",
-    "    def compute(self):\n",
-    "        \"\"\"Aggregate state over all processes and compute the metric.\n",
-    "        \"\"\"\n",
-    "        # Return average loss over entire validation dataset\n",
-    "        return self.sum_loss / self.total_batches\n",
-    "\n",
-    "def rle2mask(rle, height=1024, width=1024, fill_value=1):\n",
-    "    mask = np.zeros((height, width), np.float32)\n",
-    "    mask = mask.reshape(-1)\n",
-    "    rle = np.array([int(s) for s in rle.strip().split(' ')])\n",
-    "    rle = rle.reshape(-1, 2)\n",
-    "    start = 0\n",
-    "    for index, length in rle:\n",
-    "        start = start+index\n",
-    "        end = start+length\n",
-    "        mask[start: end] = fill_value\n",
-    "        start = end\n",
-    "    mask = mask.reshape(width, height).T\n",
-    "    return mask\n",
-    "\n",
-    "def mask2rle(mask):\n",
-    "    mask = mask.T.flatten()\n",
-    "    start = np.where(mask[1:] > mask[:-1])[0]+1\n",
-    "    end = np.where(mask[:-1] > mask[1:])[0]+1\n",
-    "    length = end-start\n",
-    "    rle = []\n",
-    "    for i in range(len(length)):\n",
-    "        if i == 0:\n",
-    "            rle.extend([start[0], length[0]])\n",
-    "        else:\n",
-    "            rle.extend([start[i]-end[i-1], length[i]])\n",
-    "    rle = ' '.join([str(r) for r in rle])\n",
-    "    return rle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Preprocessing and Data Science"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### SIIM Dataset\n",
-    "\n",
-    "The SIIM dataset consists of:\n",
-    "- `dicom-images-train` - 12954 labeled images in [DICOM][dicom] format.\n",
-    "- `dicom-images-test` - 3205 unlabeled DICOM images for testing\n",
-    "\n",
-    "- `train-rle.csv` comes with a label file `train-rle.csv` mapping `ImageId` to `EncodedPixels`.\n",
-    "\n",
-    "    - `ImageId`s map to image paths for [DICOM][dicom_format] format images. \n",
-    "\n",
-    "  - `EncodedPixels` are [run length encoded][masks] segmentation masks representing areas where pneumothorax has been labeled by an expert. A label of `\"-1\"` indicates the image was examined and no pneumothorax was found.\n",
-    "\n",
-    "[dicom]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom\n",
-    "[dicom_format]: https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom.html#sphx-glr-auto-examples-input-output-plot-read-dicom-py\n",
-    "[masks]: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/mask.py"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!ls siim"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "labels_df = pd.read_csv('siim/train-rle.csv')\n",
-    "labels_df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Clean Data\n",
-    "Of the ~13,000 images, only 3600 have masks. We will throw out some of the negative samples to better balance our dataset and speed up training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "labels_df[labels_df[\" EncodedPixels\"] != \"-1\"].shape, labels_df[labels_df[\" EncodedPixels\"] == \"-1\"].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def balance_labels(labels_df, extra_samples_without_mask=1500, random_state=1337):\n",
-    "    \"\"\"\n",
-    "    Drop duplicates and mark samples with masks.\n",
-    "    Sample 3576+extra_samples_without_mask unmasked samples to balance dataset.\n",
-    "    \"\"\"\n",
-    "    df = labels_df.drop_duplicates('ImageId')\n",
-    "    df_with_mask = df[df[\" EncodedPixels\"] != \"-1\"].copy(deep=True)\n",
-    "    df_with_mask['has_mask'] = 1\n",
-    "    df_without_mask = df[df[\" EncodedPixels\"] == \"-1\"].copy(deep=True)\n",
-    "    df_without_mask['has_mask'] = 0\n",
-    "    df_without_mask_sampled = df_without_mask.sample(len(df_with_mask)+extra_samples_without_mask, random_state=random_state)\n",
-    "    df = pd.concat([df_with_mask, df_without_mask_sampled])\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = balance_labels(labels_df)\n",
-    "df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create Cross Validation Splits\n",
-    "Once cleaned and balanced, we're left with only 6838 images. This will leave us with rather small training and validation sets once we split the data. To mitigate the chances of us validating on a poorly sampled (not representative of our unlabeled test data) validation set, we use [StratifiedKFold][kfold] to create 5 different 80%-20%, `train` `eval` splits. \n",
-    "\n",
-    "**Note**: For datasets of this size, it's good practice to train and evaluate on each split, but due to runtime constraints in this notebook we will only train on the first split which contains 5470 training and 1368 eval samples.\n",
-    "\n",
-    "[kfold]: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)\n",
-    "train_idx, eval_idx = list(kfold.split(df[\"ImageId\"], df[\"has_mask\"]))[0]\n",
-    "train_df, eval_df = df.iloc[train_idx], df.iloc[eval_idx]\n",
-    "train_df.shape, eval_df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## PyTorch\n",
-    "\n",
-    "### PyTorch Dataset\n",
-    "`SIIMDataset` is a standard PyTorch dataset that reads images and decodes labels from the siim label csv. DICOM images are loaded as grayscale numpy arrays, converted to rgb, and scaled. Labels are converted from rle strings to binary segmentation masks. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class SIIMDataset(Dataset):\n",
-    "    def __init__(self, \n",
-    "                 labels_df,\n",
-    "                 transforms=None,\n",
-    "                 image_dir=Path('siim/dicom-images-train')):\n",
-    "        self.labels_df = labels_df\n",
-    "        self.image_dir = image_dir\n",
-    "        self.transforms = transforms\n",
-    "\n",
-    "    def __getitem__(self, idx):\n",
-    "        row = self.labels_df.iloc[idx]\n",
-    "        image_id = row.ImageId\n",
-    "        image_path = self.image_dir / f'{image_id}.dcm'\n",
-    "        image = dcmread(image_path).pixel_array # load dicom image\n",
-    "        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # convert rgb so we can keep imagenet first layer weights\n",
-    "        image = (image / 255.).astype('float32') # scale (0.- 1.)\n",
-    "\n",
-    "        rle = row[' EncodedPixels']\n",
-    "        if rle != '-1':\n",
-    "            mask = rle2mask(rle, 1024, 1024).astype('float32')\n",
-    "        else:\n",
-    "            mask = np.zeros([1024, 1024]).astype('float32')\n",
-    "\n",
-    "        if self.transforms:\n",
-    "            augmented = self.transforms(image=image, mask=mask)\n",
-    "            image = augmented['image']\n",
-    "            mask = augmented['mask']\n",
-    "\n",
-    "        return (\n",
-    "            torch.from_numpy(image).permute(2, 0, 1),\n",
-    "            torch.from_numpy(mask).unsqueeze(0)\n",
-    "        )\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return len(self.labels_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Transforms\n",
-    "We use the [albumentations](https://albumentations.ai/docs/getting_started/mask_augmentation/) library to resize and randomly scale/rotate our training images. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_size = 512\n",
-    "\n",
-    "train_transforms = Compose(\n",
-    "    [\n",
-    "        Resize(image_size, image_size),\n",
-    "        ShiftScaleRotate(\n",
-    "            shift_limit=0,\n",
-    "            scale_limit=0.1,\n",
-    "            rotate_limit=10, # rotate\n",
-    "            p=0.5,\n",
-    "            border_mode=cv2.BORDER_CONSTANT\n",
-    "        )\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "eval_transforms = Compose([Resize(image_size, image_size)])\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### DataLoaders"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "train_batch_size = 32\n",
-    "val_batch_size = 32\n",
-    "\n",
-    "train_dataloader = DataLoader(SIIMDataset(train_df, transforms=train_transforms),\n",
-    "                              batch_size=train_batch_size, shuffle=True, num_workers=2)\n",
-    "\n",
-    "eval_dataloader = DataLoader(SIIMDataset(eval_df, transforms=eval_transforms),\n",
-    "                             batch_size=val_batch_size, shuffle=False, num_workers=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Visualize batch\n",
-    "Areas of pneumothorax are highlighted in red; drag the slider to iterate through batches."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@interact(data_loader=fixed(train_dataloader), batch=IntSlider(min=0, max=len(train_dataloader)-1, step=1, value=0))\n",
-    "def show_batch(data_loader, batch):\n",
-    "    plt.rcParams['figure.figsize'] = [20, 15]\n",
-    "\n",
-    "    images, masks = list(itertools.islice(data_loader, batch, batch+1))[0]\n",
-    "    masks_list = []\n",
-    "    for image, mask in zip(images, masks):\n",
-    "        masked = draw_segmentation_masks((image * 255).byte(),\n",
-    "                                    mask.bool(), alpha=0.5, colors='red')\n",
-    "        masks_list.append(masked)\n",
-    "\n",
-    "    grid  = make_grid(masks_list, nrow=6)\n",
-    "    plt.imshow(grid.permute(1, 2, 0));"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Composer\n",
-    "\n",
-    "### Model\n",
-    "\n",
-    "Here we define a Composer model that wraps the smp [segmentation models pytorch][pytorch_seg] package. This lets us quickly create many different segmentation models made from common pre-trained PyTorch encoders. \n",
-    "\n",
-    "- We set defaults to create a [Unet][unet] from an ImageNet pre-trained ResNet-34 with 3 input channels for our RGB (converted) inputs and 1 output channel. \n",
-    "- We set the default loss to `nn.BCEWithLogitsLoss()` to classify each pixel of the output.\n",
-    "\n",
-    "[pytorch_seg]: https://github.com/qubvel/segmentation_models.pytorch\n",
-    "[unet]: https://arxiv.org/abs/1505.04597"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class SMPUNet(ComposerModel):\n",
-    "    def __init__(self,\n",
-    "                 encoder_name='resnet34',\n",
-    "                 encoder_weights='imagenet',\n",
-    "                 in_channels=3, classes=1,\n",
-    "                 loss=nn.BCEWithLogitsLoss()):\n",
-    "        super().__init__()\n",
-    "        self.model = smp.Unet(\n",
-    "            encoder_name=encoder_name,\n",
-    "            encoder_weights=encoder_weights,     # use `imagenet` pre-trained weights for encoder initialization\n",
-    "            in_channels=in_channels,        # model input channels (1 for gray-scale images, 3 for RGB, etc.)\n",
-    "            classes=classes         # model output channels (number of classes in your dataset)\n",
-    "        )                \n",
-    "\n",
-    "        self.criterion = loss\n",
-    "        self.train_loss = LossMetric(loss)\n",
-    "        self.val_loss = LossMetric(loss)\n",
-    "        self.val_dice = Dice(num_classes=classes)\n",
-    "\n",
-    "    def forward(self, batch):\n",
-    "        images, targets = batch\n",
-    "        return self.model(images)\n",
-    "\n",
-    "    def loss(self, outputs, batch):\n",
-    "        _, targets = batch\n",
-    "        return self.criterion(outputs, targets)\n",
-    "\n",
-    "    def get_metrics(self, is_train: bool = False):\n",
-    "        if is_train:\n",
-    "            return {'BCEWithLogitsLoss': self.train_loss}\n",
-    "        else:\n",
-    "            return {'BCEWithLogitsLoss': self.val_loss, 'Dice': self.dice}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = SMPUNet() # define unet model\n",
-    "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Trainer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    train_dataloader=train_dataloader,\n",
-    "    eval_dataloader=eval_dataloader,\n",
-    "    max_duration='2ep',\n",
-    "    optimizers=optimizer,\n",
-    "    device='gpu',\n",
-    "    precision='amp',\n",
-    "    seed=1337\n",
-    ")\n",
-    "trainer.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Algorithms\n",
-    "\n",
-    "Composer allows us to quickly experiment with algorithms that can speed up or improve the quality of our model. This is how we can add `CutOut` and `LabelSmoothing`\n",
-    "\n",
-    "Additionally, the Composer trainer has builtin support for automatic mixed precision training and gradient accumulation to help train quickly and simulate larger batch sizes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from composer.algorithms import CutOut, LabelSmoothing\n",
-    "\n",
-    "model = SMPUNet() # define unet model\n",
-    "optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)\n",
-    "\n",
-    "algorithms = [CutOut(length=0.5), LabelSmoothing(smoothing=0.1)]\n",
-    "\n",
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    train_dataloader=train_dataloader,\n",
-    "    eval_dataloader=eval_dataloader,\n",
-    "    max_duration='2ep',\n",
-    "    optimizers=optimizer,\n",
-    "    algorithms=algorithms,\n",
-    "    device='gpu',\n",
-    "    precision='amp',\n",
-    "    seed=1337\n",
-    ")\n",
-    "trainer.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "## What next?\n",
-    "\n",
-    "You've now seen a from-scratch demonstration of using Composer in a computer vision project. But don't stop here! If you're interested, we recommend that you continue to experiment with:\n",
-    "\n",
-    "- training longer\n",
-    "- different loss functions, architectures, transformations, and\n",
-    "- different combinations of composer methods!\n",
-    "\n",
-    "In addition, please continue to explore our tutorials! Here are a couple suggestions:\n",
-    "\n",
-    "* Continue to explore more advanced applications of Composer like [fine-tuning a transformer for sentiment classification][huggingface_tutorial].\n",
-    "\n",
-    "* Learn about callbacks and how to apply [early stopping][early_stopping_tutorial].\n",
-    "\n",
-    "* See how dataloading bottlenecks in computer vision can be addressed using [FFCV][ffcv].\n",
-    "\n",
-    "[image_segmentation_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/medical_image_segmentation.html\n",
-    "[huggingface_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/huggingface_models.html\n",
-    "[early_stopping_tutorial]: https://docs.mosaicml.com/projects/composer/en/stable/examples/early_stopping.html\n",
-    "[ffcv]: https://docs.mosaicml.com/projects/composer/en/stable/examples/ffcv_dataloaders.html"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Come get involved with MosaicML!\n",
-    "\n",
-    "We'd love for you to get involved with the MosaicML community in any of these ways:\n",
-    "\n",
-    "### [Star Composer on GitHub](https://github.com/mosaicml/composer)\n",
-    "\n",
-    "Help make others aware of our work by [starring Composer on GitHub](https://github.com/mosaicml/composer).\n",
-    "\n",
-    "### [Join the MosaicML Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg)\n",
-    "\n",
-    "Head on over to the [MosaicML slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg) to join other ML efficiency enthusiasts. Come for the paper discussions, stay for the memes!\n",
-    "\n",
-    "### Contribute to Composer\n",
-    "\n",
-    "Is there a bug you noticed or a feature you'd like? File an [issue](https://github.com/mosaicml/composer/issues) or make a [pull request](https://github.com/mosaicml/composer/pulls)!"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/examples/profiler_demo.py b/examples/profiler_demo.py
index f06fa17f06..d46c89e559 100644
--- a/examples/profiler_demo.py
+++ b/examples/profiler_demo.py
@@ -8,11 +8,13 @@
 
 # [imports-start]
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.utils.data import DataLoader
 from torchvision import datasets, transforms
 
 from composer import Trainer
-from composer.models import mnist_model
+from composer.models.tasks import ComposerClassifier
 from composer.profiler import JSONTraceHandler, cyclic_schedule
 from composer.profiler.profiler import Profiler
 
@@ -35,10 +37,39 @@
     persistent_workers=True,
     num_workers=8,
 )
+
 # [dataloader-end]
 
+
 # Instantiate Model
-model = mnist_model(num_classes=10)
+class Model(nn.Module):
+    """Toy convolutional neural network architecture in pytorch for MNIST."""
+
+    def __init__(self, num_classes: int = 10):
+        super().__init__()
+
+        self.num_classes = num_classes
+
+        self.conv1 = nn.Conv2d(1, 16, (3, 3), padding=0)
+        self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=0)
+        self.bn = nn.BatchNorm2d(32)
+        self.fc1 = nn.Linear(32 * 16, 32)
+        self.fc2 = nn.Linear(32, num_classes)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn(out)
+        out = F.relu(out)
+        out = F.adaptive_avg_pool2d(out, (4, 4))
+        out = torch.flatten(out, 1, -1)
+        out = self.fc1(out)
+        out = F.relu(out)
+        return self.fc2(out)
+
+
+model = ComposerClassifier(module=Model(num_classes=10))
 
 # [trainer-start]
 # Instantiate the trainer
diff --git a/examples/segmentation/README.md b/examples/segmentation/README.md
deleted file mode 100644
index 8eaa391184..0000000000
--- a/examples/segmentation/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Semantic Segmentation Example
-
-This example illustrates how to train a semantic segmentation model in composer.
-
-## Installation
-
-First, install [Composer](https://github.com/mosaicml/composer) with `pip install mosaicml`. Additionally, our models are pulled from [MMsegmentation](https://github.com/open-mmlab/mmsegmentation), so follow the [MMcv install instructions](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) (which is dependent on your CUDA and PyTorch versions), then install MMsegmentation with `pip install mmsegmentation`.
-
-Alternatively, we have publicly available Docker images to reproduce our results. Use `mosaicml/pytorch_vision:1.12.1_cu116-python3.9-ubuntu20.04` for running on GPUs or `mosaicml/pytorch_vision:1.12.1_cpu-python3.9-ubuntu20.04` for running on CPUs.
-
-## DeepLabv3+ on ADE20k
-
-The `train_deeplabv3_ade20k.py` script trains a DeepLabv3+ model with either a ResNet-50 or ResNet-101 backbone on the ADE20k semantic segmentation benchmark. To download ADE20k locally (~1 GB), specify the `--download` option when running the script, then the dataset will be downloaded data directory path i.e. the first argument.
-
-We designed the script to be hackable, so try our recipes on your own models and datsets!
-### Example configurations
-
-<!--pytest.mark.skip-->
-
-```bash
-# Downloads ADE20k and does single GPU/CPU training depending on torch.cuda.is_available():
-python train_deeplabv3_ade20k.py /path/to/ade20k --download
-
-# Log experiments to Weights and Biases:
-python train_deeplabv3_ade20k.py /path/to/ade20k --wandb_logger --wandb_entity my_username --wandb_project my_project --run_name my_run_name
-
-# Single/Multi GPU training (infers the number of GPUs available):
-composer train_deeplabv3_ade20k.py /path/to/ade20k
-
-# Manually specify number of GPUs to use:
-composer -n $N_GPUS train_deeplabv3_ade20k.py /path/to/ade20k
-
-# Mild DeepLabv3+ recipe for fastest training to 45.6 mIoU:
-composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name mild --max_duration 25ep
-
-# Medium DeepLabv3+ recipe for highest mIoU (49.15) with similar training time as baseline:
-composer train_deeplabv3_ade20k.py /path/to/ade20k/ --recipe_name medium --max_duration 90ep
-
-# Hot DeepLabv3+ recipe for highest mIoU (49.83) with a long training schedule:
-composer train_deeplabv3_ade20k.py /path/to/ade20k --recipe_name hot --max_duration 256ep
-```
diff --git a/examples/segmentation/train_deeplabv3_ade20k.py b/examples/segmentation/train_deeplabv3_ade20k.py
deleted file mode 100644
index 90d93aa037..0000000000
--- a/examples/segmentation/train_deeplabv3_ade20k.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Example script to train a DeepLabv3+ model on ADE20k for semantic segmentation."""
-
-import argparse
-import logging
-import os
-
-import torch
-import torchvision
-from torch.utils.data import DataLoader
-from torchmetrics import MetricCollection
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
-from composer import DataSpec, Time, Trainer
-from composer.algorithms import EMA, SAM, ChannelsLast, MixUp
-from composer.callbacks import CheckpointSaver, ImageVisualizer, LRMonitor, SpeedMonitor
-from composer.datasets.ade20k import (ADE20k, PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair,
-                                      RandomResizePair)
-from composer.datasets.utils import NormalizationFn, pil_image_collate
-from composer.loggers import WandBLogger
-from composer.loss import DiceLoss, soft_cross_entropy
-from composer.metrics import CrossEntropy, MIoU
-from composer.models import ComposerClassifier
-from composer.models.deeplabv3.model import deeplabv3
-from composer.optim import CosineAnnealingScheduler, DecoupledSGDW
-from composer.utils import dist
-
-logging.basicConfig()
-logging.getLogger().setLevel(logging.INFO)
-
-parser = argparse.ArgumentParser()
-
-# Dataloader command-line arguments
-parser.add_argument('data_dir', help='Path to the directory containing the ImageNet-1k dataset', type=str)
-parser.add_argument('--download',
-                    help='Use to download ADE20k from the internet and put it in the `data_dir`',
-                    action='store_true')
-parser.add_argument('--train_resize_size', help='Training image resize size', type=int, default=512)
-parser.add_argument('--eval_resize_size', help='Evaluation image resize size', type=int, default=512)
-parser.add_argument('--train_batch_size', help='Train dataloader per-device batch size', type=int, default=128)
-parser.add_argument('--eval_batch_size', help='Validation dataloader per-device batch size', type=int, default=128)
-
-# Model command-line arguments
-parser.add_argument('--backbone_arch',
-                    help='Architecture to use for the backbone.',
-                    default='resnet101',
-                    choices=['resnet50', 'resnet101'])
-parser.add_argument('--sync_bn',
-                    help='Use sync BatchNorm. Recommended if the per device microbatch size is below 16',
-                    action='store_true')
-parser.add_argument('--cross_entropy_weight', help='Weight to scale the cross entropy loss', type=float, default=0.375)
-parser.add_argument('--dice_weight', help='Weight to scale the dice loss', type=float, default=1.125)
-
-# Optimizer command-line arguments
-parser.add_argument('--learning_rate', help='Optimizer learning rate', type=float, default=0.08)
-parser.add_argument('--momentum', help='Optimizer momentum', type=float, default=0.9)
-parser.add_argument('--weight_decay', help='Optimizer weight decay', type=float, default=5.0e-5)
-
-# Save checkpoint command-line arguments
-parser.add_argument('--save_checkpoint_dir',
-                    help='Directory in which to save model checkpoints',
-                    type=str,
-                    default='checkpoints/{run_name}')
-parser.add_argument('--checkpoint_interval',
-                    help='Frequency to save checkpoints',
-                    type=Time.from_timestring,
-                    default='1ep')
-
-# Load checkpoint command-line arguments, assumes resuming from a previous training run (as opposed to fine-tuning)
-parser.add_argument('--load_checkpoint_path', help='Path to the checkpoint to load', type=str)
-
-# Recipes command-line argument
-parser.add_argument('--recipe_name',
-                    help='Algorithmic recipes to be applied to the trainer',
-                    choices=['mild', 'medium', 'hot'])
-
-# Logger command-line arguments
-# Note: Only Weights and Biases to minimize arguments. Other loggers can be used by adjusting the script
-parser.add_argument('--wandb_logger', help='Whether or not to log results to Weights and Biases', action='store_true')
-parser.add_argument('--wandb_entity', help='WandB entity name', type=str)
-parser.add_argument('--wandb_project', help='WandB project name', type=str)
-
-parser.add_argument('--image_viz', help='Whether or not to log images using ImageVisualizer', action='store_true')
-
-# Trainer arguments
-parser.add_argument('--device_train_microbatch_size',
-                    help='Size of train microbatch size if running on GPU',
-                    default='auto')
-parser.add_argument('--run_name', help='Name of the training run used for checkpointing and logging', type=str)
-parser.add_argument('--seed', help='Random seed', type=int, default=17)
-parser.add_argument('--max_duration',
-                    help='Duration to train specified as a Time string',
-                    type=Time.from_timestring,
-                    default='128ep')
-
-args = parser.parse_args()
-
-IMAGENET_CHANNEL_MEAN = (int(0.485 * 255), int(0.456 * 255), int(0.406 * 255))
-IMAGENET_CHANNEL_STD = (int(0.229 * 255), int(0.224 * 255), int(0.225 * 255))
-
-ADE20K_URL = 'http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip'
-ADE20K_FILE = 'ADEChallengeData2016.zip'
-
-
-def _main():
-    # Divide batch size by number of devices
-    if dist.get_world_size() > 1:
-        args.train_batch_size = args.train_batch_size // dist.get_world_size()
-        args.eval_batch_size = args.eval_batch_size // dist.get_world_size()
-
-    # Train dataset code
-    logging.info('Building train dataloader')
-
-    if args.download:
-        torchvision.datasets.utils.download_and_extract_archive(url=ADE20K_URL,
-                                                                download_root=args.data_dir,
-                                                                filename=ADE20K_FILE,
-                                                                remove_finished=True)
-        # Adjust the data_dir to include the extracted directory
-        args.data_dir = os.path.join(args.data_dir, 'ADEChallengeData2016')
-
-    # Training transforms applied to both the image and target
-    train_both_transforms = torch.nn.Sequential(
-        RandomResizePair(
-            min_scale=0.5,
-            max_scale=2.0,
-            base_size=(args.train_resize_size, args.train_resize_size),
-        ),
-        RandomCropPair(
-            crop_size=(args.train_resize_size, args.train_resize_size),
-            class_max_percent=0.75,
-            num_retry=10,
-        ),
-        RandomHFlipPair(),
-    )
-
-    # Training transforms applied to the image only
-    train_image_transforms = torch.nn.Sequential(
-        PhotometricDistoration(
-            brightness=32. / 255,
-            contrast=0.5,
-            saturation=0.5,
-            hue=18. / 255,
-        ),
-        PadToSize(
-            size=(args.train_resize_size, args.train_resize_size),
-            fill=IMAGENET_CHANNEL_MEAN,
-        ),
-    )
-
-    # Training transforms applied to the target only
-    train_target_transforms = PadToSize(size=(args.train_resize_size, args.train_resize_size), fill=0)
-
-    # Create ADE20k train dataset
-    train_dataset = ADE20k(
-        datadir=args.data_dir,
-        split='training',
-        image_transforms=train_image_transforms,
-        target_transforms=train_target_transforms,
-        both_transforms=train_both_transforms,
-    )
-
-    # Create ADE20k train dataloader
-
-    train_sampler = None
-    if dist.get_world_size():
-        # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware setup
-        train_sampler = dist.get_sampler(train_dataset, drop_last=True, shuffle=True)
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=True,  # Prevents using a smaller batch at the end of an epoch
-        sampler=train_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,
-    )
-
-    # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck
-    train_dataspec = DataSpec(dataloader=train_dataloader,
-                              device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                                                std=IMAGENET_CHANNEL_STD,
-                                                                ignore_background=True))
-    logging.info('Built train dataloader\n')
-
-    # Validation dataset code
-    logging.info('Building evaluation dataloader')
-
-    # Validation image and target transformations
-    image_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size),
-                                         interpolation=InterpolationMode.BILINEAR)
-    target_transforms = transforms.Resize(size=(args.eval_resize_size, args.eval_resize_size),
-                                          interpolation=InterpolationMode.NEAREST)
-
-    # Create ADE20k validation dataset
-    val_dataset = ADE20k(datadir=args.data_dir,
-                         split='validation',
-                         both_transforms=None,
-                         image_transforms=image_transforms,
-                         target_transforms=target_transforms)
-
-    #Create ADE20k validation dataloader
-
-    val_sampler = None
-    if dist.get_world_size():
-        # Nifty function to instantiate a PyTorch DistributedSampler based on your hardware
-        val_sampler = dist.get_sampler(val_dataset, drop_last=False, shuffle=False)
-
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size=args.eval_batch_size,
-        num_workers=8,
-        pin_memory=True,
-        drop_last=False,
-        sampler=val_sampler,
-        collate_fn=pil_image_collate,
-        persistent_workers=True,
-    )
-
-    # DataSpec enables image normalization to be performed on-GPU, marginally relieving dataloader bottleneck
-    val_dataspec = DataSpec(dataloader=val_dataloader,
-                            device_transforms=NormalizationFn(mean=IMAGENET_CHANNEL_MEAN,
-                                                              std=IMAGENET_CHANNEL_STD,
-                                                              ignore_background=True))
-    logging.info('Built validation dataset\n')
-
-    logging.info('Building Composer DeepLabv3+ model')
-
-    # Create a DeepLabv3+ model
-    model = deeplabv3(
-        num_classes=150,
-        backbone_arch=args.backbone_arch,
-        backbone_weights='IMAGENET1K_V2',
-        sync_bn=args.sync_bn,
-        use_plus=True,
-    )
-
-    # Initialize the classifier head only since the backbone uses pre-trained weights
-    def weight_init(module: torch.nn.Module):
-        if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
-            torch.nn.init.kaiming_normal_(module.weight)
-        if isinstance(module, torch.nn.BatchNorm2d):
-            torch.nn.init.ones_(module.weight)
-            torch.nn.init.zeros_(module.bias)
-
-    model.classifier.apply(weight_init)  # type: ignore Does not recognize classifier as a torch.nn.Module
-
-    # Loss function to use during training
-    # This ignores index -1 since the NormalizationFn transformation sets the background class to -1
-    dice_loss_fn = DiceLoss(softmax=True, batch=True, ignore_absent_classes=True)
-
-    def combo_loss(output, target):
-        loss = {}
-        loss['cross_entropy'] = soft_cross_entropy(output, target, ignore_index=-1)
-        loss['dice'] = dice_loss_fn(output, target)
-        loss['total'] = args.cross_entropy_weight * loss['cross_entropy'] + args.dice_weight * loss['dice']
-        return loss
-
-    # Training and Validation metrics to log throughout training
-    train_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)])
-    val_metrics = MetricCollection([CrossEntropy(ignore_index=-1), MIoU(num_classes=150, ignore_index=-1)])
-
-    # Create a ComposerClassifier using the model, loss function, and metrics
-    composer_model = ComposerClassifier(module=model,
-                                        train_metrics=train_metrics,
-                                        val_metrics=val_metrics,
-                                        loss_fn=combo_loss)
-
-    logging.info('Built Composer DeepLabv3+ model\n')
-
-    logging.info('Building optimizer and learning rate scheduler')
-    # Optimizer
-    optimizer = DecoupledSGDW(composer_model.parameters(),
-                              lr=args.learning_rate,
-                              momentum=args.momentum,
-                              weight_decay=args.weight_decay)
-
-    # Only use a LR schedule if no recipe is specified or if the hot recipe was specified
-    lr_scheduler = None
-    if args.recipe_name is None or args.recipe_name == 'hot':
-        lr_scheduler = CosineAnnealingScheduler()
-
-    logging.info('Built optimizer and learning rate scheduler')
-
-    logging.info('Building callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver')
-    speed_monitor = SpeedMonitor(window_size=50)  # Measures throughput as samples/sec and tracks total training time
-    lr_monitor = LRMonitor()  # Logs the learning rate
-
-    # Callback for checkpointing
-    checkpoint_saver = CheckpointSaver(folder=args.save_checkpoint_dir, save_interval=args.checkpoint_interval)
-    logging.info('Built callbacks: SpeedMonitor, LRMonitor, and CheckpointSaver\n')
-
-    # Recipes for training DeepLabv3+ on ImageNet in order of increasing training time and accuracy
-    # To learn about individual methods, check out "Methods Overview" in our documentation: https://docs.mosaicml.com/
-    logging.info('Building algorithm recipes')
-    if args.recipe_name == 'mild':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='1000ba', update_interval='10ba'),
-        ]
-    elif args.recipe_name == 'medium':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='1000ba', update_interval='10ba'),
-            SAM(rho=0.3, interval=2),
-            MixUp(alpha=0.2),
-        ]
-    elif args.recipe_name == 'hot':
-        algorithms = [
-            ChannelsLast(),
-            EMA(half_life='2000ba', update_interval='1ba'),
-            SAM(rho=0.3, interval=1),
-            MixUp(alpha=0.5),
-        ]
-    else:
-        algorithms = None
-    logging.info('Built algorithm recipes\n')
-
-    # Weight and Biases logger if specified in commandline
-    logger = None
-    if args.wandb_logger:
-        logging.info('Building Weights and Biases logger')
-        if args.wandb_entity is None:
-            raise ValueError('Please specify --wandb_entity argument')
-        if args.wandb_project is None:
-            raise ValueError('Please specify --wandb_project argument')
-        logger = WandBLogger(entity=args.wandb_entity, project=args.wandb_project)
-        logging.info('Built Weights and Biases logger')
-
-    callbacks = [speed_monitor, lr_monitor, checkpoint_saver]
-    if args.image_viz:
-        callbacks.append(ImageVisualizer(mode='segmentation'))
-    # Create the Trainer!
-    logging.info('Building Trainer')
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-    precision = 'amp' if device == 'gpu' else 'fp32'  # Mixed precision for fast training when using a GPU
-    device_train_microbatch_size = 'auto' if device == 'gpu' else args.device_train_microbatch_size  # If on GPU, use 'auto' gradient accumulation
-    trainer = Trainer(run_name=args.run_name,
-                      model=composer_model,
-                      train_dataloader=train_dataspec,
-                      eval_dataloader=val_dataspec,
-                      eval_interval='1ep',
-                      optimizers=optimizer,
-                      schedulers=lr_scheduler,
-                      algorithms=algorithms,
-                      loggers=logger,
-                      max_duration=args.max_duration,
-                      callbacks=callbacks,
-                      load_path=args.load_checkpoint_path,
-                      device=device,
-                      precision=precision,
-                      device_train_microbatch_size=device_train_microbatch_size,
-                      seed=args.seed)
-    logging.info('Built Trainer\n')
-
-    # Start training!
-    logging.info('Train!')
-    trainer.fit()
-
-
-if __name__ == '__main__':
-    _main()
diff --git a/pyproject.toml b/pyproject.toml
index 342c9b3d7e..1583440640 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,18 +82,15 @@ reportUnusedCoroutine = "error"
 # Pytest
 [tool.pytest.ini_options]
 # By default, do not run gpu, vision, docs, notebook, or daily tests
-addopts = "--codeblocks --strict-markers -m 'not gpu and not vision and not doctest and not daily and not remote'"
+addopts = "--codeblocks --strict-markers -m 'not gpu and not doctest and not daily and not remote'"
 
 markers = [
-    # !!!!!!!!!!!IMPORTANT!!!!!!!!!: when updating the markers, also make sure to update meta.yaml
     # Tests that require a world_size of two should be annotated with `@pytest.mark.world_size(2)`.
     # If not specified, the test will be assumed to have a world-size of one, which is
     # equivalent to `@pytest.mark.world_size(1)`
     "world_size(val)",
     # Tests that require a gpu should be annotated with `@pytest.mark.gpu`
     "gpu",
-    # Whether the test should run in a container based on the vision dockerimage, which contains ffcv and opencv
-    "vision",
     # Tests which are run as part of the documentation build
     "doctest",
     # Should be run during daily regression
@@ -151,6 +148,12 @@ filterwarnings = [
     '''ignore:torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead:UserWarning''',
     # Ignore torch sharded tensor deprecated warnings
     '''ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning''',
+    # Ignore torch pytree deprecated warnings
+    '''ignore:torch.utils._pytree._register_pytree_node is deprecated.*:UserWarning''',
+    # Ignore autograd kernel warning inside DeepSpeed
+    '''ignore:.*an autograd kernel was not registered to the Autograd key.*:UserWarning''',
+    # Ignore save_state_dict / load_state_dict deprecation warnings
+    '''ignore:'.*_state_dict' is deprecated and will be removed in future versions.*:UserWarning'''
 ]
 
 # Coverage
diff --git a/scripts/ffcv/create_ffcv_datasets.py b/scripts/ffcv/create_ffcv_datasets.py
deleted file mode 100644
index 190974c762..0000000000
--- a/scripts/ffcv/create_ffcv_datasets.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Helper utilities to create FFCV datasets."""
-
-import logging
-import os
-import sys
-import textwrap
-from argparse import ArgumentParser
-from io import BytesIO
-from typing import Tuple
-
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import Subset
-from torchvision import transforms
-from torchvision.datasets import CIFAR10, ImageFolder
-from tqdm import tqdm
-
-from composer.datasets.ffcv_utils import write_ffcv_dataset
-
-log = logging.getLogger(__name__)
-
-
-def _get_parser():
-    parser = ArgumentParser(description='Utility for converting datasets to ffcv format.')
-
-    parser.add_argument('--dataset',
-                        type=str,
-                        default='cifar10',
-                        choices=['cifar10', 'imagenet1k'],
-                        help=textwrap.dedent("""\
-                                Dataset to use. Default: cifar10"""))
-    parser.add_argument('--remote',
-                        type=str,
-                        help=textwrap.dedent("""\
-                                Remote directory (S3 or local filesystem) where dataset is stored., Example: s3://my-s3-bucket-name"""
-                                            ))
-    parser.add_argument('--local',
-                        type=str,
-                        default=None,
-                        help=textwrap.dedent("""\
-                                Local filesystem directory where dataset is cached during operation. Default: None"""))
-    parser.add_argument('--split',
-                        type=str,
-                        default='train',
-                        choices=['train', 'val'],
-                        help=textwrap.dedent("""\
-                                Split to use. Default: train"""))
-
-    parser.add_argument('--datadir',
-                        type=str,
-                        default=None,
-                        help=textwrap.dedent("""\
-                                Location of the dataset. Default: None"""))
-
-    parser.add_argument('--download',
-                        type=bool,
-                        default=False,
-                        help=textwrap.dedent("""\
-                                Download the dataset if possible. Default: False"""))
-
-    parser.add_argument('--write_path',
-                        type=str,
-                        default=None,
-                        help=textwrap.dedent("""\
-                                File path to use for writing the dataset. Default: /tmp/<dataset>_<split>.ffcv"""))
-
-    parser.add_argument('--write_mode',
-                        type=str,
-                        default='proportion',
-                        choices=['raw', 'jpg', 'smart', 'proportion'],
-                        help=textwrap.dedent("""\
-                                Write mode to use. raw is uint8 values, jpg is jpeg compressed images, smart is
-                                compressing based on image size and proportion is according to the given
-                                compress_probability. Default: proportion"""))
-
-    parser.add_argument('--max_resolution', type=int, default=500, help='Max resoultion for images.')
-
-    parser.add_argument('--num_workers', type=int, default=64, help='Number of workers to use.')
-
-    parser.add_argument('--chunk_size', type=int, default=100, help='Chunk size to use.')
-
-    parser.add_argument('--jpeg_quality', type=int, default=90, help='Quality of jpeg.')
-
-    parser.add_argument('--subset', type=int, default=-1, help='Only use a subset of dataset.')
-
-    parser.add_argument('--compress_probability',
-                        type=float,
-                        required=False,
-                        default=0.50,
-                        help='Compress the given fraction of images to jpeg while writing the ffcv dataset.')
-    return parser
-
-
-def _parse_args():
-    parser = _get_parser()
-
-    args = parser.parse_args()
-
-    if args.datadir is not None:
-        log.info(f'Will read from local directory: {args.datadir}.')
-    else:
-        if args.local is None:
-            args.local = f'/tmp/mds-cache/mds-{args.dataset}/'
-
-        if args.remote.startswith('s3://'):
-            log.info(f'Will read from remote: {args.remote}.')
-        else:
-            log.info(f'Will read from local: {args.remote}.')
-
-    if args.write_path is None:
-        args.write_path = f'/tmp/{args.dataset}_{args.split}.ffcv'
-
-    if os.path.exists(args.write_path):
-        log.error(f'Destination already exists: {args.write_path}')
-        sys.exit(-1)
-
-    return args
-
-
-def _main():
-    args = _parse_args()
-
-    if args.dataset == 'cifar10':
-        dataset = CIFAR10(root=args.datadir, train=(args.split == 'train'), download=args.download)
-    elif args.dataset == 'imagenet1k':
-        dataset = ImageFolder(os.path.join(args.datadir, args.split))
-    else:
-        raise ValueError(f'Unsupported dataset: {args.dataset}. Checkout the list of supported datasets with -h')
-
-    if args.subset > 0:
-        dataset = Subset(dataset, range(args.subset))
-
-    write_ffcv_dataset(dataset=dataset,
-                       write_path=args.write_path,
-                       max_resolution=args.max_resolution,
-                       num_workers=args.num_workers,
-                       write_mode=args.write_mode,
-                       compress_probability=args.compress_probability,
-                       jpeg_quality=args.jpeg_quality,
-                       chunk_size=args.chunk_size)
-
-
-if __name__ == '__main__':
-    sys.exit(_main())
diff --git a/setup.py b/setup.py
index 7322bdc49e..6600f716a7 100644
--- a/setup.py
+++ b/setup.py
@@ -76,10 +76,10 @@ def package_files(prefix: str, directory: str, extension: str):
 install_requires = [
     'pyyaml>=6.0,<7',
     'tqdm>=4.62.3,<5',
-    'torchmetrics>=0.10.0,<1.1',
+    'torchmetrics>=0.10.0,<1.3.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.13.1,<0.19',
-    'torch>=1.13.1,<2.2.1',
+    'torchvision>=0.13.1,<0.20',  # TODO: Tighten before release
+    'torch>=2.0.1,<2.3.1',  # TODO: Tighten before release
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<1.27.0',
     'psutil>=5.8.0,<6',
@@ -88,7 +88,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'py-cpuinfo>=8.0.0,<10',
     'packaging>=21.3.0,<23',
     'importlib-metadata>=5.0.0,<7',
-    'mosaicml-cli>=0.5.25,<0.6',
+    'mosaicml-cli>=0.5.25,<0.7',
 ]
 extra_deps = {}
 
@@ -100,14 +100,13 @@ def package_files(prefix: str, directory: str, extension: str):
     # Should manually update dependency versions occassionally.
     'custom_inherit==2.4.1',
     'junitparser==3.1.1',
-    'coverage[toml]==7.3.4',
+    'coverage[toml]==7.4.1',
     'fasteners==0.18',  # object store tests require fasteners
     'pytest==7.4.4',
-    'toml==0.10.2',
     'ipython==8.11.0',
-    'ipykernel==6.28.0',
+    'ipykernel==6.29.2',
     'jupyter==1.0.0',
-    'yamllint==1.33.0',
+    'yamllint==1.34.0',
     'recommonmark==0.7.1',
     'sphinx==4.4.0',
     'pre-commit>=3.4.0,<4',
@@ -117,6 +116,11 @@ def package_files(prefix: str, directory: str, extension: str):
     'sphinx_markdown_tables==0.0.17',
     'sphinx-argparse==0.4.0',
     'sphinxcontrib.katex==0.9.6',
+    'sphinxcontrib-applehelp==1.0.0',
+    'sphinxcontrib-devhelp==1.0.0',
+    'sphinxcontrib-htmlhelp==2.0.0',
+    'sphinxcontrib-serializinghtml==1.1.5',
+    'sphinxcontrib-qthelp==1.0.0',
     'sphinxext.opengraph==0.9.1',
     'sphinxemoji==0.2.0',
     'furo==2022.9.29',
@@ -130,7 +134,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'nbsphinx==0.9.1',
     'pandoc==2.3',
     'pypandoc==1.12',
-    'GitPython==3.1.40',
+    'GitPython==3.1.41',
     'moto[s3]>=4.0.1,<5',
     'mock-ssh-server==0.9.1',
     'cryptography==41.0.5',
@@ -138,10 +142,6 @@ def package_files(prefix: str, directory: str, extension: str):
     'setuptools<=59.5.0',
 ]
 
-extra_deps['health_checker'] = {
-    'pynvml>=11.5.0,<12',
-}
-
 extra_deps['system_metrics_monitor'] = {
     'pynvml>=11.5.0,<12',
 }
@@ -163,21 +163,12 @@ def package_files(prefix: str, directory: str, extension: str):
     'comet_ml>=3.31.12,<4.0.0',
 ]
 
-extra_deps['tensorboard'] = [
-    'tensorboard>=2.9.1,<3.0.0',
-]
-
-extra_deps['unet'] = [
-    'monai>=0.9.1,<1.4',
-    'scikit-learn>=1.0.1,<2',
+extra_deps['neptune'] = [
+    'neptune>=1.6.2,<2.0.0',
 ]
 
-extra_deps['vit'] = [
-    'vit_pytorch==1.6.1',
-]
-
-extra_deps['timm'] = [
-    'timm>=0.5.4,<0.6',
+extra_deps['tensorboard'] = [
+    'tensorboard>=2.9.1,<3.0.0',
 ]
 
 extra_deps['coco'] = [
@@ -185,10 +176,14 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['nlp'] = [
-    'transformers>=4.11,<4.37,!=4.34.0',
+    'transformers>=4.11,<4.38,!=4.34.0',
     'datasets>=2.4,<3',
 ]
 
+extra_deps['peft'] = [
+    'peft>=0.7.0,<0.8',
+]
+
 extra_deps['sentencepiece'] = [
     'protobuf<3.21',
     'sentencepiece==0.1.99',
@@ -229,7 +224,7 @@ def package_files(prefix: str, directory: str, extension: str):
 
 extra_deps['pandas'] = ['pandas>=2.0.0,<3.0']
 
-extra_deps['databricks'] = ['databricks-sdk>=0.15.0,<1.0']
+extra_deps['databricks'] = ['databricks-sdk==0.18.0']
 
 extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps}
 
@@ -258,9 +253,9 @@ def package_files(prefix: str, directory: str, extension: str):
       packages=setuptools.find_packages(exclude=['docker*', 'examples*', 'scripts*', 'tests*']),
       classifiers=[
           'Programming Language :: Python :: 3',
-          'Programming Language :: Python :: 3.8',
           'Programming Language :: Python :: 3.9',
           'Programming Language :: Python :: 3.10',
+          'Programming Language :: Python :: 3.11',
       ],
       install_requires=install_requires,
       entry_points={
@@ -272,7 +267,7 @@ def package_files(prefix: str, directory: str, extension: str):
       },
       extras_require=extra_deps,
       dependency_links=['https://developer.download.nvidia.com/compute/redist'],
-      python_requires='>=3.8',
+      python_requires='>=3.9',
       ext_package='composer',
       cmdclass={'develop': develop})
 
diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py
index 940ca040f2..91ecf2dac2 100644
--- a/tests/algorithms/algorithm_settings.py
+++ b/tests/algorithms/algorithm_settings.py
@@ -21,12 +21,11 @@
                                  LabelSmoothing, LayerFreezing, LowPrecisionGroupNorm, LowPrecisionLayerNorm, MixUp,
                                  NoOpModel, ProgressiveResizing, RandAugment, SelectiveBackprop, SeqLengthWarmup,
                                  SqueezeExcite, StochasticDepth, WeightStandardization)
-from composer.models import composer_resnet
 from composer.models.base import ComposerModel
 from composer.utils import dist
 from tests.common import get_module_subclasses
 from tests.common.datasets import RandomImageDataset, SimpleDataset, dummy_bert_lm_dataloader, dummy_gpt_lm_dataloader
-from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, configure_tiny_bert_hf_model,
+from tests.common.models import (SimpleConvModel, SimpleModelWithDropout, composer_resnet, configure_tiny_bert_hf_model,
                                  configure_tiny_gpt2_hf_model)
 
 simple_bert_settings = {
diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py
index d1fb4e2c40..9f243caeae 100644
--- a/tests/algorithms/test_algorithm_resumption.py
+++ b/tests/algorithms/test_algorithm_resumption.py
@@ -57,7 +57,7 @@ def test_algorithm_resumption(
         'save_filename': 'ep{epoch}-rank{rank}',
         'save_interval': '1ep',
         'train_subset_num_batches': 2,
-        'precision': 'amp_fp16',
+        'precision': 'amp_bf16',
     }
     train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True)
     # train model once, saving checkpoints every epoch
@@ -117,6 +117,7 @@ def test_algorithm_resumption(
 
 
 def _assert_checkpoints_equal(file1, file2):
+    # TODO: consider merging with _assert_checkpoints_equivalent
     checkpoint1 = torch.load(file1)
     checkpoint2 = torch.load(file2)
 
@@ -136,6 +137,10 @@ def _assert_checkpoints_equal(file1, file2):
     del checkpoint1['state']['run_name']
     del checkpoint2['state']['run_name']
 
+    # Remove all saved checkpoints to timestamp (accumulates between runs)
+    del checkpoint1['state']['callbacks']['CheckpointSaver']['all_saved_checkpoints_to_timestamp']
+    del checkpoint2['state']['callbacks']['CheckpointSaver']['all_saved_checkpoints_to_timestamp']
+
     # Remove algorithm representations which are memory addresses
     for i, algo_info in enumerate(checkpoint1['state']['algorithms']):
         if '0x' in algo_info[1]['repr']:
diff --git a/tests/algorithms/test_alibi.py b/tests/algorithms/test_alibi.py
index 81617a5ade..c33bd58bff 100644
--- a/tests/algorithms/test_alibi.py
+++ b/tests/algorithms/test_alibi.py
@@ -93,7 +93,7 @@ def test_registry(caplog):
     from composer.algorithms.alibi.attention_surgery_functions import policy_registry
 
     @policy_registry.register(torch.nn.Linear)
-    def zero_linear_weights(  # pyright: reportUnusedFunction = none
+    def zero_linear_weights(  # pyright: ignore[reportUnusedFunction]
             module: torch.nn.Module, idx: int, max_sequence_length: int) -> torch.nn.Module:
         assert isinstance(module, torch.nn.Linear)
         old_weight = getattr(module, 'weight')
diff --git a/tests/algorithms/test_colout.py b/tests/algorithms/test_colout.py
index 9e71d2554c..007bd43fd1 100644
--- a/tests/algorithms/test_colout.py
+++ b/tests/algorithms/test_colout.py
@@ -1,8 +1,8 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import functools
-from typing import Tuple
 
 import numpy as np
 import pytest
@@ -28,7 +28,8 @@ def verify_shape_image(orig: Image.Image, new: Image.Image, p_row: float, p_col:
     assert (H_n, W_n) == (H_t, W_t), f'Image shape mismatch: {(H_n, W_n)} != {(H_t, W_t)}'
 
 
-def verify_shape_image_pair(orig_sample: Tuple[Image.Image, Image.Image], new_sample: Tuple[Image.Image, Image.Image],
+def verify_shape_image_pair(orig_sample: tuple[Image.Image, Image.Image],
+                            new_sample: tuple[torch.Tensor, torch.Tensor] | tuple[Image.Image, Image.Image],
                             p_row: float, p_col: float):
     """Verify the shape of a pair of transformed PIL images."""
     H_o, W_o = orig_sample[0].height, orig_sample[0].width
@@ -50,8 +51,8 @@ def verify_shape_tensor(orig: torch.Tensor, new: torch.Tensor, p_row: float, p_c
     assert new.shape == (C, H_t, W_t), f'Image tensor shape mismatch: {new.shape} != {(C, H_t, W_t)}'
 
 
-def verify_shape_tensor_pair(orig_sample: Tuple[torch.Tensor, torch.Tensor],
-                             new_sample: Tuple[torch.Tensor, torch.Tensor], p_row: float, p_col: float) -> None:
+def verify_shape_tensor_pair(orig_sample: tuple[torch.Tensor, torch.Tensor],
+                             new_sample: tuple[torch.Tensor, torch.Tensor], p_row: float, p_col: float) -> None:
     """Verify the shape of a transformed image tensor."""
     C, H_o, W_o = orig_sample[0].shape
 
@@ -72,8 +73,8 @@ def verify_shape_batch(orig: torch.Tensor, new: torch.Tensor, p_row: float, p_co
     assert new.shape == (N, C, H_t, W_t), f'Image batch shape mismatch: {new.shape} != {(N, C, H_t, W_t)}'
 
 
-def verify_shape_batch_pair(orig_sample: Tuple[torch.Tensor, torch.Tensor],
-                            new_sample: Tuple[torch.Tensor, torch.Tensor], p_row: float, p_col: float) -> None:
+def verify_shape_batch_pair(orig_sample: tuple[torch.Tensor, torch.Tensor],
+                            new_sample: tuple[torch.Tensor, torch.Tensor], p_row: float, p_col: float) -> None:
     """Verify the shape of a transformed batch of images."""
 
     N, C, H_o, W_o = orig_sample[0].shape
@@ -163,7 +164,7 @@ def test_image_pair_drop_size(self, fake_image: Image.Image, p_row: float, p_col
         transform = ColOutTransform(p_row, p_col)
         orig_sample = (fake_image, fake_image)
         new_sample = transform(orig_sample)
-        assert isinstance(new_sample, Tuple)
+        assert isinstance(new_sample, tuple)
         verify_shape_image_pair(orig_sample, new_sample, p_row, p_col)
 
     @pytest.mark.parametrize('W', [48])
@@ -228,7 +229,7 @@ def test_batch_pair_drop_size(self, fake_image_batch: torch.Tensor, p_row: float
         colout = functools.partial(colout_batch, p_row=p_row, p_col=p_col)
         sample = (fake_image_batch, fake_image_batch)
         new_batch = colout(sample)
-        assert isinstance(new_batch, Tuple) and isinstance(new_batch[0], torch.Tensor) and isinstance(
+        assert isinstance(new_batch, tuple) and isinstance(new_batch[0], torch.Tensor) and isinstance(
             new_batch[1], torch.Tensor)
         verify_shape_batch_pair(sample, new_batch, p_row, p_col)
 
diff --git a/tests/algorithms/test_gradient_clipping.py b/tests/algorithms/test_gradient_clipping.py
index fe06fa188b..57e71c35f5 100644
--- a/tests/algorithms/test_gradient_clipping.py
+++ b/tests/algorithms/test_gradient_clipping.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from packaging import version
 from torch import nn
 
 import composer.algorithms.gradient_clipping.gradient_clipping as gc_module
@@ -13,7 +12,6 @@
 from composer.algorithms.gradient_clipping.gradient_clipping import _apply_agc, _get_clipped_gradient_coeff
 from composer.core import Engine, State
 from composer.core.event import Event
-from composer.utils.misc import using_torch_2
 from tests.common import world_size
 from tests.common.datasets import dummy_tiny_bert_classification_batch, dummy_transformer_classifier_batch
 from tests.common.models import SimpleTransformerClassifier, configure_tiny_bert_config
@@ -29,9 +27,9 @@ def simple_model_with_grads():
     # Force wrap every module in FSDP, to allow for testing FSDP
     # gradient clipping properly.
     for module in model:
-        module._fsdp_wrap = True
+        module._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
 
-    model._fsdp_wrap = True
+    model._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
     o = model(x)
     loss_fn = nn.CrossEntropyLoss()
     loss = loss_fn(o, y)
@@ -64,7 +62,7 @@ def forward(self, x):
     # Force wrap every module in FSDP, to allow for testing FSDP
     # gradient clipping properly.
     for layer in model.modules():
-        layer._fsdp_wrap = True
+        layer._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
 
     o = model(x)
     loss_fn = nn.CrossEntropyLoss()
@@ -79,7 +77,7 @@ def simple_transformer_model_with_grads():
     # Force wrap every module in FSDP, to allow for testing FSDP
     # gradient clipping properly.
     for layer in model.modules():
-        layer._fsdp_wrap = True
+        layer._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
 
     x = dummy_transformer_classifier_batch(num_classes=3)
     o = model(x)
@@ -104,7 +102,7 @@ def hf_model_with_grads():
     # Force wrap every module in FSDP, to allow for testing FSDP
     # gradient clipping properly.
     for layer in model.modules():
-        layer._fsdp_wrap = True
+        layer._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
 
     x = dummy_tiny_bert_classification_batch(num_classes=3)
     o = model(x).logits
@@ -193,27 +191,17 @@ def test_gradient_clipping_algorithm_with_deepspeed_enabled(
     apply_gc_fn.assert_not_called()
 
 
-if not using_torch_2():
+def _auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
+    if recurse:
+        return True
 
-    def _auto_wrap_policy(module: torch.nn.Module, recurse: bool, unwrapped_params: int) -> bool:  # type: ignore
-        if recurse:
-            return True
-        if hasattr(module, '_fsdp_wrap'):
-            return bool(module._fsdp_wrap)
+    # With Torch 2.0, there is a bug that emits a nasty warning if you wrap a module with no parameters
+    if len(list(module.parameters())) == 0:
         return False
-else:
 
-    def _auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-        if recurse:
-            return True
-
-        # With Torch 2.0, there is a bug that emits a nasty warning if you wrap a module with no parameters
-        if len(list(module.parameters())) == 0:
-            return False
-
-        if hasattr(module, '_fsdp_wrap'):
-            return bool(module._fsdp_wrap)
-        return False
+    if hasattr(module, '_fsdp_wrap'):
+        return bool(module._fsdp_wrap)
+    return False
 
 
 @pytest.mark.parametrize('model_with_grads', [
@@ -223,8 +211,6 @@ def _auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel:
     hf_model_with_grads
 ])
 @pytest.mark.parametrize('clipping_type', ['norm', 'value'])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 @pytest.mark.gpu
 @world_size(2)
 def test_gradient_clipping_algorithm_with_fsdp_enabled_does_not_error(
@@ -241,13 +227,10 @@ def test_gradient_clipping_algorithm_with_fsdp_enabled_does_not_error(
     clipping_threshold = 0.1191
     state = dummy_state
 
-    torch_2_kwargs = {}
-    if using_torch_2():
-        torch_2_kwargs['use_orig_params'] = True
     state.model = FullyShardedDataParallel(model,
                                            auto_wrap_policy=_auto_wrap_policy,
                                            device_id=torch.cuda.current_device(),
-                                           **torch_2_kwargs)
+                                           use_orig_params=True)
 
     state.algorithms = [GradientClipping(clipping_type=clipping_type, clipping_threshold=clipping_threshold)]
     logger = Mock()
diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
index 3844a57084..defaaa4389 100644
--- a/tests/algorithms/test_required_on_load.py
+++ b/tests/algorithms/test_required_on_load.py
@@ -9,12 +9,13 @@
 
 import pytest
 import torch
+from packaging import version
 
 from composer import Trainer, algorithms
 from composer.callbacks import CheckpointSaver
 from composer.core import Algorithm, Event, Time, TimeUnit  # type: ignore imports used in `eval(representation)`
-from composer.models import ComposerClassifier, ComposerModel, composer_resnet
-from tests.common import ConvModel, SimpleConvModel
+from composer.models import ComposerClassifier, ComposerModel
+from tests.common import ConvModel, SimpleConvModel, composer_resnet
 
 
 def initialize_algorithm(algo_cls: Type):
@@ -163,14 +164,20 @@ def test_autoload(algo_name: str, load_weights_only: bool, already_added: bool,
             context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
         # Excluding some algorithms leads to errors when loading
         elif exclude:
-            if algo_name in ['Factorize', 'SqueezeExcite']:
-                context = pytest.raises(
-                    ValueError,
-                    match=
-                    "loaded state dict contains a parameter group that doesn't match the size of optimizer's group",
-                )
-            elif algo_name == 'Alibi':
-                context = pytest.raises(RuntimeError)
+            if version.parse(torch.__version__) > version.parse('2.2.9'):
+                if algo_name in [
+                        'Alibi', 'BlurPool', 'Factorize', 'GatedLinearUnits', 'GhostBatchNorm', 'SqueezeExcite'
+                ]:
+                    context = pytest.raises(KeyError)  # Optimizer loading is strict
+            else:
+                if algo_name in ['Factorize', 'SqueezeExcite']:
+                    context = pytest.raises(
+                        ValueError,
+                        match=
+                        "loaded state dict contains a parameter group that doesn't match the size of optimizer's group",
+                    )
+                elif algo_name == 'Alibi':
+                    context = pytest.raises(RuntimeError)
 
         with context:
             trainer2 = Trainer(
diff --git a/tests/algorithms/test_stochastic_depth.py b/tests/algorithms/test_stochastic_depth.py
index 23c21bd816..2ec267756a 100644
--- a/tests/algorithms/test_stochastic_depth.py
+++ b/tests/algorithms/test_stochastic_depth.py
@@ -14,8 +14,8 @@
 from composer.algorithms.stochastic_depth.stochastic_layers import make_resnet_bottleneck_stochastic
 from composer.core import Event, State
 from composer.core.time import TimeUnit
-from composer.models import composer_resnet
 from composer.utils import module_surgery
+from tests.common import composer_resnet
 
 
 @pytest.fixture()
diff --git a/tests/callbacks/callback_settings.py b/tests/callbacks/callback_settings.py
index ef9fe12187..f6065c1863 100644
--- a/tests/callbacks/callback_settings.py
+++ b/tests/callbacks/callback_settings.py
@@ -3,6 +3,7 @@
 
 import os
 from typing import Any, Dict, List, Tuple, Type
+from unittest.mock import MagicMock
 
 import pytest
 from torch.utils.data import DataLoader
@@ -11,11 +12,11 @@
 import composer.loggers
 import composer.profiler
 from composer import Callback
-from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, HealthChecker,
-                                ImageVisualizer, MemoryMonitor, MLPerfCallback, SpeedMonitor, SystemMetricsMonitor,
-                                ThresholdStopper)
-from composer.loggers import (CometMLLogger, ConsoleLogger, LoggerDestination, MLFlowLogger, ProgressBarLogger,
-                              RemoteUploaderDownloader, TensorboardLogger, WandBLogger)
+from composer.callbacks import (EarlyStopper, ExportForInferenceCallback, FreeOutputs, Generate, ImageVisualizer,
+                                MemoryMonitor, MemorySnapshot, MLPerfCallback, OOMObserver, SpeedMonitor,
+                                SystemMetricsMonitor, ThresholdStopper)
+from composer.loggers import (CometMLLogger, ConsoleLogger, LoggerDestination, MLFlowLogger, NeptuneLogger,
+                              ProgressBarLogger, RemoteUploaderDownloader, TensorboardLogger, WandBLogger)
 from composer.models.base import ComposerModel
 from composer.utils import dist
 from composer.utils.device import get_device
@@ -76,6 +77,13 @@
 except ImportError:
     _PYNMVL_INSTALLED = False
 
+try:
+    import neptune
+    _NEPTUNE_INSTALLED = True
+    del neptune  # unused
+except ImportError:
+    _NEPTUNE_INSTALLED = False
+
 _callback_kwargs: Dict[Type[Callback], Dict[str, Any],] = {
     Generate: {
         'prompts': ['a', 'b', 'c'],
@@ -115,6 +123,13 @@
     SpeedMonitor: {
         'window_size': 1,
     },
+    NeptuneLogger: {
+        'mode': 'debug',
+    },
+    composer.profiler.Profiler: {
+        'trace_handlers': [MagicMock()],
+        'schedule': composer.profiler.cyclic_schedule(),
+    }
 }
 
 _callback_marks: Dict[Type[Callback], List[pytest.MarkDecorator],] = {
@@ -128,6 +143,14 @@
         pytest.mark.filterwarnings(
             r'ignore:The memory monitor only works on CUDA devices, but the model is on cpu:UserWarning')
     ],
+    MemorySnapshot: [
+        pytest.mark.filterwarnings(
+            r'ignore:The memory snapshot only works on CUDA devices, but the model is on cpu:UserWarning')
+    ],
+    OOMObserver: [
+        pytest.mark.filterwarnings(
+            r'ignore:The oom observer only works on CUDA devices, but the model is on cpu:UserWarning')
+    ],
     MLPerfCallback: [pytest.mark.skipif(not _MLPERF_INSTALLED, reason='MLPerf is optional')],
     WandBLogger: [
         pytest.mark.filterwarnings(r'ignore:unclosed file:ResourceWarning'),
@@ -145,7 +168,7 @@
     ImageVisualizer: [pytest.mark.skipif(not _WANDB_INSTALLED, reason='Wandb is optional')],
     MLFlowLogger: [pytest.mark.skipif(not _MLFLOW_INSTALLED, reason='mlflow is optional'),],
     SystemMetricsMonitor: [pytest.mark.skipif(not _PYNMVL_INSTALLED, reason='pynmvl is optional'),],
-    HealthChecker: [pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')],
+    NeptuneLogger: [pytest.mark.skipif(not _NEPTUNE_INSTALLED, reason='neptune is optional'),],
 }
 
 
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 695be08c55..f0ddbe43cc 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -43,12 +43,14 @@ class TestCallbacks:
     def setup_class(cls):
         pytest.importorskip('wandb', reason='WandB is optional.')
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_callback_is_constructable(self, cb_cls: Type[Callback]):
         cb_kwargs = get_cb_kwargs(cb_cls)
         cb = cb_cls(**cb_kwargs)
         assert isinstance(cb_cls, type)
         assert isinstance(cb, cb_cls)
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_multiple_fit_start_and_end(self, cb_cls: Type[Callback], dummy_state: State):
         """Test that callbacks do not crash when Event.FIT_START and Event.FIT_END is called multiple times."""
         cb_kwargs = get_cb_kwargs(cb_cls)
@@ -69,6 +71,7 @@ def test_multiple_fit_start_and_end(self, cb_cls: Type[Callback], dummy_state: S
         engine.run_event(Event.FIT_START)
         engine.run_event(Event.FIT_END)
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_idempotent_close(self, cb_cls: Type[Callback], dummy_state: State):
         """Test that callbacks do not crash when .close() and .post_close() are called multiple times."""
         cb_kwargs = get_cb_kwargs(cb_cls)
@@ -85,6 +88,7 @@ def test_idempotent_close(self, cb_cls: Type[Callback], dummy_state: State):
         engine.close()
         engine.close()
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_multiple_init_and_close(self, cb_cls: Type[Callback], dummy_state: State):
         """Test that callbacks do not crash when INIT/.close()/.post_close() are called multiple times in that order."""
         cb_kwargs = get_cb_kwargs(cb_cls)
@@ -136,6 +140,7 @@ def _get_trainer(self, cb: Callback, device_train_microbatch_size: int):
                               torch_prof_memory_filename=None),
         )
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_trains(self, cb_cls: Type[Callback], device_train_microbatch_size: int, _remote: bool):
         del _remote  # unused. `_remote` must be passed through to parameterize the test markers.
         cb_kwargs = get_cb_kwargs(cb_cls)
@@ -143,6 +148,7 @@ def test_trains(self, cb_cls: Type[Callback], device_train_microbatch_size: int,
         trainer = self._get_trainer(cb, device_train_microbatch_size)
         trainer.fit()
 
+    @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_trains_multiple_calls(self, cb_cls: Type[Callback], device_train_microbatch_size: int, _remote: bool):
         """
         Tests that training with multiple fits complete.
diff --git a/tests/callbacks/test_checkpoint_saver.py b/tests/callbacks/test_checkpoint_saver.py
new file mode 100644
index 0000000000..67654b9b17
--- /dev/null
+++ b/tests/callbacks/test_checkpoint_saver.py
@@ -0,0 +1,45 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+from composer.callbacks import CheckpointSaver
+from composer.core import Timestamp
+
+
+def test_stateful_checkpoint_saver():
+    checkpoint_saver = CheckpointSaver()
+    assert not checkpoint_saver.all_saved_checkpoints_to_timestamp
+
+    # empty state dict
+    empty_state_dict = checkpoint_saver.state_dict()
+    assert 'all_saved_checkpoints_to_timestamp' in empty_state_dict
+    assert len(empty_state_dict['all_saved_checkpoints_to_timestamp']) == 0
+
+    # backwards compatibility; empty state dict should not raise
+    checkpoint_saver.load_state_dict({})
+    assert not checkpoint_saver.all_saved_checkpoints_to_timestamp
+
+    # add a checkpoint and confirm it can save and load
+    checkpoint_saver.all_saved_checkpoints_to_timestamp = {
+        'foobar/example-checkpoint.pt': Timestamp(epoch=1, batch=2),
+    }
+    new_state_dict = checkpoint_saver.state_dict()
+    assert 'all_saved_checkpoints_to_timestamp' in new_state_dict
+    assert len(new_state_dict['all_saved_checkpoints_to_timestamp']) == 1
+    checkpoint, ts = new_state_dict['all_saved_checkpoints_to_timestamp'][0]
+    assert checkpoint == 'foobar/example-checkpoint.pt'
+    assert isinstance(ts, dict)
+    assert ts['epoch'] == 1
+    assert ts['batch'] == 2
+    assert ts['sample'] == 0
+
+    # load works again if we clear the dict
+    checkpoint_saver.all_saved_checkpoints_to_timestamp = {}
+    checkpoint_saver.load_state_dict(new_state_dict)
+    assert checkpoint_saver.all_saved_checkpoints_to_timestamp
+    assert len(checkpoint_saver.all_saved_checkpoints_to_timestamp) == 1
+    assert 'foobar/example-checkpoint.pt' in checkpoint_saver.all_saved_checkpoints_to_timestamp
+    ts = checkpoint_saver.all_saved_checkpoints_to_timestamp['foobar/example-checkpoint.pt']
+    assert isinstance(ts, Timestamp)
+    assert ts.epoch == 1
+    assert ts.batch == 2
+    assert ts.sample == 0
diff --git a/tests/callbacks/test_generate.py b/tests/callbacks/test_generate.py
index a848071dff..c9247ce616 100644
--- a/tests/callbacks/test_generate.py
+++ b/tests/callbacks/test_generate.py
@@ -7,7 +7,6 @@
 
 import pytest
 import torch
-from packaging import version
 
 from composer.callbacks import Generate
 from composer.core import Event
@@ -24,8 +23,6 @@
 class TestGenerate():
 
     def _check_test_params(self, device, world_size, use_fsdp) -> None:
-        if use_fsdp and version.parse(torch.__version__) < version.parse('1.13.0'):
-            pytest.skip('FSDP requires torch >= 1.13.0')
         if device == 'cpu' and use_fsdp:
             pytest.skip('FSDP is not supported on CPU.')
         if world_size == 1 and use_fsdp:
diff --git a/tests/callbacks/test_health_checker.py b/tests/callbacks/test_health_checker.py
deleted file mode 100644
index 5638699ca9..0000000000
--- a/tests/callbacks/test_health_checker.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import datetime
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from composer import Timestamp
-from composer.callbacks import HealthChecker
-from composer.callbacks.health_checker import GPUUtilization
-from composer.utils import dist
-from tests.common import world_size
-
-pynvml = pytest.importorskip('pynvml')
-pytest.importorskip('slack_sdk')
-
-
-class MockUtil:
-
-    def __init__(self, util):
-        self.gpu = util
-
-
-@pytest.mark.gpu
-@world_size(1, 2)
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_gpu_utilization(world_size):
-    assert HealthChecker._is_available()
-
-    gpu_utilization_values = [
-        MockUtil(100),
-        MockUtil(10),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-    ]
-
-    with patch.multiple(pynvml,
-                        nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values),
-                        nvmlDeviceGetCount=MagicMock(return_value=world_size)):
-
-        gpu_utilization = GPUUtilization()
-        gpu_utilization.sample()
-        gpu_utilization.sample()
-        gpu_utilization.sample()
-        _, alert = gpu_utilization.check()
-
-        should_alert = dist.get_local_rank() == 0 and world_size > 1
-        assert alert == should_alert
-
-
-@pytest.mark.gpu
-@world_size(1, 2)
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_health_checker(world_size):
-
-    state = MagicMock()
-    state.run_name = 'pytest-mock-run-kwei73'
-    logger = MagicMock()
-
-    health_checker = HealthChecker(
-        sample_freq=1,
-        window_size=3,
-        wait=0,
-    )
-
-    gpu_utilization_values = [
-        MockUtil(100),
-        MockUtil(10),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-        MockUtil(100),
-    ]
-
-    with patch.multiple(pynvml,
-                        nvmlDeviceGetUtilizationRates=MagicMock(side_effect=gpu_utilization_values),
-                        nvmlDeviceGetCount=MagicMock(return_value=world_size)):
-
-        # collect data and checker
-        for seconds in [1, 2, 3]:
-            state.timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds))
-            health_checker.after_train_batch(state, logger)
-
-        should_alert = dist.get_local_rank() == 0 and world_size > 1
-        assert health_checker.metrics[0].alerted == should_alert
-
-
-@pytest.mark.filterwarnings('ignore:.*HealthChecker is deprecated.*')
-def test_health_checker_sampling():
-    timestamp = Timestamp(total_wct=datetime.timedelta(seconds=0))
-
-    health_checker = HealthChecker(
-        sample_freq=1,
-        window_size=5,
-        wait=10,
-    )
-
-    config = [
-        (5, False),  # before wait
-        (11, True),
-        (11.5, False),  # below sample frequency
-        (12, True),
-        (20, True),
-        (11, False),  # no time travel
-    ]
-
-    for seconds, is_sample in config:
-        timestamp = Timestamp(total_wct=datetime.timedelta(seconds=seconds))
-        assert health_checker._sample(timestamp) == is_sample
diff --git a/tests/callbacks/test_inference.py b/tests/callbacks/test_inference.py
index 960aec9a04..bef07c081c 100644
--- a/tests/callbacks/test_inference.py
+++ b/tests/callbacks/test_inference.py
@@ -13,9 +13,9 @@
 from torch.utils.data import DataLoader
 
 from composer.callbacks import ExportForInferenceCallback, export_for_inference
-from composer.models import composer_resnet
 from composer.trainer import Trainer
 from tests.common.datasets import RandomImageDataset
+from tests.common.models import composer_resnet
 
 
 @pytest.mark.parametrize(
diff --git a/tests/callbacks/test_loggers_across_callbacks.py b/tests/callbacks/test_loggers_across_callbacks.py
index 92363e7aa5..1c58babf0b 100644
--- a/tests/callbacks/test_loggers_across_callbacks.py
+++ b/tests/callbacks/test_loggers_across_callbacks.py
@@ -15,6 +15,7 @@
 
 @pytest.mark.parametrize('logger_cls', get_cbs_and_marks(loggers=True))
 @pytest.mark.parametrize('callback_cls', get_cbs_and_marks(callbacks=True))
+@pytest.mark.filterwarnings('ignore::UserWarning')
 def test_loggers_on_callbacks(logger_cls: Type[LoggerDestination], callback_cls: Type[Callback]):
     if logger_cls in [ProgressBarLogger, ConsoleLogger, SlackLogger]:
         pytest.skip()
diff --git a/tests/callbacks/test_memory_monitor.py b/tests/callbacks/test_memory_monitor.py
index f40a04eeb3..f2badc638c 100644
--- a/tests/callbacks/test_memory_monitor.py
+++ b/tests/callbacks/test_memory_monitor.py
@@ -7,13 +7,10 @@
 from composer.callbacks import MemoryMonitor
 from composer.loggers import InMemoryLogger
 from composer.trainer import Trainer
-from tests.common import RandomClassificationDataset, SimpleModel, device
+from tests.common import RandomClassificationDataset, SimpleModel
 
 
-@device('cpu', 'gpu')
-def test_memory_monitor_warnings_on_cpu_models(device: str):
-    # Error if the user sets device=cpu even when cuda is available
-    del device  # unused. always using cpu
+def test_memory_monitor_warnings_on_cpu_models():
     with pytest.warns(UserWarning, match='The memory monitor only works on CUDA devices'):
         Trainer(
             model=SimpleModel(),
diff --git a/tests/callbacks/test_memory_snapshot.py b/tests/callbacks/test_memory_snapshot.py
new file mode 100644
index 0000000000..0bafbcb1c1
--- /dev/null
+++ b/tests/callbacks/test_memory_snapshot.py
@@ -0,0 +1,62 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pathlib
+
+import pytest
+import torch
+from packaging import version
+from torch.utils.data import DataLoader
+
+from composer import State, Trainer
+from composer.callbacks import MemorySnapshot
+from composer.loggers import LoggerDestination
+from composer.trainer import Trainer
+from tests.common import RandomClassificationDataset, SimpleModel
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.1.0'),
+                    reason='OOM Observer requires PyTorch 2.1 or higher')
+def test_memory_snapshot_warnings_on_cpu_models():
+    with pytest.warns(UserWarning):
+        Trainer(
+            model=SimpleModel(),
+            callbacks=MemorySnapshot(),
+            device='cpu',
+            train_dataloader=DataLoader(RandomClassificationDataset()),
+            max_duration='1ba',
+        )
+
+
+class FileUploaderTracker(LoggerDestination):
+
+    def __init__(self) -> None:
+        self.uploaded_files = []
+
+    def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Path, *, overwrite: bool):
+        del state, overwrite  # unused
+        self.uploaded_files.append((remote_file_name, file_path))
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize('interval', ['1ba'])
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.1.0'),
+                    reason='OOM Observer requires PyTorch 2.1 or higher')
+def test_memory_snapshot(interval: str):
+    # Construct the callbacks
+    skip_batches = 0
+    memory_snapshot = MemorySnapshot(skip_batches=skip_batches, interval=interval)
+    simple_model = SimpleModel()
+    file_tracker_destination = FileUploaderTracker()
+
+    # Construct the trainer and train
+    trainer = Trainer(
+        model=simple_model,
+        loggers=file_tracker_destination,
+        callbacks=memory_snapshot,
+        train_dataloader=DataLoader(RandomClassificationDataset()),
+        max_duration='2ba',
+    )
+    trainer.fit()
+    assert len(file_tracker_destination.uploaded_files) == 2
+    trainer.close()
diff --git a/tests/callbacks/test_oom_observer.py b/tests/callbacks/test_oom_observer.py
new file mode 100644
index 0000000000..60323b00c0
--- /dev/null
+++ b/tests/callbacks/test_oom_observer.py
@@ -0,0 +1,88 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pathlib
+
+import pytest
+import torch
+from packaging import version
+from torch.utils.data import DataLoader
+
+from composer import State, Trainer
+from composer.callbacks import MemorySnapshot, OOMObserver
+from composer.loggers import LoggerDestination
+from composer.trainer import Trainer
+from tests.common import RandomClassificationDataset, SimpleModel
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.1.0'),
+                    reason='OOM Observer requires PyTorch 2.1 or higher')
+def test_oom_observer_warnings_on_cpu_models():
+    ob = OOMObserver()
+    with pytest.warns(UserWarning):
+        Trainer(
+            model=SimpleModel(),
+            callbacks=ob,
+            train_dataloader=DataLoader(RandomClassificationDataset()),
+            max_duration='1ba',
+            device='cpu',
+        )
+        assert ob._enabled is False
+
+
+class FileUploaderTracker(LoggerDestination):
+
+    def __init__(self) -> None:
+        self.uploaded_files = []
+
+    def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Path, *, overwrite: bool):
+        del state, overwrite  # unused
+        self.uploaded_files.append((remote_file_name, file_path))
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.1.0'),
+                    reason='OOM Observer requires PyTorch 2.1 or higher')
+def test_oom_observer():
+    # Construct the callbacks
+    oom_observer = OOMObserver()
+    simple_model = SimpleModel()
+    file_tracker_destination = FileUploaderTracker()
+
+    with pytest.raises(torch.cuda.OutOfMemoryError):
+        trainer = Trainer(
+            model=simple_model,
+            loggers=file_tracker_destination,
+            callbacks=oom_observer,
+            train_dataloader=DataLoader(RandomClassificationDataset()),
+            max_duration='2ba',
+        )
+
+        # trigger OOM
+        torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
+
+        trainer.fit()
+
+    assert len(file_tracker_destination.uploaded_files) == 5
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.1.0'),
+                    reason='OOM Observer requires PyTorch 2.1 or higher')
+def test_oom_observer_with_memory_snapshot():
+    # Construct the callbacks
+    oom_observer = OOMObserver()
+    memory_snapshot = MemorySnapshot(skip_batches=0, interval='1ba')
+    simple_model = SimpleModel()
+    file_tracker_destination = FileUploaderTracker()
+
+    trainer = Trainer(
+        model=simple_model,
+        loggers=file_tracker_destination,
+        callbacks=[oom_observer, memory_snapshot],
+        train_dataloader=DataLoader(RandomClassificationDataset()),
+        max_duration='2ba',
+    )
+
+    trainer.fit()
+    assert len(file_tracker_destination.uploaded_files) == 2
diff --git a/tests/callbacks/test_optimizer_monitor.py b/tests/callbacks/test_optimizer_monitor.py
index 226a38c119..02ee0586fb 100644
--- a/tests/callbacks/test_optimizer_monitor.py
+++ b/tests/callbacks/test_optimizer_monitor.py
@@ -11,7 +11,7 @@
 from composer.models import HuggingFaceModel
 from composer.optim import DecoupledAdamW
 from composer.trainer import Trainer
-from composer.utils import dist, using_torch_2
+from composer.utils import dist
 from tests.common import device, world_size
 from tests.common.datasets import RandomClassificationDataset, RandomTextLMDataset
 from tests.common.models import SimpleModel
@@ -57,16 +57,13 @@ def test_optimizer_monitor(log_optimizer_metrics: bool, batch_log_interval: int)
                     reason='requires PyTorch 1.13 or higher')
 @pytest.mark.parametrize('use_orig_params', [True, False])
 def test_fsdp_optimizer_monitor(device, world_size, use_orig_params):
-    if use_orig_params and not using_torch_2():
-        pytest.skip('use_orig_params was introduced in pytorch 2.0')
-
     # Construct the callback
     grad_monitor = OptimizerMonitor(log_optimizer_metrics=True)
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     model = SimpleModel(num_classes=100, num_features=100, num_hidden=100)
     for module in model.modules():
         if len(list(module.parameters())) > 0:
-            module._fsdp_wrap = True
+            module._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
     dataset = RandomClassificationDataset(num_classes=100, shape=(100, 1, 1))
     # Construct the trainer and train
     trainer = Trainer(model=model,
@@ -91,12 +88,11 @@ def test_fsdp_optimizer_monitor(device, world_size, use_orig_params):
     # Count the logged steps
     grad_norm_calls = len(in_memory_logger.data['l2_norm/grad/global'])
     layer_norm_calls = [len(calls) for (k, calls) in in_memory_logger.data.items() if 'l2_norm/grad' in k]
-    suffix = ('._flat_param' if using_torch_2() else '.flat_param') if not use_orig_params else '.weight'
-    infix = '' if using_torch_2() else '._fpw_module'
+    suffix = '._flat_param' if not use_orig_params else '.weight'
     test_keys = [
-        f'l2_norm/grad/module._fsdp_wrapped_module{infix}.4._fsdp_wrapped_module',
-        f'l2_norm/moment/module._fsdp_wrapped_module{infix}.4._fsdp_wrapped_module',
-        f'l2_norm/update/module._fsdp_wrapped_module{infix}.4._fsdp_wrapped_module',
+        f'l2_norm/grad/module._fsdp_wrapped_module.4._fsdp_wrapped_module',
+        f'l2_norm/moment/module._fsdp_wrapped_module.4._fsdp_wrapped_module',
+        f'l2_norm/update/module._fsdp_wrapped_module.4._fsdp_wrapped_module',
     ]
     test_keys = [key + suffix for key in test_keys]
     for key in test_keys:
@@ -110,12 +106,8 @@ def test_fsdp_optimizer_monitor(device, world_size, use_orig_params):
 
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 @pytest.mark.parametrize('use_orig_params', [True, False])
 def test_fsdp_optimizer_monitor_transformer(device, world_size, tiny_gpt2_model, tiny_gpt2_tokenizer, use_orig_params):
-    if use_orig_params and not using_torch_2():
-        pytest.skip('use_orig_params was introduced in pytorch 2.0')
     transformers = pytest.importorskip('transformers')
     # Construct the callback
     grad_monitor = OptimizerMonitor(log_optimizer_metrics=True)
@@ -164,11 +156,9 @@ def test_fsdp_optimizer_monitor_transformer(device, world_size, tiny_gpt2_model,
     layer_norm_calls = [len(calls) for (k, calls) in in_memory_logger.data.items() if 'l2_norm/grad' in k]
     # an incomplete list of expected keys
     if not use_orig_params:
-        suffix = '._flat_param' if using_torch_2() else '.flat_param'
-        infix = '' if using_torch_2() else '._fpw_module'
         test_keys = [
-            f'l2_norm/grad/model._fsdp_wrapped_module{infix}.transformer.h.1._fsdp_wrapped_module{suffix}',
-            f'l2_norm/update/model._fsdp_wrapped_module{infix}.transformer.h.1._fsdp_wrapped_module{suffix}',
+            f'l2_norm/grad/model._fsdp_wrapped_module.transformer.h.1._fsdp_wrapped_module._flat_param',
+            f'l2_norm/update/model._fsdp_wrapped_module.transformer.h.1._fsdp_wrapped_module._flat_param',
         ]
     else:
         test_keys = [
diff --git a/tests/callbacks/test_speed_monitor.py b/tests/callbacks/test_speed_monitor.py
index f880a7c370..36c30a87f6 100644
--- a/tests/callbacks/test_speed_monitor.py
+++ b/tests/callbacks/test_speed_monitor.py
@@ -34,7 +34,7 @@ def test_speed_monitor(flops_per_batch: bool):
 
     model = SimpleModel()
     if flops_per_batch:
-        model.flops_per_batch = lambda batch: len(batch) * 100.0
+        model.flops_per_batch = lambda batch: len(batch) * 100.0  # pyright: ignore[reportGeneralTypeIssues]
 
     # Construct the trainer and train
     trainer = Trainer(
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
index be2a508860..bcc9903e61 100644
--- a/tests/common/__init__.py
+++ b/tests/common/__init__.py
@@ -12,7 +12,7 @@
 from tests.common.markers import device, world_size
 from tests.common.models import (ConvModel, EmbeddedWeightTiedModel, EmptyModel, SimpleConvModel, SimpleModel,
                                  SimpleModelWithDropout, SimpleTransformerClassifier, SimpleTransformerMaskedLM,
-                                 SimpleWeightTiedModel, ZeroModel)
+                                 SimpleWeightTiedModel, ZeroModel, composer_resnet)
 from tests.common.state import assert_state_equivalent
 
 
@@ -46,4 +46,5 @@ def get_module_subclasses(module: types.ModuleType, cls: Type) -> List[Type]:
     'ParityDataset',
     'SimpleDataset',
     'InfiniteClassificationDataset',
+    'composer_resnet',
 ]
diff --git a/tests/common/models.py b/tests/common/models.py
index cac3769b38..d8bf2994d4 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -4,15 +4,21 @@
 """Contains commonly used models that are shared across the test suite."""
 import copy
 from functools import partial
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 import torch
 from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchvision.models import resnet
 
+from composer.loss import loss_registry
 from composer.metrics import CrossEntropy, MIoU
 from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
-from composer.models import ComposerClassifier, HuggingFaceModel
+from composer.models import ComposerClassifier, HuggingFaceModel, Initializer
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 
 class EmptyModel(ComposerClassifier):
@@ -74,7 +80,7 @@ def __init__(
             fc2,
             torch.nn.Softmax(dim=-1),
         )
-        net.param_init_fn = self.param_init_fn
+        net.param_init_fn = self.param_init_fn  # pyright: ignore[reportGeneralTypeIssues]
         super().__init__(module=net, num_classes=num_classes)
 
         # Important: It is crucial that the FC layers are bound to `self`
@@ -90,7 +96,7 @@ def param_init_fn(self, module):
 
         if isinstance(module, torch.nn.Linear):
             init_fn(module.weight)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 torch.nn.init.zeros_(module.bias)
 
 
@@ -131,7 +137,7 @@ def __init__(self, num_features: int = 1, device: str = 'cpu') -> None:
 
         self.mlp = mlp
         self.net = net
-        self.net.param_init_fn = self.param_init_fn
+        self.net.param_init_fn = self.param_init_fn  # pyright: ignore[reportGeneralTypeIssues]
 
         self.mlp.fc1.weight = self.mlp.fc2.weight
 
@@ -140,7 +146,7 @@ def param_init_fn(self, module):
 
         if isinstance(module, torch.nn.Linear):
             init_fn(module.weight)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 torch.nn.init.zeros_(module.bias)
 
 
@@ -166,7 +172,7 @@ def __init__(self, num_features: int = 1, device: str = 'cpu') -> None:
 
         super().__init__(module=net, num_classes=num_features)
 
-        self.module.param_init_fn = self.param_init_fn
+        self.module.param_init_fn = self.param_init_fn  # pyright: ignore[reportGeneralTypeIssues]
 
         self.net1 = net1
         self.net2 = net2
@@ -178,7 +184,7 @@ def param_init_fn(self, module):
 
         if isinstance(module, torch.nn.Linear):
             init_fn(module.weight)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 torch.nn.init.zeros_(module.bias)
 
 
@@ -437,107 +443,224 @@ def forward(self, batch: Tuple[torch.Tensor, Any]) -> torch.Tensor:
         return outputs
 
 
+def composer_resnet(
+    model_name: str,
+    num_classes: int = 1000,
+    weights: Optional[str] = None,
+    groups: int = 1,
+    width_per_group: int = 64,
+    initializers: Optional[List[Initializer]] = None,
+    loss_name: str = 'soft_cross_entropy',
+) -> ComposerClassifier:
+    """Helper function to create a :class:`.ComposerClassifier` with a torchvision ResNet model.
+    From `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`_ (He et al, 2015).
+    Args:
+        model_name (str): Name of the ResNet model instance. Either [``"resnet18"``, ``"resnet34"``, ``"resnet50"``, ``"resnet101"``,
+            ``"resnet152"``].
+        num_classes (int, optional): The number of classes. Needed for classification tasks. Default: ``1000``.
+        weights (str, optional): If provided, pretrained weights can be specified, such as with ``IMAGENET1K_V2``. Default: ``None``.
+        groups (int, optional): Number of filter groups for the 3x3 convolution layer in bottleneck blocks. Default: ``1``.
+        width_per_group (int, optional): Initial width for each convolution group. Width doubles after each stage.
+            Default: ``64``.
+        initializers (List[Initializer], optional): Initializers for the model. ``None`` for no initialization.
+            Default: ``None``.
+        loss_name (str, optional): Loss function to use. E.g. 'soft_cross_entropy' or
+            'binary_cross_entropy_with_logits'. Loss function must be in
+            :mod:`~composer.loss.loss`. Default: ``'soft_cross_entropy'``".
+    Returns:
+        ComposerModel: instance of :class:`.ComposerClassifier` with a torchvision ResNet model.
+    """
+    valid_model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
+    if model_name not in valid_model_names:
+        raise ValueError(f'model_name must be one of {valid_model_names} instead of {model_name}.')
+
+    if loss_name not in loss_registry.keys():
+        raise ValueError(f'Unrecognized loss function: {loss_name}. Please ensure the '
+                         'specified loss function is present in composer.loss.loss.py')
+
+    if initializers is None:
+        initializers = []
+
+    # Instantiate model
+    model_fn = getattr(resnet, model_name)
+    model = model_fn(weights=weights, num_classes=num_classes, groups=groups, width_per_group=width_per_group)
+
+    # Grab loss function from loss registry
+    loss_fn = loss_registry[loss_name]
+
+    # Create metrics for train and validation
+    train_metrics = MulticlassAccuracy(num_classes=num_classes, average='micro')
+    val_metrics = MetricCollection([CrossEntropy(), MulticlassAccuracy(num_classes=num_classes, average='micro')])
+
+    # Apply Initializers to model
+    for initializer in initializers:
+        initializer = Initializer(initializer)
+        model.apply(initializer.get_initializer())
+
+    composer_model = ComposerClassifier(model, train_metrics=train_metrics, val_metrics=val_metrics, loss_fn=loss_fn)
+    return composer_model
+
+
 # Note: These methods are an alternative to the tiny_bert fixtures in fixtures.py.
 # Fixtures cannot be used natively as parametrized inputs, which we require when
 # we wish to run a test across multiple models, one of which is a HuggingFace model.
 # As a workaround, we inject objects into the PyTest namespace. Tests should not directly
 # use pytest.{var}, but instead should import and use these helper copy methods so the
 # objects in the PyTest namespace do not change.
-def configure_tiny_bert_model():
+def configure_tiny_bert_model() -> 'PreTrainedModel':
     try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_bert_model, PreTrainedModel)
         return copy.deepcopy(pytest.tiny_bert_model)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_bert_tokenizer():
+def configure_tiny_bert_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
     try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_bert_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
         return copy.deepcopy(pytest.tiny_bert_tokenizer)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_bert_config():
+def configure_tiny_bert_config() -> 'PretrainedConfig':
     try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_bert_config, PretrainedConfig)
         return copy.deepcopy(pytest.tiny_bert_config)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_bert_hf_model(use_logits=True):
+def configure_tiny_bert_hf_model(use_logits: bool = True) -> HuggingFaceModel:
     return HuggingFaceModel(configure_tiny_bert_model(), configure_tiny_bert_tokenizer(), use_logits)
 
 
-def configure_tiny_deberta_model():
+def configure_tiny_deberta_model() -> 'PreTrainedModel':
     try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_deberta_model, PreTrainedModel)
         return copy.deepcopy(pytest.tiny_deberta_model)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_deberta_tokenizer():
+def configure_tiny_deberta_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
     try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_deberta_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
         return copy.deepcopy(pytest.tiny_deberta_tokenizer)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_deberta_config():
+def configure_tiny_deberta_config() -> 'PretrainedConfig':
     try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_deberta_config, PretrainedConfig)
         return copy.deepcopy(pytest.tiny_deberta_config)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_deberta_hf_model(use_logits=True):
-    return HuggingFaceModel(configure_tiny_deberta_model(), configure_tiny_deberta_tokenizer(), use_logits)
+def configure_tiny_deberta_hf_model(use_logits: bool = True) -> HuggingFaceModel:
+    return HuggingFaceModel(
+        configure_tiny_deberta_model(),
+        configure_tiny_deberta_tokenizer(),
+        use_logits,
+    )
 
 
-def configure_tiny_gpt2_model():
+def configure_tiny_gpt2_model() -> 'PreTrainedModel':
     try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_gpt2_model, PreTrainedModel)
         return copy.deepcopy(pytest.tiny_gpt2_model)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_gpt2_tokenizer():
+def configure_tiny_gpt2_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
     try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_gpt2_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
         return copy.deepcopy(pytest.tiny_gpt2_tokenizer)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_gpt2_config():
+def configure_tiny_gpt2_config() -> 'PretrainedConfig':
     try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_gpt2_config, PretrainedConfig)
         return copy.deepcopy(pytest.tiny_gpt2_config)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_gpt2_hf_model(use_logits=True):
+def configure_tiny_gpt2_hf_model(use_logits: bool = True) -> HuggingFaceModel:
     return HuggingFaceModel(configure_tiny_gpt2_model(), configure_tiny_gpt2_tokenizer(), use_logits)
 
 
-def configure_tiny_t5_model():
+def configure_tiny_t5_model() -> 'PreTrainedModel':
     try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_t5_model, PreTrainedModel)
         return copy.deepcopy(pytest.tiny_t5_model)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_t5_tokenizer():
+def configure_tiny_t5_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
     try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_t5_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
         return copy.deepcopy(pytest.tiny_t5_tokenizer)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_t5_config():
+def configure_tiny_t5_config() -> 'PretrainedConfig':
     try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_t5_config, PretrainedConfig)
         return copy.deepcopy(pytest.tiny_t5_config)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
 
-def configure_tiny_t5_hf_model(use_logits=True):
+def configure_tiny_t5_hf_model(use_logits: bool = True) -> HuggingFaceModel:
     return HuggingFaceModel(configure_tiny_t5_model(), configure_tiny_t5_tokenizer(), use_logits)
+
+
+def configure_tiny_mistral_model() -> 'PreTrainedModel':
+    try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_mistral_model, PreTrainedModel)
+        return copy.deepcopy(pytest.tiny_mistral_model)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
+    try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_mistral_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
+        return copy.deepcopy(pytest.tiny_mistral_tokenizer)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_config() -> 'PretrainedConfig':
+    try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_mistral_config, PretrainedConfig)
+        return copy.deepcopy(pytest.tiny_mistral_config)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_hf_model(use_logits: bool = True) -> HuggingFaceModel:
+    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
diff --git a/tests/conftest.py b/tests/conftest.py
index bcd063d9c7..e327730d42 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -111,21 +111,27 @@ def pytest_configure():
     if TRANSFORMERS_INSTALLED:
         from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
                                              tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
-                                             tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
-                                             tiny_t5_model_helper, tiny_t5_tokenizer_helper)
+                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
+                                             tiny_llama_tokenizer_helper, tiny_mistral_config_helper,
+                                             tiny_mistral_model_helper, tiny_mistral_tokenizer_helper,
+                                             tiny_opt_config_helper, tiny_opt_model_helper, tiny_opt_tokenizer_helper,
+                                             tiny_t5_config_helper, tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
         pytest.tiny_bert_model = tiny_bert_model_helper(pytest.tiny_bert_config)  # type: ignore
         pytest.tiny_bert_tokenizer = tiny_bert_tokenizer_helper()  # type: ignore
         pytest.tiny_gpt2_config = tiny_gpt2_config_helper()  # type: ignore
         pytest.tiny_gpt2_model = tiny_gpt2_model_helper(pytest.tiny_gpt2_config)  # type: ignore
         pytest.tiny_gpt2_tokenizer = tiny_gpt2_tokenizer_helper()  # type: ignore
+        pytest.tiny_llama_tokenizer = tiny_llama_tokenizer_helper()  # type: ignore
         pytest.tiny_opt_config = tiny_opt_config_helper()  # type: ignore
         pytest.tiny_opt_model = tiny_opt_model_helper(pytest.tiny_opt_config)  # type: ignore
         pytest.tiny_opt_tokenizer = tiny_opt_tokenizer_helper()  # type: ignore
         pytest.tiny_t5_config = tiny_t5_config_helper()  # type: ignore
         pytest.tiny_t5_model = tiny_t5_model_helper(pytest.tiny_t5_config)  # type: ignore
         pytest.tiny_t5_tokenizer = tiny_t5_tokenizer_helper()  # type: ignore
+        pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
+        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
+        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
diff --git a/tests/datasets/test_add_dataset_transform.py b/tests/datasets/test_add_dataset_transform.py
deleted file mode 100644
index d7a545a33b..0000000000
--- a/tests/datasets/test_add_dataset_transform.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torchvision import transforms
-
-from composer.datasets.synthetic import SyntheticPILDataset
-from composer.datasets.utils import add_vision_dataset_transform
-
-image_size = 32
-
-
-def generate_synthetic_dataset(data_transforms):
-    return SyntheticPILDataset(total_dataset_size=1000,
-                               data_shape=[image_size, image_size],
-                               num_classes=2,
-                               transform=data_transforms)
-
-
-def generate_default_transforms():
-    return transforms.Compose([transforms.RandomCrop(32), transforms.ToTensor(), transforms.RandomRotation(5)])
-
-
-def generate_composition_no_tensor():
-    return transforms.Compose(
-        [transforms.RandomCrop(32),
-         transforms.RandomHorizontalFlip(),
-         transforms.RandomRotation(5)])
-
-
-@pytest.mark.parametrize('is_tensor_transform,index', [(False, 1), (True, 2)])
-def test_pre_post_to_tensor_compose(is_tensor_transform, index):
-    dataset = generate_synthetic_dataset(generate_default_transforms())
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform)
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast
-
-
-@pytest.mark.parametrize('is_tensor_transform,index', [(False, 0), (True, 1)])
-def test_pre_post_to_tensor(is_tensor_transform, index):
-    dataset = generate_synthetic_dataset(transforms.ToTensor())
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast(), is_tensor_transform=is_tensor_transform)
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[index]) == transforms.RandomAutocontrast
-
-
-@pytest.mark.parametrize('data_transforms', [(generate_composition_no_tensor()), (transforms.RandomHorizontalFlip())])
-def test_default_to_append(data_transforms):
-    dataset = generate_synthetic_dataset(data_transforms)
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast())
-    assert dataset.transform is not None
-    assert type(dataset.transform.transforms[-1]) == transforms.RandomAutocontrast
-
-
-def test_add_to_none_transform():
-    dataset = generate_synthetic_dataset(None)
-    add_vision_dataset_transform(dataset, transforms.RandomAutocontrast())
-    assert type(dataset.transform) == transforms.RandomAutocontrast
diff --git a/tests/datasets/test_cifar.py b/tests/datasets/test_cifar.py
deleted file mode 100644
index 6eac6e2ebf..0000000000
--- a/tests/datasets/test_cifar.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from composer.datasets import build_cifar10_dataloader, build_synthetic_cifar10_dataloader
-
-
-@pytest.mark.skip  # Download is flaky and test is not critical
-@pytest.mark.parametrize('is_train', [False, True])
-@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True])
-def test_cifar10_shape_length(is_train, synthetic):
-    batch_size = 1
-
-    if synthetic:
-        dataspec = build_synthetic_cifar10_dataloader(global_batch_size=batch_size, is_train=is_train)
-    else:
-        dataspec = build_cifar10_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train)
-
-    samples = list(dataspec.dataloader)
-    if is_train:
-        assert len(samples) == 50000 // batch_size
-    else:
-        assert len(samples) == 10000 // batch_size
-
-    assert samples[0][0].shape == (1, 3, 32, 32)
diff --git a/tests/datasets/test_dataset_utils.py b/tests/datasets/test_dataset_utils.py
deleted file mode 100644
index 720edce59b..0000000000
--- a/tests/datasets/test_dataset_utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import List, Tuple
-
-import numpy as np
-import pytest
-import torch
-from PIL import Image
-
-from composer.datasets.utils import pil_image_collate
-
-
-@pytest.fixture
-def num_samples():
-    return 4
-
-
-@pytest.fixture
-def image_size():
-    return (16, 16)
-
-
-@pytest.fixture
-def pil_image_list(num_samples: int, image_size: Tuple[int, int]):
-    return [Image.new(mode='RGB', size=image_size, color=(i, i, i)) for i in range(num_samples)]
-
-
-@pytest.fixture
-def pil_target_list(num_samples: int, image_size: Tuple[int, int]):
-    return [Image.new(mode='L', size=image_size, color=i) for i in range(num_samples)]
-
-
-@pytest.fixture
-def correct_image_tensor(num_samples: int, image_size: Tuple[int, int]):
-    return torch.arange(num_samples).expand(3, *image_size, -1).permute(3, 0, 1, 2)
-
-
-@pytest.fixture
-def scalar_target_list(num_samples: int):
-    return np.arange(num_samples)
-
-
-def test_scalar_target_collate(pil_image_list: List[Image.Image], scalar_target_list: np.ndarray,
-                               correct_image_tensor: torch.Tensor):
-    batch = [(img, target) for img, target in zip(pil_image_list, scalar_target_list)]
-    image_tensor, target_tensor = pil_image_collate(batch=batch)
-
-    correct_target_tensor = torch.arange(correct_image_tensor.shape[0])
-
-    assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_target_tensor)
-
-
-def test_image_target_collate(pil_image_list: List[Image.Image], pil_target_list: List[Image.Image],
-                              correct_image_tensor):
-    batch = [(img, target) for img, target in zip(pil_image_list, pil_target_list)]
-    image_tensor, target_tensor = pil_image_collate(
-        batch=batch)  # type: ignore "Image" is incompatible with "ndarray[Unknown, Unknown]"
-
-    assert torch.all(image_tensor == correct_image_tensor) and torch.all(target_tensor == correct_image_tensor[:, 0])
diff --git a/tests/datasets/test_ffcv_utils.py b/tests/datasets/test_ffcv_utils.py
deleted file mode 100644
index 3614d73387..0000000000
--- a/tests/datasets/test_ffcv_utils.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import pathlib
-
-import pytest
-
-from composer.datasets.ffcv_utils import write_ffcv_dataset
-from composer.datasets.synthetic import SyntheticDataLabelType, SyntheticPILDataset
-
-
-@pytest.mark.vision
-def test_write_ffcv_dataset(tmp_path: pathlib.Path):
-    dataset = SyntheticPILDataset(total_dataset_size=1,
-                                  num_classes=1,
-                                  data_shape=[1, 1, 3],
-                                  label_type=SyntheticDataLabelType.CLASSIFICATION_INT,
-                                  num_unique_samples_to_create=1)
-    output_file = str(tmp_path / 'ffcv')
-    write_ffcv_dataset(dataset, write_path=output_file, num_workers=1)
-    assert os.path.exists(output_file)
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 2e9a461fcf..063f7215bc 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -9,15 +9,28 @@
 
 import pytest
 import torch
-import transformers
 from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
 
 from composer import Evaluator
 from composer.core import DataSpec
-from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              _get_fewshot_sample_idxs, _make_padded_input,
-                                                              get_icl_task_dataloader)
+
+# isort: off
+from composer.datasets.in_context_learning_evaluation import (
+    InContextLearningCodeEvalDataset,
+    InContextLearningDataset,
+    InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningQATaskDataset,
+    InContextLearningSchemaTaskDataset,
+    _get_continuation_span,
+    _get_fewshot_sample_idxs,
+    _make_padded_input,
+    _tokenizer_needs_prefix_space,
+    _trim_context,
+    get_icl_task_dataloader,
+    strip_data,
+)
+# isort: on
+from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -27,19 +40,122 @@
 from tests.common import device, world_size
 
 
+def test_strip_data():
+    data_to_strip = {'strip_data': '  boo!  \n', 'has_space': '  wa hoo!', 'end_space': 'yoohoo!  '}
+    stripped_data = strip_data(data_to_strip)
+    for k, v in stripped_data.items():
+        assert k in data_to_strip
+        assert not v[0].isspace()
+        assert not v[-1].isspace()
+
+
+@pytest.mark.skip(reason="Currently don't have a tokenizer that satisfies this test")
+def test_tokenizer_needs_prefix_space_when_space_not_needed(tiny_gpt2_tokenizer):
+    assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
+
+
+def test_tokenizer_needs_prefix_space_when_space_needed():
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m',
+                                                           use_fast=False)  # type: ignore reportUnboundVariable
+    assert _tokenizer_needs_prefix_space(tokenizer)
+
+
+def test_trim_context():
+    context = [0] * 99 + [1] * 2037
+    continuation = [2] * 10
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2038
+    assert trimmed_context[0] == 0
+    assert trimmed_context[1] == 1
+
+
+def test_trim_context_no_continuation():
+    context = [0] * 2048
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    context = [0] * 3000 + [1]
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    assert trimmed_context[-1] == 1
+
+
+def test_get_continuation_span():
+    context = [0] * 200
+    continuation = [1] * 3
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
+    continuation = [1]
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200])))
+
+
+@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
+def test_make_padding(tiny_gpt2_tokenizer, padding_side):
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    padding_id = tiny_gpt2_tokenizer.eos_token_id
+
+    error_context = contextlib.nullcontext() if padding_side in {'left', 'right'} else pytest.raises(ValueError)
+
+    with error_context:
+        input_ids = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
+
+        if padding_side == 'left':
+            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[48:].tolist() == context
+        elif padding_side == 'right':
+            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[:-48].tolist() == context
+
+
+def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+    assert len(padded_input) == 2048
+    assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
+
+
+def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
+    assert len(padded_input) == 2048
+    assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
+
+
 def test_fewshot_sample_idxs():
     rng = random.Random(1234)
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=4, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=4, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=5, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=5, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=500, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=500, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10, num_fewshot=7, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10, num_fewshot=7, example_idx=4, rng=rng)
     assert len(fewshot_idxs) == 7 and 4 not in fewshot_idxs
 
 
@@ -66,30 +182,667 @@ def test_fewshot_sample_idxs_randomness():
     assert rng_1_sample_2 != rng_3_sample_2
 
 
-def test_batch_padding_logic(tiny_gpt2_tokenizer):
-    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
-    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    _, continuation_spans = _make_padded_input(context, continuation, 2048, tiny_gpt2_tokenizer.eos_token_id)
-    # the context (of len 2000) gets clipped to len 48 so that the whole continuation can fit
-    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+    gen_kwargs = {'test_arg1': 1, 'test_arg2': 2}
 
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  generation_kwargs=gen_kwargs)
+    assert dl.base_batch['generation_kwargs'] == {'test_arg1': 1, 'test_arg2': 2}
+
+
+def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
+    pytest.importorskip('transformers')
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tiny_gpt2_tokenizer.pad_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids, None)  # pyright: ignore[reportGeneralTypeIssues]
+
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids, None)  # pyright: ignore[reportGeneralTypeIssues]
+
+
+def test_stop_sequences_criteria_sentencepiece(tiny_llama_tokenizer):
+    pytest.importorskip('datasets')
 
-@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
-def test_make_padding(tiny_gpt2_tokenizer, padding_side):
-    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    padding_id = tiny_gpt2_tokenizer.eos_token_id
+    tokenizer = tiny_llama_tokenizer
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer('\n\nDogs')['input_ids']  # check to make sure starting with the stop sequence doesnt break it
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tokenizer.eos_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids, None)  # pyright: ignore[reportGeneralTypeIssues]
 
-    error_context = contextlib.nullcontext() if padding_side in {'left', 'right'} else pytest.raises(ValueError)
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids, None)  # pyright: ignore[reportGeneralTypeIssues]
 
-    with error_context:
-        input_ids, _ = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
 
-        if padding_side == 'left':
-            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
-            assert input_ids[48:].tolist() == context
-        elif padding_side == 'right':
-            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
-            assert input_ids[:-48].tolist() == context
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    assert not 'generation_kwargs' in dl.base_batch
+
+
+def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
+                                        tokenizer=tokenizer,
+                                        max_seq_len=1024,
+                                        pad_tok_id=tokenizer.eos_token_id,
+                                        num_fewshot=0,
+                                        fewshot_random_seed=1234,
+                                        prompt_string='',
+                                        example_delimiter='\n',
+                                        continuation_delimiter=': ',
+                                        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+                                        generation_kwargs=None)
+    assert len(dl.base_batch['generation_kwargs']) == 3
+
+
+def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
+                                        tokenizer=tokenizer,
+                                        max_seq_len=1024,
+                                        pad_tok_id=tokenizer.eos_token_id,
+                                        num_fewshot=0,
+                                        fewshot_random_seed=1234,
+                                        prompt_string='',
+                                        example_delimiter='\n',
+                                        continuation_delimiter=': ',
+                                        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+                                        generation_kwargs={'temperature': 0.9})
+    assert 'generation_kwargs' in dl.base_batch
+    assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
+    assert len(dl.base_batch['generation_kwargs']) == 4
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    constructed_context = dl.construct_context({'context': 'quas quas exort', 'answer': 'ice wall'})
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: '
+    constructed_context = dl.construct_context({'context': 'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
+    constructed_context = dl.construct_context({
+        'context': 'quas quas exort',
+        'answer': 'ice wall'
+    },
+                                               preceding_text='The harsh White Waste beckons!',
+                                               add_answer=True)
+    assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    answer = dl.get_answer_from_example({'context': 'wex exort exort', 'answer': 'alacrity'})
+    assert answer == ' alacrity'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_fix_eos_on_preamble(tmp_path):
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m',
+                                                           use_fast=False)  # type: ignore reportUnboundVariable
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    preamble = 'blah blah blah.'
+    tokenized_preamble = tokenizer.encode(preamble)
+    tokenized_preamble += [tokenizer.eos_token_id]
+    fixed_preamble = dl._fix_eos_on_preamble(tokenized_preamble)
+    assert tokenized_preamble[:-1] == fixed_preamble
+    assert fixed_preamble[-1] != tokenizer.eos_token_id
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  tokenize_labels=True)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25, 19145, 1894]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert type(tokenized_example['answer'][0]) == int
+    assert len(tokenized_example['context']) == seqlen
+    assert 'continuation_indices' in tokenized_example
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  tokenize_labels=False)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert len(tokenized_example['context']) == seqlen
+    assert type(tokenized_example['answer']) == str
+
+
+def test_qa_set_cot_no_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert not dl.has_cot
+
+
+def test_qa_set_cot_has_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/gsm8k_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert dl.has_cot
+
+
+def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='',
+        continuation_delimiter='',
+        cot_delimiter='',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    # empirical number from the small test dataset
+    assert dl.max_answer_length == 7
+
+
+def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    answer = dl.get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == 'this is the correct answer'
+
+
+def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    answer = dl.get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == "Let's think step by step.  ### this is the correct answer"
+
+
+def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    tokenized_example = dl.tokenize_example(
+        'starting prompt', 'a context', {
+            'context': 'empty',
+            'answer': 'this is the correct answer',
+            'aliases': ['this is the right answer', 'this is the best answer'],
+            'chain_of_thought': "Let's think step by step. "
+        })
+    assert 'aliases' in tokenized_example
+    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
+
+
+def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
+
+    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
+
+
+def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
+    assert dl.base_batch['generation_kwargs']['num_beams'] == 9000
+    assert dl.base_batch['generation_kwargs']['top_p'] == .95
+    assert dl.base_batch['generation_kwargs']['temperature'] == .9
+    assert dl.base_batch['generation_kwargs']['do_sample'] == True
+
+
+def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/mmlu_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningMultipleChoiceTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {
+        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
+        'choices': ['A', 'B', 'C', 'D'],
+        'gold': 2
+    }
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
+                                            ctxt=example['context'],
+                                            example=example)
+    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
+    correct_output = [
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: C",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: D"
+    ]
+    assert untokenized_inputs == correct_output
+
+
+def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    constructed_context = dl.construct_context(example)
+    assert constructed_context == 'cont one ### this is a continuation'
+    constructed_context = dl.construct_context(example, preceding_text='text')
+    assert constructed_context == '\ncont one ### this is a continuation'
+
+
+def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    constructed_contexts = dl._construct_multiple_contexts(example)
+    assert constructed_contexts == ['cont one', 'cont two']
+    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
+    assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
+
+
+def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
+                                            context_options=example['context_options'],
+                                            example=example)
+    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
+    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
+    assert untokenized_inputs == [
+        'prompt context one this is a continuation', 'prompt context two this is a continuation'
+    ]
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
@@ -103,9 +856,9 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     batch_size = 8
     seqlen = 64
     dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri,
-                                  tokenizer,
-                                  batch_size,
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
                                   max_seq_len=seqlen,
                                   pad_tok_id=tokenizer.eos_token_id,
                                   num_fewshot=2,
@@ -147,9 +900,9 @@ def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_pa
     batch_size = 2
     seqlen = 64
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=10,
@@ -188,9 +941,9 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 64
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=0,
@@ -226,9 +979,9 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 64
     dl = get_icl_task_dataloader('schema',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -261,19 +1014,18 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
+def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path, tiny_llama_tokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
     dl = get_icl_task_dataloader('schema',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -318,9 +1070,9 @@ def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     batch_size = 2
     seqlen = 512
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -360,9 +1112,9 @@ def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     batch_size = 4
     seqlen = 64
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -410,9 +1162,9 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
     batch_size = 4
     seqlen = 512
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -466,13 +1218,13 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
     tokenizer = tiny_opt_tokenizer
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)  # for dist
     dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri,
-        tokenizer,
-        8,
-        max_seq_len=64,
+        icl_task_type='question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=0,
         prompt_string='',
@@ -553,9 +1305,9 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     # empirical number from the small test dataset
     maximum_answer_length = 7
     dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -605,9 +1357,9 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     # empirical number from the small test dataset
     maximum_answer_length = 132
     dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -658,9 +1410,9 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 64
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -697,16 +1449,18 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        8,
-        max_seq_len=64,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=2,
         prompt_string='',
@@ -738,7 +1492,6 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'labels': str,
         'prompts': str,
         'tests': str,
-        'canonical_solutions': str,
         'entry_points': str,
         'test_inputs': list,
         'test_outputs': list,
@@ -763,25 +1516,27 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample,
+                                        tiny_llama_tokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
@@ -828,25 +1583,26 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_test_cases(dataset_uri, tmp_path):
+def test_code_eval_test_cases(dataset_uri, tmp_path, tiny_llama_tokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_.jsonl'),
                                  generations_per_sample=1)
@@ -866,9 +1622,8 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
 
     mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['canonical_solutions'],
-                                                              batch['test_inputs'], batch['test_outputs'],
-                                                              batch['entry_points']):
+    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
+                                                              batch['test_outputs'], batch['entry_points']):
         exec(prompt + solution, mod.__dict__)
         for test_input, test_output in zip(inputs, outputs):
             result = mod.__dict__[entry_point](*eval(test_input))
@@ -876,26 +1631,27 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
+def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path, tiny_llama_tokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
 
     with pytest.raises(ValueError, match=r'.* pass_at_k .*'):
         get_icl_task_dataloader('code_evaluation',
-                                dataset_uri,
-                                tokenizer,
-                                batch_size,
+                                dataset_uri=dataset_uri,
+                                tokenizer=tokenizer,
+                                batch_size=batch_size,
                                 max_seq_len=seqlen,
                                 pad_tok_id=tokenizer.eos_token_id,
                                 num_fewshot=0,
                                 prompt_string='',
                                 example_delimiter='\n',
+                                continuation_delimiter='',
                                 question_prelimiter='Code start: \n',
                                 destination_path=str(tmp_path / f'icl_.jsonl'),
                                 pass_at_k=10,
@@ -911,23 +1667,29 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 generations_per_sample=generations_per_sample)
+                                 generations_per_sample=generations_per_sample,
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -970,6 +1732,59 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     )
 
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 generations_per_sample=1,
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    microbatch_size = 1
+    microbatches = dl.split_batch(batch, microbatch_size)
+    assert len(microbatches) == 4
+    for microbatch in microbatches:
+        assert dl.get_num_samples_in_batch(microbatch) == 1
+        assert 'input_ids' in microbatch
+        # TODO: what should this be?
+        # assert tuple(microbatch['input_ids'].shape) == (microbatch_size, seqlen)
+        assert 'attention_mask' in microbatch
+        # assert tuple(microbatch['attention_mask'].shape) == (microbatch_size, seqlen)
+        assert isinstance(microbatch['generation_kwargs'], dict)
+        assert microbatch['generation_kwargs']['temperature'] == .9
+        assert microbatch['generation_kwargs']['top_k'] == 40
+        assert microbatch['generation_kwargs']['pad_token_id'] == 0
+        assert microbatch['generation_kwargs']['num_beams'] == 1
+        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
+        assert microbatch['generation_kwargs']['do_sample'] == True
+        assert microbatch['generation_kwargs']['use_cache'] == True
+        assert microbatch['generation_kwargs']['eos_token_id'] == 0
+
+
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
@@ -979,11 +1794,12 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     dl = get_icl_task_dataloader(
         'language_modeling',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=2048,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -995,6 +1811,7 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
 
     evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
 
+    transformers = pytest.importorskip('transformers')
     config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
     model = transformers.AutoModelForCausalLM.from_config(config)
     model = HuggingFaceModel(
@@ -1010,8 +1827,8 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
     assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
 
 
-@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
@@ -1019,12 +1836,13 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
     dl = get_icl_task_dataloader(
         'schema',
-        dataset_uri,
-        tokenizer,
-        8,
-        max_seq_len=64,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1065,13 +1883,16 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
+    max_seq_len = 64
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    reproducibility.seed_all(1234)
     dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri,
-                                  tokenizer,
-                                  8,
-                                  max_seq_len=64,
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
+                                  max_seq_len=max_seq_len,
                                   pad_tok_id=tokenizer.eos_token_id,
                                   num_fewshot=num_fewshot,
                                   prompt_string='',
@@ -1104,29 +1925,35 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
-@device('gpu')
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+@device('gpu')
+@world_size(1, 2)
+def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                            tiny_gpt2_model):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
 
     # seed because the fewshot selection is currently unseeded
     reproducibility.seed_all(1234)
     dl = get_icl_task_dataloader(
         'multiple_choice',
-        dataset_uri,
-        tokenizer,
-        8,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=64,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
         example_delimiter='\n',
         continuation_delimiter=': ',
-        destination_path=str(tmp_path / 'icl.jsonl'),
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
     evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
@@ -1146,14 +1973,17 @@ def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenize
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    assert trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
+    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
+    dist.all_reduce(total)  # type: ignore
+    assert total.item() == num_samples  # type: ignore
 
 
+@pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                           dataset_uri, tmp_path):
     pytest.importorskip('datasets')
@@ -1162,14 +1992,15 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
 
+    batch_size = 4
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=64,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1193,11 +2024,12 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
     assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
+@pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
     pytest.importorskip('datasets')
@@ -1206,14 +2038,15 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
 
+    batch_size = 4
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=256,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1239,9 +2072,9 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                             tmp_path):
@@ -1250,14 +2083,15 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=64,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1283,10 +2117,10 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@device('gpu')
+@world_size(1, 2)
 def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
@@ -1294,14 +2128,15 @@ def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_ur
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=256,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1340,10 +2175,10 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0])
+@pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', [1, 2])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                  dataset_uri, tmp_path, generations_per_sample):
@@ -1353,15 +2188,16 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
+    batch_size = 4
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=256,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=150,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1391,10 +2227,10 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0])
+@pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @device('gpu')
 @world_size(1, 2)
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', [1, 2])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
                                         tiny_t5_model, tmp_path, generations_per_sample):
@@ -1404,14 +2240,15 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_t5_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
-        max_seq_len=256,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=175,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
@@ -1438,11 +2275,11 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
+@device('gpu')
+@world_size(1, 2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
                                    tiny_gpt2_model, tmp_path, generations_per_sample):
@@ -1452,13 +2289,14 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=64 * num_fewshot,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1496,9 +2334,9 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 512
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -1522,3 +2360,112 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
     assert first_batch_without_last_word.count(' UNIQUE ') == 1
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'juggernaut',
+}])
+@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                      hf_loading_vars, hf_parsing_map):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=0,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' ',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
+
+    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
+    assert decoded_batch[0] == "Looks like it's just you and me."
+    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'invoker',
+}])
+@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                       hf_loading_vars, hf_parsing_map):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+
+    # empirical number from the small test dataset
+    maximum_answer_length = 4
+
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 question_prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
diff --git a/tests/datasets/test_mnist.py b/tests/datasets/test_mnist.py
deleted file mode 100644
index 7342184d03..0000000000
--- a/tests/datasets/test_mnist.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from composer.datasets import build_mnist_dataloader, build_synthetic_mnist_dataloader
-
-
-@pytest.mark.parametrize('is_train', [False, True])
-@pytest.mark.parametrize('synthetic', [pytest.param(False, marks=pytest.mark.daily), True])
-def test_mnist_shape_length(is_train, synthetic):
-    batch_size = 1
-
-    if synthetic:
-        loader = build_synthetic_mnist_dataloader(global_batch_size=batch_size, is_train=is_train)
-    else:
-        loader = build_mnist_dataloader(datadir='/tmp', global_batch_size=batch_size, is_train=is_train)
-
-    samples = list(loader)
-    if is_train:
-        assert len(samples) == 60000 // batch_size
-    else:
-        assert len(samples) == 10000 // batch_size
-
-    assert samples[0][0].shape == (1, 1, 28, 28)
diff --git a/tests/datasets/test_segmentation_transforms.py b/tests/datasets/test_segmentation_transforms.py
deleted file mode 100644
index 2e4af40126..0000000000
--- a/tests/datasets/test_segmentation_transforms.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import pytest
-from PIL import Image
-
-from composer.datasets.ade20k import (PadToSize, PhotometricDistoration, RandomCropPair, RandomHFlipPair,
-                                      RandomResizePair)
-
-
-@pytest.fixture
-def size():
-    return 16, 16
-
-
-@pytest.fixture
-def sample_pair(size):
-    img = Image.new(mode='RGB', size=size)
-    target = Image.new(mode='L', size=size)
-    return img, target
-
-
-def test_random_resize(sample_pair, size):
-    random_resize_transform = RandomResizePair(min_scale=0.5, max_scale=2.0, base_size=size)
-
-    # Test that the resized image remains within bounds for 10 iterations
-    for _ in range(10):
-        resized_img, resized_target = random_resize_transform(sample_pair)
-        assert resized_img.size == resized_target.size
-        assert resized_img.size[0] >= size[0] // 2 and resized_img.size[0] <= size[0] * 2
-        assert resized_img.size[1] >= size[1] // 2 and resized_img.size[1] <= size[1] * 2
-
-
-@pytest.mark.parametrize('crop_size', [(8, 8), (32, 32)])
-def test_random_crop(sample_pair, crop_size):
-    random_crop_transform = RandomCropPair(crop_size)
-    image, target = random_crop_transform(sample_pair)
-    assert image.size == target.size
-    final_size = min(crop_size[0], sample_pair[0].height), min(crop_size[1], sample_pair[0].width)
-    assert final_size == image.size
-
-
-def test_random_hflip(sample_pair):
-    old_image, old_target = np.array(sample_pair[0]), np.array(sample_pair[1])
-
-    # Always flip
-    always_hflip_transform = RandomHFlipPair(probability=1.0)
-    new_image, new_target = always_hflip_transform(sample_pair)
-    new_image, new_target = np.array(new_image), np.array(new_target)
-    assert np.allclose(new_image, old_image[:, ::-1]) and np.allclose(new_target, old_target[:, ::-1])
-
-    # Never flip
-    always_hflip_transform = RandomHFlipPair(probability=0.0)
-    new_image, new_target = always_hflip_transform(sample_pair)
-    new_image, new_target = np.array(new_image), np.array(new_target)
-    assert np.allclose(new_image, old_image) and np.allclose(new_target, old_target)
-
-
-@pytest.mark.parametrize('pad_size', [(32, 32), (8, 8)])
-def test_pad_transform(sample_pair, pad_size):
-    image = sample_pair[0]
-    pad_transform = PadToSize(size=pad_size, fill=255)
-    padded_image = pad_transform(image)
-    final_size = max(pad_size[1], image.width), max(pad_size[0], image.height)
-    # Check for correct size and number of padding elements
-    assert padded_image.size == final_size
-
-    # Check appropriate amount of padding is used
-    padded_image = np.array(padded_image)
-    initial_area = image.width * image.height
-    final_area = final_size[0] * final_size[1]
-    n_channels = padded_image.shape[2]
-    pad_volume = n_channels * (final_area - initial_area)
-    assert pad_volume == (padded_image == 255).sum()
-
-
-def test_photometric_distortion(sample_pair):
-    old_image = sample_pair[0]
-    # Test no transform case
-    photometric_transform = PhotometricDistoration(brightness=1.0, contrast=1.0, saturation=1.0, hue=0)
-    new_image = photometric_transform(old_image)
-    old_image, new_image = np.array(old_image), np.array(new_image)
-    assert np.allclose(old_image, new_image)
diff --git a/tests/datasets/test_synthetic_data.py b/tests/datasets/test_synthetic_data.py
deleted file mode 100644
index 6f62aebb9d..0000000000
--- a/tests/datasets/test_synthetic_data.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional
-
-import pytest
-import torch
-
-from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
-                                         SyntheticPILDataset)
-
-
-@pytest.mark.parametrize('data_type', [
-    SyntheticDataType.GAUSSIAN,
-    SyntheticDataType.SEPARABLE,
-])
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-def test_synthetic_batch_pair_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType):
-    if data_type == SyntheticDataType.SEPARABLE:
-        if label_type != SyntheticDataLabelType.CLASSIFICATION_INT:
-            pytest.skip('Separable data requires classification int labels')
-        num_classes = 2
-        label_shape = None
-    else:
-        num_classes = 10
-        label_shape = (1, 10, 12)
-
-    if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        pytest.xfail('classification_int is not currently supported with gaussian data')
-
-    dataset_size = 1000
-    data_shape = (3, 32, 32)
-    num_samples_to_create = 10
-    dataset = SyntheticBatchPairDataset(total_dataset_size=dataset_size,
-                                        data_shape=data_shape,
-                                        num_unique_samples_to_create=num_samples_to_create,
-                                        data_type=data_type,
-                                        label_type=label_type,
-                                        num_classes=num_classes,
-                                        label_shape=label_shape)
-    assert len(dataset) == dataset_size
-
-    # verify datapoints are correct
-    x, y = dataset[0]
-    assert x.size() == data_shape
-    if label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        assert isinstance(y.item(), int)
-    elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-        assert y.size() == (num_classes,)
-        assert torch.min(y) == 0
-        assert torch.max(y) == 1
-
-    # check that points were allocated in memory after the first call to __getitem__
-    assert dataset.input_data is not None
-    assert dataset.input_target is not None
-    # check that the correct number of points were allocated in memory
-    assert dataset.input_data.size()[0] == num_samples_to_create
-    assert dataset.input_target.size()[0] == num_samples_to_create
-
-    # verify that you can getch points outside the num_samples_to_create range
-    # (still within the total dataset size range)
-    x, y = dataset[num_samples_to_create + 1]
-    assert x is not None
-    assert y is not None
-
-
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-@pytest.mark.parametrize('num_classes', [None, 0])
-def test_synthetic_classification_param_validation(label_type: SyntheticDataLabelType, num_classes: Optional[int]):
-    with pytest.raises(ValueError):
-        SyntheticBatchPairDataset(total_dataset_size=10,
-                                  data_shape=(2, 2),
-                                  label_type=label_type,
-                                  num_classes=num_classes)
-
-
-@pytest.mark.parametrize('data_type', [
-    SyntheticDataType.GAUSSIAN,
-    SyntheticDataType.SEPARABLE,
-])
-@pytest.mark.parametrize('label_type', [
-    SyntheticDataLabelType.CLASSIFICATION_ONE_HOT,
-    SyntheticDataLabelType.CLASSIFICATION_INT,
-])
-def test_synthetic_image_data_creation(data_type: SyntheticDataType, label_type: SyntheticDataLabelType):
-    if data_type == SyntheticDataType.SEPARABLE:
-        if label_type != SyntheticDataLabelType.CLASSIFICATION_INT:
-            pytest.skip('Seperable data requires classification int labels')
-        num_classes = 2
-        label_shape = None
-    else:
-        num_classes = 10
-        label_shape = (1, 10, 12)
-
-    if data_type == SyntheticDataType.GAUSSIAN and label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        pytest.xfail('classification_int is not currently supported with gaussian data')
-
-    dataset_size = 1000
-    data_shape = (32, 32)
-    num_samples_to_create = 100
-    dataset = SyntheticPILDataset(total_dataset_size=dataset_size,
-                                  data_shape=data_shape,
-                                  num_unique_samples_to_create=num_samples_to_create,
-                                  data_type=data_type,
-                                  label_type=label_type,
-                                  num_classes=num_classes,
-                                  label_shape=label_shape)
-    assert len(dataset) == dataset_size
-
-    # verify datapoints are correct
-    x, y = dataset[0]
-    assert x.size == data_shape
-    if label_type == SyntheticDataLabelType.CLASSIFICATION_INT:
-        assert isinstance(y.item(), int)
-    elif label_type == SyntheticDataLabelType.CLASSIFICATION_ONE_HOT:
-        assert y.size() == (num_classes,)
-        assert torch.min(y) == 0
-        assert torch.max(y) == 1
-
-    # check that points were allocated in memory after the first call to __getitem__
-    assert dataset._dataset.input_data is not None
-    assert dataset._dataset.input_target is not None
-    # check that the correct number of points were allocated in memory
-    assert dataset._dataset.input_data.shape[0] == num_samples_to_create
-    assert dataset._dataset.input_target.shape[0] == num_samples_to_create
-
-    # verify that you can getch points outside the num_samples_to_create range
-    # (still within the total dataset size range)
-    x, y = dataset[num_samples_to_create + 1]
-    assert x is not None
-    assert y is not None
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
index cfd8674338..5ab9b472b0 100644
--- a/tests/fixtures/fixtures.py
+++ b/tests/fixtures/fixtures.py
@@ -244,11 +244,23 @@ def tiny_gpt2_tokenizer_helper():
     return hf_tokenizer
 
 
+def tiny_llama_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    return hf_tokenizer
+
+
 @pytest.fixture(scope='session')
 def _session_tiny_gpt2_tokenizer():  # type: ignore
     return tiny_gpt2_tokenizer_helper()
 
 
+@pytest.fixture(scope='session')
+def _session_tiny_llama_tokenizer():  # type: ignore
+    return tiny_llama_tokenizer_helper()
+
+
 def tiny_opt_model_helper(config):
     transformers = pytest.importorskip('transformers')
 
@@ -320,6 +332,47 @@ def _session_tiny_t5_model(_session_tiny_t5_config):  # type: ignore
     return tiny_t5_model_helper(_session_tiny_t5_config)
 
 
+def tiny_mistral_config_helper():
+    transformers = pytest.importorskip('transformers')
+
+    tiny_overrides = {
+        'hidden_size': 128,
+        'intermediate_size': 256,
+        'num_attention_heads': 8,
+        'num_hidden_layers': 2,
+        'num_kv_heads': 4
+    }
+    return transformers.AutoConfig.from_pretrained('mistralai/Mistral-7B-v0.1', **tiny_overrides)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_mistral_config():  # type: ignore
+    return tiny_mistral_config_helper()
+
+
+def tiny_mistral_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1', model_max_length=512)
+    return hf_tokenizer
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_mistral_tokenizer():  # type: ignore
+    return tiny_mistral_tokenizer_helper()
+
+
+def tiny_mistral_model_helper(config):
+    transformers = pytest.importorskip('transformers')
+
+    return transformers.AutoModelForCausalLM.from_config(config)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_mistral_model(_session_tiny_mistral_config):  # type: ignore
+    return tiny_mistral_model_helper(_session_tiny_mistral_config)
+
+
 @pytest.fixture
 def tiny_bert_model(_session_tiny_bert_model):
     return copy.deepcopy(_session_tiny_bert_model)
@@ -360,6 +413,11 @@ def tiny_gpt2_tokenizer(_session_tiny_gpt2_tokenizer):
     return copy.deepcopy(_session_tiny_gpt2_tokenizer)
 
 
+@pytest.fixture
+def tiny_llama_tokenizer(_session_tiny_llama_tokenizer):
+    return copy.deepcopy(_session_tiny_llama_tokenizer)
+
+
 @pytest.fixture
 def tiny_gpt2_model(_session_tiny_gpt2_model):
     return copy.deepcopy(_session_tiny_gpt2_model)
@@ -393,3 +451,18 @@ def tiny_t5_tokenizer(_session_tiny_t5_tokenizer):
 @pytest.fixture
 def tiny_t5_model(_session_tiny_t5_model):
     return copy.deepcopy(_session_tiny_t5_model)
+
+
+@pytest.fixture
+def tiny_mistral_config(_session_tiny_mistral_config):
+    return copy.deepcopy(_session_tiny_mistral_config)
+
+
+@pytest.fixture
+def tiny_mistral_tokenizer(_session_tiny_mistral_tokenizer):
+    return copy.deepcopy(_session_tiny_mistral_tokenizer)
+
+
+@pytest.fixture
+def tiny_mistral_model(_session_tiny_mistral_model):
+    return copy.deepcopy(_session_tiny_mistral_model)
diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py
index 5ff0a2fa3c..d5de5b8171 100644
--- a/tests/loggers/test_mlflow_logger.py
+++ b/tests/loggers/test_mlflow_logger.py
@@ -29,7 +29,9 @@ def _get_latest_mlflow_run(experiment_name, tracking_uri=None):
     # NB: Convert tracking URI to string because MlflowClient doesn't support non-string
     # (e.g. PosixPath) tracking URI representations
     client = MlflowClient(str(tracking_uri))
-    experiment_id = (client.get_experiment_by_name(experiment_name).experiment_id)
+    experiment = client.get_experiment_by_name(experiment_name)
+    assert experiment is not None
+    experiment_id = experiment.experiment_id
     first_run_or_empty = client.search_runs(
         experiment_ids=[experiment_id],
         max_results=1,
@@ -164,6 +166,26 @@ def test_mlflow_experiment_init_experiment_name(monkeypatch):
     id_logger.post_close()
 
 
+def test_mlflow_experiment_init_existing_composer_run(monkeypatch):
+    """ Test that an existing MLFlow run is used if one already exists in the experiment for the Composer run.
+    """
+    mlflow = pytest.importorskip('mlflow')
+
+    monkeypatch.setattr(mlflow, 'set_tracking_uri', MagicMock())
+    monkeypatch.setattr(mlflow, 'start_run', MagicMock())
+
+    mock_state = MagicMock()
+    mock_state.run_name = 'dummy-run-name'
+
+    existing_id = 'dummy-id'
+    mock_search_runs = MagicMock(return_value=[MagicMock(info=MagicMock(run_id=existing_id))])
+    monkeypatch.setattr(mlflow, 'search_runs', mock_search_runs)
+
+    test_logger = MLFlowLogger()
+    test_logger.init(state=mock_state, logger=MagicMock())
+    assert test_logger._run_id == existing_id
+
+
 def test_mlflow_experiment_set_up(tmp_path):
     """ Test that MLFlow experiment is set up correctly within mlflow
     """
@@ -189,6 +211,7 @@ def test_mlflow_experiment_set_up(tmp_path):
     )
     run_id = run.info.run_id
     experiment_id = run.info.experiment_id
+    tags = run.data.tags
 
     # Check uri set correctly.
     assert mlflow_uri.exists()
@@ -207,6 +230,9 @@ def test_mlflow_experiment_set_up(tmp_path):
     actual_run_name = run_cfg['run_name']
     assert actual_run_name == expected_run_name
 
+    # Check run tagged with Composer run name.
+    assert tags['composer_run_name'] == mock_state.run_name
+
     # Check run ended.
     test_mlflow_logger.post_close()
     assert mlflow.active_run() is None
@@ -336,6 +362,48 @@ def test_mlflow_save_model(tmp_path, tiny_gpt2_model, tiny_gpt2_tokenizer):
     check_hf_tokenizer_equivalence(loaded_model['tokenizer'], tiny_gpt2_tokenizer)
 
 
+@pytest.mark.filterwarnings('ignore:.*Setuptools is replacing distutils.*:UserWarning')
+@pytest.mark.filterwarnings("ignore:.*The 'transformers' MLflow Models integration.*:FutureWarning")
+def test_mlflow_save_peft_model(tmp_path, tiny_mistral_model, tiny_mistral_tokenizer):
+    mlflow = pytest.importorskip('mlflow')
+    peft = pytest.importorskip('peft')
+
+    # Reload just so the model has the update base model name
+    tiny_mistral_model.save_pretrained(tmp_path / Path('tiny_mistral_save_pt'))
+    tiny_mistral_model = tiny_mistral_model.from_pretrained(tmp_path / Path('tiny_mistral_save_pt'))
+
+    peft_config = {'peft_type': 'LORA'}
+    peft_model = peft.get_peft_model(tiny_mistral_model, peft.get_peft_config(peft_config))
+
+    mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
+    mlflow_exp_name = 'test-log-model-exp-name'
+    test_mlflow_logger = MLFlowLogger(
+        tracking_uri=mlflow_uri,
+        experiment_name=mlflow_exp_name,
+    )
+
+    mock_state = MagicMock()
+    mock_state.run_name = 'dummy-run-name'  # this run name should be unused.
+    mock_logger = MagicMock()
+
+    peft_model.save_pretrained(tmp_path / Path('peft_model_save_pt'))
+    tiny_mistral_tokenizer.save_pretrained(tmp_path / Path('peft_model_save_pt'))
+
+    local_mlflow_save_path = str(tmp_path / Path('my_model_local'))
+    test_mlflow_logger.init(state=mock_state, logger=mock_logger)
+    test_mlflow_logger.save_model(
+        flavor='peft',
+        path=local_mlflow_save_path,
+        save_pretrained_dir=str(tmp_path / Path('peft_model_save_pt')),
+    )
+    test_mlflow_logger.post_close()
+
+    loaded_model = mlflow.pyfunc.load_model(local_mlflow_save_path).unwrap_python_model()
+
+    check_hf_model_equivalence(loaded_model.model, tiny_mistral_model)
+    check_hf_tokenizer_equivalence(loaded_model.tokenizer, tiny_mistral_tokenizer)
+
+
 @pytest.mark.filterwarnings('ignore:.*Setuptools is replacing distutils.*:UserWarning')
 @pytest.mark.filterwarnings("ignore:.*The 'transformers' MLflow Models integration.*:FutureWarning")
 def test_mlflow_register_model(tmp_path, monkeypatch):
@@ -364,11 +432,54 @@ def test_mlflow_register_model(tmp_path, monkeypatch):
         name='my_model',
     )
 
-    assert mlflow.register_model.called_with(model_uri=local_mlflow_save_path,
-                                             name='my_catalog.my_schema.my_model',
-                                             await_registration_for=300,
-                                             tags=None,
-                                             registry_uri='databricks-uc')
+    mlflow.register_model.assert_called_with(
+        model_uri=local_mlflow_save_path,
+        name='my_catalog.my_schema.my_model',
+        await_registration_for=300,
+        tags=None,
+    )
+    assert mlflow.get_registry_uri() == 'databricks-uc'
+
+    test_mlflow_logger.post_close()
+
+
+@pytest.mark.filterwarnings('ignore:.*Setuptools is replacing distutils.*:UserWarning')
+@pytest.mark.filterwarnings("ignore:.*The 'transformers' MLflow Models integration.*:FutureWarning")
+def test_mlflow_register_model_with_run_id(tmp_path, monkeypatch):
+    mlflow = pytest.importorskip('mlflow')
+
+    mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
+    mlflow_exp_name = 'test-log-model-exp-name'
+    test_mlflow_logger = MLFlowLogger(
+        tracking_uri=mlflow_uri,
+        experiment_name=mlflow_exp_name,
+        model_registry_prefix='my_catalog.my_schema',
+        model_registry_uri='databricks-uc',
+    )
+
+    monkeypatch.setattr(test_mlflow_logger._mlflow_client, 'create_model_version', MagicMock())
+    monkeypatch.setattr(test_mlflow_logger._mlflow_client, 'create_registered_model',
+                        MagicMock(return_value=type('MockResponse', (), {'name': 'my_catalog.my_schema.my_model'})))
+
+    mock_state = MagicMock()
+    mock_state.run_name = 'dummy-run-name'  # this run name should be unused.
+    mock_logger = MagicMock()
+
+    local_mlflow_save_path = str(tmp_path / Path('my_model_local'))
+    test_mlflow_logger.init(state=mock_state, logger=mock_logger)
+
+    test_mlflow_logger.register_model_with_run_id(
+        model_uri=local_mlflow_save_path,
+        name='my_model',
+    )
+
+    test_mlflow_logger._mlflow_client.create_model_version.assert_called_with(
+        name='my_catalog.my_schema.my_model',
+        source=local_mlflow_save_path,
+        run_id=test_mlflow_logger._run_id,
+        await_creation_for=300,
+        tags=None,
+    )
     assert mlflow.get_registry_uri() == 'databricks-uc'
 
     test_mlflow_logger.post_close()
@@ -488,7 +599,8 @@ def test_mlflow_logging_works(tmp_path, device):
     actual_params_list = [param_filepath.stem for param_filepath in param_path.iterdir()]
 
     expected_params_list = [
-        'num_cpus_per_node', 'node_name', 'num_nodes', 'rank_zero_seed', 'composer_version', 'composer_commit_hash'
+        'num_cpus_per_node', 'node_name', 'num_nodes', 'rank_zero_seed', 'composer_version', 'composer_commit_hash',
+        'mlflow_experiment_id', 'mlflow_run_id'
     ]
     assert set(expected_params_list) == set(actual_params_list)
 
@@ -549,3 +661,96 @@ def before_forward(self, state: State, logger: Logger):
     run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id)
     im_dir = run_file_path / Path('artifacts')
     assert len(os.listdir(im_dir)) == expected_num_ims
+
+
+@device('cpu')
+def test_mlflow_ignore_metrics(tmp_path, device):
+    mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
+    experiment_name = 'mlflow_logging_test'
+    test_mlflow_logger = MLFlowLogger(
+        tracking_uri=mlflow_uri,
+        experiment_name=experiment_name,
+        log_system_metrics=False,
+        ignore_metrics=['metrics/eval/*', 'nothing/should/match', 'metrics/train/CrossEntropy'],
+    )
+
+    dataset_size = 64
+    batch_size = 4
+    num_batches = 4
+    eval_interval = '1ba'
+
+    trainer = Trainer(model=SimpleConvModel(),
+                      loggers=test_mlflow_logger,
+                      train_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
+                      eval_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
+                      max_duration=f'{num_batches}ba',
+                      eval_interval=eval_interval,
+                      device=device)
+    trainer.fit()
+    # Allow async logging to finish.
+    time.sleep(3)
+    test_mlflow_logger.post_close()
+
+    run = _get_latest_mlflow_run(
+        experiment_name=experiment_name,
+        tracking_uri=mlflow_uri,
+    )
+    run_id = run.info.run_id
+    experiment_id = run.info.experiment_id
+
+    run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id)
+
+    # Test metrics logged.
+    for metric_name in [
+            'metrics/train/MulticlassAccuracy',
+            'loss/train/total',
+    ]:
+        metric_file = run_file_path / Path('metrics') / Path(metric_name)
+        with open(metric_file) as f:
+            csv_reader = csv.reader(f, delimiter=' ')
+            lines = list(csv_reader)
+
+        assert len(lines) == num_batches
+
+    # Test metrics are not logged.
+    for metric_name in ['metrics/eval/MulticlassAccuracy', 'metrics/eval/CrossEntropy', 'metrics/train/CrossEntropy']:
+        metric_file = run_file_path / Path('metrics') / Path(metric_name)
+        assert not os.path.exists(metric_file)
+
+    # Test system metrics are not logged.
+    metric_file = run_file_path / Path('metrics') / Path('system/cpu_utilization_percentage')
+    assert not os.path.exists(metric_file)
+
+
+def test_mlflow_ignore_hyperparameters(tmp_path):
+    mlflow_uri = tmp_path / Path('my-test-mlflow-uri')
+    experiment_name = 'mlflow_logging_test'
+    test_mlflow_logger = MLFlowLogger(tracking_uri=mlflow_uri,
+                                      experiment_name=experiment_name,
+                                      log_system_metrics=False,
+                                      ignore_hyperparameters=['num*', 'mlflow_run_id', 'nothing'])
+
+    Trainer(model=SimpleConvModel(), loggers=test_mlflow_logger, max_duration=f'4ba')
+    # Allow async logging to finish.
+    time.sleep(3)
+    test_mlflow_logger.post_close()
+
+    run = _get_latest_mlflow_run(
+        experiment_name=experiment_name,
+        tracking_uri=mlflow_uri,
+    )
+    run_file_path = mlflow_uri / Path(run.info.experiment_id) / Path(run.info.run_id)
+
+    # Test params logged.
+    param_path = run_file_path / Path('params')
+    actual_params_list = [param_filepath.stem for param_filepath in param_path.iterdir()]
+
+    # should not see num_cpus_per_node, num_nodes, mlflow_run_id
+    expected_params_list = [
+        'node_name',
+        'rank_zero_seed',
+        'composer_version',
+        'composer_commit_hash',
+        'mlflow_experiment_id',
+    ]
+    assert set(expected_params_list) == set(actual_params_list)
diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py
index 106acfc6fc..0834e3dbf0 100644
--- a/tests/loggers/test_mosaicml_logger.py
+++ b/tests/loggers/test_mosaicml_logger.py
@@ -59,10 +59,11 @@ def test_format_data_to_json_serializable():
         'key3': 3.14,
         'key4': True,
         'key5': torch.tensor([1, 2, 3]),
-        'key6': {
+        'key6': torch.tensor([42]),
+        'key7': {
             'inner_key': 'inner_value'
         },
-        'key7': [1, 2, 3],
+        'key8': [1, 2, 3],
     }
     formatted_data = format_data_to_json_serializable(data)
 
@@ -72,10 +73,11 @@ def test_format_data_to_json_serializable():
         'key3': 3.14,
         'key4': True,
         'key5': 'Tensor of shape torch.Size([3])',
-        'key6': {
+        'key6': 42,
+        'key7': {
             'inner_key': 'inner_value'
         },
-        'key7': [1, 2, 3],
+        'key8': [1, 2, 3],
     }
 
     assert formatted_data == expected_formatted_data
@@ -83,6 +85,7 @@ def test_format_data_to_json_serializable():
 
 @pytest.mark.parametrize('callback_cls', get_cbs_and_marks(callbacks=True))
 @world_size(1, 2)
+@pytest.mark.filterwarnings('ignore::UserWarning')
 def test_logged_data_is_json_serializable(monkeypatch, callback_cls: Type[Callback], world_size):
     """Test that all logged data is json serializable, which is a requirement to use MAPI."""
 
diff --git a/tests/loggers/test_neptune_logger.py b/tests/loggers/test_neptune_logger.py
new file mode 100644
index 0000000000..4463595c0f
--- /dev/null
+++ b/tests/loggers/test_neptune_logger.py
@@ -0,0 +1,149 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+import os
+import uuid
+from pathlib import Path
+from typing import Sequence
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+from torch.utils.data import DataLoader
+
+from composer import Trainer
+from composer._version import __version__
+from composer.loggers import NeptuneLogger
+from composer.utils import dist
+from tests.common import RandomImageDataset, SimpleConvModel
+from tests.common.markers import device
+
+
+@pytest.fixture
+def test_neptune_logger() -> NeptuneLogger:
+    neptune_project = 'test_project'
+    neptune_api_token = 'test_token'
+
+    neptune_logger = NeptuneLogger(
+        project=neptune_project,
+        api_token=neptune_api_token,
+        rank_zero_only=False,
+        mode='debug',
+        upload_artifacts=True,
+    )
+
+    return neptune_logger
+
+
+def test_neptune_init(test_neptune_logger):
+    mock_state = MagicMock()
+    mock_state.run_name = 'dummy-run-name'  # should appear in sys/tags
+
+    test_neptune_logger.init(state=mock_state, logger=MagicMock())
+
+    assert test_neptune_logger.neptune_run is not None
+
+    test_neptune_logger.neptune_run.sync()
+    assert test_neptune_logger.neptune_run[NeptuneLogger.integration_version_key].fetch() == __version__
+    assert test_neptune_logger.neptune_run['sys/name'].fetch() == 'dummy-run-name'
+    assert test_neptune_logger.base_handler['rank'].fetch() == 0
+
+
+@device('cpu')
+def test_neptune_logging(device, test_neptune_logger):
+
+    dataset_size = 64
+    batch_size = 4
+    num_batches = 4
+    eval_interval = '1ba'
+
+    trainer = Trainer(model=SimpleConvModel(),
+                      loggers=test_neptune_logger,
+                      train_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
+                      eval_dataloader=DataLoader(RandomImageDataset(size=dataset_size), batch_size),
+                      max_duration=f'{num_batches}ba',
+                      eval_interval=eval_interval,
+                      device=device)
+    trainer.fit()
+
+    assert test_neptune_logger.neptune_run is not None
+    assert test_neptune_logger.base_handler is not None
+
+    for metric_name in [
+            'metrics/train/MulticlassAccuracy', 'metrics/eval/MulticlassAccuracy', 'metrics/eval/CrossEntropy',
+            'loss/train/total'
+    ]:
+        path = f'{test_neptune_logger._base_namespace}/{test_neptune_logger.metric_namespace}/{metric_name}'
+        assert test_neptune_logger.neptune_run.exists(path)
+
+    for hyperparam_name in ['node_name', 'num_cpus_per_node', 'num_nodes', 'rank_zero_seed']:
+        path = f'{test_neptune_logger._base_namespace}/{test_neptune_logger.hyperparam_namespace}/{hyperparam_name}'
+        assert test_neptune_logger.neptune_run.exists(path)
+
+    assert test_neptune_logger.base_handler['hyperparameters/num_nodes'].fetch() == 1
+
+
+@pytest.mark.gpu
+@pytest.mark.world_size(1, 2)
+def test_upload_and_download_file(test_neptune_logger, tmp_path, dummy_state):
+    neptune_artifact_name = 'test-neptune-artifact-' + str(uuid.uuid4())
+    tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
+    save_folder = Path(tmp_paths[0])
+    file_content = 'hello from Neptune!'
+
+    dummy_neptune_artifact_path = save_folder / 'neptune_artifact.txt'
+    if dist.get_global_rank() == 0:
+        with open(dummy_neptune_artifact_path, 'w+') as f:
+            f.write(file_content)
+
+    test_neptune_logger.upload_file(state=dummy_state,
+                                    file_path=dummy_neptune_artifact_path,
+                                    remote_file_name=neptune_artifact_name)
+
+    dist.barrier()
+
+    assert test_neptune_logger.neptune_run.exists(f'{test_neptune_logger._base_namespace}/{neptune_artifact_name}')
+
+    dst_path = save_folder / 'neptune_artifact'
+
+    test_neptune_logger.download_file(
+        remote_file_name=neptune_artifact_name,
+        destination=str(dst_path),
+    )
+
+    assert dst_path.exists()
+
+    with open(str(dst_path), 'r') as fp:
+        assert fp.read() == file_content
+
+
+def test_neptune_log_image(test_neptune_logger):
+    pytest.importorskip('neptune', reason='neptune is optional')
+
+    with patch('neptune.attributes.FileSeries.extend', MagicMock()) as mock_extend:
+        image_variants = [
+            (torch.rand(4, 4), False),  # 2D image
+            (torch.rand(2, 3, 4, 4), False),  # multiple images, not channels last
+            (torch.rand(2, 3, 4, 4, dtype=torch.float64), False),  # same as above but with float64
+            (torch.rand(3, 4, 4), False),  # with channels, not channels last
+            ([torch.rand(4, 4, 3)], True),  # with channels, channels last
+            (torch.rand(2, 4, 4, 3), True),  # multiple images, channels last
+            ([torch.rand(4, 4, 3), torch.rand(4, 4, 3)], True)  # multiple images in list
+        ]
+
+        expected_num_images_total = 0
+        for (images, channels_last) in image_variants:
+            if isinstance(images, Sequence):
+                expected_num_images = len(images)
+                np_images = [image.to(torch.float32).numpy() for image in images]
+
+            else:
+                expected_num_images = 1 if images.ndim < 4 else images.shape[0]
+                np_images = images.to(torch.float32).numpy()
+            test_neptune_logger.log_images(images=images, channels_last=channels_last)
+            test_neptune_logger.log_images(images=np_images, channels_last=channels_last)
+
+            expected_num_images *= 2  # One set of torch tensors, one set of numpy arrays
+            expected_num_images_total += expected_num_images
+
+        test_neptune_logger.post_close()
+        assert mock_extend.call_count == 2 * len(image_variants)  # One set of torch tensors, one set of numpy arrays
diff --git a/tests/loggers/test_wandb_logger.py b/tests/loggers/test_wandb_logger.py
index 1ccfc5e53a..c9cfe0fc6c 100644
--- a/tests/loggers/test_wandb_logger.py
+++ b/tests/loggers/test_wandb_logger.py
@@ -247,6 +247,7 @@ def test_wandb_log_metrics(test_wandb_logger):
 
 
 @pytest.mark.parametrize('callback_cls', get_cbs_and_marks(callbacks=True))
+@pytest.mark.filterwarnings('ignore::UserWarning')
 def test_logged_data_is_json_serializable(callback_cls: Type[Callback]):
     """Test that all logged data is json serializable, which is a requirement to use wandb."""
     pytest.importorskip('wandb', reason='wandb is optional')
diff --git a/tests/metrics/metric_setter_callback.py b/tests/metrics/metric_setter_callback.py
index 6b90c26bfe..63ec8db305 100644
--- a/tests/metrics/metric_setter_callback.py
+++ b/tests/metrics/metric_setter_callback.py
@@ -60,6 +60,7 @@ def _update_metrics(self, state: State):
         # assert for pyright error: "module_to_device" is not a known member of "None"
         assert self.device is not None
         self.device.module_to_device(raw_metric)
+        assert state.train_metrics is not None
         if self.dataloader_label == 'train':
             state.train_metrics[self.monitor] = raw_metric
         else:
diff --git a/tests/metrics/test_current_metrics.py b/tests/metrics/test_current_metrics.py
index d5315e3993..0f75349f9c 100644
--- a/tests/metrics/test_current_metrics.py
+++ b/tests/metrics/test_current_metrics.py
@@ -29,12 +29,14 @@ def batch_end(self, state: State, logger: Logger) -> None:
         # The metric should be computed and updated on state every batch.
         del logger  # unused
         # assuming that at least one sample was correctly classified
+        assert state.train_metrics is not None
         assert state.train_metrics['MulticlassAccuracy'].compute() != 0.0
         self._train_batch_end_train_accuracy = state.train_metrics['MulticlassAccuracy']
 
     def epoch_end(self, state: State, logger: Logger) -> None:
         # The metric at epoch end should be the same as on batch end.
         del logger  # unused
+        assert state.train_metrics is not None
         assert state.train_metrics['MulticlassAccuracy'].compute() == self._train_batch_end_train_accuracy
 
     def eval_end(self, state: State, logger: Logger) -> None:
@@ -85,6 +87,7 @@ def test_current_metrics(eval_interval: str,):
         return
 
     # Validate the metrics
+    assert trainer.state.train_metrics is not None
     assert trainer.state.train_metrics['MulticlassAccuracy'].compute() != 0.0
 
     if compute_val_metrics:
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index a37e53ca8f..e31cd4d410 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from typing import Optional
 
 import pytest
 import torch
@@ -10,8 +11,9 @@
 from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy,
                                   InContextLearningExpectedCalibrationError, InContextLearningLMAccuracy,
                                   InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError, InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
+                                  InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
+                                  InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy,
+                                  LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
@@ -53,7 +55,7 @@ def test_masked_accuracy(ignore_index, num_classes):
 @pytest.mark.parametrize('sequence_length', [128])
 @pytest.mark.parametrize('num_classes', [2, 10])
 @pytest.mark.parametrize('minibatch_size', [56, 256, 768])
-def test_cross_entropy(batch_size: float, ignore_index: int, sequence_length: int, num_classes: int,
+def test_cross_entropy(batch_size: float, ignore_index: Optional[int], sequence_length: int, num_classes: int,
                        minibatch_size: int):
     """Sanity check to make sure that batched CrossEntropyLoss matches the expected performance.
 
@@ -71,15 +73,15 @@ def test_cross_entropy(batch_size: float, ignore_index: int, sequence_length: in
     generated_preds = torch.randn((batch_size, sequence_length, num_classes))
     generated_true = torch.randint(low=0, high=num_classes, size=(batch_size, sequence_length))
 
+    assert ignore_index is not None
     torchmetrics_xent = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
     ce_with_keys_metric = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index)
 
-    if ignore_index is not None:
-        labels_mask = torch.rand((batch_size, sequence_length))
-        labels_mask[labels_mask > 0.8] = 1
-        labels_mask[labels_mask <= 0.8] = 0
-        labels_mask = labels_mask.bool()
-        generated_true[labels_mask] = ignore_index
+    labels_mask = torch.rand((batch_size, sequence_length))
+    labels_mask[labels_mask > 0.8] = 1
+    labels_mask[labels_mask <= 0.8] = 0
+    labels_mask = labels_mask.bool()
+    generated_true[labels_mask] = ignore_index
 
     num_batches = math.ceil(batch_size / minibatch_size)
     for batch_idx in range(num_batches):
@@ -171,6 +173,53 @@ def test_language_perplexity():
     assert torch.equal(torch.exp(ce), perplexity)
 
 
+def test_in_context_learning_rename_args_no_op():
+    batch = {'input': [1, 2, 3]}
+    outputs = torch.Tensor([12, 13, 14])
+    labels = torch.Tensor([0, 1, 0])
+    batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch, outputs=outputs, labels=labels)
+    assert batch == {'input': [1, 2, 3]}
+    assert torch.all(torch.eq(outputs, torch.tensor([12, 13, 14])))
+    assert torch.all(torch.eq(labels, torch.tensor([0, 1, 0])))
+
+
+def test_in_context_learning_rename_args_output_and_output_logits():
+    batch = {'input': [1, 2, 3]}
+    outputs = torch.Tensor([12, 13, 14])
+    output_logits = torch.Tensor([.1, .2, .3])
+    labels = torch.Tensor([0, 1, 0])
+    with pytest.raises(ValueError):
+        _, _, _ = InContextLearningMetric.rename_args(batch=batch,
+                                                      outputs=outputs,
+                                                      labels=labels,
+                                                      output_logits=output_logits)
+
+
+def test_in_context_learning_rename_args_rename_output_logits():
+    batch = {'input': [1, 2, 3]}
+    output_logits = torch.Tensor([.1, .2, .3])
+    labels = torch.Tensor([0, 1, 0])
+    batch, outputs, labels = InContextLearningMetric.rename_args(batch=batch,
+                                                                 labels=labels,
+                                                                 output_logits=output_logits)
+    assert batch == {'input': [1, 2, 3]}
+    assert torch.all(torch.eq(outputs, torch.Tensor([.1, .2, .3])))  # pyright: ignore [reportGeneralTypeIssues]
+    assert torch.all(torch.eq(labels, torch.tensor([0, 1, 0])))
+
+
+def test_in_context_learning_rename_args_fail_on_no_label():
+    batch = {'input': [1, 2, 3]}
+    output_logits = torch.Tensor([.1, .2, .3])
+    with pytest.raises(ValueError):
+        _, _, _ = InContextLearningMetric.rename_args(batch=batch, output_logits=output_logits)
+
+
+def test_in_context_learning_rename_args_fail_on_no_output():
+    batch = {'input': [1, 2, 3]}
+    with pytest.raises(ValueError):
+        _, _, _ = InContextLearningMetric.rename_args(batch=batch)
+
+
 def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
     contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
     continuations = [' furry', ' pie', ' long lines', ' snowy']
@@ -237,12 +286,12 @@ def test_in_context_learning_qa_accuracy():
 
 def test_in_context_learning_qa_cot_accuracy():
     outputs = [
-        'chain of thought ### Correct but then some more text', 'Incorrect',
-        'chain of thought ### the CORREct with weird casing and spacing',
+        'chain of thought ### Correct but then some more text\n\nanother chain of thought ### Incorrect answer this time',
+        'Incorrect', 'chain of thought ### the CORREct with weird casing and spacing',
         'incorrect chain of thought delimiter ## Correct but wrong delimiter'
     ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
-    batch = {'cot_delimiter': ' ### ', 'labels': labels}
+    batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
     metric = InContextLearningQAAccuracy()
     metric.update(outputs, labels, batch)
 
diff --git a/tests/models/test_bert.py b/tests/models/test_bert.py
deleted file mode 100644
index 82caa80f45..0000000000
--- a/tests/models/test_bert.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torch.utils.data import DataLoader
-
-from composer.models.bert import create_bert_classification, create_bert_mlm
-from composer.trainer import Trainer
-from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset
-
-
-def test_bert_mlm_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch):
-    transformers = pytest.importorskip('transformers')
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_bert_config)
-    bert_composer_model = create_bert_mlm(use_pretrained=False,
-                                          pretrained_model_name='dummy',
-                                          model_config=None,
-                                          tokenizer_name=None,
-                                          gradient_checkpointing=False)
-
-    train_dataset = RandomTextLMDataset(size=8,
-                                        vocab_size=tiny_bert_tokenizer.vocab_size,
-                                        sequence_length=8,
-                                        use_keys=True)
-    collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_bert_tokenizer,
-                                                            mlm=True,
-                                                            mlm_probability=0.15)
-    train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator)
-
-    trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics['LanguageCrossEntropy'].compute() > 0.0
-
-
-def test_bert_classification_hf_factory(tiny_bert_config, tiny_bert_tokenizer, monkeypatch):
-    pytest.importorskip('transformers')
-
-    def config_patch(x, num_labels):
-        tiny_bert_config.num_labels = num_labels
-        return tiny_bert_config
-
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', config_patch)
-    bert_composer_model = create_bert_classification(use_pretrained=False,
-                                                     pretrained_model_name='dummy',
-                                                     model_config=None,
-                                                     tokenizer_name=None,
-                                                     gradient_checkpointing=False,
-                                                     num_labels=3)
-
-    train_dataset = RandomTextClassificationDataset(size=8,
-                                                    vocab_size=tiny_bert_tokenizer.vocab_size,
-                                                    sequence_length=8,
-                                                    num_classes=3,
-                                                    use_keys=True)
-    train_dataloader = DataLoader(train_dataset, batch_size=4)
-
-    trainer = Trainer(model=bert_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics['MulticlassAccuracy'].compute() > 0.0
diff --git a/tests/models/test_efficientnet.py b/tests/models/test_efficientnet.py
deleted file mode 100644
index a11dccc87b..0000000000
--- a/tests/models/test_efficientnet.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-from composer.models.efficientnetb0.efficientnets import EfficientNet
-
-
-@pytest.mark.gpu
-def test_efficientb0_activate_shape():
-    # Running this test on cuda as convolutions are slow on CPU
-    random_input = torch.rand(2, 3, 224, 224).cuda()
-
-    model = EfficientNet.get_model_from_name(
-        'efficientnet-b0',
-        num_classes=1000,
-        drop_connect_rate=0.2,
-    ).cuda()
-    # Test Stem
-    out = model.conv_stem(random_input)
-    out = model.bn1(out)
-    out = model.act1(out)
-    assert out.shape == (2, 32, 112, 112)
-
-    # Test each block, shapes found at Table 1 of EfficientNet paper
-    block_act_shape = [
-        (2, 16, 112, 112),
-        (2, 24, 56, 56),
-        (2, 24, 56, 56),
-        (2, 40, 28, 28),
-        (2, 40, 28, 28),
-        (2, 80, 14, 14),
-        (2, 80, 14, 14),
-        (2, 80, 14, 14),
-        (2, 112, 14, 14),
-        (2, 112, 14, 14),
-        (2, 112, 14, 14),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 192, 7, 7),
-        (2, 320, 7, 7),
-    ]
-    for i, block in enumerate(model.blocks):
-        out = block(out)
-        assert out.shape == block_act_shape[i]
-
-    out = model.conv_head(out)
-    assert out.shape == (2, 1280, 7, 7)
diff --git a/tests/models/test_gpt2.py b/tests/models/test_gpt2.py
deleted file mode 100644
index 7bbb878e5e..0000000000
--- a/tests/models/test_gpt2.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from torch.utils.data import DataLoader
-
-from composer.models.gpt2 import create_gpt2
-from composer.trainer import Trainer
-from tests.common.datasets import RandomTextLMDataset
-
-
-def test_gpt2_hf_factory(tiny_gpt2_config, tiny_gpt2_tokenizer, monkeypatch):
-    transformers = pytest.importorskip('transformers')
-    monkeypatch.setattr('transformers.AutoConfig.from_pretrained', lambda x: tiny_gpt2_config)
-    gpt2_composer_model = create_gpt2(use_pretrained=False,
-                                      pretrained_model_name='dummy',
-                                      model_config=None,
-                                      tokenizer_name=None,
-                                      gradient_checkpointing=False)
-
-    train_dataset = RandomTextLMDataset(size=8,
-                                        vocab_size=tiny_gpt2_tokenizer.vocab_size,
-                                        sequence_length=8,
-                                        use_keys=True)
-    collator = transformers.DataCollatorForLanguageModeling(tokenizer=tiny_gpt2_tokenizer, mlm=False)
-    train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collator)
-
-    trainer = Trainer(model=gpt2_composer_model, train_dataloader=train_dataloader, max_duration='1ep')
-    trainer.fit()
-
-    assert trainer.state.train_metrics['LanguagePerplexity'].compute() > 0.0
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 0f6076116f..e677941e9e 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -6,7 +6,7 @@
 import tempfile
 from contextlib import nullcontext
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 from unittest.mock import patch
 from urllib.parse import urlparse
 
@@ -26,9 +26,48 @@
 from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset, RandomTextRegressionDataset
 from tests.common.markers import device, world_size
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
+                                 configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
+                                 configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
+if TYPE_CHECKING:
+    from peft import PeftConfig
+
+
+def _gpt2_peft_config():
+    pytest.importorskip('peft')
+    from peft import get_peft_config
+
+    peft_config = get_peft_config({
+        'peft_type': 'LORA',
+        'task_type': 'CAUSAL_LM',
+        'target_modules': ['c_attn'],
+        'fan_in_fan_out': True,
+    })
+    return peft_config
+
+
+@pytest.fixture
+def gpt2_peft_config():
+    return _gpt2_peft_config()
+
+
+def _mistral_peft_config():
+    pytest.importorskip('peft')
+    from peft import get_peft_config
+
+    peft_config = get_peft_config({
+        'peft_type': 'LORA',
+        'task_type': 'CAUSAL_LM',
+        'target_modules': ['up_proj'],
+    })
+    return peft_config
+
+
+@pytest.fixture
+def mistral_peft_config():
+    return _mistral_peft_config()
+
 
 def test_hf_tokenizer_save(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
@@ -98,6 +137,7 @@ def test_hf_train_eval_predict(num_classes: int, tiny_bert_config):
     trainer.eval()
 
     # Check that there is some train/eval accuracy
+    assert trainer.state.train_metrics is not None
     assert trainer.state.train_metrics['MulticlassAccuracy'].compute() != 0.0
     assert trainer.state.eval_metrics['eval']['MulticlassAccuracy'].compute() != 0.0
 
@@ -153,6 +193,7 @@ def test_hf_train_eval_predict_regression(tiny_deberta_config):
     trainer.eval()
 
     # Check that there is some train/eval accuracy
+    assert trainer.state.train_metrics is not None
     assert trainer.state.train_metrics['PearsonCorrCoef'].compute() != 0.0
     assert trainer.state.eval_metrics['eval']['PearsonCorrCoef'].compute() != 0.0
 
@@ -431,14 +472,33 @@ def get_lm_trainer(hf_model,
                    device_train_microbatch_size: Optional[int] = None,
                    batch_size: int = 4,
                    sequence_length: int = 4,
-                   size: int = 4):
+                   size: int = 4,
+                   peft_config: Optional['PeftConfig'] = None,
+                   should_save_peft_only: bool = False):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]
     if not is_conditional_generation:
         metrics.append(MaskedAccuracy(ignore_index=-100))
 
-    model = HuggingFaceModel(hf_model, tokenizer=hf_tokenizer, metrics=metrics, use_logits=True)
+    model = HuggingFaceModel(
+        hf_model,
+        tokenizer=hf_tokenizer,
+        metrics=metrics,
+        use_logits=True,
+        peft_config=peft_config,
+        should_save_peft_only=should_save_peft_only,
+    )
+
+    # On torch 2.0, fsdp wrapped modules can not have both frozen and unfrozen params.
+    # On 2.1+, if you have use_orig_params=True, they can. So we need a special case for the tests here.
+    if version.parse(torch.__version__) < version.parse('2.1.0') and peft_config is not None:
+        for name, module in model.named_modules():
+            if 'lora' in name.lower() and 'default' in name.lower():
+                has_parameters = any(True for _ in module.parameters())
+                has_buffers = any(True for _ in module.buffers())
+                if has_parameters or has_buffers:
+                    module._fsdp_wrap = True  # type: ignore
 
     vocab_size = hf_model.config.vocab_size
     sequence_length = 4
@@ -475,8 +535,13 @@ def get_lm_trainer(hf_model,
                                      collate_fn=collator,
                                      sampler=dist.get_sampler(train_dataset))
 
+    from composer.optim import DecoupledAdamW
+
+    optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)
+
     in_memory_logger = InMemoryLogger()
     trainer = Trainer(model=model,
+                      optimizers=optimizer,
                       train_dataloader=train_dataloader,
                       eval_dataloader=eval_dataloader,
                       max_duration='1ep',
@@ -865,8 +930,6 @@ def test_encoder_decoder(tiny_t5_model, tiny_t5_tokenizer):
 
 
 @pytest.mark.gpu
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 @pytest.mark.filterwarnings('ignore::UserWarning')
 def test_hf_fsdp(tiny_bert_config, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
@@ -909,6 +972,7 @@ def test_separate_eval_metrics(tiny_bert_model, tiny_bert_tokenizer):
 
 @pytest.mark.parametrize('checkpoint_upload_folder', [None, 's3://checkpoints-bucket/'])
 @pytest.mark.parametrize('local_save_filename', [None, 'local-checkpoint.pt'])
+@pytest.mark.filterwarnings('ignore:TypedStorage is deprecated.*:UserWarning')
 def test_write_hf_from_composer(checkpoint_upload_folder, local_save_filename, tiny_bert_model, tiny_bert_tokenizer,
                                 tmp_path):
     transformers = pytest.importorskip('transformers')
@@ -943,6 +1007,7 @@ def test_write_hf_from_composer(checkpoint_upload_folder, local_save_filename, t
     check_hf_model_equivalence(tiny_bert_model, loaded_hf_model)
 
 
+@pytest.mark.filterwarnings('ignore:TypedStorage is deprecated.*:UserWarning')
 def test_write_hf_from_composer_direct(tiny_bert_tokenizer, tmp_path):
     # tests that the logic to write out a huggingface checkpoint from a composer checkpoint
     # still works when the huggingface model is instantiated directly rather than using from_pretrained
@@ -1028,9 +1093,6 @@ def test_embedding_resizing(tiny_bert_model, tiny_bert_tokenizer, embedding_resi
 @pytest.mark.parametrize('hf_model,hf_tokenizer', [(configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer),
                                                    (configure_tiny_t5_model, configure_tiny_t5_tokenizer)])
 def test_generate(device, world_size, hf_model, hf_tokenizer, use_fsdp):
-    if use_fsdp and version.parse(torch.__version__) < version.parse('1.13.0'):
-        pytest.skip('FSDP requires torch >= 1.13.0')
-
     transformers = pytest.importorskip('transformers')
     if device == 'cpu' and use_fsdp:
         pytest.skip('FSDP is not supported on CPU.')
@@ -1074,12 +1136,10 @@ def test_generate(device, world_size, hf_model, hf_tokenizer, use_fsdp):
     generation1 = model.generate(**input_dict, max_new_tokens=5, pad_token_id=hf_tokenizer.pad_token_id)
     generation2 = model.generate(**input_dict, max_new_tokens=3, pad_token_id=hf_tokenizer.pad_token_id)
 
-    assert generation1.shape == (2,
-                                 (input_dict['input_ids'].shape[1] if not hf_model.config.is_encoder_decoder else 1) +
-                                 5)
-    assert generation2.shape == (2,
-                                 (input_dict['input_ids'].shape[1] if not hf_model.config.is_encoder_decoder else 1) +
-                                 3)
+    generation1_dim2 = (input_dict['input_ids'].shape[1] if not hf_model.config.is_encoder_decoder else 1) + 5
+    assert generation1.shape == (2, generation1_dim2)  # pyright: ignore[reportGeneralTypeIssues]
+    generation2_dim2 = (input_dict['input_ids'].shape[1] if not hf_model.config.is_encoder_decoder else 1) + 3
+    assert generation2.shape == (2, generation2_dim2)  # pyright: ignore[reportGeneralTypeIssues]
 
     decoded_generation1 = hf_tokenizer.batch_decode(generation1, skip_special_tokens=True)
     decoded_generation2 = hf_tokenizer.batch_decode(generation2, skip_special_tokens=True)
@@ -1095,8 +1155,6 @@ def test_generate(device, world_size, hf_model, hf_tokenizer, use_fsdp):
 @pytest.mark.parametrize('hf_model,hf_tokenizer', [(configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer),
                                                    (configure_tiny_t5_model, configure_tiny_t5_tokenizer)])
 def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_fsdp):
-    if use_fsdp and version.parse(torch.__version__) < version.parse('1.13.0'):
-        pytest.skip('FSDP requires torch >= 1.13.0')
     transformers = pytest.importorskip('transformers')
     if device == 'cpu' and use_fsdp:
         pytest.skip('FSDP is not supported on CPU.')
@@ -1148,3 +1206,229 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert len(generation1) == len(generation2) == 2
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation1)
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
+
+
+@pytest.mark.parametrize('peft_type', ['LORA', 'loRa'])
+@pytest.mark.parametrize('task_type', ['CAUSAL_LM', 'causal_lm'])
+def test_peft_init(peft_type: str, task_type: str, tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+    from peft import PeftModelForCausalLM
+
+    peft_config = copy.deepcopy(gpt2_peft_config)
+    peft_config.peft_type = peft_type
+    peft_config.task_type = task_type
+
+    original_model = copy.deepcopy(tiny_gpt2_model)
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=peft_config)
+    assert isinstance(hf_model.model, PeftModelForCausalLM)
+    assert hf_model.model.peft_config['default'].peft_type == 'LORA'
+    assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
+    assert hf_model.model.config == original_model.config
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+def test_peft_init_errors(tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+    peft_config = copy.deepcopy(gpt2_peft_config)
+    peft_config.peft_type = 'NOT_LORA'
+
+    with pytest.raises(ValueError):
+        _ = HuggingFaceModel(tiny_gpt2_model, peft_config=peft_config)
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+
+    with patch('composer.models.huggingface.peft_installed', False):
+        with pytest.raises(ImportError):
+            from composer.models import HuggingFaceModel
+            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_save_peft_only):
+    pytest.importorskip('peft')
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        should_save_peft_only=should_save_peft_only,
+    )
+    trainer.fit()
+
+    load_trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        should_save_peft_only=should_save_peft_only,
+    )
+
+    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+@pytest.mark.parametrize('model,tokenizer,peft_config', [
+    (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
+    (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
+])
+def test_peft_generate(model, tokenizer, peft_config):
+    pytest.importorskip('peft')
+
+    model = model()
+    tokenizer = tokenizer()
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
+
+    input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
+    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
+    pytest.importorskip('peft')
+
+    from peft import get_peft_config
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
+    metadata = hf_model.get_metadata()
+    loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
+
+    assert loaded_peft_config == gpt2_peft_config
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
+                                     should_save_peft_only):
+    peft = pytest.importorskip('peft')
+    transformers = pytest.importorskip('transformers')
+
+    # Simulate a local model instead of a hub model
+    tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
+    tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        should_save_peft_only=should_save_peft_only,
+    )
+    trainer.fit()
+
+    from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
+    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
+                                                          tmp_path / 'hf-save-pretrained')
+
+    # Test we can load back in using transformers interface
+    loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+    # Test we can load back in using peft interface
+    loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+
+@pytest.mark.gpu
+@world_size(2)
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size,
+                          should_save_peft_only):
+    pytest.importorskip('peft')
+
+    fsdp_config = {
+        'sharding_strategy': 'FULL_SHARD',
+        'cpu_offload': False,
+        'mixed_precision': 'PURE',
+        'backward_prefetch': 'BACKWARD_PRE',
+        'activation_checkpointing': False,
+        'activation_cpu_offload': False,
+        'verbose': False
+    }
+
+    stashed_model = copy.deepcopy(tiny_gpt2_model)
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path / 'trainer1'),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        fsdp_config=fsdp_config,
+        should_save_peft_only=should_save_peft_only,
+    )
+
+    for n, p in trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
+    trainer.fit()
+    trainer.close()
+
+    load_trainer = get_lm_trainer(
+        stashed_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path / 'trainer2'),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
+        fsdp_config=fsdp_config,
+        should_save_peft_only=should_save_peft_only,
+    )
+
+    for n, p in load_trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+    with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
+        for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+            torch.testing.assert_close(p1, p2)
+
+    if dist.get_global_rank() == 0:
+        loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
+
+        # Check that only the LoRA parameters were saved
+        if should_save_peft_only:
+            assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+        else:
+            assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0'), reason='requires PyTorch 2+')
+def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+    pytest.importorskip('peft')
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model,
+                                tokenizer=tiny_gpt2_tokenizer,
+                                peft_config=gpt2_peft_config,
+                                should_save_peft_only=True)
+    state_dict = hf_model.state_dict()
+
+    assert len(state_dict.keys()) == 4
diff --git a/tests/models/test_mmdet_model.py b/tests/models/test_mmdet_model.py
deleted file mode 100644
index fafeeb1ac5..0000000000
--- a/tests/models/test_mmdet_model.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import pytest
-import torch
-
-
-@pytest.fixture
-def mmdet_detection_batch():
-    batch_size = 2
-    num_labels_per_image = 20
-    image_size = 224
-    return {
-        'img_metas': [{
-            'filename': '../../data/coco/train2017/fake_img.jpg',
-            'ori_filename': 'fake_image.jpg',
-            'img_shape': (image_size, image_size, 3),
-            'ori_shape': (image_size, image_size, 3),
-            'pad_shape': (image_size, image_size, 3),
-            'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32)
-        }] * batch_size,
-        'img':
-            torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32),
-        'gt_bboxes': [torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size,
-        'gt_labels': [torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size
-    }
-
-
-@pytest.fixture
-def mmdet_detection_eval_batch():
-    # Eval settings for mmdetection datasets have an extra list around inputs.
-    batch_size = 2
-    num_labels_per_image = 20
-    image_size = 224
-    return {
-        'img_metas': [[{
-            'filename': '../../data/coco/train2017/fake_img.jpg',
-            'ori_filename': 'fake_image.jpg',
-            'img_shape': (image_size, image_size, 3),
-            'ori_shape': (image_size, image_size, 3),
-            'pad_shape': (image_size, image_size, 3),
-            'scale_factor': np.array([1., 1., 1., 1.], dtype=np.float32),
-        }] * batch_size],
-        'img': [torch.zeros(batch_size, 3, image_size, image_size, dtype=torch.float32)],
-        'gt_bboxes': [[torch.zeros(num_labels_per_image, 4, dtype=torch.float32)] * batch_size],
-        'gt_labels': [[torch.zeros(num_labels_per_image, dtype=torch.int64)] * batch_size]
-    }
-
-
-@pytest.fixture
-def yolox_config():
-    # from https://github.com/open-mmlab/mmdetection/blob/master/configs/yolox/yolox_s_8x8_300e_coco.py
-    return dict(
-        type='YOLOX',
-        input_size=(640, 640),
-        random_size_range=(15, 25),
-        random_size_interval=10,
-        backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
-        neck=dict(type='YOLOXPAFPN', in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
-        bbox_head=dict(type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128),
-        train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
-        # In order to align the source code, the threshold of the val phase is
-        # 0.01, and the threshold of the test phase is 0.001.
-        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
-
-
-@pytest.fixture
-def faster_rcnn_config():
-    # modified from https://github.com/open-mmlab/mmdetection/blob/master/configs/_base_/models/faster_rcnn_r50_fpn.py
-    return dict(
-        type='FasterRCNN',
-        backbone=dict(type='ResNet',
-                      depth=50,
-                      num_stages=4,
-                      out_indices=(0, 1, 2, 3),
-                      frozen_stages=1,
-                      norm_cfg=dict(type='BN', requires_grad=True),
-                      norm_eval=True,
-                      style='pytorch'),
-        neck=dict(type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
-        rpn_head=dict(type='RPNHead',
-                      in_channels=256,
-                      feat_channels=256,
-                      anchor_generator=dict(type='AnchorGenerator',
-                                            scales=[8],
-                                            ratios=[0.5, 1.0, 2.0],
-                                            strides=[4, 8, 16, 32, 64]),
-                      bbox_coder=dict(type='DeltaXYWHBBoxCoder',
-                                      target_means=[.0, .0, .0, .0],
-                                      target_stds=[1.0, 1.0, 1.0, 1.0]),
-                      loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
-                      loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
-        roi_head=dict(type='StandardRoIHead',
-                      bbox_roi_extractor=dict(type='SingleRoIExtractor',
-                                              roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-                                              out_channels=256,
-                                              featmap_strides=[4, 8, 16, 32]),
-                      bbox_head=dict(type='Shared2FCBBoxHead',
-                                     in_channels=256,
-                                     fc_out_channels=1024,
-                                     roi_feat_size=7,
-                                     num_classes=80,
-                                     bbox_coder=dict(type='DeltaXYWHBBoxCoder',
-                                                     target_means=[0., 0., 0., 0.],
-                                                     target_stds=[0.1, 0.1, 0.2, 0.2]),
-                                     reg_class_agnostic=False,
-                                     loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-                                     loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
-        # model training and testing settings
-        train_cfg=dict(rpn=dict(assigner=dict(type='MaxIoUAssigner',
-                                              pos_iou_thr=0.7,
-                                              neg_iou_thr=0.3,
-                                              min_pos_iou=0.3,
-                                              match_low_quality=True,
-                                              ignore_iof_thr=-1),
-                                sampler=dict(type='RandomSampler',
-                                             num=256,
-                                             pos_fraction=0.5,
-                                             neg_pos_ub=-1,
-                                             add_gt_as_proposals=False),
-                                allowed_border=-1,
-                                pos_weight=-1,
-                                debug=False),
-                       rpn_proposal=dict(nms_pre=2000,
-                                         max_per_img=1000,
-                                         nms=dict(type='nms', iou_threshold=0.7),
-                                         min_bbox_size=0),
-                       rcnn=dict(assigner=dict(type='MaxIoUAssigner',
-                                               pos_iou_thr=0.5,
-                                               neg_iou_thr=0.5,
-                                               min_pos_iou=0.5,
-                                               match_low_quality=False,
-                                               ignore_iof_thr=-1),
-                                 sampler=dict(type='RandomSampler',
-                                              num=512,
-                                              pos_fraction=0.25,
-                                              neg_pos_ub=-1,
-                                              add_gt_as_proposals=True),
-                                 pos_weight=-1,
-                                 debug=False)),
-        test_cfg=dict(
-            rpn=dict(nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0),
-            rcnn=dict(score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)
-            # soft-nms is also supported for rcnn testing
-            # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
-        ))
-
-
-def test_mmdet_model_forward_yolox(mmdet_detection_batch, yolox_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(yolox_config)
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model(mmdet_detection_batch)
-    assert list(out.keys()) == ['loss_cls', 'loss_bbox', 'loss_obj']
-
-
-def test_mmdet_model_eval_forward_yolox(mmdet_detection_eval_batch, yolox_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(yolox_config)
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model.eval_forward(mmdet_detection_eval_batch)
-    assert len(out) == mmdet_detection_eval_batch['img'][0].shape[0]  # batch size
-    assert list(out[0].keys()) == ['labels', 'boxes', 'scores']
-
-
-def test_mmdet_model_forward_faster_rcnn(mmdet_detection_batch, faster_rcnn_config):
-    pytest.importorskip('mmdet')
-
-    from mmcv import ConfigDict
-    from mmdet.models import build_detector
-
-    from composer.models import MMDetModel
-
-    config = ConfigDict(faster_rcnn_config)
-
-    # non pretrained model to avoid a slow test that downloads the weights.
-    model = build_detector(config)
-    model.init_weights()
-    model = MMDetModel(model=model)
-    out = model(mmdet_detection_batch)
-    assert list(out.keys()) == ['loss_rpn_cls', 'loss_rpn_bbox', 'loss_cls', 'acc', 'loss_bbox']
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
index 2ae9383d79..f13be17486 100644
--- a/tests/profiler/test_profiler.py
+++ b/tests/profiler/test_profiler.py
@@ -9,8 +9,10 @@
 import pytest
 import torch
 from packaging import version
+from torch.profiler.profiler import ProfilerAction as TorchProfilerAction
 
-from composer.core import State
+from composer.core import Engine, Event, State, Timestamp
+from composer.loggers import Logger
 from composer.profiler import Profiler, ProfilerAction, SystemProfiler, TorchProfiler, cyclic_schedule
 from composer.profiler.utils import export_memory_timeline_html
 
@@ -170,3 +172,39 @@ def test_memory_timeline(tmp_path: pathlib.Path) -> None:
     assert fig is not None, 'export_memory_timeline_html should return a figure when return_fig=True'
     _, end = fig.gca().get_ylim()
     assert round(end, 2) == 0.06
+
+
+def test_skip_first_after_resumption(minimal_state: State) -> None:
+    skip_first = 1
+    wait = 2
+    warmup = 3
+    active = 4
+    repeat = 1
+    schedule = cyclic_schedule(skip_first=skip_first, wait=wait, warmup=warmup, active=active, repeat=repeat)
+    mock_trace_handler = MagicMock()
+    profiler = Profiler(
+        trace_handlers=[mock_trace_handler],
+        schedule=schedule,
+    )
+    profiler.bind_to_state(minimal_state)
+    minimal_state.profiler = profiler
+
+    assert len(profiler._callbacks) >= 1
+    assert isinstance(profiler._callbacks[-1], TorchProfiler)
+    torch_profiler = profiler._callbacks[-1]
+
+    # Create torch.profiler.profile
+    logger = Logger(minimal_state)
+    engine = Engine(state=minimal_state, logger=logger)
+    engine.run_event(Event.INIT)
+    assert torch_profiler.profiler is not None
+
+    minimal_state.timestamp = Timestamp(batch_in_epoch=7)
+    assert torch_profiler.profiler.schedule(0) == TorchProfilerAction.RECORD
+
+    # Load checkpoint at batch 4
+    minimal_state.timestamp = Timestamp(batch_in_epoch=4)
+    engine.run_event(Event.BEFORE_LOAD)
+    engine.run_event(Event.AFTER_LOAD)
+    minimal_state.timestamp = Timestamp(batch_in_epoch=7)
+    assert torch_profiler.profiler.schedule(0) == TorchProfilerAction.WARMUP
diff --git a/tests/test_events.py b/tests/test_events.py
index 63bff245ba..c81feea0b0 100644
--- a/tests/test_events.py
+++ b/tests/test_events.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from packaging import version
 from torch.utils.data import DataLoader
 
 from composer import Trainer
@@ -89,8 +88,6 @@ def get_trainer(self, precision='fp32', **kwargs):
                          id='gpu-fsdp',
                          marks=[
                              pytest.mark.gpu,
-                             pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                                                reason='requires PyTorch 1.13 or higher'),
                              pytest.mark.filterwarnings('ignore::UserWarning'),
                          ]),
         ])
@@ -153,6 +150,7 @@ def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, nu
 
         expected_num_calls = {
             Event.INIT: 1,
+            Event.BEFORE_LOAD: 1,
             Event.AFTER_LOAD: 1,
             Event.EPOCH_START: num_epochs,
             Event.BATCH_START: total_steps,
diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py
index 3d31d36406..11ad2240d3 100644
--- a/tests/test_notebooks.py
+++ b/tests/test_notebooks.py
@@ -6,6 +6,7 @@
 import os
 from urllib.parse import urlparse
 
+import importlib_metadata
 import pytest
 import testbook
 from testbook.client import TestbookNotebookClient
@@ -21,6 +22,16 @@
     for nb in glob.glob(os.path.join(nb_root, '*.ipynb')) \
 ]
 
+try:
+    importlib_metadata.files('mosaicml')
+    package_name = 'mosaicml'
+except importlib_metadata.PackageNotFoundError:
+    try:
+        importlib_metadata.files('composer')
+        package_name = 'composer'
+    except importlib_metadata.PackageNotFoundError:
+        raise RuntimeError('Could not find the package under mosaicml or composer.')
+
 
 def patch_notebooks():
     import itertools
@@ -80,11 +91,15 @@ def modify_cell_source(tb: TestbookNotebookClient, notebook_name: str, cell_sour
         cell_source = cell_source.replace('batch_size = 1024', 'batch_size = 64')
         cell_source = cell_source.replace('download=True', 'download=False')
     if notebook_name == 'auto_microbatching':
+        cell_source = cell_source.replace('batch_size = 2048', 'batch_size = 1024')
         cell_source = cell_source.replace('download=True', 'download=False')
     if notebook_name == 'migrate_from_ptl':
         cell_source = cell_source.replace('batch_size=256', 'batch_size=64')
         cell_source = cell_source.replace('download=True', 'download=False')
 
+    cell_source = cell_source.replace("pip install 'mosaicml", f"pip install '{package_name}")
+    cell_source = cell_source.replace('pip install mosaicml', f'pip install {package_name}')
+
     return cell_source
 
 
@@ -122,7 +137,7 @@ def test_notebook(notebook: str, device: str, s3_bucket: str):
 
     obj = urlparse('s3://mosaicml-internal-integration-testing/read_only/CIFAR-10/')
     s3 = boto3.resource('s3')
-    bucket = s3.Bucket(obj.netloc)
+    bucket = s3.Bucket(obj.netloc)  # pyright: ignore[reportGeneralTypeIssues]
     files = bucket.objects.filter(Prefix=obj.path.lstrip('/'))
     for file in files:
         target = os.path.join(os.getcwd(), 'data', os.path.relpath(file.key, obj.path.lstrip('/')))
diff --git a/tests/test_precision.py b/tests/test_precision.py
index 46571529c6..2b85d3d7d2 100644
--- a/tests/test_precision.py
+++ b/tests/test_precision.py
@@ -9,8 +9,7 @@
 
 from composer import Trainer
 from composer.core import Precision, get_precision_context
-from composer.models import composer_resnet_cifar
-from tests.common import RandomImageDataset
+from tests.common import RandomImageDataset, composer_resnet
 
 try:
     import transformer_engine.pytorch as te
@@ -22,7 +21,7 @@
 def get_trainer(precision: Precision, precision_config: Optional[Dict[str, Any]] = None) -> Trainer:
 
     return Trainer(
-        model=composer_resnet_cifar('resnet_9'),
+        model=composer_resnet('resnet18'),
         train_dataloader=DataLoader(
             dataset=RandomImageDataset(size=1024),
             batch_size=512,
@@ -78,7 +77,7 @@ def predict_and_measure_memory(precision) -> int:
 def test_train_precision_memory(precision: Precision):
     memory_fp32 = fit_and_measure_memory(Precision.FP32)
     memory_half = fit_and_measure_memory(precision)
-    assert memory_half < 0.7 * memory_fp32
+    assert memory_half < 0.85 * memory_fp32
 
 
 @pytest.mark.gpu
diff --git a/tests/test_simple_nlp.py b/tests/test_simple_nlp.py
index b200e7cfa5..6b53b16125 100644
--- a/tests/test_simple_nlp.py
+++ b/tests/test_simple_nlp.py
@@ -47,6 +47,7 @@ def test_simple_nlp_classification():
     trainer.eval()
 
     # Check that there is some train/eval accuracy
+    assert trainer.state.train_metrics is not None
     assert trainer.state.train_metrics['MulticlassAccuracy'].compute() != 0.0
     assert trainer.state.eval_metrics['eval']['MulticlassAccuracy'].compute() != 0.0
 
@@ -100,6 +101,7 @@ def test_simple_nlp_mlm(tiny_bert_tokenizer, tiny_bert_model):
     trainer.eval()
 
     # Check that there is some train/eval cross entropy
+    assert trainer.state.train_metrics is not None
     assert trainer.state.train_metrics['LanguageCrossEntropy'].compute() != 0.0
     assert trainer.state.eval_metrics['eval']['LanguageCrossEntropy'].compute() != 0.0
 
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 883fb04fb5..77b580b476 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -65,6 +65,7 @@ def _load_checkpoint(filename: Union[str, pathlib.Path]):
 
 
 def _assert_checkpoints_equivalent(file1, file2, atol=0.0, rtol=0.0):
+    # TODO: consider merging with _assert_checkpoints_equal
     checkpoint_1 = _load_checkpoint(file1)
     checkpoint_2 = _load_checkpoint(file2)
 
@@ -85,6 +86,10 @@ def _assert_checkpoints_equivalent(file1, file2, atol=0.0, rtol=0.0):
         if 'DummyStatefulCallback' in ckpt['state']['callbacks']:
             del ckpt['state']['callbacks']['DummyStatefulCallback']
 
+    # Remove all saved checkpoints to timestamp (accumulates between runs)
+    del checkpoint_1['state']['callbacks']['CheckpointSaver']['all_saved_checkpoints_to_timestamp']
+    del checkpoint_2['state']['callbacks']['CheckpointSaver']['all_saved_checkpoints_to_timestamp']
+
     deep_compare(checkpoint_1, checkpoint_2, atol=atol, rtol=rtol)
 
     # deepspeed checkpoints do not have model or optimizer
@@ -280,6 +285,7 @@ def test_checkpoint_saver_properly_constructed(self, save_folder: str, expected_
             'weights_only': False,
             'save_interval': '1ep',
             'num_checkpoints_to_keep': -1,
+            'ignore_keys': None,
         }
         expected_folder = expected_path.rstrip('/') if expected_path != '' else '.'
         mock_checkpoint_saver.assert_called_once_with(folder=expected_folder, **rest_of_checkpoint_saver_kwargs)
@@ -689,7 +695,11 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool):
 
         last_checkpoint = os.path.join('first', 'ep2.pt')
         if missing_key or unexpected_key:
-            error_context = pytest.raises(RuntimeError, match='Failed to load checkpoint due to')
+            message = r'Error\(s\) in loading state_dict'
+            if version.parse(torch.__version__) < version.parse('2.2.9'):
+                # Composer implements strict for older torch versions
+                message = 'Failed to load checkpoint due to'
+            error_context = pytest.raises(RuntimeError, match=message)
         else:
             error_context = contextlib.nullcontext()
 
@@ -741,6 +751,7 @@ def test_load_weights(self, device, load_weights_only, save_metrics):
                 assert metrics_equal
 
     @pytest.mark.parametrize('load_ignore_keys,weights_equal,callbacks_equal,rng_equal', [
+        ['*', False, False, False],
         ['state/model/*', False, True, True],
         ['state/callbacks/*', True, False, True],
         ['rng', True, True, False],
@@ -780,6 +791,44 @@ def test_load_ignore_keys(self, load_ignore_keys, weights_equal, callbacks_equal
             assert trainer_1_rng_state is not None
             deep_compare(trainer_1_rng_state, trainer_2._rng_state)
 
+    @pytest.mark.parametrize('save_ignore_keys,weights_equal,callbacks_equal,rng_equal', [
+        ['*', False, False, False],
+        ['state/model/*', False, True, True],
+        ['state/callbacks/*', True, False, True],
+        ['rng', True, True, False],
+    ])
+    @pytest.mark.filterwarnings('ignore:.* is not in the state_dict.*:UserWarning')
+    def test_save_ignore_keys(self, save_ignore_keys, weights_equal, callbacks_equal, rng_equal):
+
+        trainer_1 = self.get_trainer(save_folder='first', save_ignore_keys=[save_ignore_keys])
+        trainer_1.fit()
+        trainer_1_rng_state = reproducibility.get_rng_state()
+        trainer_1.close()
+
+        last_checkpoint = os.path.join('first', 'ep2.pt')
+        trainer_2 = self.get_trainer(load_path=last_checkpoint)
+
+        # Check weights loaded properly
+        with contextlib.nullcontext() if weights_equal else pytest.raises(AssertionError):
+            self._assert_weights_equivalent(
+                trainer_1.state.model,
+                trainer_2.state.model,
+            )
+
+        # Check callbacks state
+        stateful_callbacks_equal = self._stateful_callbacks_equal(
+            trainer_1.state.callbacks,
+            trainer_2.state.callbacks,
+        )
+        if callbacks_equal:
+            assert stateful_callbacks_equal
+        else:
+            assert not stateful_callbacks_equal
+
+        if rng_equal:
+            assert trainer_1_rng_state is not None
+            deep_compare(trainer_1_rng_state, trainer_2._rng_state)
+
     @pytest.mark.remote
     @device('cpu')
     @pytest.mark.parametrize('load_weights_only', [True, False])
@@ -790,8 +839,6 @@ def test_load_ignore_keys(self, load_ignore_keys, weights_equal, callbacks_equal
         ],
     )
     @pytest.mark.filterwarnings('ignore:.*The checkpoint included CUDA RNG state.*')
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                        reason='requires PyTorch 1.13 or higher')
     def test_load_remote_checkpoint(self, device, tmp_path: pathlib.Path, load_weights_only, remote_checkpoint_uri,
                                     remote_checkpoint_name, continue_training_dur, final_checkpoint_name, s3_bucket,
                                     s3_read_only_prefix):
@@ -972,8 +1019,10 @@ def test_autoload_algorithm_old_checkpoint(self):
         old_init, old_repr = NoOpModel.__init__, NoOpModel.__repr__
         NoOpModel.__init__ = lambda self, x: None  # type: ignore
         NoOpModel.__repr__ = lambda self: 'NoOpModel(3)'
-        with pytest.warns(UserWarning, match='required_on_load algorithm.*'), pytest.raises(
-                ValueError, match='loaded state dict contains a parameter group.*'):
+        error_context = pytest.raises(KeyError, match='module.0.weight')
+        if version.parse(torch.__version__) < version.parse('2.2.9'):
+            error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*')
+        with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context:
             trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt'),)
             trainer_3.fit(duration='1ba')
         # Restore algorithm
@@ -1247,6 +1296,36 @@ def test_spin_dataloaders(
                 save_folder / 'second' / 'latest-rank{rank}.pt',
             )
 
+    def test_format_load_path(self, tmp_path: pathlib.Path):
+        run_name = 'a-quick-rabbit'
+        save_folder = os.path.join(tmp_path, '{run_name}')
+        trainer = self.get_trainer(
+            run_name=run_name,
+            save_folder=os.path.join(save_folder, 'first'),
+            save_filename='ep{epoch}-rank{rank}.pt',
+            save_interval='1ep',
+        )
+
+        trainer.fit()
+        trainer.close()
+
+        resume_file = os.path.join(save_folder, 'first', 'ep1-rank0.pt')
+        trainer = self.get_trainer(
+            run_name=run_name,
+            save_folder=os.path.join(save_folder, 'second'),
+            save_filename='ep{epoch}-rank{rank}.pt',
+            save_interval='1ep',
+            load_path=resume_file,  # <-- resume training from file
+        )
+        trainer.fit()
+        trainer.close()
+
+        save_folder = save_folder.replace('{run_name}', run_name)
+        _assert_checkpoints_equivalent(
+            os.path.join(save_folder, 'first', 'latest-rank{rank}.pt'),
+            os.path.join(save_folder, 'second', 'latest-rank{rank}.pt'),
+        )
+
     def _assert_expected_num_checkpoints(
         self,
         save_folder: str,
@@ -1306,6 +1385,7 @@ def test_rotate_checkpoints(
             dataset=train_dataset,
             sampler=dist.get_sampler(train_dataset),
         ),
+        precision='fp32',
         save_folder=str(save_folder),
         save_filename='checkpoint_{rank}_{batch}.pt',
         save_interval='1ba',
diff --git a/tests/trainer/test_ddp.py b/tests/trainer/test_ddp.py
index f34ba3862d..d9733c4285 100644
--- a/tests/trainer/test_ddp.py
+++ b/tests/trainer/test_ddp.py
@@ -7,17 +7,15 @@
 import pytest
 import torch
 import torch.distributed
-from packaging import version
 from torch.utils.data import DataLoader
 
 import composer.core.types as types
 from composer import Callback, Event
 from composer.core import State
-from composer.datasets.synthetic import SyntheticBatchPairDataset
 from composer.loggers import Logger
 from composer.trainer.trainer import Trainer
 from composer.utils import dist
-from tests.common import SimpleModel
+from tests.common import RandomClassificationDataset, SimpleModel
 
 
 def get_file_path(*, is_train: bool, tmp_path: pathlib.Path) -> str:
@@ -41,8 +39,8 @@ class TrackedDataset(types.Dataset):
     atomic file writes, it is slow and should not be used in any performance measurements.
     """
 
-    def __init__(self, is_train: bool, synthetic_dataset: SyntheticBatchPairDataset, tmp_path: pathlib.Path):
-        self.dataset = synthetic_dataset
+    def __init__(self, is_train: bool, dataset, tmp_path: pathlib.Path):
+        self.dataset = dataset
         self.is_train = is_train
         self.tmp_path = tmp_path
         self.counter = 0
@@ -88,16 +86,11 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
         pytest.param('gpu', False, False, id='gpu', marks=pytest.mark.gpu),
         # TODO: Remove filterwarnings after FSDP removes deprecated code
         pytest.param('gpu', True, False, id='deepspeed', marks=pytest.mark.gpu),
-        pytest.param('gpu',
-                     False,
-                     True,
-                     id='fsdp',
-                     marks=[
-                         pytest.mark.gpu,
-                         pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                                            reason='requires PyTorch 1.13 or higher'),
-                         pytest.mark.filterwarnings('ignore::UserWarning'),
-                     ]),
+        pytest.param(
+            'gpu', False, True, id='fsdp', marks=[
+                pytest.mark.gpu,
+                pytest.mark.filterwarnings('ignore::UserWarning'),
+            ]),
     ])
 @pytest.mark.parametrize('world_size', [
     pytest.param(1),
@@ -116,19 +109,11 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
     and 2) each ddp process is indeed getting different data.
     """
 
-    model = SimpleModel(num_classes=100)
-
     train_batch_size = 10
     train_subset_num_batches = 3
 
-    synthetic_dataset = SyntheticBatchPairDataset(
-        num_unique_samples_to_create=train_batch_size * train_subset_num_batches,
-        total_dataset_size=10_000,
-        data_shape=(model.num_features, 5, 5),
-        num_classes=model.num_classes,
-    )
     train_dataset = TrackedDataset(
-        synthetic_dataset=synthetic_dataset,
+        dataset=RandomClassificationDataset(size=train_batch_size * train_subset_num_batches,),
         is_train=True,
         tmp_path=tmp_path,
     )
@@ -150,14 +135,8 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
     eval_batch_size = 10
     eval_subset_num_batches = 3
 
-    eval_dataset = SyntheticBatchPairDataset(
-        num_unique_samples_to_create=eval_batch_size * eval_subset_num_batches,
-        total_dataset_size=10_000,
-        data_shape=(model.num_features, 5, 5),
-        num_classes=model.num_classes,
-    )
     eval_dataset = TrackedDataset(
-        synthetic_dataset=eval_dataset,
+        dataset=RandomClassificationDataset(size=eval_batch_size * eval_subset_num_batches,),
         is_train=False,
         tmp_path=tmp_path,
     )
@@ -185,17 +164,19 @@ def test_ddp(device: str, world_size: int, deepspeed: bool, fsdp: bool, tmp_path
         }
 
     max_epochs = 2
-    trainer = Trainer(model=model,
-                      train_dataloader=train_dataloader,
-                      eval_dataloader=eval_dataloader,
-                      device=device,
-                      max_duration=f'{max_epochs}ep',
-                      eval_interval='1ep',
-                      eval_subset_num_batches=eval_subset_num_batches,
-                      train_subset_num_batches=train_subset_num_batches,
-                      deepspeed_config={} if deepspeed else None,
-                      fsdp_config=fsdp_config,
-                      callbacks=[CheckBatch0(tmp_path)])
+    trainer = Trainer(
+        model=SimpleModel(num_classes=100),
+        train_dataloader=train_dataloader,
+        eval_dataloader=eval_dataloader,
+        device=device,
+        max_duration=f'{max_epochs}ep',
+        eval_interval='1ep',
+        eval_subset_num_batches=eval_subset_num_batches,
+        train_subset_num_batches=train_subset_num_batches,
+        deepspeed_config={} if deepspeed else None,
+        fsdp_config=fsdp_config,
+        callbacks=[CheckBatch0(tmp_path)],
+    )
 
     trainer.fit()
 
diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py
index 95aaf31e97..c6f5258c49 100644
--- a/tests/trainer/test_fsdp.py
+++ b/tests/trainer/test_fsdp.py
@@ -26,8 +26,6 @@
 @world_size(2)
 @pytest.mark.gpu
 @pytest.mark.filterwarnings('ignore:The passed in model appears to have tied weights.*:UserWarning')
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='FSDP requires PyTorch 1.13 or higher')
 def test_fsdp_device_initialization(model: ComposerClassifier, mixed_precision: str, reentrant: bool, world_size: int,
                                     device: str):
     """test FSDP device initialization for a simple model with weight tying and a model where two modules
@@ -99,11 +97,11 @@ def test_fsdp_inits_params_once(model: ComposerClassifier, device: str, world_si
     def dummy_param_init_fn(module: torch.nn.Module):
         if isinstance(module, torch.nn.Linear):
             torch.nn.init.ones_(module.weight)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 torch.nn.init.constant_(module.bias, 2)
 
     # Override the param_init_fn to be deterministic so we can test the init
-    model.module.param_init_fn = dummy_param_init_fn
+    model.module.param_init_fn = dummy_param_init_fn  # pyright: ignore[reportGeneralTypeIssues]
     # Apply the initial initialization, because it will only be called later for parameters on meta device
     model.apply(model.module.param_init_fn)
     # Now wrap the param_init_fn with a MagicMock so we can count calls
@@ -136,7 +134,7 @@ def dummy_param_init_fn(module: torch.nn.Module):
     for module in model.modules():
         if isinstance(module, torch.nn.Linear):
             assert torch.all(module.weight == 1)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 assert torch.all(module.bias == 2)
 
 
@@ -144,8 +142,6 @@ def dummy_param_init_fn(module: torch.nn.Module):
 @pytest.mark.parametrize('mixed_precision', _MIXED_PRECISION_TYPES)
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='FSDP requires PyTorch 1.13 or higher')
 def test_fsdp_meta_initialization_none(model: ComposerClassifier, mixed_precision: 'str', world_size: int):
     """
     This test is intended to test FSDP for meta initialization when there are attributes
@@ -173,12 +169,10 @@ def test_fsdp_meta_initialization_none(model: ComposerClassifier, mixed_precisio
 @pytest.mark.parametrize('backward_prefetch_limit', [1, 2])
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='FSDP requires PyTorch 1.13 or higher')
 def test_fsdp_prefetch_limit(forward_prefetch_limit: int, backward_prefetch_limit: int, world_size: int):
     model = SimpleModel()
-    model.fc1._fsdp_wrap = True
-    model.fc2._fsdp_wrap = True
+    model.fc1._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    model.fc2._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
     dataset = RandomClassificationDataset(size=10)
     dataloader = DataLoader(dataset, sampler=dist.get_sampler(dataset))
     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
@@ -199,14 +193,12 @@ def test_fsdp_prefetch_limit(forward_prefetch_limit: int, backward_prefetch_limi
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='FSDP requires PyTorch 1.13 or higher')
 @pytest.mark.filterwarnings('ignore:Instantiating FSDP with custom process groups.*:UserWarning')
 @pytest.mark.filterwarnings('ignore:Composer is instantiating custom process groups.*:UserWarning')
 def test_fsdp_process_group(world_size: int):
     model = SimpleModel()
-    model.fc1._fsdp_wrap = True
-    model.fc2._fsdp_wrap = True
+    model.fc1._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    model.fc2._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
     dataset = RandomClassificationDataset(size=10)
     dataloader = DataLoader(dataset, sampler=dist.get_sampler(dataset))
     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
@@ -258,7 +250,7 @@ def test_fsdp_act_ckpt_offload(
         'activation_cpu_offload': activation_cpu_offload,
     }
 
-    model.fc1._activation_checkpointing = True
+    model.fc1._activation_checkpointing = True  # pyright: ignore[reportGeneralTypeIssues]
 
     trainer = Trainer(
         model=model,
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index d9b7c5b5ee..bda2a36187 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -11,7 +11,7 @@
 import uuid
 from contextlib import nullcontext as does_not_raise
 from functools import partial
-from typing import Any, Callable, Optional, Sequence
+from typing import Any, Callable, Optional, Sequence, Union
 from unittest.mock import patch
 
 import numpy as np
@@ -28,10 +28,9 @@
 from composer.models import ComposerClassifier
 from composer.optim import DecoupledAdamW
 from composer.trainer import Trainer
-from composer.utils import dist
+from composer.utils import dist, parse_uri
 from composer.utils.checkpoint import is_checkpoint_legacy_sharded
 from composer.utils.file_helpers import get_file
-from composer.utils.misc import using_torch_2
 from composer.utils.object_store import S3ObjectStore
 from composer.utils.reproducibility import get_rng_state
 from tests.common import RandomClassificationDataset, deep_compare
@@ -58,9 +57,9 @@ def __init__(
 
         for module in net:
             if isinstance(module, torch.nn.Linear):
-                module._fsdp_wrap = True
+                module._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
 
-        net.param_init_fn = self.param_init_fn
+        net.param_init_fn = self.param_init_fn  # pyright: ignore[reportGeneralTypeIssues]
         super().__init__(
             module=net,
             num_classes=num_classes,
@@ -73,7 +72,7 @@ def param_init_fn(self, module):
 
         if isinstance(module, torch.nn.Linear):
             init_fn(module.weight)
-            if module.bias is not None:
+            if module.bias is not None:  # pyright: ignore[reportUnnecessaryComparison]
                 torch.nn.init.zeros_(module.bias)
 
 
@@ -238,7 +237,8 @@ def _compare_rng_states_between_trainers(rng_state1, rng_state2):
         if 'cuda' in rank_state1_keys:
             cuda_state1 = rank_state1['cuda']
             cuda_state2 = rank_state2['cuda']
-            torch.equal(cuda_state1, cuda_state2), f'Cuda rng state not the same between state_dicts for rank {rank}'
+            states_equal = torch.equal(cuda_state1, cuda_state2)
+            assert states_equal, f'Cuda rng state not the same between state_dicts for rank {rank}'
 
 
 def _compare_metrics_between_state_dicts(state_dict1: dict[str, Any], state_dict2: dict[str, Any]):
@@ -274,8 +274,6 @@ def _compare_timestamps_between_state_dicts(state_dict1, state_dict2):
 @pytest.mark.parametrize('autoresume', [True, False])
 @pytest.mark.parametrize('precision', ['amp_bf16', 'amp_fp16'])
 @pytest.mark.parametrize('load_fsdp_monolith_rank0_only', [True, False])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 def test_fsdp_full_state_dict_load(
     world_size,
     tmp_path: pathlib.Path,
@@ -341,8 +339,6 @@ def test_fsdp_full_state_dict_load(
 @pytest.mark.gpu
 @world_size(2)
 @pytest.mark.parametrize('sync_module_states', [True, False])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 def test_fsdp_mixed_with_sync(
     world_size,
     tmp_path: pathlib.Path,
@@ -365,7 +361,7 @@ def test_fsdp_mixed_with_sync(
 @world_size(2)
 @pytest.mark.parametrize('precision', ['amp_bf16', 'amp_fp16'])
 @pytest.mark.parametrize('sharding_strategy', ['FULL_SHARD', 'SHARD_GRAD_OP'])
-@pytest.mark.parametrize('state_dict_type', ['full', 'sharded', 'local'])
+@pytest.mark.parametrize('state_dict_type', ['full', 'sharded'])
 @pytest.mark.parametrize('composer_version', [
     pytest.param(
         '0.13.5',
@@ -400,12 +396,11 @@ def test_fsdp_mixed_with_sync(
         '0.17.0',
         marks=pytest.mark.filterwarnings((r'ignore:MosaicMLLogger is not in the state_dict. Its '
                                           r'state will not be restored.:UserWarning')),
-    )
+    ),
+    '0.18.1',
 ])
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*The CUDA RNG state could not be loaded.*:UserWarning')
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 def test_fsdp_load_old_checkpoint(
     world_size,
     tmp_path: pathlib.Path,
@@ -416,15 +411,8 @@ def test_fsdp_load_old_checkpoint(
     s3_read_only_prefix: str,
     composer_version: str,
 ):
-
-    if (version.parse(torch.__version__) >= version.parse('1.13.0') and
-            composer_version not in ['0.13.5', '0.14.0', '0.14.1']):
-        pytest.skip(('Composer 0.15.1 and above checkpoints were saved with '
-                     'torch 2 and as a result are not compatible with torch 1.13.'))
-    if (version.parse(torch.__version__) >= version.parse('2.0.0') and state_dict_type == 'local'):
-        pytest.xfail(('Loading a torch 1.13 checkpoint with torch 2.0 for '
-                      'state_dict_type local is not backwards compatible. See '
-                      'https://github.com/pytorch/pytorch/issues/102667 for more info'))
+    if composer_version == '0.18.1' and state_dict_type == 'full' and precision == 'amp_bf16' and sharding_strategy == 'FULL_SHARD':
+        pytest.skip('TODO: This checkpoint is missing')
 
     if composer_version in ['0.13.5', '0.14.0', '0.14.1', '0.15.1']:
         rank = 0 if state_dict_type == 'full' else '{rank}'
@@ -436,7 +424,6 @@ def test_fsdp_load_old_checkpoint(
             load_path_dir = (load_path_dir + 'ep0-ba2/')
 
         load_path = load_path_dir + f'ba2_rank{rank}.pt'
-
         assert is_checkpoint_legacy_sharded(
             object_store=S3ObjectStore(bucket=f'{s3_bucket}'),
             source_path=load_path.lstrip(f's3://{s3_bucket}/'),
@@ -445,6 +432,10 @@ def test_fsdp_load_old_checkpoint(
         load_path = (f's3://{s3_bucket}/{s3_read_only_prefix}/backwards_compatibility/'
                      f'{composer_version}/{sharding_strategy.lower()}_{state_dict_type}_'
                      f'{precision}/')
+        if state_dict_type == 'full':
+            load_path += 'ba2_rank0.pt'
+        else:
+            load_path += 'ep0-ba2/'
 
     if composer_version == '0.15.1':
         num_classes = 8  # This parameter setting is very important. Don't change or the test will fail.
@@ -475,14 +466,81 @@ def test_fsdp_load_old_checkpoint(
     )
     state_dict2 = trainer.state.state_dict()
 
-    if ((dist.get_global_rank() == 0 and state_dict_type == 'full') or state_dict_type in ['sharded', 'local']):
-        filled_load_path = load_path.format(rank=dist.get_global_rank())
-        destination = str(tmp_path / pathlib.Path(filled_load_path).name)
-        get_file(filled_load_path, destination=destination)
-        with open(destination, 'rb') as f:
-            state_dict1 = torch.load(f)['state']
-        _compare_model_params_between_state_dicts(state_dict1, state_dict2)
+    if (dist.get_global_rank() == 0 and state_dict_type == 'full') or state_dict_type == 'sharded':
+        # After composer version 0.16.0, sharded checkpoints are of type folder/__{local_rank}__{global_rank}.distcp
+        # They cannot be loaded with `get_file` as we need the whole folder to load the checkpoint.
+        # Thus, we use the DistCPObjectStoreReader to load the state_dict.
+        if state_dict_type == 'sharded' and version.parse(composer_version) >= version.parse('0.16.0'):
+            trainer2 = get_trainer(
+                num_features=32,  # This parameter setting is very important. Don't change or the test will fail.
+                num_classes=8,  # This parameter setting is very important. Don't change or the test will fail.
+                precision=precision,
+                max_duration='10ba',  # Change this so we have slightly different model runtime settings.
+                train_metrics=train_metrics,
+                val_metrics=val_metrics,
+                fsdp_config=fsdp_config,
+            )
 
+            from torch.distributed import checkpoint as dist_cp
+
+            from composer.utils.checkpoint import DistCPObjectStoreReader
+
+            _, _, parsed_load_path = parse_uri(load_path)
+            gathered_tmp_path = str(dist.all_gather_object(tmp_path)[0])
+            destination = str(pathlib.Path(gathered_tmp_path) / parsed_load_path)
+            state_dict: dict[str, Any] = {
+                'state': trainer2.state.state_dict(),
+                'rng': get_rng_state(),
+            }
+            if version.parse(torch.__version__) < version.parse('2.2.9'):
+                state_dict['state'].pop('optimizers')
+
+            object_store = S3ObjectStore(bucket=f'{s3_bucket}')
+            storage_reader = DistCPObjectStoreReader(source_path=parsed_load_path,
+                                                     destination_path=destination,
+                                                     object_store=object_store,
+                                                     device_mesh=None)
+
+            process_group = None
+            dist_cp.load_state_dict(
+                state_dict=state_dict,
+                storage_reader=storage_reader,
+                planner=None,
+                process_group=process_group,
+            )
+            if version.parse(torch.__version__) < version.parse('2.2.9'):
+                from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+                from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+                model_state_dict = state_dict['state']['model']
+                model = trainer2.state.model
+                optim = trainer2.state.optimizers[0]
+                optim_name = type(optim).__qualname__
+                optim_state_dict = load_sharded_optimizer_state_dict(model_state_dict=model_state_dict,
+                                                                     optimizer_key='optimizers',
+                                                                     storage_reader=storage_reader)
+                with fsdp_state_dict_type_context(module=model, state_dict_type=state_dict_type):
+                    optim_state_dict = FSDP.optim_state_dict_to_load(
+                        optim_state_dict=optim_state_dict['optimizers'][optim_name], model=model, optim=optim)
+
+                trainer2.state.optimizers[0].load_state_dict(optim_state_dict)
+
+                with fsdp_state_dict_type_context(module=model, state_dict_type=state_dict_type):
+                    flattened_optim_state_dict = FSDP.optim_state_dict(model, optim)  # type: ignore
+
+                state_dict['state']['optimizers'] = {
+                    optim_name: flattened_optim_state_dict,
+                }
+
+            state_dict1 = state_dict['state']
+        else:
+            filled_load_path = load_path.format(rank=dist.get_global_rank())
+            destination = str(tmp_path / pathlib.Path(filled_load_path).name)
+
+            get_file(filled_load_path, destination=destination)
+            with open(destination, 'rb') as f:
+                state_dict1 = torch.load(f)['state']
+
+        _compare_model_params_between_state_dicts(state_dict1, state_dict2)
         _compare_optims_between_state_dicts(state_dict1, state_dict2)
 
     # Continue to fit to make sure we can continue training.
@@ -494,8 +552,6 @@ def test_fsdp_load_old_checkpoint(
 @world_size(2)
 @pytest.mark.parametrize('optimizer', ['adam', 'adamw'])
 @pytest.mark.parametrize('precision', ['amp_bf16', 'amp_fp16'])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 def test_fsdp_full_state_dict_load_with_ema(
     world_size,
     tmp_path: pathlib.Path,
@@ -551,8 +607,6 @@ def test_fsdp_full_state_dict_load_with_ema(
 @world_size(2)
 @pytest.mark.parametrize('is_valid_checkpoint', [True, False])
 @pytest.mark.parametrize('state_dict_type', ['sharded', 'full'])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
@@ -560,11 +614,7 @@ def test_checkpoint_loading_with_validation(world_size, tmp_path, is_valid_check
     # Set the error expectations.
     expectation = does_not_raise()
     if not is_valid_checkpoint:
-        if using_torch_2() and state_dict_type == 'sharded':
-            from torch.distributed.checkpoint import CheckpointException
-            expectation = pytest.raises(CheckpointException)
-        else:
-            expectation = pytest.raises(ValueError)
+        expectation = pytest.raises(ValueError)
 
     def mock_get_checkpoint_validation_function():
         return lambda _: is_valid_checkpoint
@@ -581,10 +631,7 @@ def mock_get_checkpoint_validation_function():
     # Determine the checkpoint path for loading.
     checkpoint_relpath = 'ba1-rank0.pt'
     if state_dict_type == 'sharded':
-        if using_torch_2():
-            checkpoint_relpath = 'ba1'
-        else:
-            checkpoint_relpath = 'ba1/ba1-rank{rank}.pt'
+        checkpoint_relpath = 'ba1'
 
     # Load checkpoints with checkpoint validation.
     with expectation:
@@ -599,25 +646,26 @@ def mock_get_checkpoint_validation_function():
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.parametrize('weights_only', [False, True])
-@pytest.mark.parametrize('optimizer', ['adam', 'adamw'])
-@pytest.mark.parametrize('state_dict_type', ['sharded', 'local'])
-@pytest.mark.parametrize('precision', ['amp_bf16', 'amp_fp16'])
 @pytest.mark.parametrize('use_remote', [pytest.param(True, marks=pytest.mark.remote), False])
-@pytest.mark.parametrize('autoresume', [True, False])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
+@pytest.mark.parametrize('weights_only,optimizer,precision,autoresume,load_ignore_keys', [
+    [False, 'adamw', 'amp_bf16', False, None],
+    [True, 'adamw', 'amp_bf16', False, None],
+    [False, 'adam', 'amp_bf16', False, None],
+    [False, 'adamw', 'amp_fp16', False, None],
+    [False, 'adamw', 'amp_bf16', True, None],
+    [False, 'adamw', 'amp_bf16', False, ['rng']],
+])
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
 def test_fsdp_partitioned_state_dict_load(
     world_size,
     tmp_path: pathlib.Path,
-    state_dict_type: str,
     autoresume: bool,
     precision: str,
     optimizer: str,
     weights_only: bool,
+    load_ignore_keys: Union[list[str], None],
     use_remote,
     s3_bucket,
     s3_ephemeral_prefix,
@@ -625,10 +673,7 @@ def test_fsdp_partitioned_state_dict_load(
 ):
     if weights_only and autoresume:
         pytest.xfail('Weights only with autoresume is not supported')
-    if state_dict_type == 'local' and using_torch_2():
-        pytest.xfail(('Loading a state_dict_type="local" checkpoint with strict=True '
-                      'errors out. See https://github.com/pytorch/pytorch/issues/102667 '
-                      'for more info'))
+    load_ignore_keys = [] if load_ignore_keys is None else load_ignore_keys
 
     if autoresume:
         local_run_name = f'my-cool-autoresume-run-{uuid.uuid1()}'
@@ -644,7 +689,7 @@ def test_fsdp_partitioned_state_dict_load(
 
     save_filename = 'ba{batch}-rank{rank}.pt'
 
-    fsdp_config = FSDPConfig(state_dict_type=state_dict_type)
+    fsdp_config = FSDPConfig(state_dict_type='sharded')
 
     trainer1 = get_trainer(
         save_folder=str(save_folder),
@@ -671,19 +716,10 @@ def test_fsdp_partitioned_state_dict_load(
         object_store = None
         load_path = str(save_folder.format(run_name=run_name) / pathlib.Path('ba2'))
 
-    if not using_torch_2():
-        load_filename = f"{save_filename.format(batch=2, rank='{rank}')}"
-        assert load_filename == 'ba2-rank{rank}.pt'
-        load_path += '/' + load_filename
-        assert is_checkpoint_legacy_sharded(
-            object_store=object_store,
-            source_path=load_path.replace(f's3://{s3_bucket}/', ''),
-        )
-    else:
-        assert not is_checkpoint_legacy_sharded(
-            object_store=object_store,
-            source_path=load_path.replace(f's3://{s3_bucket}/', ''),
-        )
+    assert not is_checkpoint_legacy_sharded(
+        object_store=object_store,
+        source_path=load_path.replace(f's3://{s3_bucket}/', ''),
+    )
 
     if autoresume:
         load_path = None
@@ -699,6 +735,7 @@ def test_fsdp_partitioned_state_dict_load(
         optimizer=optimizer,
         load_weights_only=weights_only,
         fsdp_config=fsdp_config,
+        load_ignore_keys=load_ignore_keys,
     )
     state_dict_from_trainer2 = trainer2.state.state_dict()
     rng2 = trainer2._rng_state
@@ -708,7 +745,10 @@ def test_fsdp_partitioned_state_dict_load(
         state_dict_from_trainer2,
     )
     if not weights_only:
-        _compare_rng_states_between_trainers(rng1, rng2)
+        if any('rng' in x for x in load_ignore_keys):
+            assert rng1 is not None and rng2 is None
+        else:
+            _compare_rng_states_between_trainers(rng1, rng2)
         _compare_optims_between_state_dicts(
             state_dict_from_trainer1_ba2,
             state_dict_from_trainer2,
@@ -729,20 +769,16 @@ def test_fsdp_partitioned_state_dict_load(
 @pytest.mark.gpu
 @pytest.mark.remote
 @world_size(2)
-@pytest.mark.parametrize('state_dict_type', ['sharded'])
 @pytest.mark.parametrize('precision', ['amp_bf16', 'amp_fp16'])
-@pytest.mark.parametrize('autoresume', [False, True])  # True commented out for now
+@pytest.mark.parametrize('autoresume', [False, True])
 @pytest.mark.parametrize('num_shards', [2, 4, 7])
 @pytest.mark.parametrize('sharding_strategy', ['FULL_SHARD', 'SHARD_GRAD_OP'])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0.1'),
-                    reason='requires PyTorch 2.0.1 or higher')
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:MosaicMLLogger is not in the state_dict.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 def test_elastic_resumption(
     world_size,
     tmp_path: pathlib.Path,
-    state_dict_type: str,
     autoresume: bool,
     precision: str,
     sharding_strategy,
@@ -750,17 +786,13 @@ def test_elastic_resumption(
     s3_read_only_prefix,
     num_shards: int,
 ):
-    if state_dict_type == 'local' and using_torch_2():
-        pytest.xfail(('Loading a state_dict_type="local" checkpoint with '
-                      'strict=True errors out. See https://github.com/pytorch/pytorch/issues/102667 '
-                      'for more info'))
     if autoresume:
         run_name = 'my-autoresume-run'
     else:
         run_name = None
 
     base_path = (f's3://{s3_bucket}/{s3_read_only_prefix}/elastic_test/'
-                 f'{sharding_strategy.lower()}_{state_dict_type}_{precision}_'
+                 f'{sharding_strategy.lower()}_sharded_{precision}_'
                  f'{num_shards}/')
 
     mono_load_path = os.path.join(base_path, 'mono.pt')
@@ -797,7 +829,7 @@ def test_elastic_resumption(
         run_name=run_name,
         max_duration='4ba',
         load_weights_only=False,
-        fsdp_config=FSDPConfig(state_dict_type=state_dict_type),
+        fsdp_config=FSDPConfig(state_dict_type='sharded'),
     )
 
     def get_mono_state_dict_from_sharded_one(trainer):
@@ -843,87 +875,20 @@ def compare_state_dicts():
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.parametrize('state_dict_type', ['local', 'sharded'])
-@pytest.mark.parametrize('autoresume', [True])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
-@pytest.mark.skipif(version.parse(torch.__version__) > version.parse('1.13.0'),
-                    reason='All Pytorch 2.0 checkpoints have just 1 symlink')
-def test_mismatch_timestamp_error(
-    world_size,
-    tmp_path: pathlib.Path,
-    state_dict_type: str,
-    autoresume: bool,
-):
-    run_name = 'my-run-ar' if autoresume else 'my-run'
-    tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
-    save_folder = str(tmp_paths[0] / pathlib.Path(run_name))
-    save_filename = 'ba{batch}-rank{rank}.pt'
-    trainer1 = get_trainer(
-        save_folder=save_folder,
-        save_filename=save_filename,
-        run_name=run_name,
-        autoresume=autoresume,
-        max_duration='2ba',
-        save_interval='1ba',
-        fsdp_config=FSDPConfig(state_dict_type=state_dict_type),
-    )
-    trainer1.fit()
-    trainer1.close()
-    latest_symlink = str(pathlib.Path(save_folder) / pathlib.Path(f'latest-rank{dist.get_global_rank()}.pt'))
-    latest_checkpoint_path = pathlib.Path(save_folder) / pathlib.Path('ba2') / (pathlib.Path(
-        save_filename.format(batch=2, rank=dist.get_global_rank())) if not using_torch_2() else pathlib.Path(''))
-    assert os.path.join(save_folder, os.readlink(latest_symlink)) == str(latest_checkpoint_path)
-    oldest_checkpoint_relative_path = str(
-        pathlib.Path('ba1') / (pathlib.Path(save_filename.format(batch=1, rank=dist.get_global_rank()))
-                               if not using_torch_2() else pathlib.Path('')))
-
-    # Corrupt latest checkpoint symlink for rank1 by changing it from batch 2 checkpoint to the batch 1 one
-    # and removing batch 2 checkpoint.
-    if dist.get_global_rank() == 0:
-        os.remove(latest_symlink)
-        os.symlink(src=oldest_checkpoint_relative_path, dst=latest_symlink)
-        assert os.readlink(latest_symlink) == oldest_checkpoint_relative_path
-
-    dist.barrier()
-    expected_error = pytest.raises(RuntimeError, match='Timestamp mismatch error:*')
-
-    with expected_error:
-        get_trainer(
-            save_folder=save_folder,
-            save_filename=save_filename,
-            autoresume=autoresume,
-            run_name=run_name,
-            fsdp_config=FSDPConfig(state_dict_type=state_dict_type),
-        )
-
-
-@pytest.mark.gpu
-@world_size(2)
-@pytest.mark.parametrize('state_dict_type', ['sharded', 'local'])
 @pytest.mark.parametrize('num_ckpts_to_keep', [-1, 1, 2, 3])
-@pytest.mark.parametrize('batches_to_train', [3])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
 @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
 def test_cleanup_sharded_checkpoints(
     world_size,
     tmp_path: pathlib.Path,
-    state_dict_type: str,
     num_ckpts_to_keep: int,
-    batches_to_train: int,
     s3_bucket,
     s3_ephemeral_prefix,
     request,
 ):
-    if state_dict_type == 'local' and using_torch_2():
-        pytest.xfail(('Loading a state_dict_type="local" checkpoint with strict=True '
-                      'errors out. See https://github.com/pytorch/pytorch/issues/102667 '
-                      'for more info'))
-
     run_name = None
+    batches_to_train = 3
 
     tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
     save_folder = os.path.join(tmp_paths[0], 'checkpoints', '{run_name}')
@@ -936,7 +901,7 @@ def test_cleanup_sharded_checkpoints(
         max_duration=f'{batches_to_train}ba',
         save_interval='1ba',
         save_num_checkpoints_to_keep=num_ckpts_to_keep,
-        fsdp_config=FSDPConfig(state_dict_type=state_dict_type),
+        fsdp_config=FSDPConfig(state_dict_type='sharded'),
     )
     run_name = trainer1.state.run_name
     trainer1.fit()
@@ -950,9 +915,7 @@ def test_cleanup_sharded_checkpoints(
         assert num_checkpoint_dirs == num_ckpts_to_keep
     for ckpt_dir in dir_contents:
         full_path_ckpt_dir = os.path.join(shards_dir, ckpt_dir)
-        elastic_file_list = {'.metadata', *[f'__{rank}_0.distcp' for rank in range(dist.get_world_size())]}
-        non_elastic_file_list = {save_filename.format(rank=rank) for rank in range(dist.get_world_size())}
-        file_list = elastic_file_list if using_torch_2() else non_elastic_file_list
+        file_list = {'.metadata', *[f'__{rank}_0.distcp' for rank in range(dist.get_world_size())]}
         assert set(os.listdir(full_path_ckpt_dir)) == file_list
 
 
diff --git a/tests/trainer/test_fsdp_param_groups.py b/tests/trainer/test_fsdp_param_groups.py
index a144db51a4..30e29b4de5 100644
--- a/tests/trainer/test_fsdp_param_groups.py
+++ b/tests/trainer/test_fsdp_param_groups.py
@@ -19,26 +19,19 @@
 @pytest.mark.filterwarnings('ignore::UserWarning')
 @device('gpu')
 @world_size(2)
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2'),
-                    reason='FSDP use_orig_params requires torch 2.0 or higher')
 def test_fsdp_param_groups_without_orig_params(mixed_precision: str, device: str, reentrant: bool, world_size: int):
-    """
-
-    Ensure that FSDP with 'use_orig_params=False' raises an exception when passing in an optimizer
-    with multiple param groups
-
-    """
+    # Ensure that FSDP with 'use_orig_params=False' raises an exception when passing in an optimizer
+    # with multiple param groups
     num_classes = 10
     model = SimpleModel(num_features=1, num_classes=num_classes)
     dataset = RandomClassificationDataset(shape=(num_classes,), size=2, num_classes=num_classes)
     dataloader = DataLoader(dataset, sampler=dist.get_sampler(dataset))
 
-    # create a different parameter per group
+    # Create a different parameter per group
     param_groups = [{'params': param, 'lr': (0.1 + 0.1 * i)} for i, param in enumerate(model.parameters())]
     optimizer = torch.optim.SGD(param_groups, lr=0)
 
-    expected_error = 'Multiple optimizer groups with FSDP are only supported on torch 2.0 \
-                                   with use_orig_params=True.'
+    expected_error = 'Multiple optimizer groups with FSDP are only supported with use_orig_params=True.'
 
     with pytest.raises(RuntimeError, match=expected_error):
         _ = Trainer(model=model,
diff --git a/tests/trainer/test_scale_schedule.py b/tests/trainer/test_scale_schedule.py
index 2ae600f70c..ec90890e07 100644
--- a/tests/trainer/test_scale_schedule.py
+++ b/tests/trainer/test_scale_schedule.py
@@ -7,11 +7,11 @@
 import pytest
 import torch
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import ExponentialLR
+from torch.optim.lr_scheduler import ExponentialLR, LRScheduler
 from torch.utils.data import DataLoader
 
 from composer import Trainer
-from composer.core import Callback, PyTorchScheduler, State, TimeUnit
+from composer.core import Callback, State, TimeUnit
 from composer.loggers.logger import Logger
 from composer.optim import MultiStepScheduler
 from composer.trainer._scale_schedule import scale_pytorch_scheduler
@@ -33,7 +33,7 @@ def flatten(lst: list):
 class TestScaleSchedule():
 
     @staticmethod
-    def _test(targets: List[float], scheduler: PyTorchScheduler, epochs: int, optimizer: Optimizer, ssr: float):
+    def _test(targets: List[float], scheduler: LRScheduler, epochs: int, optimizer: Optimizer, ssr: float):
         scale_pytorch_scheduler(scheduler, ssr)
         for epoch in range(epochs):
             for param_group in optimizer.param_groups:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 6408c008b6..97ca2005ee 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -14,7 +14,6 @@
 
 import pytest
 import torch
-from packaging import version
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 
@@ -122,8 +121,6 @@ def test_no_param_model(self, call_fit: bool, call_eval: bool):
         if call_eval:
             trainer.eval(subset_num_batches=1)
 
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0.0'),
-                        reason='requires PyTorch 2.0 or higher')
     @pytest.mark.parametrize('compile_config', [(None, False), ({}, True), ({'mode': 'reduce-overhead'}, True)])
     def test_torch_compile(self, model: ComposerModel, compile_config: Any):
         train_dataset = RandomClassificationDataset()
@@ -137,8 +134,6 @@ def test_torch_compile(self, model: ComposerModel, compile_config: Any):
                           compile_config=compile_config[0])
         assert trainer.local_hparams['is_model_compiled'] is compile_config[1]
 
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0.0'),
-                        reason='requires PyTorch 2.0 or higher')
     def test_already_compiled_warning(self, caplog, model: ComposerModel):
         with caplog.at_level(logging.WARNING):
             train_dataset = RandomClassificationDataset()
@@ -153,20 +148,6 @@ def test_already_compiled_warning(self, caplog, model: ComposerModel):
                         compile_config=None)
             assert '`model` is already compiled with `torch.compile`' in caplog.text
 
-    @pytest.mark.skipif(version.parse(torch.__version__) >= version.parse('2.0.0'),
-                        reason='requires PyTorch 1.13 or lower')
-    def test_compile_unsupported_torch_version_exception(self, caplog, model: ComposerModel):
-        with pytest.raises(ValueError, match='`torch.compile` is supported for PyTorch 2.0 or higher.'):
-            train_dataset = RandomClassificationDataset()
-            optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
-            max_duration = '2ba'
-            _ = Trainer(model=model,
-                        max_duration=max_duration,
-                        train_dataloader=DataLoader(train_dataset, sampler=dist.get_sampler(train_dataset)),
-                        optimizers=optimizer,
-                        auto_log_hparams=True,
-                        compile_config={})
-
     def test_eval_metrics(self):
         model = SimpleModel()
         train_dataloader = DataLoader(RandomClassificationDataset(size=1), batch_size=1)
@@ -340,9 +321,7 @@ def test_max_duration_tokens(self, tiny_bert_tokenizer, batch_size: int, sequenc
     @pytest.mark.parametrize('train_subset_num_batches', [-1, 1])
     def test_infinite_train_loader(self, model: ComposerModel, max_duration: Union[int, str],
                                    train_subset_num_batches: int):
-        should_raise = (isinstance(max_duration, int) or
-                        max_duration.endswith('ep')) and (train_subset_num_batches is None or
-                                                          train_subset_num_batches == -1)
+        should_raise = (isinstance(max_duration, int) or max_duration.endswith('ep')) and train_subset_num_batches == -1
         context = pytest.raises(
             ValueError,
             match='max_duration cannot be specified in epochs') if should_raise else contextlib.nullcontext()
@@ -366,7 +345,7 @@ def test_reset_time(
         train_dataloader: DataLoader,
         model: ComposerModel,
         max_duration: Time[int],
-        new_duration: Time,
+        new_duration: Optional[Time],
         reset_time: bool,
     ):
         # Train once
@@ -629,8 +608,6 @@ def test_deepspeed(
         trainer.fit()
 
     @pytest.mark.gpu
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                        reason='requires PyTorch 1.13 or higher')
     @pytest.mark.parametrize('precision', [Precision.FP32, Precision.AMP_BF16, Precision.AMP_FP16])
     @pytest.mark.filterwarnings('ignore::UserWarning')
     def test_fsdp(
@@ -674,8 +651,6 @@ def test_fsdp(
             trainer.fit()
 
     @pytest.mark.gpu
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0.0'),
-                        reason='requires PyTorch 2.0 or higher')
     @pytest.mark.parametrize('precision', [Precision.AMP_BF16, Precision.AMP_FP16])
     @pytest.mark.parametrize('compile_config', [None, {}])
     @pytest.mark.filterwarnings('ignore::UserWarning')
@@ -1114,8 +1089,6 @@ def test_training_duration_unit(
                 assert event_counter_callback.event_to_num_calls[Event.EPOCH_END] == 2
                 assert event_counter_callback.event_to_num_calls[Event.EPOCH_CHECKPOINT] == 2
 
-    @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.0.0'),
-                        reason='requires PyTorch 2.0 or higher')
     @pytest.mark.parametrize('is_model_compiled', [True, False])
     def test_compile_uncompile_model_weights_trainer_fit(
         self,
diff --git a/tests/trainer/test_trainer_eval.py b/tests/trainer/test_trainer_eval.py
index cb7f561ca3..83e526a8e0 100644
--- a/tests/trainer/test_trainer_eval.py
+++ b/tests/trainer/test_trainer_eval.py
@@ -592,7 +592,7 @@ def __len__(self) -> Optional[int]:
         return None
 
 
-@pytest.mark.parametrize('eval_subset_num_batches,success', [[None, False], [-1, False], [1, True]])
+@pytest.mark.parametrize('eval_subset_num_batches,success', [[-1, False], [1, True]])
 def test_infinite_eval_dataloader(eval_subset_num_batches, success):
     """Test the `eval_subset_num_batches` is required with infinite dataloader."""
     # Construct the trainer
diff --git a/tests/utils/eval_client/test_local_eval_client.py b/tests/utils/eval_client/test_local_eval_client.py
index 8a598608d0..b114096ad3 100644
--- a/tests/utils/eval_client/test_local_eval_client.py
+++ b/tests/utils/eval_client/test_local_eval_client.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 
-from composer.utils import LocalEvalClient
+from composer.utils import LocalEvalClient, dist
 from tests.common.markers import world_size
 
 
@@ -29,10 +29,11 @@
 )
 @world_size(1, 2)
 def test_local_invoke(code: str, result: str, language: str, world_size: int, tmp_path: str):
-    """Test invocation function for LocalEvalClient with code that succeeds, fails compilation, times out, and is incorrect in C, C++, Python, JS.
+    """Test invocation function for LocalEvalClient.
+
+    Code can succeed, fail compilation, time out, or be incorrect in C, C++, Python, JS.
     """
-    import os
-    os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
+    dist.barrier()  # Ensure all processes are ready to run the test as invoke doesn't use dist
     eval_client = LocalEvalClient()
     input = '(1,)' if language == 'python' else '1'
     assert eval_client.invoke([[[{
diff --git a/tests/utils/object_store/test_azure_object_store.py b/tests/utils/object_store/test_azure_object_store.py
new file mode 100644
index 0000000000..949e2149ff
--- /dev/null
+++ b/tests/utils/object_store/test_azure_object_store.py
@@ -0,0 +1,33 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from torch.utils.data import DataLoader
+
+from composer.trainer import Trainer
+from tests.common import RandomClassificationDataset, SimpleModel
+
+
+@pytest.mark.remote
+def test_azure_object_store_integration():
+    model = SimpleModel()
+    train_dataloader = DataLoader(dataset=RandomClassificationDataset())
+    trainer_save = Trainer(
+        model=model,
+        train_dataloader=train_dataloader,
+        save_folder='azure://mosaicml-composer-tests/checkpoints/{run_name}',
+        save_filename='test-model.pt',
+        max_duration='1ba',
+    )
+    run_name = trainer_save.state.run_name
+    trainer_save.fit()
+    trainer_save.close()
+
+    trainer_load = Trainer(
+        model=model,
+        train_dataloader=train_dataloader,
+        load_path=f'azure://mosaicml-composer-tests/checkpoints/{run_name}/test-model.pt',
+        max_duration='2ba',
+    )
+    trainer_load.fit()
+    trainer_load.close()
diff --git a/tests/utils/object_store/test_integration_gs_object_store.py b/tests/utils/object_store/test_integration_gs_object_store.py
deleted file mode 100644
index 1a08bb73ce..0000000000
--- a/tests/utils/object_store/test_integration_gs_object_store.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-from pathlib import Path
-
-import pytest
-
-from composer.utils import GCSObjectStore
-
-__DUMMY_OBJ__ = '/tmp/dummy.ckpt'
-__NUM_BYTES__ = 1000
-bucket_name = 'mosaicml-composer-tests'
-
-
-@pytest.mark.remote
-@pytest.fixture
-def gs_object_store():
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    remote_dir = 'gs://mosaicml-composer-tests/streaming/'
-    yield GCSObjectStore(remote_dir)
-
-
-@pytest.mark.remote
-def test_bucket_not_found():
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    with pytest.raises(FileNotFoundError):
-        _ = GCSObjectStore('gs://not_a_bucket/streaming')
-
-
-@pytest.mark.remote
-def test_get_uri(gs_object_store):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    object_name = 'test-object'
-    expected_uri = 'gs://mosaicml-composer-tests/streaming/test-object'
-    assert (gs_object_store.get_uri(object_name) == expected_uri)
-
-
-@pytest.mark.remote
-def test_get_key(gs_object_store):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    object_name = 'test-object'
-    expected_key = 'streaming/test-object'
-    assert (gs_object_store.get_key(object_name) == expected_key)
-
-
-@pytest.mark.remote
-@pytest.mark.parametrize('result', ['success', 'not found'])
-def test_get_object_size(gs_object_store, result: str):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    fn = Path(__DUMMY_OBJ__)
-    with open(fn, 'wb') as fp:
-        fp.write(bytes('0' * __NUM_BYTES__, 'utf-8'))
-    gs_object_store.upload_object(fn)
-
-    if result == 'success':
-        assert (gs_object_store.get_object_size(__DUMMY_OBJ__) == __NUM_BYTES__)
-    else:  # not found
-        with pytest.raises(FileNotFoundError):
-            gs_object_store.get_object_size(__DUMMY_OBJ__ + f'time.ctime()')
-
-
-@pytest.mark.remote
-def test_upload_object(gs_object_store):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    from google.cloud.storage import Blob
-    destination_blob_name = '/tmp/dummy.ckpt2'
-    key = gs_object_store.get_key(destination_blob_name)
-    stats = Blob(bucket=gs_object_store.bucket, name=key).exists(gs_object_store.client)
-    if not stats:
-        gs_object_store.upload_object(__DUMMY_OBJ__, destination_blob_name)
-
-
-@pytest.mark.remote
-def test_list_objects(gs_object_store):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    from google.cloud.storage import Blob
-    destination_blob_name = '/tmp/dummy.ckpt2'
-    key = gs_object_store.get_key(destination_blob_name)
-    stats = Blob(bucket=gs_object_store.bucket, name=key).exists(gs_object_store.client)
-    if not stats:
-        gs_object_store.upload_object(__DUMMY_OBJ__, destination_blob_name)
-    objects = gs_object_store.list_objects()
-    assert (key in objects)
-
-
-@pytest.mark.remote
-@pytest.mark.parametrize('result', ['success', 'file_exists', 'obj_not_found'])
-def test_download_object(gs_object_store, tmp_path, result: str):
-    pytest.skip('Run this test suite only after GCS service account is configured on CI node.')
-    fn = Path(__DUMMY_OBJ__)
-    with open(fn, 'wb') as fp:
-        fp.write(bytes('0' * __NUM_BYTES__, 'utf-8'))
-    gs_object_store.upload_object(fn)
-
-    object_name = __DUMMY_OBJ__
-    filename = './dummy.ckpt.download'
-
-    if result == 'success':
-        gs_object_store.download_object(object_name, filename, overwrite=True)
-
-    elif result == 'file_exists':
-        with pytest.raises(FileExistsError):
-            gs_object_store.download_object(object_name, __DUMMY_OBJ__)
-    else:  # obj_not_found
-        with pytest.raises(FileNotFoundError):
-            gs_object_store.download_object(object_name + f'{time.ctime()}', filename, overwrite=True)
diff --git a/tests/utils/object_store/test_mlflow_object_store.py b/tests/utils/object_store/test_mlflow_object_store.py
index d46fc493a4..ecbedd2e50 100644
--- a/tests/utils/object_store/test_mlflow_object_store.py
+++ b/tests/utils/object_store/test_mlflow_object_store.py
@@ -8,7 +8,7 @@
 import pytest
 
 from composer.utils import MLFlowObjectStore
-from composer.utils.object_store.mlflow_object_store import PLACEHOLDER_EXPERIMENT_ID, PLACEHOLDER_RUN_ID
+from composer.utils.object_store.mlflow_object_store import MLFLOW_EXPERIMENT_ID_PLACEHOLDER, MLFLOW_RUN_ID_PLACEHOLDER
 
 TEST_PATH_FORMAT = 'databricks/mlflow-tracking/{experiment_id}/{run_id}/artifacts/'
 EXPERIMENT_ID = '123'
@@ -66,7 +66,7 @@ def test_init_with_experiment_and_no_run(monkeypatch):
     mock_mlflow_client.return_value.create_run.return_value = MagicMock(
         info=MagicMock(run_id=RUN_ID, run_name='test-run'))
 
-    store = MLFlowObjectStore(TEST_PATH_FORMAT.format(experiment_id=EXPERIMENT_ID, run_id=PLACEHOLDER_RUN_ID))
+    store = MLFlowObjectStore(TEST_PATH_FORMAT.format(experiment_id=EXPERIMENT_ID, run_id=MLFLOW_RUN_ID_PLACEHOLDER))
     assert store.experiment_id == EXPERIMENT_ID
     assert store.run_id == RUN_ID
 
@@ -76,7 +76,7 @@ def test_init_with_run_and_no_experiment(monkeypatch):
     monkeypatch.setattr(dbx_sdk, 'WorkspaceClient', MagicMock())
 
     with pytest.raises(ValueError):
-        MLFlowObjectStore(TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id=RUN_ID))
+        MLFlowObjectStore(TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER, run_id=RUN_ID))
 
 
 def test_init_with_active_run(monkeypatch):
@@ -91,7 +91,7 @@ def test_init_with_active_run(monkeypatch):
     mock_active_run.return_value = MagicMock(info=MagicMock(experiment_id=EXPERIMENT_ID, run_id=RUN_ID))
 
     store = MLFlowObjectStore(
-        TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id=PLACEHOLDER_RUN_ID))
+        TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER, run_id=MLFLOW_RUN_ID_PLACEHOLDER))
     assert store.experiment_id == EXPERIMENT_ID
     assert store.run_id == RUN_ID
 
@@ -109,7 +109,7 @@ def test_init_with_existing_experiment_and_no_run(monkeypatch):
         info=MagicMock(run_id=RUN_ID, run_name='test-run'))
 
     store = MLFlowObjectStore(
-        TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id=PLACEHOLDER_RUN_ID))
+        TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER, run_id=MLFLOW_RUN_ID_PLACEHOLDER))
     assert store.experiment_id == EXPERIMENT_ID
     assert store.run_id == RUN_ID
 
@@ -128,7 +128,7 @@ def test_init_with_no_experiment_and_no_run(monkeypatch):
         info=MagicMock(run_id=RUN_ID, run_name='test-run'))
 
     store = MLFlowObjectStore(
-        TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id=PLACEHOLDER_RUN_ID))
+        TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER, run_id=MLFLOW_RUN_ID_PLACEHOLDER))
     assert store.experiment_id == EXPERIMENT_ID
     assert store.run_id == RUN_ID
 
@@ -190,16 +190,19 @@ def test_get_artifact_path(mlflow_object_store):
     assert mlflow_object_store.get_artifact_path(DEFAULT_PATH + ARTIFACT_PATH) == ARTIFACT_PATH
 
     # Absolute DBFS path with placeholders
-    path = TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id=PLACEHOLDER_RUN_ID) + ARTIFACT_PATH
+    path = TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER,
+                                   run_id=MLFLOW_RUN_ID_PLACEHOLDER) + ARTIFACT_PATH
     assert mlflow_object_store.get_artifact_path(path) == ARTIFACT_PATH
 
     # Raises ValueError for different experiment ID
-    path = TEST_PATH_FORMAT.format(experiment_id='different-experiment', run_id=PLACEHOLDER_RUN_ID) + ARTIFACT_PATH
+    path = TEST_PATH_FORMAT.format(experiment_id='different-experiment',
+                                   run_id=MLFLOW_RUN_ID_PLACEHOLDER) + ARTIFACT_PATH
     with pytest.raises(ValueError):
         mlflow_object_store.get_artifact_path(path)
 
     # Raises ValueError for different run ID
-    path = TEST_PATH_FORMAT.format(experiment_id=PLACEHOLDER_EXPERIMENT_ID, run_id='different-run') + ARTIFACT_PATH
+    path = TEST_PATH_FORMAT.format(experiment_id=MLFLOW_EXPERIMENT_ID_PLACEHOLDER,
+                                   run_id='different-run') + ARTIFACT_PATH
     with pytest.raises(ValueError):
         mlflow_object_store.get_artifact_path(path)
 
diff --git a/tests/utils/object_store/test_oci_object_store.py b/tests/utils/object_store/test_oci_object_store.py
index 49676cd2e8..728462f3b3 100644
--- a/tests/utils/object_store/test_oci_object_store.py
+++ b/tests/utils/object_store/test_oci_object_store.py
@@ -54,7 +54,7 @@ def test_upload_object(test_oci_obj_store, monkeypatch, tmp_path, mock_bucket_na
                                                      bucket_name=mock_bucket_name,
                                                      object_name=mock_object_name,
                                                      file_path=file_to_upload)
-    else:  # result = bucket_not_found
+    elif result == 'bucket_not_found':
         bucket_not_found_msg = f'Either the bucket named f{mock_bucket_name} does not exist in the namespace*'
         mock_upload_file_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
             status=404, code='BucketNotFound', headers={'opc-request-id': 'foo'}, message=bucket_not_found_msg))
@@ -68,7 +68,7 @@ def test_upload_object(test_oci_obj_store, monkeypatch, tmp_path, mock_bucket_na
                 oci_os.upload_object(mock_object_name, filename=file_to_upload)
 
 
-@pytest.mark.parametrize('result', ['success', 'file_exists', 'obj_not_found', 'bucket_not_found'])
+@pytest.mark.parametrize('result', ['success', 'file_exists', 'obj_not_found', 'bucket_not_found', 'no_code'])
 def test_download_object(test_oci_obj_store, monkeypatch, tmp_path, mock_bucket_name, result: str):
     oci = pytest.importorskip('oci')
     oci_os = test_oci_obj_store
@@ -112,7 +112,7 @@ def test_download_object(test_oci_obj_store, monkeypatch, tmp_path, mock_bucket_
                     FileNotFoundError,
                     match=f'Object oci://{mock_bucket_name}/{mock_object_name} not found. {obj_not_found_msg}'):
                 oci_os.download_object(mock_object_name, filename=file_to_download_to)
-    else:  #result == 'bucket_not_found':
+    elif result == 'bucket_not_found':
         file_to_download_to = str(tmp_path / Path('my_bucket_not_found_file.bin'))
         bucket_not_found_msg = f'Either the bucket named f{mock_bucket_name} does not exist in the namespace*'
         mock_get_object_fn_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
@@ -125,6 +125,19 @@ def test_download_object(test_oci_obj_store, monkeypatch, tmp_path, mock_bucket_
                     f'Bucket specified in oci://{mock_bucket_name}/{mock_object_name} not found. {bucket_not_found_msg}'
             ):
                 oci_os.download_object(mock_object_name, filename=file_to_download_to)
+    elif result == 'no_code':
+        file_to_download_to = str(tmp_path / Path('my_bucket_not_found_file.bin'))
+        bucket_not_found_msg = f'Either the bucket named f{mock_bucket_name} does not exist in the namespace*'
+        mock_get_object_fn_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
+            status=404, code=None, headers={'opc-request-id': 'foo'}, message=bucket_not_found_msg))
+        with monkeypatch.context() as m:
+            m.setattr(oci_os.client, 'get_object', mock_get_object_fn_with_exception)
+            with pytest.raises(
+                    FileNotFoundError,
+                    match=
+                    f'Object oci://{mock_bucket_name}/{mock_object_name} not found with no error code. {bucket_not_found_msg}'
+            ):
+                oci_os.download_object(mock_object_name, filename=file_to_download_to)
 
 
 @pytest.mark.parametrize('result', ['success', 'bucket_not_found'])
@@ -171,7 +184,7 @@ def __init__(self, name: str, size: int):
                 oci_os.list_objects(prefix=prefix)
 
 
-@pytest.mark.parametrize('result', ['success', 'obj_not_found', 'bucket_not_found'])
+@pytest.mark.parametrize('result', ['success', 'obj_not_found', 'bucket_not_found', 'no_code'])
 def test_get_object_size(test_oci_obj_store, mock_bucket_name, monkeypatch, result: str):
     oci = pytest.importorskip('oci')
     oci_os = test_oci_obj_store
@@ -186,7 +199,6 @@ def test_get_object_size(test_oci_obj_store, mock_bucket_name, monkeypatch, resu
         with monkeypatch.context() as m:
             m.setattr(oci_os.client, 'get_object', mock_get_object_fn)
             assert oci_os.get_object_size(mock_object_name) == mock_object_size
-
     elif result == 'obj_not_found':
         obj_not_found_msg = f"The object '{mock_object_name}' was not found in the bucket f'{mock_bucket_name}'"
         mock_get_object_fn_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
@@ -197,8 +209,7 @@ def test_get_object_size(test_oci_obj_store, mock_bucket_name, monkeypatch, resu
                     FileNotFoundError,
                     match=f'Object oci://{mock_bucket_name}/{mock_object_name} not found. {obj_not_found_msg}'):
                 oci_os.get_object_size(mock_object_name)
-
-    else:  #result == 'bucket_not_found':
+    elif result == 'bucket_not_found':
         bucket_not_found_msg = f'Either the bucket named f{mock_bucket_name} does not exist in the namespace*'
         mock_get_object_fn_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
             status=404, code='BucketNotFound', headers={'opc-request-id': 'foo'}, message=bucket_not_found_msg))
@@ -210,3 +221,15 @@ def test_get_object_size(test_oci_obj_store, mock_bucket_name, monkeypatch, resu
                     f'Bucket specified in oci://{mock_bucket_name}/{mock_object_name} not found. {bucket_not_found_msg}'
             ):
                 oci_os.get_object_size(mock_object_name)
+    elif result == 'bucket_not_found':
+        bucket_not_found_msg = f'Either the bucket named f{mock_bucket_name} does not exist in the namespace*'
+        mock_get_object_fn_with_exception = Mock(side_effect=oci.exceptions.ServiceError(
+            status=404, code=None, headers={'opc-request-id': 'foo'}, message=bucket_not_found_msg))
+        with monkeypatch.context() as m:
+            m.setattr(oci_os.client, 'get_object', mock_get_object_fn_with_exception)
+            with pytest.raises(
+                    ValueError,
+                    match=
+                    f'Bucket specified in oci://{mock_bucket_name}/{mock_object_name} not found. {bucket_not_found_msg}'
+            ):
+                oci_os.get_object_size(mock_object_name)
diff --git a/tests/utils/object_store/test_s3_object_store.py b/tests/utils/object_store/test_s3_object_store.py
index eb6b8a0c72..2d7033c5be 100644
--- a/tests/utils/object_store/test_s3_object_store.py
+++ b/tests/utils/object_store/test_s3_object_store.py
@@ -41,7 +41,7 @@ def test_s3_upload_object_arguments(tmp_path: pathlib.Path, s3_bucket: str):
     remote_obj_name = 'remote.txt'
 
     object_store = S3ObjectStore(bucket=s3_bucket)
-    object_store.client.upload_file = MagicMock()
+    object_store.client.upload_file = MagicMock()  # pyright: ignore[reportGeneralTypeIssues]
 
     with mock.patch.dict('os.environ'):
         os.environ.pop('S3_CANNED_ACL', None)
diff --git a/tests/utils/object_store/test_uc_object_store.py b/tests/utils/object_store/test_uc_object_store.py
index 1f84143186..60845e43eb 100644
--- a/tests/utils/object_store/test_uc_object_store.py
+++ b/tests/utils/object_store/test_uc_object_store.py
@@ -78,19 +78,26 @@ def test_uc_object_store_invalid_prefix(monkeypatch):
 @pytest.mark.parametrize('result', ['success', 'not_found'])
 def test_get_object_size(ws_client, uc_object_store, result: str):
     if result == 'success':
-        db_files = pytest.importorskip('databricks.sdk.service.files')
-        ws_client.files.get_status.return_value = db_files.FileInfo(file_size=100)
-        assert uc_object_store.get_object_size('train.txt') == 100
+        ws_client.api_client.do.return_value = {}
+        assert uc_object_store.get_object_size('train.txt') == 1000000
     elif result == 'not_found':
         db_core = pytest.importorskip('databricks.sdk.core', reason='requires databricks')
-        ws_client.files.get_status.side_effect = db_core.DatabricksError('The file being accessed is not found',
-                                                                         error_code='NOT_FOUND')
+        ws_client.api_client.do.side_effect = db_core.DatabricksError('The file being accessed is not found',
+                                                                      error_code='NOT_FOUND')
         with pytest.raises(FileNotFoundError):
             uc_object_store.get_object_size('train.txt')
     else:
         raise NotImplementedError(f'Test for result={result} is not implemented.')
 
 
+def test_get_object_size_full_path(ws_client, uc_object_store):
+    ws_client.api_client.do.return_value = {}
+    assert uc_object_store.get_object_size('Volumes/catalog/schema/volume/train.txt') == 1000000
+    ws_client.api_client.do.assert_called_with(method='HEAD',
+                                               path=f'/api/2.0/fs/files/Volumes/catalog/schema/volume/train.txt',
+                                               headers={'Source': 'mosaicml/composer'})
+
+
 def test_get_uri(uc_object_store):
     assert uc_object_store.get_uri('train.txt') == 'dbfs:/Volumes/catalog/schema/volume/train.txt'
     assert uc_object_store.get_uri('Volumes/catalog/schema/volume/checkpoint/model.bin'
@@ -160,6 +167,49 @@ def generate_dummy_file(_):
         raise NotImplementedError(f'Test for result={result} is not implemented.')
 
 
+def test_list_objects_nested_folders(ws_client, uc_object_store):
+    expected_files = [
+        '/Volumes/catalog/volume/schema/path/to/folder/file1.txt',
+        '/Volumes/catalog/volume/schema/path/to/folder/file2.txt',
+        '/Volumes/catalog/volume/schema/path/to/folder/subdir/file1.txt',
+        '/Volumes/catalog/volume/schema/path/to/folder/subdir/file2.txt',
+    ]
+    uc_list_api_responses = [{
+        'files': [{
+            'path': '/Volumes/catalog/volume/schema/path/to/folder/file1.txt',
+            'is_dir': False
+        }, {
+            'path': '/Volumes/catalog/volume/schema/path/to/folder/file2.txt',
+            'is_dir': False
+        }, {
+            'path': '/Volumes/catalog/volume/schema/path/to/folder/subdir',
+            'is_dir': True
+        }]
+    }, {
+        'files': [{
+            'path': '/Volumes/catalog/volume/schema/path/to/folder/subdir/file1.txt',
+            'is_dir': False
+        }, {
+            'path': '/Volumes/catalog/volume/schema/path/to/folder/subdir/file2.txt',
+            'is_dir': False
+        }]
+    }]
+
+    prefix = 'Volumes/catalog/schema/volume/path/to/folder'
+
+    ws_client.api_client.do = MagicMock(side_effect=[uc_list_api_responses[0], uc_list_api_responses[1]])
+    actual_files = uc_object_store.list_objects(prefix=prefix)
+
+    assert actual_files == expected_files
+
+    ws_client.api_client.do.assert_called_with(method='GET',
+                                               path=uc_object_store._UC_VOLUME_LIST_API_ENDPOINT,
+                                               data='{"path": "/Volumes/catalog/volume/schema/path/to/folder/subdir"}',
+                                               headers={'Source': 'mosaicml/composer'})
+
+    assert ws_client.api_client.do.call_count == 2
+
+
 @pytest.mark.parametrize('result', ['success', 'prefix_none', 'not_found', 'error'])
 def test_list_objects(ws_client, uc_object_store, result):
     expected_files = [
@@ -173,9 +223,6 @@ def test_list_objects(ws_client, uc_object_store, result):
         }, {
             'path': '/Volumes/catalog/volume/schema/path/to/folder/file2.txt',
             'is_dir': False
-        }, {
-            'path': '/Volumes/catalog/volume/schema/path/to/folder/samples/',
-            'is_dir': True
         }]
     }
 
diff --git a/tests/utils/test_autolog_hparams.py b/tests/utils/test_autolog_hparams.py
index 7804d7bd80..4452f7bb65 100644
--- a/tests/utils/test_autolog_hparams.py
+++ b/tests/utils/test_autolog_hparams.py
@@ -10,7 +10,7 @@
 from composer.loggers import InMemoryLogger
 from composer.trainer import Trainer
 from composer.utils import (StringEnum, convert_flat_dict_to_nested_dict, convert_nested_dict_to_flat_dict,
-                            extract_hparams, using_torch_2)
+                            extract_hparams)
 from tests.common.datasets import RandomClassificationDataset
 from tests.common.models import SimpleModel
 
@@ -146,7 +146,6 @@ def test_extract_hparams_trainer():
         # Compile
         'compile_config': None,
         'is_model_compiled': False,
-        'is_torch_2_0': using_torch_2(),
 
         # Load Checkpoint
         'load_path': None,
@@ -164,6 +163,7 @@ def test_extract_hparams_trainer():
         'save_overwrite': False,
         'save_interval': '1ep',
         'save_weights_only': False,
+        'save_ignore_keys': None,
         'save_num_checkpoints_to_keep': -1,
         'save_metrics': False,
 
diff --git a/tests/utils/test_file_helpers.py b/tests/utils/test_file_helpers.py
index 2e757afbe4..7c4e470547 100644
--- a/tests/utils/test_file_helpers.py
+++ b/tests/utils/test_file_helpers.py
@@ -213,17 +213,6 @@ def test_safe_format_name_with_dist(monkeypatch: pytest.MonkeyPatch, world_size)
     assert format_name_with_dist(format_str, 'awesome_run') == expected_str
 
 
-@world_size(2)
-def test_unsafe_format_name_with_dist(monkeypatch: pytest.MonkeyPatch, world_size):
-    """Node rank is deleted, but also in the format string, so expect error."""
-    vars = ['run_name', 'node_rank']
-    format_str = ','.join(f'{x}={{{x}}}' for x in vars)
-
-    monkeypatch.delenv('NODE_RANK')
-    with pytest.raises(KeyError):
-        assert format_name_with_dist(format_str, 'awesome_run') == 'run_name=awesome_run,node_rank=3'
-
-
 def test_format_name_with_dist_and_time():
     vars = [
         'run_name',
@@ -341,7 +330,7 @@ def test_maybe_create_remote_uploader_downloader_from_uri(monkeypatch):
         mock_remote_ud = MagicMock()
         m.setattr(loggers, 'RemoteUploaderDownloader', mock_remote_ud)
         maybe_create_remote_uploader_downloader_from_uri('gs://my-nifty-gs-bucket/path/to/checkpoints.pt', loggers=[])
-        mock_remote_ud.assert_called_once_with(bucket_uri='gs://my-nifty-gs-bucket'),
+        mock_remote_ud.assert_called_once_with(bucket_uri='gs://my-nifty-gs-bucket')
 
     with pytest.raises(NotImplementedError):
         maybe_create_remote_uploader_downloader_from_uri('wandb://my-cool/checkpoint/for/my/model.pt', loggers=[])
@@ -357,7 +346,9 @@ def test_maybe_create_remote_uploader_downloader_from_uri(monkeypatch):
                                                backend_kwargs={'path': 'Volumes/checkpoint/for/my/model.pt'})
 
     with pytest.raises(ValueError):
-        maybe_create_remote_uploader_downloader_from_uri('dbfs:/checkpoint/for/my/model.pt', loggers=[])
+        rud = maybe_create_remote_uploader_downloader_from_uri('dbfs:/checkpoint/for/my/model.pt', loggers=[])
+        assert rud is not None
+        _ = rud.remote_backend
 
 
 def test_ensure_folder_is_empty(tmp_path: pathlib.Path):
diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
index e43aa184b7..789ad3c136 100644
--- a/tests/utils/test_inference.py
+++ b/tests/utils/test_inference.py
@@ -20,7 +20,6 @@
 from composer.functional import apply_gated_linear_units
 from composer.loggers import InMemoryLogger, Logger
 from composer.loggers.logger_destination import LoggerDestination
-from composer.models import composer_resnet
 from composer.trainer.dist_strategy import prepare_ddp_module
 from composer.trainer.trainer import Trainer
 from composer.utils import dist, export_with_logger, inference
@@ -28,7 +27,7 @@
 from tests.common import SimpleTransformerClassifier, device
 from tests.common.datasets import (RandomImageDataset, dummy_text_classification_dataloader, dummy_tiny_bert_lm_batch,
                                    dummy_transformer_classifier_batch)
-from tests.common.models import configure_tiny_bert_hf_model
+from tests.common.models import composer_resnet, configure_tiny_bert_hf_model
 
 
 class MockFileUploader(LoggerDestination):
@@ -212,11 +211,9 @@ def test_export_for_inference_onnx(model_cls, sample_input, onnx_opset_version,
     if onnx_opset_version == None and version.parse(torch.__version__) < version.parse('1.13'):
         pytest.skip("Don't test prior PyTorch version's default Opset version.")
 
-    from composer.utils.misc import using_torch_2
-    if using_torch_2():
-        pytest.xfail(
-            'torch.onnx.errors.UnsupportedOperatorError: Exporting the operator "aten::unflatten" to ONNX opset version 14 is not supported.'
-        )
+    pytest.xfail(
+        'torch.onnx.errors.UnsupportedOperatorError: Exporting the operator "aten::unflatten" to ONNX opset version 14 is not supported.'
+    )
 
     import onnx
     import onnx.checker
@@ -328,11 +325,9 @@ def test_export_for_inference_onnx_ddp(model_cls, sample_input, onnx_opset_versi
     pytest.importorskip('onnx')
     pytest.importorskip('onnxruntime')
 
-    from composer.utils.misc import using_torch_2
-    if using_torch_2():
-        pytest.xfail(
-            'torch.onnx.errors.UnsupportedOperatorError: Exporting the operator "aten::unflatten" to ONNX opset version 14 is not supported.'
-        )
+    pytest.xfail(
+        'torch.onnx.errors.UnsupportedOperatorError: Exporting the operator "aten::unflatten" to ONNX opset version 14 is not supported.'
+    )
 
     if onnx_opset_version == None and version.parse(torch.__version__) < version.parse('1.13'):
         pytest.skip("Don't test prior PyTorch version's default Opset version.")
diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py
new file mode 100644
index 0000000000..333262795d
--- /dev/null
+++ b/tests/utils/test_misc.py
@@ -0,0 +1,22 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+from composer.utils.misc import partial_format
+
+
+def test_partial_format():
+    # No args provided
+    assert partial_format('{foo} {bar} {}') == '{foo} {bar} {}'
+
+    # Keyword args
+    assert partial_format('{foo} {bar}', foo='Hello') == 'Hello {bar}'
+    assert partial_format('{foo} {bar}', foo='Hello', bar='World') == 'Hello World'
+
+    # Positional args
+    assert partial_format('{} {}', 'Hello') == 'Hello {}'
+    assert partial_format('{} {}', 'Hello', 'World') == 'Hello World'
+
+    # Positional and keyword args
+    assert partial_format('{foo} {}', 'World') == '{foo} World'
+    assert partial_format('{foo} {}', foo='Hello') == 'Hello {}'
+    assert partial_format('{foo} {}', 'World', foo='Hello') == 'Hello World'