Merge branch 'main' into remote-hf-generate

mosaicml · Dec 11, 2023 · 854b938 · 854b938
2 parents ada5f21 + 34ec2f7
commit 854b938
Show file tree

Hide file tree

Showing 25 changed files with 439 additions and 94 deletions.
diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
@@ -6,8 +6,8 @@
 import argparse
 import time
 
-from mcli.sdk import (RunConfig, RunStatus, create_run, follow_run_logs,
-                      wait_for_run_status)
+from mcli import (RunConfig, RunStatus, create_run, follow_run_logs,
+                  wait_for_run_status)
 
 if __name__ == '__main__':
 
@@ -107,9 +107,11 @@
 
     config = RunConfig(
         name=name,
-        cluster=args.cluster,
-        gpu_type=args.gpu_type,
-        gpu_num=args.gpu_num,
+        compute={
+            'cluster': args.cluster,
+            'gpu_type': args.gpu_type,
+            'gpus': args.gpu_num
+        },
         image=args.image,
         integrations=[git_integration],
         command=command,

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -17,12 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: '1.13.1_cu117'
-          base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          dep_groups: '[gpu]'
-        - name: '2.0.1_cu118'
-          base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
-          dep_groups: '[gpu]'
         - name: '2.1.0_cu121'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           dep_groups: '[gpu]'

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -19,10 +19,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu-1.13.1'
-          container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-          markers: 'not gpu'
-          pytest_command: 'coverage run -m pytest'
         - name: 'cpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
           markers: 'not gpu'

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -19,11 +19,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'gpu-1.13.1'
-          container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-          deps_group: 'all'
         - name: 'gpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'

diff --git a/README.md b/README.md
@@ -85,21 +85,14 @@ Something missing? Contribute with a PR!
 
 
 # Hardware and Software Requirements
-This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems with NVIDIA A100s and H100s.
+This codebase has been tested with PyTorch 2.1 with NVIDIA A100s and H100s.
 This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems.
 If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix!
 
 | Device         | Torch Version | Cuda Version | Status                       |
 | -------------- | ------------- | ------------ | ---------------------------- |
-| A100-40GB/80GB | 1.13.1        | 11.7         | :white_check_mark: Supported |
-| A100-40GB/80GB | 2.0.1         | 11.7, 11.8   | :white_check_mark: Supported |
-| A100-40GB/80GB | 2.1.0         | 11.8, 12.1   | :white_check_mark: Supported |
-| H100-80GB      | 1.13.1        | 11.7         | :x: Not Supported            |
-| H100-80GB      | 2.0.1         | 11.8         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.1.0         | 12.1         | :white_check_mark: Supported |
 | H100-80GB      | 2.1.0         | 12.1         | :white_check_mark: Supported |
-| A10-24GB       | 1.13.1        | 11.7         | :construction: In Progress   |
-| A10-24GB       | 2.0.1         | 11.7, 11.8   | :construction: In Progress   |
-| MI250          | 2.0.1         | ROCm 5.4     | :construction: In Progress   |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
@@ -113,11 +106,7 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 
 | Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
 | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
-| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1        | 11.7 (Infiniband) | No                                  |
-| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`  | 2.0.1         | 11.8 (Infiniband) | No                                  |
 | `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
-| `mosaicml/llm-foundry:1.13.1_cu117-latest`             | 1.13.1        | 11.7 (Infiniband) | Yes                                 |
-| `mosaicml/llm-foundry:2.0.1_cu118-latest`              | 2.0.1         | 11.8 (Infiniband) | Yes                                 |
 | `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
 | `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
 | `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |

diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py
@@ -59,7 +59,7 @@ class EvalGauntlet(Callback):
                             logged under in the logger after eval
         categories (dict): This contains the list of categories, as well as the subtasks within them, the
                       random baseline accuracy of each subtask, and the number of fewshot examples
-                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
+                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet_v0.2.yaml` to see the structure.
         weighting (Weighting): The weighting scheme used to balance different tasks within each category.
                                Either assign them all equal weight, assign them weight proportional
                                to the dataset size, or assign them weight proportional to the log2 of the dataset size.

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -1,14 +1,18 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
+import os
+import tempfile
 from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
-from composer.utils import using_torch_2
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizerBase
 
+log = logging.getLogger(__name__)
+
 
 class BinPackCollator:
     """Utility collator for packing to reduce padding."""
@@ -290,8 +294,13 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,
     # Set the seed so that auto packing is deterministic.
     reproducibility.seed_all(0)
 
+    max_seq_len = dataloader_cfg.dataset.max_seq_len
+    # If max_seq_len is very small, skip profiling and select packing ratio of 1.
+    if max_seq_len <= 100:
+        return 1
+
     min_ratio = 1
-    max_ratio = dataloader_cfg.dataset.max_seq_len / 100
+    max_ratio = max_seq_len / 100
     profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio,
                                         max_ratio, num_packing_ratios,
                                         device_batch_size)
@@ -300,7 +309,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,
     # profiling_results are sorted from smallest to largest packing_ratio.
     packing_ratio = 1
     for packing_ratio_candidate, _, waste in profiling_results:
-        if waste > 0:
+        if waste is None or waste > 0:
             break
         packing_ratio = packing_ratio_candidate
 
@@ -319,9 +328,10 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,
 
 
 def profile_packing(
-        dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
-        min_ratio: float, max_ratio: float, num_packing_ratios: int,
-        device_batch_size: int) -> Iterable[Tuple[float, float, float]]:
+    dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+    min_ratio: float, max_ratio: float, num_packing_ratios: int,
+    device_batch_size: int
+) -> Iterable[Tuple[float, Optional[float], Optional[float]]]:
     """Generator function that profiles example packing across packing ratios.
 
     Args:
@@ -348,9 +358,13 @@ def profile_packing(
     dataloader_cfg.dataset.packing_ratio = None
     dataloader_cfg.drop_last = False
     dataloader_cfg.num_workers = 0
-    dataloader_cfg.prefetch_factor = None if using_torch_2() else 2
+    dataloader_cfg.prefetch_factor = None
     dataloader_cfg.persistent_workers = False
 
+    # If streaming dataset, use a temporary local folder for profiling
+    if dataloader_cfg.dataset.get('remote') is not None:
+        dataloader_cfg.dataset.local = tempfile.TemporaryDirectory().name
+
     # Determine the packing_ratio values we'll try
     packing_ratios, raw_batch_sizes = [], []
     for packing_ratio in np.linspace(min_ratio,
@@ -383,7 +397,7 @@ def split_big_batch(raw_batch_size: int) -> List:
                 batches[idx].update({key: split})
         return batches
 
-    def profile(raw_batch_size: int) -> Tuple[float, float]:
+    def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]:
         packer = BinPackCollator(
             collator=lambda x: x,
             target_batch_size=device_batch_size,
@@ -396,9 +410,15 @@ def profile(raw_batch_size: int) -> Tuple[float, float]:
         for batch in split_big_batch(raw_batch_size):
             if batch['input_ids'].shape[0] < device_batch_size:
                 continue
-            _ = packer.pack(batch)
+            packer.pack(batch)
+
+        if packer.n_packed_examples == 0:
+            log.debug(
+                'No examples packed during profiling. Dataset is smaller than device batch size.'
+            )
+            return None, None
 
-        # Return the padding / waste stats over that bunch of data
+        # Return the padding and waste stats over that bunch of data
         padding_percent = 100 * (1 - packer.efficiency)
         waste_percent = 100 * packer.waste
         return padding_percent, waste_percent

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -61,6 +61,7 @@ def __init__(self,
                  eos_token: Optional[str] = '<|endoftext|>',
                  bos_token: Optional[str] = '<|endoftext|>',
                  pad_token: Optional[str] = None,
+                 errors: str = 'replace',
                  **kwargs: Any):
         """Constructor creates a tiktoken tokenizer to use as the underlying.
 
@@ -78,6 +79,9 @@ def __init__(self,
             eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
             bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
             pad_token (Optional[str], optional): The pad token. Defaults to None.
+            errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
+                [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+                Defaults to `"replace"`.
         """
         try:
             import tiktoken
@@ -126,6 +130,7 @@ def pickle_Encoding(enc: Encoding):
 
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.errors = errors
 
         self.decoder: Dict[int, str] = {}
         for i in range(self.encoding.n_vocab):
@@ -155,6 +160,7 @@ def pickle_Encoding(enc: Encoding):
                          eos_token=eos_token,
                          bos_token=bos_token,
                          pad_token=pad_token,
+                         errors=errors,
                          **kwargs)
 
     @property
@@ -252,7 +258,8 @@ def _convert_id_to_token(self, index: int) -> Optional[str]:
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """Converts a sequence of tokens (string) in a single string."""
         text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
+        text = bytearray([self.byte_decoder[c] for c in text
+                         ]).decode('utf-8', errors=self.errors)
         return text
 
     def build_inputs_with_special_tokens(

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
@@ -55,5 +55,5 @@ parameters:
     forward_prefetch: True
     limit_all_gathers: True
 
-  icl_tasks: 'eval/yamls/tasks.yaml'
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -16,7 +16,7 @@ gpu_num: 8
 # gpu_type:
 # cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.0.1_cu118-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
@@ -50,5 +50,5 @@ parameters:
     limit_all_gathers: True
 
 
-  icl_tasks: 'eval/yamls/tasks.yaml'
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
@@ -16,7 +16,7 @@ run_name: openai-eval
 # gpu_type: #
 cluster: # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.0.1_cu118-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:

diff --git a/scripts/eval/README.md b/scripts/eval/README.md
@@ -27,7 +27,7 @@ composer eval/eval.py eval/yamls/hf_eval.yaml \
     model_name_or_path=mosaicml/mpt-7b
 ```
 
-You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet.yaml`.
+You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet_v0.2.yaml`.
 
 
 ### Evaluation during training
@@ -38,7 +38,7 @@ To run evaluation during training, download this repo, follow the instructions i
 cd llm-foundry/scripts/train
 composer train.py yamls/pretrain/mpt-125m_eval.yaml train_loader.dataset.split=train_small eval_loader.dataset.split=val_small
 ```
-You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet.yaml`. You can also choose to either run the full evaluation or run on a subset number of batches per benchmark by setting `icl_subset_num_batches`.
+You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet_v0.2.yaml`. You can also choose to either run the full evaluation or run on a subset number of batches per benchmark by setting `icl_subset_num_batches`.
 
 ----
 ## In-depth walkthrough
@@ -131,7 +131,7 @@ An example is given below:
 ```
   icl_tasks: eval/yamls/tasks.yaml # or use tasks_light.yaml
   icl_subset_num_batches: 100 # -1, or omit this key entirely, to evaluate on all batches
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
   icl_seq_len: 1024
 ```