From 55da325899170c20dacc9034cac6429225175db1 Mon Sep 17 00:00:00 2001
From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Date: Wed, 21 Jun 2023 09:25:43 -0700
Subject: [PATCH 1/9] updt composer to 0.15.0 (#347)

* updt composer

* updt test

* Update test_tasks.yaml

* updt with jeremies updt

* updt eval tasks yamls

---------

Co-authored-by: Jeremy D <115047575+bmosaicml@users.noreply.github.com>
---
 scripts/eval/yamls/tasks.yaml       | 12 ++++++------
 scripts/eval/yamls/tasks_light.yaml |  8 ++++----
 setup.py                            |  4 ++--
 tests/test_tasks.yaml               |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml
index 0ffdb4dbd6..5be9fe269c 100644
--- a/scripts/eval/yamls/tasks.yaml
+++ b/scripts/eval/yamls/tasks.yaml
@@ -4,7 +4,7 @@ icl_tasks:
   dataset_uri: eval/local_data/jeopardy_all.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: language_modeling
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
   has_categories: true
 -
   label: lambada_openai
@@ -16,7 +16,7 @@ icl_tasks:
   dataset_uri: eval/local_data/piqa.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: hellaswag
   dataset_uri: eval/local_data/hellaswag.jsonl # ADD YOUR OWN DATASET URI
@@ -27,13 +27,13 @@ icl_tasks:
   dataset_uri: eval/local_data/arc_easy.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: arc_challenge
   dataset_uri: eval/local_data/arc_challenge.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: copa
   dataset_uri: eval/local_data/copa.jsonl # ADD YOUR OWN DATASET URI
@@ -44,13 +44,13 @@ icl_tasks:
   dataset_uri: eval/local_data/boolq.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: mmlu
   dataset_uri: eval/local_data/mmlu.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
   has_categories: true
 -
   label: winograd
diff --git a/scripts/eval/yamls/tasks_light.yaml b/scripts/eval/yamls/tasks_light.yaml
index 66621e1be6..54580727a1 100644
--- a/scripts/eval/yamls/tasks_light.yaml
+++ b/scripts/eval/yamls/tasks_light.yaml
@@ -9,7 +9,7 @@ icl_tasks:
   dataset_uri: eval/local_data/piqa.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: hellaswag
   dataset_uri: eval/local_data/hellaswag.jsonl # ADD YOUR OWN DATASET URI
@@ -20,13 +20,13 @@ icl_tasks:
   dataset_uri: eval/local_data/arc_easy.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: arc_challenge
   dataset_uri: eval/local_data/arc_challenge.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
 -
   label: copa
   dataset_uri: eval/local_data/copa.jsonl # ADD YOUR OWN DATASET URI
@@ -37,4 +37,4 @@ icl_tasks:
   dataset_uri: eval/local_data/boolq.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1, 5, 10]
   icl_task_type: multiple_choice
-  continuation_delimiter: 'Answer: ' # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
diff --git a/setup.py b/setup.py
index e8f670f762..3302030fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
 ]
 
 install_requires = [
-    'composer[libcloud,nlp,wandb]>=0.14.1,<0.15',
+    'composer[libcloud,nlp,wandb]>=0.15.0,<0.16',
     'accelerate>=0.19,<0.20',  # for HF inference `device_map`
     'mosaicml-streaming>=0.5.1,<0.6',
     'torch>=1.13.1,<=2.0.1',
@@ -77,7 +77,7 @@
 ]
 
 extra_deps['tensorboard'] = [
-    'composer[tensorboard]>=0.14.1,<0.15',
+    'composer[tensorboard]>=0.15.0,<0.16',
 ]
 
 extra_deps['gpu'] = [
diff --git a/tests/test_tasks.yaml b/tests/test_tasks.yaml
index dae4b470bd..4298b3939c 100644
--- a/tests/test_tasks.yaml
+++ b/tests/test_tasks.yaml
@@ -4,7 +4,7 @@ icl_tasks:
   dataset_uri: scripts/eval/local_data/jeopardy_all.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1]
   icl_task_type: language_modeling
-  continuation_delimiter: "Answer: " # this separates questions from answers
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
   has_categories: true
 -
   label: copa

From da7fa6b907462a03af051ee6f1fe54d9fbfcf0f3 Mon Sep 17 00:00:00 2001
From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Date: Wed, 21 Jun 2023 09:43:19 -0700
Subject: [PATCH 2/9] updt yml (#349)

---
 mcli/mcli-1b-eval.yaml           | 2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml | 2 +-
 mcli/mcli-1b.yaml                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index cc5dd1cbb1..d5df9c902b 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -10,7 +10,7 @@ command: |
   cd llm-foundry/llmfoundry/icl_eval
   composer eval.py /mnt/config/parameters.yaml
 image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-name: mosaic-gpt-1b-eval
+name: mpt-1b-eval
 
 compute:
   gpus: 8  # Number of GPUs to use
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index d6b1ea2a9c..177aaee7aa 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -18,7 +18,7 @@ command: |
     --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py /mnt/config/parameters.yaml
 image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-name: mosaic-gpt-1b-ctx-8k-gpus-8
+name: mpt-1b-ctx-8k-gpus-8
 
 compute:
   gpus: 8  # Number of GPUs to use
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index ccca8f26d7..1d7f4b0d13 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -22,7 +22,7 @@ command: |
     max_duration=100ba \
     eval_interval=0
 image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-name: mosaic-gpt-1b-gpus-8
+name: mpt-1b-gpus-8
 
 compute:
   gpus: 8  # Number of GPUs to use

From 619400a22a0c2b89270a16b51e0fdc10acee257d Mon Sep 17 00:00:00 2001
From: Evan Racah <ejracah@gmail.com>
Date: Wed, 21 Jun 2023 12:45:29 -0700
Subject: [PATCH 3/9] Fix bug with saving optimizer states with
 MonolithicCheckpointSaver Callback (#310)

* Fix bug with saving optimizer states with mono ckpt saver

* lint

---------

Co-authored-by: Abhi Venigalla <77638579+abhi-mosaic@users.noreply.github.com>
Co-authored-by: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Co-authored-by: root <vitaliy@mosaicml.com>
---
 .../callbacks/monolithic_ckpt_callback.py     | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/callbacks/monolithic_ckpt_callback.py b/llmfoundry/callbacks/monolithic_ckpt_callback.py
index 4e6bc4bf50..afca099832 100644
--- a/llmfoundry/callbacks/monolithic_ckpt_callback.py
+++ b/llmfoundry/callbacks/monolithic_ckpt_callback.py
@@ -8,7 +8,8 @@
 
 import torch
 from composer.core import Callback, State
-from composer.core.state import fsdp_state_dict_type_context
+from composer.core.state import (fsdp_get_optim_state_dict,
+                                 fsdp_state_dict_type_context)
 from composer.loggers import Logger
 from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader
 from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
@@ -79,13 +80,27 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 'state': state.state_dict(),
                 'rng': reproducibility.get_rng_state()
             }
-            if not self.keep_optimizers:
-                state_dict['state'].pop('optimizers')
+            # Remove sharded model and optimizer state dicts
+            state_dict['state'].pop('optimizers')
+            state_dict['state'].pop('model')
+
+            # Add in unsharded model params.
             with fsdp_state_dict_type_context(state.model,
                                               state_dict_type='full'):
                 state_dict['state']['model'] = state.model.state_dict()
-                if dist.get_global_rank() == 0:
-                    torch.save(state_dict, save_path)
+
+            # Add in unsharded optimizer state dict.
+            if self.keep_optimizers:
+                optimizer = state.optimizers[0]
+                state_dict['state']['optimizers'] = {
+                    type(optimizer).__qualname__:
+                        fsdp_get_optim_state_dict(state.model,
+                                                  optimizer,
+                                                  state_dict_type='full')
+                }
+            if dist.get_global_rank() == 0:
+                torch.save(state_dict, save_path)
+
             if self.upload_to_object_store and self.remote_ud is not None and dist.get_global_rank(
             ) == 0:
                 remote_file_name = str(Path(save_dir) / Path(filename))

From d848d3daf3be1dc55fbfaf5b092e560ba00ae3db Mon Sep 17 00:00:00 2001
From: bandish-shah <86627118+bandish-shah@users.noreply.github.com>
Date: Wed, 21 Jun 2023 14:00:42 -0700
Subject: [PATCH 4/9] Add step to free up some disk space on the worker (#350)

---
 .github/workflows/docker.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 67ccd5ddd1..28084b7fb4 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -17,6 +17,14 @@ jobs:
           base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
 
     steps:
+    - name: Maximize Build Space on Worker
+      uses: easimon/maximize-build-space@v4
+      with:
+        overprovision-lvm: true
+        remove-dotnet: true
+        remove-android: true
+        remove-haskell: true
+
     - name: Checkout
       uses: actions/checkout@v3
 

From 2167c0e6fd0a7b837c230f26e8be974bb2951be6 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 21 Jun 2023 17:44:54 -0700
Subject: [PATCH 5/9] Filter out sequences where prompt is longer than max
 length, rather than dropping them on the fly later (#348)

* attempt fix for hf side

* fix

* fix cpu count
---
 llmfoundry/data/finetuning/dataloader.py |  5 ++++-
 llmfoundry/data/finetuning/tasks.py      | 20 ++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 9b27f4f0d0..0db8d98b49 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -145,7 +145,10 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: Tokenizer,
         )
 
     else:
-        dataset = dataset_constructor.build_from_hf(cfg.dataset, tokenizer)
+        dataset = dataset_constructor.build_from_hf(
+            cfg.dataset,
+            max_seq_len=cfg.dataset.max_seq_len,
+            tokenizer=tokenizer)
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
             cfg.dataset, tokenizer, device_batch_size)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 56be18532d..2ff151de0c 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -33,6 +33,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 
 import importlib
 import os
+import warnings
 from typing import Any, Callable, Dict, Optional, Union
 
 import datasets as hf_datasets
@@ -220,11 +221,15 @@ def get_preprocessing_fn_from_str(self,
 
         return preprocessing_fn
 
-    def build_from_hf(self, cfg: DictConfig, tokenizer: Tokenizer):
+    def build_from_hf(self, cfg: DictConfig, max_seq_len: int,
+                      tokenizer: Tokenizer):
         """Load a HuggingFace Datasets, preprocess, and tokenize.
 
+        Note: This function will drop examples where the prompt is longer than the max_seq_len
+
         Args:
             cfg (DictConfig): The dataset configuration.
+            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
             tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
 
         Returns:
@@ -248,9 +253,20 @@ def dataset_mapper(example: Dict):
             dataset_mapper,
             batched=False,
             remove_columns=columns_to_remove,
+            num_proc=max(os.cpu_count() - 2, 1),
         )
+        prompt_length_filtered_dataset = tokenized_dataset.filter(
+            lambda example: len(example['input_ids']) < max_seq_len,
+            num_proc=max(os.cpu_count() - 2, 1))
+
+        examples_removed = len(tokenized_dataset) - len(
+            prompt_length_filtered_dataset)
+        if examples_removed > 0:
+            warnings.warn(
+                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
+            )
 
-        return tokenized_dataset
+        return prompt_length_filtered_dataset
 
     def build_from_streaming(self, *args: Any, **kwargs: Any):
         return StreamingFinetuningDataset(*args, **kwargs)

From 2f1bf410e5780fa4274dd1cbb66f44870ed3bc69 Mon Sep 17 00:00:00 2001
From: Cody Blakeney <cody@mosaicml.com>
Date: Wed, 21 Jun 2023 23:56:00 -0500
Subject: [PATCH 6/9] Revert "Filter out sequences where prompt is longer than
 max length, rather than dropping them on the fly later (#348)" (#354)

This reverts commit 2167c0e6fd0a7b837c230f26e8be974bb2951be6.
---
 llmfoundry/data/finetuning/dataloader.py |  5 +----
 llmfoundry/data/finetuning/tasks.py      | 20 ++------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 0db8d98b49..9b27f4f0d0 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -145,10 +145,7 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: Tokenizer,
         )
 
     else:
-        dataset = dataset_constructor.build_from_hf(
-            cfg.dataset,
-            max_seq_len=cfg.dataset.max_seq_len,
-            tokenizer=tokenizer)
+        dataset = dataset_constructor.build_from_hf(cfg.dataset, tokenizer)
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
             cfg.dataset, tokenizer, device_batch_size)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 2ff151de0c..56be18532d 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -33,7 +33,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 
 import importlib
 import os
-import warnings
 from typing import Any, Callable, Dict, Optional, Union
 
 import datasets as hf_datasets
@@ -221,15 +220,11 @@ def get_preprocessing_fn_from_str(self,
 
         return preprocessing_fn
 
-    def build_from_hf(self, cfg: DictConfig, max_seq_len: int,
-                      tokenizer: Tokenizer):
+    def build_from_hf(self, cfg: DictConfig, tokenizer: Tokenizer):
         """Load a HuggingFace Datasets, preprocess, and tokenize.
 
-        Note: This function will drop examples where the prompt is longer than the max_seq_len
-
         Args:
             cfg (DictConfig): The dataset configuration.
-            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
             tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
 
         Returns:
@@ -253,20 +248,9 @@ def dataset_mapper(example: Dict):
             dataset_mapper,
             batched=False,
             remove_columns=columns_to_remove,
-            num_proc=max(os.cpu_count() - 2, 1),
         )
-        prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len,
-            num_proc=max(os.cpu_count() - 2, 1))
-
-        examples_removed = len(tokenized_dataset) - len(
-            prompt_length_filtered_dataset)
-        if examples_removed > 0:
-            warnings.warn(
-                f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}.'
-            )
 
-        return prompt_length_filtered_dataset
+        return tokenized_dataset
 
     def build_from_streaming(self, *args: Any, **kwargs: Any):
         return StreamingFinetuningDataset(*args, **kwargs)

From af209b380cd1cd288b5c963cec04d1e20c3439c9 Mon Sep 17 00:00:00 2001
From: Sam Havens <sam@mosaicml.com>
Date: Wed, 21 Jun 2023 22:26:46 -0700
Subject: [PATCH 7/9] Remote JSONL IFT data (#275)

* support remote jsonl files for IFT datasets

* improve docstring

* add support for other extensions

* don't duplicate validation check

* build dataset before tmpdir deletes

* parse uri

* only rank 0 download

* only download rank 0

* better error

* break earlier

* log more

* more reasonable destination str

* use data files format

* name points to a preprocessing function I guess

* debugging

* always something with HF

* json vs jsonl [no-ci]

* if hf wants it local, make it local [no-ci]

* back to tempfile [no-ci]

* debug

* debug hfds [no-ci]

* ... [no-ci]

* don't rename file

* use tempfile again

* updt

---------

Co-authored-by: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Co-authored-by: root <vitaliy@mosaicml.com>
---
 llmfoundry/data/finetuning/dataloader.py | 54 ++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 9b27f4f0d0..4fcb7c4f25 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -2,10 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
+import os
+import tempfile
 from typing import Union
 
 import torch
-from composer.utils import dist
+from composer.utils import dist, get_file, parse_uri
 from omegaconf import DictConfig
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -38,7 +40,9 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: Tokenizer,
             ---
             *** HuggingFace dataset config fields ***
             cfg.dataset.hf_name (str, optional): The name of the HuggingFace dataset
-                to use.
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
             cfg.dataset.hf_kwargs (DictConfig, optional): Additional kwargs to
                 pass to `datasets.load_dataset`, which can be used to load
                 a dataset from local files.
@@ -145,7 +149,51 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: Tokenizer,
         )
 
     else:
-        dataset = dataset_constructor.build_from_hf(cfg.dataset, tokenizer)
+        backend, _, _ = parse_uri(cfg.dataset.hf_name)
+        if backend not in ['', None]:
+            if cfg.dataset.get('split') is None:
+                raise ValueError(
+                    'When using a HuggingFace dataset from a URL, you must set the ' + \
+                    '`split` key in the dataset config.'
+                )
+            supported_extensions = ['jsonl', 'csv', 'parquet']
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                for extension in supported_extensions:
+                    name = f'{cfg.dataset.hf_name.strip("/")}/{cfg.dataset.split}.{extension}'
+                    destination = str(
+                        os.path.abspath(
+                            f'{tmp_dir}/{cfg.dataset.split}.{extension}'))
+                    try:
+                        with dist.run_local_rank_zero_first():
+                            get_file(name, destination, overwrite=True)
+                    except FileNotFoundError as e:
+                        if extension == supported_extensions[-1]:
+                            raise FileNotFoundError(
+                                f'Could not find a {cfg.dataset.split} file with any of ' + \
+                                f'the supported extensions: {supported_extensions}\n' + \
+                                f'at {cfg.dataset.hf_name}/{cfg.dataset.split}'
+                            ) from e
+                        else:
+                            print(
+                                f'Could not find {name}, looking for another extension'
+                            )
+                        continue
+                    # 'json' causes special behavior in the dataset constructor
+                    cfg.dataset.hf_name = extension if extension != 'jsonl' else 'json'
+                    kwargs = cfg.dataset.get('hf_kwargs', {})
+                    kwargs['data_files'] = destination
+                    cfg.dataset['hf_kwargs'] = kwargs
+                    print(cfg.dataset)
+                    dataset = dataset_constructor.build_from_hf(
+                        cfg.dataset,
+                        tokenizer=tokenizer,
+                    )
+                    break
+        else:
+            dataset = dataset_constructor.build_from_hf(
+                cfg.dataset,
+                tokenizer=tokenizer,
+            )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
             cfg.dataset, tokenizer, device_batch_size)

From 7731106750ba3bef965ae3c5a32e8566aea5493c Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <77638579+abhi-mosaic@users.noreply.github.com>
Date: Thu, 22 Jun 2023 09:04:25 -0600
Subject: [PATCH 8/9] Add MPT-30B to README (#356)

---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index aba62029cb..2acdcf882c 100644
--- a/README.md
+++ b/README.md
@@ -41,15 +41,18 @@ You'll find in this repo:
 
 # MPT
 
-MPT-7B is a GPT-style model, and the first in the MosaicML Foundation Series of models. Trained on 1T tokens of a MosaicML-curated dataset, MPT-7B is open-source, commercially usable, and equivalent to LLaMa 7B on evaluation metrics. The MPT architecture contains all the latest techniques on LLM modeling -- Flash Attention for efficiency, Alibi for context length extrapolation, and stability improvements to mitigate loss spikes. The base model and several variants, including a 64K context length fine-tuned model (!!) are all available:
+Mosaic Pretrained Transformers (MPT) are GPT-style models with some special features -- Flash Attention for efficiency, ALiBi for context length extrapolation, and stability improvements to mitigate loss spikes. As part of MosaicML's Foundation series, we have open-sourced several MPT models:
 
 
 | Model              | Context Length | Download                                           | Demo                                                             | Commercial use? |
 |--------------------|----------------|----------------------------------------------------|------------------------------------------------------------------|-----------------|
+| MPT-30B            | 8192           | https://huggingface.co/mosaicml/mpt-30b            |                                                                  | Yes             |
+| MPT-30B-Instruct   | 8192           | https://huggingface.co/mosaicml/mpt-30b-instruct   |                                                                  | Yes             |
+| MPT-30B-Chat       | 8192           | https://huggingface.co/mosaicml/mpt-30b-chat       | [Demo](https://huggingface.co/spaces/mosaicml/mpt-30b-chat)      | No              |
 | MPT-7B             | 2048           | https://huggingface.co/mosaicml/mpt-7b             |                                                                  | Yes             |
-| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-instruct)   | Yes             |
+| MPT-7B-Instruct    | 2048           | https://huggingface.co/mosaicml/mpt-7b-instruct    |                                                                  | Yes             |
 | MPT-7B-Chat        | 2048           | https://huggingface.co/mosaicml/mpt-7b-chat        | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-chat)       | No              |
-| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter | [Demo](https://huggingface.co/spaces/mosaicml/mpt-7b-storywriter)| Yes             |
+| MPT-7B-StoryWriter | 65536          | https://huggingface.co/mosaicml/mpt-7b-storywriter |                                                                  | Yes             |
 
 To try out these models locally, [follow the instructions](https://github.com/mosaicml/llm-foundry/tree/main/scripts/inference#interactive-generation-with-modelgenerate) in `scripts/inference/README.md` to prompt HF models using our [hf_generate.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_generate.py) or [hf_chat.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/inference/hf_chat.py) scripts.
 
@@ -71,6 +74,7 @@ Tutorial videos from the community:
 Something missing? Contribute with a PR!
 
 # Latest News
+* [Blog: MPT-30B: Raising the bar for open-source foundation models](https://www.mosaicml.com/blog/mpt-30b)
 * [Blog: Introducing MPT-7B](https://www.mosaicml.com/blog/mpt-7b)
 * [Blog: Benchmarking LLMs on H100](https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1)
 * [Blog: Blazingly Fast LLM Evaluation](https://www.mosaicml.com/blog/llm-evaluation-for-icl)

From 38361a6f56bf05a1d90d03b2d0bbf0a1cc28e7bc Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Thu, 22 Jun 2023 09:34:36 -0700
Subject: [PATCH 9/9] Codeql on PRs (#352)

* Codeql on PRs

* allow empty
---
 .github/workflows/codeql-analysis.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 07197a82e1..7fb270db97 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -14,6 +14,9 @@ name: 'CodeQL'
 on:
   push:
     branches: [main]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [main]
   schedule:
   - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)