From 495218376830947dce625efabc4d86be7c47cc45 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:58:33 -0700
Subject: [PATCH 1/3] Clean up the publicly exported API (#1128)

---
 llmfoundry/__init__.py                        | 52 ++++++++-----------
 llmfoundry/callbacks/async_eval_callback.py   |  2 +
 .../callbacks/curriculum_learning_callback.py |  2 +
 llmfoundry/callbacks/fdiff_callback.py        |  2 +
 llmfoundry/callbacks/hf_checkpointer.py       |  2 +
 .../log_mbmoe_tok_per_expert_callback.py      |  4 +-
 .../callbacks/monolithic_ckpt_callback.py     |  2 +
 llmfoundry/callbacks/resumption_callbacks.py  |  4 ++
 llmfoundry/callbacks/scheduled_gc_callback.py |  2 +
 llmfoundry/data/__init__.py                   | 15 +++++-
 llmfoundry/data/data.py                       |  5 ++
 llmfoundry/data/dataloader.py                 |  4 ++
 llmfoundry/data/finetuning/__init__.py        | 13 ++++-
 llmfoundry/data/finetuning/collator.py        |  4 ++
 llmfoundry/data/finetuning/dataloader.py      |  4 ++
 llmfoundry/data/finetuning/tasks.py           |  7 ++-
 llmfoundry/data/packing.py                    |  6 +++
 llmfoundry/data/text_data.py                  |  7 +++
 llmfoundry/eval/__init__.py                   | 29 +++++++++++
 llmfoundry/eval/datasets/__init__.py          | 17 +++---
 .../in_context_learning_evaluation.py         | 37 ++++++-------
 llmfoundry/eval/datasets/utils.py             | 12 ++++-
 llmfoundry/eval/metrics/__init__.py           |  6 +--
 llmfoundry/eval/metrics/nlp.py                |  6 +--
 llmfoundry/metrics/__init__.py                |  3 --
 llmfoundry/models/hf/__init__.py              |  2 +
 llmfoundry/models/hf/hf_fsdp.py               |  8 ++-
 llmfoundry/models/hf/model_wrapper.py         |  2 +
 .../models/inference_api_wrapper/__init__.py  |  6 ++-
 .../models/inference_api_wrapper/fmapi.py     |  1 +
 .../models/inference_api_wrapper/interface.py |  2 +
 .../inference_api_wrapper/openai_causal_lm.py |  1 +
 llmfoundry/models/layers/__init__.py          | 33 +++++++++---
 llmfoundry/models/layers/attention.py         | 12 +++++
 llmfoundry/models/layers/blocks.py            |  5 ++
 llmfoundry/models/layers/custom_embedding.py  |  2 +
 llmfoundry/models/layers/dmoe.py              |  8 +++
 llmfoundry/models/layers/ffn.py               | 11 ++++
 llmfoundry/models/layers/layer_builders.py    |  7 +++
 llmfoundry/models/layers/norm.py              |  8 +++
 llmfoundry/models/utils/__init__.py           |  6 +++
 llmfoundry/models/utils/act_ckpt.py           |  8 ++-
 llmfoundry/models/utils/config_moe_args.py    |  8 ++-
 llmfoundry/models/utils/meta_init_context.py  |  5 ++
 llmfoundry/models/utils/mpt_param_count.py    |  5 ++
 llmfoundry/models/utils/param_init_fns.py     |  4 ++
 llmfoundry/optim/__init__.py                  |  1 +
 llmfoundry/optim/adaptive_lion.py             |  5 ++
 llmfoundry/optim/lion.py                      |  4 ++
 llmfoundry/optim/outlier_detection.py         |  2 +
 llmfoundry/tokenizers/tiktoken.py             |  4 ++
 llmfoundry/utils/__init__.py                  | 44 +++++++++++-----
 llmfoundry/utils/data_prep_utils.py           |  3 +-
 llmfoundry/utils/exceptions.py                | 24 +++++++++
 llmfoundry/utils/model_download_utils.py      |  2 +-
 llmfoundry/utils/mosaicml_logger_utils.py     |  7 +++
 llmfoundry/utils/registry_utils.py            |  8 ++-
 llmfoundry/utils/warnings.py                  |  3 ++
 mcli/README.md                                |  2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml              |  2 +-
 .../data_prep/convert_finetuning_dataset.py   |  2 +-
 scripts/data_prep/convert_text_to_mds.py      | 16 +++---
 scripts/eval/README.md                        |  2 +-
 scripts/inference/benchmarking/README.md      |  4 +-
 scripts/inference/convert_hf_to_onnx.py       |  2 +-
 scripts/inference/endpoint_generate.py        |  2 +-
 scripts/inference/run_mpt_with_ft.py          |  2 +-
 scripts/train/README.md                       |  4 +-
 scripts/train/benchmarking/README.md          |  6 +--
 scripts/train/benchmarking/collect_results.py |  4 +-
 .../train/benchmarking/submit_benchmarks.py   |  2 +-
 .../yamls/finetune/1b_local_data_sft.yaml     |  2 +-
 scripts/train/yamls/pretrain/mpt-1b.yaml      |  2 +-
 tests/data/test_dataloader.py                 |  3 +-
 74 files changed, 425 insertions(+), 130 deletions(-)

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index 29b6552f01..012147ec20 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -19,47 +19,39 @@
 
 hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
 
-from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
-from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
-                             Seq2SeqFinetuningCollator,
-                             build_finetuning_dataloader)
-from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
-from llmfoundry.models.layers.attention import (
-    MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
-    flash_attn_fn, scaled_multihead_dot_product_attention)
-from llmfoundry.models.layers.blocks import MPTBlock
-from llmfoundry.models.layers.ffn import MPTMLP
+from llmfoundry import (algorithms, callbacks, cli, data, eval, interfaces,
+                        loggers, metrics, models, optim, tokenizers, utils)
+from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
+from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
+from llmfoundry.models.hf import ComposerHFCausalLM
 from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
                                    MPTForCausalLM, MPTModel, MPTPreTrainedModel)
-from llmfoundry.tokenizers import TiktokenTokenizerWrapper
+from llmfoundry.optim import DecoupledLionW
 
 __all__ = [
-    'build_finetuning_dataloader',
-    'Seq2SeqFinetuningCollator',
-    'MPTBlock',
-    'MPTMLP',
+    'StreamingFinetuningDataset',
+    'StreamingTextDataset',
+    'InContextLearningDataset',
+    'InContextLearningMetric',
+    'ComposerHFCausalLM',
     'MPTConfig',
     'MPTPreTrainedModel',
     'MPTModel',
     'MPTForCausalLM',
     'ComposerMPTCausalLM',
-    'ComposerHFCausalLM',
-    'ComposerHFT5',
-    'scaled_multihead_dot_product_attention',
-    'flash_attn_fn',
-    'MultiheadAttention',
-    'NoConcatDataset',
-    'ConcatTokensDataset',
-    'attn_bias_shape',
-    'build_attn_bias',
-    'build_alibi_bias',
-    'optim',
-    'utils',
-    'loggers',
+    'DecoupledLionW',
     'algorithms',
     'callbacks',
-    'TiktokenTokenizerWrapper',
-    'registry',
+    'cli',
+    'data',
+    'eval',
+    'interfaces',
+    'loggers',
+    'metrics',
+    'models',
+    'optim',
+    'tokenizers',
+    'utils',
 ]
 
 __version__ = '0.8.0.dev0'
diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py
index a976c08060..c261a2086b 100644
--- a/llmfoundry/callbacks/async_eval_callback.py
+++ b/llmfoundry/callbacks/async_eval_callback.py
@@ -27,6 +27,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['AsyncEval']
+
 REQUIRED_PARAMS_FOR_EVAL = {
     'device_eval_batch_size',
     'icl_tasks',  # only required for eval, may not be specified in pure training
diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py
index 37faa14fdd..f00d68f760 100644
--- a/llmfoundry/callbacks/curriculum_learning_callback.py
+++ b/llmfoundry/callbacks/curriculum_learning_callback.py
@@ -20,6 +20,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['CurriculumLearning']
+
 
 @experimental_class('CurriculumLearning callback')
 class CurriculumLearning(CallbackWithConfig):
diff --git a/llmfoundry/callbacks/fdiff_callback.py b/llmfoundry/callbacks/fdiff_callback.py
index 1237f32e22..2afcc94452 100644
--- a/llmfoundry/callbacks/fdiff_callback.py
+++ b/llmfoundry/callbacks/fdiff_callback.py
@@ -8,6 +8,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+__all__ = ['FDiffMetrics']
+
 
 class FDiffMetrics(Callback):
     """Rate of change of metrics.
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index f899206add..55dcb8b833 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -35,6 +35,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['HuggingFaceCheckpointer']
+
 _LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE)
 
 
diff --git a/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py b/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
index fc906e0d87..89ee37cf0c 100644
--- a/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
+++ b/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
@@ -9,6 +9,8 @@
 from composer.loggers import Logger
 from composer.utils import dist
 
+__all__ = ['MegaBlocksMoE_TokPerExpert']
+
 
 class MegaBlocksMoE_TokPerExpert(Callback):
     """Log tokens per expert for MegaBlocks MoE.
@@ -44,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback):
 
     Args:
         log_interval (int, optional): The interval on which to log (Default: 10).
-        log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log
+        log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log
             only aggregate statistics (Default: False).
         all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log
             statistics for GPU 0 (Default: False).
diff --git a/llmfoundry/callbacks/monolithic_ckpt_callback.py b/llmfoundry/callbacks/monolithic_ckpt_callback.py
index aaa68763f5..395a13111c 100644
--- a/llmfoundry/callbacks/monolithic_ckpt_callback.py
+++ b/llmfoundry/callbacks/monolithic_ckpt_callback.py
@@ -15,6 +15,8 @@
 from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
                             reproducibility)
 
+__all__ = ['MonolithicCheckpointSaver']
+
 
 class MonolithicCheckpointSaver(Callback):
     """Save a monolithic checkpoint every N batches.
diff --git a/llmfoundry/callbacks/resumption_callbacks.py b/llmfoundry/callbacks/resumption_callbacks.py
index 751accc922..f910114a88 100644
--- a/llmfoundry/callbacks/resumption_callbacks.py
+++ b/llmfoundry/callbacks/resumption_callbacks.py
@@ -7,6 +7,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+from llmfoundry.utils.warnings import experimental_class
+
 __all__ = [
     'GlobalLRScaling',
     'LayerFreezing',
@@ -15,6 +17,7 @@
 log = logging.getLogger(__name__)
 
 
+@experimental_class('GlobalLRScaling')
 class GlobalLRScaling(Callback):
     """GlobalLRScaling.
 
@@ -52,6 +55,7 @@ def fit_start(self, state: State, logger: Logger) -> None:
             ]
 
 
+@experimental_class('LayerFreezing')
 class LayerFreezing(Callback):
     """LayerFreezing.
 
diff --git a/llmfoundry/callbacks/scheduled_gc_callback.py b/llmfoundry/callbacks/scheduled_gc_callback.py
index 216fa2adb4..b210c99b16 100644
--- a/llmfoundry/callbacks/scheduled_gc_callback.py
+++ b/llmfoundry/callbacks/scheduled_gc_callback.py
@@ -8,6 +8,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+__all__ = ['ScheduledGarbageCollector']
+
 
 def gc_cuda():
     """Garbage collect Torch (CUDA) memory."""
diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
index 45d1f8237f..027ea7b07a 100644
--- a/llmfoundry/data/__init__.py
+++ b/llmfoundry/data/__init__.py
@@ -4,9 +4,14 @@
 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
 from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
+                                        StreamingFinetuningDataset,
                                         build_finetuning_dataloader)
-from llmfoundry.data.text_data import (StreamingTextDataset,
-                                       build_text_dataloader)
+from llmfoundry.data.packing import (BinPackCollator, auto_packing_ratio,
+                                     profile_packing)
+from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
+                                       StreamingTextDataset,
+                                       build_text_dataloader,
+                                       get_tokens_per_batch_func)
 from llmfoundry.registry import dataloaders
 
 dataloaders.register('text', func=build_text_dataloader)
@@ -15,9 +20,15 @@
 __all__ = [
     'Seq2SeqFinetuningCollator',
     'build_finetuning_dataloader',
+    'StreamingFinetuningDataset',
     'StreamingTextDataset',
     'build_text_dataloader',
     'NoConcatDataset',
     'ConcatTokensDataset',
     'build_dataloader',
+    'BinPackCollator',
+    'auto_packing_ratio',
+    'profile_packing',
+    'ConcatenatedSequenceCollatorWrapper',
+    'get_tokens_per_batch_func',
 ]
diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py
index 92e4538d73..c7b018c5fb 100644
--- a/llmfoundry/data/data.py
+++ b/llmfoundry/data/data.py
@@ -11,6 +11,11 @@
 from torch.utils.data import IterableDataset
 from transformers import PreTrainedTokenizerBase
 
+__all__ = [
+    'ConcatTokensDataset',
+    'NoConcatDataset',
+]
+
 
 class NoConcatDataset(IterableDataset):
     """An IterableDataset that returns text samples for MDSWriter.
diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
index a98526001a..61471420f8 100644
--- a/llmfoundry/data/dataloader.py
+++ b/llmfoundry/data/dataloader.py
@@ -10,6 +10,10 @@
 from llmfoundry import registry
 from llmfoundry.utils.registry_utils import construct_from_registry
 
+__all__ = [
+    'build_dataloader',
+]
+
 
 def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
                      device_batch_size: int) -> DataSpec:
diff --git a/llmfoundry/data/finetuning/__init__.py b/llmfoundry/data/finetuning/__init__.py
index 9d10a17cfa..3b5c277199 100644
--- a/llmfoundry/data/finetuning/__init__.py
+++ b/llmfoundry/data/finetuning/__init__.py
@@ -3,5 +3,16 @@
 
 from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
 from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.finetuning.tasks import (StreamingFinetuningDataset,
+                                              dataset_constructor,
+                                              is_valid_ift_example,
+                                              tokenize_formatted_example)
 
-__all__ = ['Seq2SeqFinetuningCollator', 'build_finetuning_dataloader']
+__all__ = [
+    'Seq2SeqFinetuningCollator',
+    'build_finetuning_dataloader',
+    'dataset_constructor',
+    'tokenize_formatted_example',
+    'is_valid_ift_example',
+    'StreamingFinetuningDataset',
+]
diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py
index 6e3babd657..7d592483f1 100644
--- a/llmfoundry/data/finetuning/collator.py
+++ b/llmfoundry/data/finetuning/collator.py
@@ -10,6 +10,10 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'Seq2SeqFinetuningCollator',
+]
+
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100
 
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 1d8711d280..e72ee29719 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -23,6 +23,10 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'build_finetuning_dataloader',
+]
+
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100
 
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 25258a3f2b..e6a3afb188 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -72,7 +72,12 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 
 log = logging.getLogger(__name__)
 
-__all__ = ['dataset_constructor']
+__all__ = [
+    'dataset_constructor',
+    'tokenize_formatted_example',
+    'is_valid_ift_example',
+    'StreamingFinetuningDataset',
+]
 
 _ALLOWED_RESPONSE_KEYS = {'response', 'completion'}
 _ALLOWED_PROMPT_KEYS = {'prompt'}
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 9696f967ca..3d525def47 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -13,6 +13,12 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'BinPackCollator',
+    'auto_packing_ratio',
+    'profile_packing',
+]
+
 
 class BinPackCollator:
     """Utility collator for packing to reduce padding."""
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
index fc31b890b0..a59098323b 100644
--- a/llmfoundry/data/text_data.py
+++ b/llmfoundry/data/text_data.py
@@ -22,6 +22,13 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'StreamingTextDataset',
+    'build_text_dataloader',
+    'ConcatenatedSequenceCollatorWrapper',
+    'get_tokens_per_batch_func',
+]
+
 
 class StreamingTextDataset(StreamingDataset):
     """Generic text dataset using MosaicML's StreamingDataset.
diff --git a/llmfoundry/eval/__init__.py b/llmfoundry/eval/__init__.py
index 80950cb7b4..f2425296b6 100644
--- a/llmfoundry/eval/__init__.py
+++ b/llmfoundry/eval/__init__.py
@@ -1,2 +1,31 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
+
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+    InContextLearningCodeEvalDataset, InContextLearningDataset,
+    InContextLearningGenerationTaskWithAnswersDataset,
+    InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningLMExpectedCalibrationError,
+    InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
+    InContextLearningMultipleChoiceAccuracy)
+
+__all__ = [
+    'InContextLearningDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningGenerationTaskWithAnswersDataset',
+    'get_icl_task_dataloader',
+    'InContextLearningMetric',
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
+    'InContextLearningCodeEvalAccuracy',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMCExpectedCalibrationError',
+]
diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index e6a8b5222d..0be9882b0c 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -8,11 +8,13 @@
     InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
-from llmfoundry.eval.datasets.utils import (get_continuation_span,
-                                            get_fewshot_sample_idxs,
-                                            make_padded_input, strip_data,
-                                            tokenizer_needs_prefix_space,
-                                            trim_context)
+
+# isort: off
+from llmfoundry.eval.datasets.utils import (
+    MultiTokenEOSCriteria, convert_tokens_to_tensors, get_continuation_span,
+    get_fewshot_sample_idxs, make_padded_input, stop_sequences_criteria,
+    strip_data, tokenizer_needs_prefix_space, trim_context)
+# isort: on
 
 __all__ = [
     'InContextLearningDataset',
@@ -22,10 +24,13 @@
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
     'get_icl_task_dataloader',
+    'MultiTokenEOSCriteria',
     'strip_data',
     'tokenizer_needs_prefix_space',
     'trim_context',
     'get_continuation_span',
-    'get_fewshot_sample_idxs',
     'make_padded_input',
+    'convert_tokens_to_tensors',
+    'get_fewshot_sample_idxs',
+    'stop_sequences_criteria',
 ]
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index b7e3dce630..447855f953 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -35,6 +35,7 @@
 _MAX_ANSWER_BUFFER_LENGTH = 10
 
 __all__ = [
+    'InContextLearningDataset',
     'InContextLearningLMTaskDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
@@ -49,7 +50,7 @@ class InContextLearningDataset(Dataset):
 
     evaluations. The dataset format is expected to be a local jsonl file, a
     cloud link to a jsonl file, or a Hugging Face dataset link. 'context' refers
-    to the input a model will recieve before generating an output. For example,
+    to the input a model will receive before generating an output. For example,
     the question in question answering tasks, the preceding text in a language
     modeling task, or the document and question regarding the document in a
     document understanding task. 'example' refers to a loaded dictionary,
@@ -99,7 +100,7 @@ class InContextLearningDataset(Dataset):
         base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+            Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
         generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
         static_keys (List): A list of the key values which will be broadcast across a batch (e.g. it is the same for each batch element).
         list_keys (List): A list of the batch keys whose values are lists which will be split using list methods during calls to split_batch.
@@ -260,7 +261,7 @@ def _generate_few_shot_prompt(
         """Formats the fewshot prompt for test example `example_idx`.
 
         Randomly selects `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
-        contextes with answers appended.
+        contexts with answers appended.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
 
@@ -360,7 +361,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctxt (str): The specific example's derrived context
+            ctxt (str): The specific example's derived context
             example (Dict): The example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
@@ -642,7 +643,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
+            ctx (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -670,10 +671,10 @@ def _get_max_answer_length(self, dataset: Iterable[dict]) -> int:
                     )
                 else:
                     response = answer
-                tokenized_repsonse = self.tokenizer(response)['input_ids']
-                assert isinstance(tokenized_repsonse, list)
+                tokenized_response = self.tokenizer(response)['input_ids']
+                assert isinstance(tokenized_response, list)
                 max_answer_length = max(max_answer_length,
-                                        len(tokenized_repsonse))
+                                        len(tokenized_response))
         max_answer_length = max_answer_length + (
             _MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
@@ -816,7 +817,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
+            ctx (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1043,7 +1044,7 @@ def _construct_multiple_contexts(self,
                                      preceding_text: str = '') -> List[str]:
         """Takes a example and constructs all contexts.
 
-        Optionally, appends this to preceeding text (such as a prompt or fewshot examples).
+        Optionally, appends this to preceding text (such as a prompt or fewshot examples).
 
         Args:
             example (Dict): The example from which to construct the context
@@ -1078,7 +1079,7 @@ def _prep_example(
 
         with prompt and fewshot examples.
 
-        Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
+        Each task consists of multiple contexts and a single, correct continuation. Will prepend fewshot examples and
         prompt if present.
 
         Args:
@@ -1105,7 +1106,7 @@ def tokenize_example(self, prompt_and_fewshot: str,
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
+            ctx (str): The specific example's derived context
             example (Dict): The example as a dictionary.
 
         Returns:
@@ -1182,8 +1183,8 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - test_outputs: List of test outputs
     - languages:  List of languages
     - pass_at_k: Passed value for pass_at_k
-    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
-      by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+    - generation_kwargs: Dictionary of kwargs needed for generation. Includes the following, which will be individually overwritten
+      by keys in generation_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
       for more details):
 
         - pad_token_id: ID for padding token, derived automatically
@@ -1405,14 +1406,14 @@ def build_icl_dataloader(
         do_normalization: bool = True) -> DataSpec:
     """Factory method that builds the specific dataset for the specified.
 
-    icl_task_type. See documentation for `get_icl_task_dataloader` for arugment
+    icl_task_type. See documentation for `get_icl_task_dataloader` for argument
     documentation.
 
     When writing a dataset for a new task, here you will need to:
         1. add the dataset to the factory and choose an appropriate string
         2. set the batch size for that task (see InContextLearningMultipleChoiceTaskDataset for why
             this might be different)
-        3. set the `split_batch` funciton if necessary
+        3. set the `split_batch` function if necessary
     """
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
@@ -1712,9 +1713,9 @@ def get_icl_task_dataloader(
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+            Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
         generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
-                                                  keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+                                                  keyword args in this function (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
                                                   for more details)
         early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
             Used in generation tasks with CoT
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index 7ea7f9fae2..6433e7cb56 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -13,6 +13,14 @@
 
 __all__ = [
     'MultiTokenEOSCriteria',
+    'strip_data',
+    'tokenizer_needs_prefix_space',
+    'trim_context',
+    'get_continuation_span',
+    'make_padded_input',
+    'convert_tokens_to_tensors',
+    'get_fewshot_sample_idxs',
+    'stop_sequences_criteria',
 ]
 
 log = logging.getLogger(__name__)
@@ -61,7 +69,7 @@ def trim_context(context_enc: List, continuation_enc: List,
 
     Args:
         context_enc (list): List of tokens in the context
-        continuation_enc (lsit): List of tokens in the continuation
+        continuation_enc (list): List of tokens in the continuation
         max_seq_len (int): Maximum length the model can ingest
 
     Returns:
@@ -229,7 +237,7 @@ def __init__(
         self.stop_sequence_ids = tokenizer.encode(stop_sequence,
                                                   add_special_tokens=False)
 
-        # sentence piece tokenizers add a superflous underline token before string-initial \n
+        # sentence piece tokenizers add a superfluous underline token before string-initial \n
         # that throws off our calculation of the stop sequence length
         # so we remove any token ids that produce empty strings
         self.stop_sequence_ids = [
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 6e70e2ece3..079439da59 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -11,11 +11,11 @@
     InContextLearningMultipleChoiceAccuracy)
 
 __all__ = [
+    'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
-    'InContextLearningMCExpectedCalibrationError',
-    'InContextLearningLMExpectedCalibrationError',
-    'InContextLearningMetric',
     'InContextLearningCodeEvalAccuracy',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMCExpectedCalibrationError',
 ]
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 55922e28d2..f5a50721e3 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -662,7 +662,7 @@ class InContextLearningMCExpectedCalibrationError(
     def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         outputs = torch.softmax(outputs, dim=2)
-        probabilites = []
+        probabilities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             cont_tok_logits = outputs[batch_idx].index_select(dim=0,
                                                               index=cont_idx -
@@ -671,11 +671,11 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
                                                            index=cont_idx - 1)
             probability = cont_tok_logits.index_select(
                 dim=1, index=cont_tok_targ).diagonal().mean()
-            probabilites.append(probability)
+            probabilities.append(probability)
 
         for (start, end), gold_idx in zip(batch['choice_groupings'],
                                           batch['gold_indices']):
-            subset = probabilites[start:end]
+            subset = probabilities[start:end]
             idx_max = subset.index(max(subset))
             confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
 
diff --git a/llmfoundry/metrics/__init__.py b/llmfoundry/metrics/__init__.py
index e8310687a1..8ca2db5bd2 100644
--- a/llmfoundry/metrics/__init__.py
+++ b/llmfoundry/metrics/__init__.py
@@ -58,9 +58,6 @@
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
     'InContextLearningCodeEvalAccuracy',
-    'LanguageCrossEntropy',
-    'LanguagePerplexity',
-    'MaskedAccuracy',
     'DEFAULT_CAUSAL_LM_TRAIN_METRICS',
     'DEFAULT_CAUSAL_LM_EVAL_METRICS',
     'DEFAULT_ENC_DEC_METRICS',
diff --git a/llmfoundry/models/hf/__init__.py b/llmfoundry/models/hf/__init__.py
index 3c35080d6e..2ed7b2d6e1 100644
--- a/llmfoundry/models/hf/__init__.py
+++ b/llmfoundry/models/hf/__init__.py
@@ -6,6 +6,7 @@
                                           prepare_hf_enc_dec_model_for_fsdp,
                                           prepare_hf_model_for_fsdp)
 from llmfoundry.models.hf.hf_t5 import ComposerHFT5
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
 
 __all__ = [
     'ComposerHFCausalLM',
@@ -13,4 +14,5 @@
     'prepare_hf_causal_lm_model_for_fsdp',
     'prepare_hf_enc_dec_model_for_fsdp',
     'prepare_hf_model_for_fsdp',
+    'HuggingFaceModelWithFSDP',
 ]
diff --git a/llmfoundry/models/hf/hf_fsdp.py b/llmfoundry/models/hf/hf_fsdp.py
index acdc927835..87bffc3af8 100644
--- a/llmfoundry/models/hf/hf_fsdp.py
+++ b/llmfoundry/models/hf/hf_fsdp.py
@@ -14,6 +14,12 @@
 if TYPE_CHECKING:
     from peft import PeftModel
 
+__all__ = [
+    'prepare_hf_model_for_fsdp',
+    'prepare_hf_causal_lm_model_for_fsdp',
+    'prepare_hf_enc_dec_model_for_fsdp',
+]
+
 
 # helper functions
 def rhasattr(obj: Any, attr: str) -> bool:
@@ -250,7 +256,7 @@ def prepare_hf_enc_dec_model_for_fsdp(model: PreTrainedModel,
     if encoder_block_type == decoder_block_type:
         return
 
-    # need to wrap encoder blocks separately for ProhpetNet and Marian
+    # need to wrap encoder blocks separately for ProphetNet and Marian
     model.fsdp_wrap_fn = lambda module: isinstance(module, encoder_block_type)
     model.activation_checkpointing_fn = lambda module: isinstance(
         module, encoder_block_type)
diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
index 2ba88d390c..4b7be9ee08 100644
--- a/llmfoundry/models/hf/model_wrapper.py
+++ b/llmfoundry/models/hf/model_wrapper.py
@@ -19,6 +19,8 @@
 if TYPE_CHECKING:
     from peft import PeftConfig
 
+__all__ = ['HuggingFaceModelWithFSDP']
+
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100
 
diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py
index 9bb2ece2b2..905abf2fa1 100644
--- a/llmfoundry/models/inference_api_wrapper/__init__.py
+++ b/llmfoundry/models/inference_api_wrapper/__init__.py
@@ -2,16 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.models.inference_api_wrapper.fmapi import (
-    FMAPICasualLMEvalWrapper, FMAPIChatAPIEvalWrapper)
+    FMAPICasualLMEvalWrapper, FMAPIChatAPIEvalWrapper, FMAPIEvalInterface)
 from llmfoundry.models.inference_api_wrapper.interface import \
     InferenceAPIEvalWrapper
 from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
-    OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper)
+    OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper, OpenAIEvalInterface)
 
 __all__ = [
     'OpenAICausalLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
+    'OpenAIEvalInterface',
     'InferenceAPIEvalWrapper',
     'FMAPICasualLMEvalWrapper',
     'FMAPIChatAPIEvalWrapper',
+    'FMAPIEvalInterface',
 ]
diff --git a/llmfoundry/models/inference_api_wrapper/fmapi.py b/llmfoundry/models/inference_api_wrapper/fmapi.py
index 58ea302ace..d0c987304a 100644
--- a/llmfoundry/models/inference_api_wrapper/fmapi.py
+++ b/llmfoundry/models/inference_api_wrapper/fmapi.py
@@ -15,6 +15,7 @@
 __all__ = [
     'FMAPICasualLMEvalWrapper',
     'FMAPIChatAPIEvalWrapper',
+    'FMAPIEvalInterface',
 ]
 
 log = logging.getLogger(__name__)
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 91f6fb2600..a939d03d68 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -13,6 +13,8 @@
 from llmfoundry.eval.metrics import InContextLearningMetric
 from llmfoundry.metrics import DEFAULT_CAUSAL_LM_EVAL_METRICS
 
+__all__ = ['InferenceAPIEvalWrapper']
+
 
 class InferenceAPIEvalWrapper(ComposerModel):
 
diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
index bacf71b8e2..9f2cf3315c 100644
--- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
+++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
@@ -23,6 +23,7 @@
 __all__ = [
     'OpenAICausalLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
+    'OpenAIEvalInterface',
 ]
 
 if TYPE_CHECKING:
diff --git a/llmfoundry/models/layers/__init__.py b/llmfoundry/models/layers/__init__.py
index dca55098c4..e31029024c 100644
--- a/llmfoundry/models/layers/__init__.py
+++ b/llmfoundry/models/layers/__init__.py
@@ -3,13 +3,18 @@
 
 from llmfoundry.models.layers.attention import (
     GroupedQueryAttention, MultiheadAttention, MultiQueryAttention,
-    attn_bias_shape, build_alibi_bias, build_attn_bias, flash_attn_fn,
-    scaled_multihead_dot_product_attention)
-from llmfoundry.models.layers.blocks import MPTBlock
+    attn_bias_shape, build_alibi_bias, build_attn_bias, check_alibi_support,
+    flash_attn_fn, scaled_multihead_dot_product_attention)
+from llmfoundry.models.layers.blocks import FusedNormAttentionNorm, MPTBlock
 from llmfoundry.models.layers.custom_embedding import SharedEmbedding
+from llmfoundry.models.layers.dmoe import DroplessMLP, LearnedRouter, dMoE
 from llmfoundry.models.layers.fc import *
-from llmfoundry.models.layers.ffn import MPTMLP
-from llmfoundry.models.layers.norm import LPLayerNorm
+from llmfoundry.models.layers.ffn import MPTGLU, MPTMLP
+from llmfoundry.models.layers.layer_builders import (build_attention_layer,
+                                                     build_fc, build_ffn,
+                                                     build_norm)
+from llmfoundry.models.layers.norm import (LPLayerNorm, LPRMSNorm, RMSNorm,
+                                           TritonRMSNorm, rms_norm)
 
 __all__ = [
     'scaled_multihead_dot_product_attention',
@@ -20,8 +25,22 @@
     'attn_bias_shape',
     'build_attn_bias',
     'build_alibi_bias',
-    'MPTMLP',
+    'check_alibi_support',
     'MPTBlock',
-    'LPLayerNorm',
+    'FusedNormAttentionNorm',
     'SharedEmbedding',
+    'dMoE',
+    'LearnedRouter',
+    'DroplessMLP',
+    'MPTMLP',
+    'MPTGLU',
+    'build_attention_layer',
+    'build_ffn',
+    'build_fc',
+    'build_norm',
+    'LPLayerNorm',
+    'LPRMSNorm',
+    'RMSNorm',
+    'TritonRMSNorm',
+    'rms_norm',
 ]
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index 6614d5d161..d4a34eecaa 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -18,6 +18,18 @@
                                         attention_implementations)
 from llmfoundry.models.layers.layer_builders import build_fc, build_norm
 
+__all__ = [
+    'scaled_multihead_dot_product_attention',
+    'flash_attn_fn',
+    'MultiheadAttention',
+    'MultiQueryAttention',
+    'GroupedQueryAttention',
+    'attn_bias_shape',
+    'build_attn_bias',
+    'build_alibi_bias',
+    'check_alibi_support',
+]
+
 
 def is_flash_v2_installed(v2_version: str = '2.0.0'):
     assert version.parse(v2_version) >= version.parse('2.0.0')
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index 40f349368f..d56c4753af 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -17,6 +17,11 @@
 except:
     unpad_input, pad_input = None, None
 
+__all__ = [
+    'MPTBlock',
+    'FusedNormAttentionNorm',
+]
+
 attn_config_defaults: Dict = {
     'attn_type': 'multihead_attention',
     'attn_pdrop': 0.0,
diff --git a/llmfoundry/models/layers/custom_embedding.py b/llmfoundry/models/layers/custom_embedding.py
index 20a2be3a55..fba823a4f7 100644
--- a/llmfoundry/models/layers/custom_embedding.py
+++ b/llmfoundry/models/layers/custom_embedding.py
@@ -5,6 +5,8 @@
 import torch.nn.functional as F
 from torch import Tensor
 
+__all__ = ['SharedEmbedding']
+
 
 class SharedEmbedding(nn.Embedding):
 
diff --git a/llmfoundry/models/layers/dmoe.py b/llmfoundry/models/layers/dmoe.py
index 19cd67b8aa..f2b255294c 100644
--- a/llmfoundry/models/layers/dmoe.py
+++ b/llmfoundry/models/layers/dmoe.py
@@ -5,6 +5,14 @@
 
 import torch
 
+__all__ = [
+    'dMoE',
+    'LearnedRouter',
+    'MLP',
+    'GLU',
+    'DroplessMLP',
+]
+
 
 # Add option to route tokens uniformly across experts. We use
 # a custom autograd op router backwards is still run for benchmarking.
diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py
index fb663b4c3c..c64e87cb9a 100644
--- a/llmfoundry/models/layers/ffn.py
+++ b/llmfoundry/models/layers/ffn.py
@@ -32,6 +32,17 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'MPTMLP',
+    'MPTGLU',
+    'build_mptglu',
+    'build_mptmlp',
+    'build_te_ln_mlp',
+    'build_torch_dmoe',
+    'build_mb_moe',
+    'build_mb_dmoe',
+]
+
 _FFN_ACT_FN_DEFAULT = {
     'name': 'gelu',
     'approximate': 'none',
diff --git a/llmfoundry/models/layers/layer_builders.py b/llmfoundry/models/layers/layer_builders.py
index 425fcaf862..ceb41d8d41 100644
--- a/llmfoundry/models/layers/layer_builders.py
+++ b/llmfoundry/models/layers/layer_builders.py
@@ -10,6 +10,13 @@
                                         norms)
 from llmfoundry.utils.registry_utils import construct_from_registry
 
+__all__ = [
+    'build_attention_layer',
+    'build_ffn',
+    'build_fc',
+    'build_norm',
+]
+
 
 def build_norm(
     name: str,
diff --git a/llmfoundry/models/layers/norm.py b/llmfoundry/models/layers/norm.py
index 92d295c71c..23b92015e7 100644
--- a/llmfoundry/models/layers/norm.py
+++ b/llmfoundry/models/layers/norm.py
@@ -7,6 +7,14 @@
 
 from llmfoundry.layers_registry import norms
 
+__all__ = [
+    'LPLayerNorm',
+    'LPRMSNorm',
+    'RMSNorm',
+    'TritonRMSNorm',
+    'rms_norm',
+]
+
 norms.register(name='layernorm', func=torch.nn.LayerNorm)
 
 
diff --git a/llmfoundry/models/utils/__init__.py b/llmfoundry/models/utils/__init__.py
index ca5fa4b935..45a5f757f6 100644
--- a/llmfoundry/models/utils/__init__.py
+++ b/llmfoundry/models/utils/__init__.py
@@ -1,6 +1,9 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from llmfoundry.models.utils.act_ckpt import (build_act_ckpt_mod_to_blocks,
+                                              check_mapping_blocks_overlap,
+                                              pass_on_block_idx)
 from llmfoundry.models.utils.config_moe_args import config_moe_args
 from llmfoundry.models.utils.meta_init_context import (init_empty_weights,
                                                        init_on_device)
@@ -15,4 +18,7 @@
     'config_moe_args',
     'mpt_get_active_params',
     'mpt_get_total_params',
+    'build_act_ckpt_mod_to_blocks',
+    'pass_on_block_idx',
+    'check_mapping_blocks_overlap',
 ]
diff --git a/llmfoundry/models/utils/act_ckpt.py b/llmfoundry/models/utils/act_ckpt.py
index e6cd8bdc58..ef9a851a09 100644
--- a/llmfoundry/models/utils/act_ckpt.py
+++ b/llmfoundry/models/utils/act_ckpt.py
@@ -10,6 +10,12 @@
                                         norms)
 from llmfoundry.models.layers.blocks import FusedNormAttentionNorm, MPTBlock
 
+__all__ = [
+    'build_act_ckpt_mod_to_blocks',
+    'pass_on_block_idx',
+    'check_mapping_blocks_overlap',
+]
+
 
 def pass_on_block_idx(parent: torch.nn.Module):
     if not hasattr(parent, 'block_idx') or not hasattr(parent, 'max_block_idx'):
@@ -104,7 +110,7 @@ def get_target_block_list(target_blocks: Any, max_block_idx: int) -> list:
             candidate_block_ids.extend(to_add)
     else:
         raise ValueError(
-            f'target_blocks must be either a single intege, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}'
+            f'target_blocks must be either a single integer, or a list of integers, or a comma separated string made of "first-n", "last-m", "middle-k", "range-i-j", or a list of mixed integers and before-mentioned strings, but got {type(target_blocks)}'
         )
 
     candidate_block_ids = list(set(candidate_block_ids))
diff --git a/llmfoundry/models/utils/config_moe_args.py b/llmfoundry/models/utils/config_moe_args.py
index 4de9a47bbc..859dd3c52b 100644
--- a/llmfoundry/models/utils/config_moe_args.py
+++ b/llmfoundry/models/utils/config_moe_args.py
@@ -12,6 +12,10 @@
 from llmfoundry.layers_registry import ffns_with_megablocks
 from llmfoundry.models.layers.ffn import resolve_ffn_hidden_size
 
+__all__ = [
+    'config_moe_args',
+]
+
 
 def create_process_group_ranks(ranks: tuple[int]):
     """Creates a new distributed group.
@@ -70,7 +74,7 @@ def config_megablocks_moe_args(
     groups can be initialized and shared across all blocks in the network.
 
     Args:
-        ffn_config (dict): FFN configuation before the MegaBlocks MoE is configured.
+        ffn_config (dict): FFN configuration before the MegaBlocks MoE is configured.
         d_model (int): Hidden size of the network.
         expansion_ratio (Union[int, float]): Expansion ratio in FFN.
         n_layers (int): Number of blocks used in the network.
@@ -170,7 +174,7 @@ def config_moe_args(
     """Configures `ffn_config` for MoE.
 
     Args:
-        ffn_config (dict): FFN configuation before the MoE is configured.
+        ffn_config (dict): FFN configuration before the MoE is configured.
         d_model (int): Hidden size of the network.
         expansion_ratio (int, float): Expansion ratio in FFN.
         n_layers (int): Number of blocks used in the network.
diff --git a/llmfoundry/models/utils/meta_init_context.py b/llmfoundry/models/utils/meta_init_context.py
index d72a289a73..66f06db581 100644
--- a/llmfoundry/models/utils/meta_init_context.py
+++ b/llmfoundry/models/utils/meta_init_context.py
@@ -23,6 +23,11 @@
 import torch.nn as nn
 from torch.distributed._tensor import DTensor
 
+__all__ = [
+    'init_empty_weights',
+    'init_on_device',
+]
+
 
 @contextmanager
 def init_empty_weights(include_buffers: bool = False):
diff --git a/llmfoundry/models/utils/mpt_param_count.py b/llmfoundry/models/utils/mpt_param_count.py
index cb1a5c0935..ca487ecca0 100644
--- a/llmfoundry/models/utils/mpt_param_count.py
+++ b/llmfoundry/models/utils/mpt_param_count.py
@@ -18,6 +18,11 @@
 
 from llmfoundry.layers_registry import ffns_with_megablocks
 
+__all__ = [
+    'mpt_get_active_params',
+    'mpt_get_total_params',
+]
+
 
 def module_n_params(module: nn.Module) -> int:
     """Gets the number of parameters in this module excluding child modules.
diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py
index e64cde5e96..06bdd84438 100644
--- a/llmfoundry/models/utils/param_init_fns.py
+++ b/llmfoundry/models/utils/param_init_fns.py
@@ -26,6 +26,10 @@
 except:
     megablocks = None
 
+__all__ = [
+    'generic_param_init_fn_',
+]
+
 
 def torch_default_param_init_fn_(
     module: nn.Module,
diff --git a/llmfoundry/optim/__init__.py b/llmfoundry/optim/__init__.py
index 527969bd63..26389665b5 100644
--- a/llmfoundry/optim/__init__.py
+++ b/llmfoundry/optim/__init__.py
@@ -26,4 +26,5 @@
     'DecoupledLionW',
     'DecoupledClipLion',
     'DecoupledAdaLRLion',
+    'InverseSquareRootWithWarmupScheduler',
 ]
diff --git a/llmfoundry/optim/adaptive_lion.py b/llmfoundry/optim/adaptive_lion.py
index 0ce76e905e..9b2dac9d80 100644
--- a/llmfoundry/optim/adaptive_lion.py
+++ b/llmfoundry/optim/adaptive_lion.py
@@ -13,6 +13,11 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'DecoupledAdaLRLion',
+    'DecoupledClipLion',
+]
+
 
 class DecoupledAdaLRLion(Optimizer):
     """DecoupledAdaLRLion.
diff --git a/llmfoundry/optim/lion.py b/llmfoundry/optim/lion.py
index 0caa7d2877..b04211649c 100644
--- a/llmfoundry/optim/lion.py
+++ b/llmfoundry/optim/lion.py
@@ -11,6 +11,10 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'DecoupledLionW',
+]
+
 
 class DecoupledLionW(Optimizer):
     metric_functions = {
diff --git a/llmfoundry/optim/outlier_detection.py b/llmfoundry/optim/outlier_detection.py
index 9df4381ba4..e430f4ccb5 100644
--- a/llmfoundry/optim/outlier_detection.py
+++ b/llmfoundry/optim/outlier_detection.py
@@ -4,6 +4,8 @@
 import collections
 from typing import Optional
 
+__all__ = ['OutlierDetector']
+
 
 class OutlierDetector:
     """OutlierDetector.
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 298e1bc984..0ecaa45b5f 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -6,6 +6,10 @@
 
 from transformers import PreTrainedTokenizer
 
+__all__ = [
+    'TiktokenTokenizerWrapper',
+]
+
 DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
 
 
diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py
index e8c90a6007..2c3d7c9bc3 100644
--- a/llmfoundry/utils/__init__.py
+++ b/llmfoundry/utils/__init__.py
@@ -2,7 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.utils.builders import (build_algorithm, build_callback,
-                                       build_logger, build_optimizer,
+                                       build_composer_model, build_evaluators,
+                                       build_icl_data_and_gauntlet,
+                                       build_icl_evaluators, build_logger,
+                                       build_metric, build_optimizer,
                                        build_scheduler, build_tokenizer)
 from llmfoundry.utils.checkpoint_conversion_helpers import (
     convert_and_save_ft_weights, get_hf_tokenizer_from_composer_state_dict,
@@ -12,7 +15,7 @@
                                            process_init_device,
                                            update_batch_size_info)
 from llmfoundry.utils.data_prep_utils import (DownloadingIterable,
-                                              merge_shard_groups, with_id)
+                                              merge_shard_groups)
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 from llmfoundry.utils.logging_utils import SpecificWarningFilter
@@ -30,40 +33,53 @@
 from llmfoundry.utils.prompt_files import load_prompts, load_prompts_from_file
 from llmfoundry.utils.registry_utils import (TypedRegistry,
                                              construct_from_registry,
-                                             create_registry)
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
+                                             create_registry, import_file,
+                                             save_registry)
+from llmfoundry.utils.warnings import (ExperimentalWarning,
+                                       VersionedDeprecationWarning,
+                                       experimental_class,
+                                       experimental_function)
 
 __all__ = [
     'build_algorithm',
     'build_callback',
+    'build_evaluators',
+    'build_icl_data_and_gauntlet',
+    'build_icl_evaluators',
     'build_logger',
     'build_optimizer',
     'build_scheduler',
     'build_tokenizer',
-    'convert_and_save_ft_weights',
+    'build_composer_model',
+    'build_metric',
     'get_hf_tokenizer_from_composer_state_dict',
     'load_tokenizer',
-    'calculate_batch_size_info',
-    'log_config',
+    'convert_and_save_ft_weights',
     'pop_config',
+    'calculate_batch_size_info',
     'update_batch_size_info',
     'process_init_device',
+    'log_config',
     'DownloadingIterable',
     'merge_shard_groups',
-    'with_id',
     'edit_files_for_hf_compatibility',
     'SpecificWarningFilter',
     'download_from_http_fileserver',
     'download_from_hf_hub',
     'download_from_oras',
+    'maybe_create_mosaicml_logger',
+    'find_mosaicml_logger',
+    'log_eval_analytics',
+    'log_train_analytics',
     'load_prompts',
     'load_prompts_from_file',
-    'VersionedDeprecationWarning',
+    'TypedRegistry',
     'create_registry',
     'construct_from_registry',
-    'TypedRegistry',
-    'find_mosaicml_logger',
-    'log_eval_analytics',
-    'log_train_analytics',
-    'maybe_create_mosaicml_logger',
+    'import_file',
+    'save_registry',
+    'VersionedDeprecationWarning',
+    'ExperimentalWarning',
+    'experimental_function',
+    'experimental_class',
 ]
diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py
index d50977c097..058e73b393 100644
--- a/llmfoundry/utils/data_prep_utils.py
+++ b/llmfoundry/utils/data_prep_utils.py
@@ -9,7 +9,6 @@
 from composer.utils import ObjectStore
 
 __all__ = [
-    'with_id',
     'merge_shard_groups',
     'DownloadingIterable',
 ]
@@ -94,7 +93,7 @@ def __init__(
         Args:
             object_names (List[str]): Names of objects to download
             output_folder (str): Local folder to write downloaded files to
-            object_store (Optiona[ObjectStore]): Object store to download from
+            object_store (Optional[ObjectStore]): Object store to download from
         """
         self.object_names = object_names
         self.object_store = object_store
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 5b6ac621ed..7a6be2be29 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -5,6 +5,30 @@
 from collections.abc import Mapping
 from typing import Any, Dict, List
 
+__all__ = [
+    'MissingHuggingFaceURLSplitError',
+    'NotEnoughDatasetSamplesError',
+    'UnknownExampleTypeError',
+    'TooManyKeysInExampleError',
+    'NotEnoughChatDataError',
+    'ConsecutiveRepeatedChatRolesError',
+    'InvalidLastChatMessageRoleError',
+    'IncorrectMessageKeyQuantityError',
+    'InvalidRoleError',
+    'InvalidContentTypeError',
+    'InvalidPromptTypeError',
+    'InvalidResponseTypeError',
+    'InvalidPromptResponseKeysError',
+    'InvalidFileExtensionError',
+    'UnableToProcessPromptResponseError',
+    'ClusterDoesNotExistError',
+    'FailedToCreateSQLConnectionError',
+    'FailedToConnectToDatabricksError',
+    'InputFolderMissingDataError',
+    'OutputFolderNotEmptyError',
+    'MisconfiguredHfDatasetError',
+]
+
 
 # Finetuning dataloader exceptions
 class MissingHuggingFaceURLSplitError(ValueError):
diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
index a88e02a33a..3707da3883 100644
--- a/llmfoundry/utils/model_download_utils.py
+++ b/llmfoundry/utils/model_download_utils.py
@@ -249,7 +249,7 @@ def download_from_oras(model: str,
         credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three
             files: `username`, `password`, and `registry`, each of which contains the corresponding credential.
         save_dir (str): Path to the directory where files will be downloaded.
-        tokenizer_only (bool): If true, only download the tokenzier files.
+        tokenizer_only (bool): If true, only download the tokenizer files.
         concurrency (int): The number of concurrent downloads to run.
     """
     if shutil.which(ORAS_CLI) is None:
diff --git a/llmfoundry/utils/mosaicml_logger_utils.py b/llmfoundry/utils/mosaicml_logger_utils.py
index e54f11ce32..b4a40821ed 100644
--- a/llmfoundry/utils/mosaicml_logger_utils.py
+++ b/llmfoundry/utils/mosaicml_logger_utils.py
@@ -10,6 +10,13 @@
                                               MOSAICML_PLATFORM_ENV_VAR)
 from omegaconf import DictConfig, ListConfig
 
+__all__ = [
+    'maybe_create_mosaicml_logger',
+    'find_mosaicml_logger',
+    'log_eval_analytics',
+    'log_train_analytics',
+]
+
 _MODEL_KEYS_TO_LOG = [
     'pretrained_model_name_or_path',
     'pretrained',
diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py
index 0eeefbae74..1604a8a91f 100644
--- a/llmfoundry/utils/registry_utils.py
+++ b/llmfoundry/utils/registry_utils.py
@@ -13,7 +13,13 @@
 
 import catalogue
 
-__all__ = ['TypedRegistry', 'create_registry', 'construct_from_registry']
+__all__ = [
+    'TypedRegistry',
+    'create_registry',
+    'construct_from_registry',
+    'import_file',
+    'save_registry',
+]
 
 T = TypeVar('T')
 TypeBoundT = TypeVar('TypeBoundT', bound=Type)
diff --git a/llmfoundry/utils/warnings.py b/llmfoundry/utils/warnings.py
index 6c9106b2e7..fb0046f938 100644
--- a/llmfoundry/utils/warnings.py
+++ b/llmfoundry/utils/warnings.py
@@ -6,6 +6,9 @@
 
 __all__ = [
     'VersionedDeprecationWarning',
+    'ExperimentalWarning',
+    'experimental_function',
+    'experimental_class',
 ]
 
 
diff --git a/mcli/README.md b/mcli/README.md
index ced3c42adc..59fb723f57 100644
--- a/mcli/README.md
+++ b/mcli/README.md
@@ -23,4 +23,4 @@ All the details of multi-gpu and multi-node orchestration are handled automatica
 
 ## Using the MosaicML Python SDK to launch runs
 You can also use the [Python SDK](https://mcli.docs.mosaicml.com/en/stable/python/hello_world.html) to launch MosaicML platform jobs.
-This can be used to programatically sweep hyperparameters or orchestrate training runs within a larger pipeline.
+This can be used to programmatically sweep hyperparameters or orchestrate training runs within a larger pipeline.
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index 33a891c058..3c7c62f7d4 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -43,7 +43,7 @@ parameters:
     name: mpt_causal_lm
     init_device: meta
     d_model: 2048
-    n_heads: 16  # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
+    n_heads: 16  # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
     n_layers: 24
     expansion_ratio: 4
     max_seq_len: ${max_seq_len}
diff --git a/scripts/data_prep/convert_finetuning_dataset.py b/scripts/data_prep/convert_finetuning_dataset.py
index e78e76a912..fb6bde4115 100644
--- a/scripts/data_prep/convert_finetuning_dataset.py
+++ b/scripts/data_prep/convert_finetuning_dataset.py
@@ -59,7 +59,7 @@ def parse_args() -> Namespace:
         '--skip-preprocessing',
         action='store_true',
         help=
-        'Whether to skip preprocesing (e.g., if the dataset is already formatted correctly)'
+        'Whether to skip preprocessing (e.g., if the dataset is already formatted correctly)'
     )
     parser.add_argument(
         '--out_root',
diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index be986fc24d..636e85abed 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -191,8 +191,8 @@ def get_task_args(
         input_folder (str): Folder of text files to process
         n_groups (int): Number of groups to split the object names into
         tokenizer_name (str): Name of tokenizer to use
-        concat_tokens (int): Concantenate up to this many tokens
-        eos_text (str): Textend to append to each example to separate concatenated samples
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
         bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
@@ -219,7 +219,7 @@ def get_task_args(
 def download_and_convert_starargs(args: Tuple):
     """Helper function to call download_and_convert with star args.
 
-    This helps us use download_and_convert with mutiprocessing.
+    This helps us use download_and_convert with multiprocessing.
     """
     return download_and_convert(*args)
 
@@ -236,15 +236,15 @@ def download_and_convert(
     compression: str,
     trust_remote_code: bool,
 ):
-    """Downloads and converts text fies to MDS format.
+    """Downloads and converts text files to MDS format.
 
     Args:
         file_names (List[str]): Files to process
         output_folder (str): Folder to write MDS shards to
         input_folder (str): Folder of text files to process
         tokenizer_name (str): Name of tokenizer to use
-        concat_tokens (int): Concantenate up to this many tokens
-        eos_text (str): Textend to append to each example to separate concatenated samples
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
         bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
@@ -375,8 +375,8 @@ def convert_text_to_mds(
         tokenizer_name (str): Name of tokenizer to use
         output_folder (str): Folder to write MDS shards to
         input_folder (str): Folder of text files to process
-        concat_tokens (int): Concantenate up to this many tokens
-        eos_text (str): Textend to append to each example to separate concatenated samples
+        concat_tokens (int): Concatenate up to this many tokens
+        eos_text (str): Text to append to each example to separate concatenated samples
         bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index ba2e9e2c79..488dd50a3a 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -49,7 +49,7 @@ In order to do ICL evaluation you must specify a set of benchmarks you'd like to
 
 
 #### ICL task YAML format
-Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate againts, this can either be a list of dictionaries of the form
+Your YAML must have a config section entitled `icl_tasks` specifying the benchmarks to evaluate against, this can either be a list of dictionaries of the form
 
 ```jsx
 icl_tasks:
diff --git a/scripts/inference/benchmarking/README.md b/scripts/inference/benchmarking/README.md
index 837154a977..b3f01256ac 100644
--- a/scripts/inference/benchmarking/README.md
+++ b/scripts/inference/benchmarking/README.md
@@ -28,7 +28,7 @@ LLM inference consists of two stages: _prefill_ and _decode_. It's important to
 
 During _prefill_, the model processes the input tokens/prompt/context. This is done in a single forward pass, making this stage fast, with excellent use of GPU hardware (ie. high Model Flop Utilization aka [MFU](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train/benchmarking#mfu)). Typically, if people talk about LLM inference being slow, this is _not_ the stage that they are referring to.
 
-During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominant.
+During _decode_, the model generates output tokens one at a time, i.e. autoregressively. This requires making N forward passes of the model for N tokens. This stage is slow and inefficient, because it requires moving gigabytes of model weights and pre-filled values for every single forward pass. Here, latency scales (mostly) linearly with the number of output tokens. Why mostly linear? When generating long sequences, the quadratic memory and compute complexity of the attention operation become more prominent.
 
 ##### KV cache
 
@@ -132,5 +132,5 @@ The benchmark script supports calling models directly from huggingface (using `h
 The analysis is done on a single A100 80GB GPU, with input length 512, and output length 64, while varying the batch size. As in previous sections, the batch sizes swept are 1, 2, 4, 8, 16, 32, 64, unless the GPU ran out of memory, in which case that point is not shown.
 
 As seen here, both MPT-7B and MPT-30B are among the fastest for inference in the open-source community, with MPT-30B being faster than the respective LLAMA-30B model.
-Among the 7B models, Falcon-7B tends to have higher througput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies.
+Among the 7B models, Falcon-7B tends to have higher throughput at higher latencies than MPT-7B, though MPT-7B has higher throughput at lower latencies.
 Previously, we found that Falcon-7b was significantly slower than both MPT-7B and LLAMA-7B. This slow speed was due to the KV-cache not being used properly during generation, however this appears to be [fixed](https://huggingface.co/tiiuae/falcon-7b/tree/main) as of July 13, 2022.
diff --git a/scripts/inference/convert_hf_to_onnx.py b/scripts/inference/convert_hf_to_onnx.py
index 1ba1123c86..9d1841b12f 100644
--- a/scripts/inference/convert_hf_to_onnx.py
+++ b/scripts/inference/convert_hf_to_onnx.py
@@ -160,7 +160,7 @@ def export_to_onnx(
             atol=1e-2,
             msg=f'output mismatch between the orig and onnx exported model',
         )
-        print('exported model ouptut matches with unexported model!!')
+        print('exported model output matches with unexported model!!')
 
     if save_object_store is not None:
         print('Uploading files to object storage...')
diff --git a/scripts/inference/endpoint_generate.py b/scripts/inference/endpoint_generate.py
index e78fecf59b..e6f9ae1448 100644
--- a/scripts/inference/endpoint_generate.py
+++ b/scripts/inference/endpoint_generate.py
@@ -42,7 +42,7 @@ def parse_args() -> Namespace:
         '-i',
         '--inputs',
         nargs='+',
-        help=f'List of strings, local datafiles (starting with {utils.PROMPTFILE_PREFIX}),' +\
+        help=f'List of strings, local data files (starting with {utils.PROMPTFILE_PREFIX}),' +\
              ' and/or remote object stores'
         )
     parser.add_argument(
diff --git a/scripts/inference/run_mpt_with_ft.py b/scripts/inference/run_mpt_with_ft.py
index 10ccf6b78b..61d9f68d2c 100644
--- a/scripts/inference/run_mpt_with_ft.py
+++ b/scripts/inference/run_mpt_with_ft.py
@@ -197,7 +197,7 @@ def main():
         type=int,
         default=0,
         choices=[0, 1, 2],
-        help='Whether to compute the cumulative log probsbility of sentences.' +
+        help='Whether to compute the cumulative log probability of sentences.' +
         ' 0: do not return the cumulative log probs' +
         ' 1: return the cumulative log probs of generated sequences' +
         ' 2: return the cumulative log probs of sequences')
diff --git a/scripts/train/README.md b/scripts/train/README.md
index 36974ec943..6730cb793b 100644
--- a/scripts/train/README.md
+++ b/scripts/train/README.md
@@ -276,7 +276,7 @@ If the dataset requires a [custom preprocessing function](#custom-data-preproces
 train_loader:
     name: finetuning
     dataset:
-        hf_name: mosaiml/doge-facts
+        hf_name: mosaicml/doge-facts
         preprocessing_fn: my_data.formatting:dogefacts_prep_fn
         split: train
         ...
@@ -402,7 +402,7 @@ so you should be able to run the exact same YAML on 8 or 16 or 256 GPUs and get
 This is nice because it means you can write device-count-agnostic training configs,
 and not worry about OOM-ing or accidentally changing the optimization math.
 
-In previous blogposts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy))
+In previous blog posts ([1](https://www.mosaicml.com/blog/farewell-oom), [2](https://www.mosaicml.com/blog/billion-parameter-gpt-training-made-easy))
 we also demonstrated auto microbatching, which takes things a step further by letting Composer determine the `device_train_microbatch_size` on its own.
 This makes our configs not only device-count-agnostic, but hardware-agnostic too!
 You can try out this feature by setting `device_train_microbatch_size: auto`, but bear in mind that FSDP support is still in alpha mode
diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md
index 5414cdc7bf..f5da10ec6a 100644
--- a/scripts/train/benchmarking/README.md
+++ b/scripts/train/benchmarking/README.md
@@ -20,7 +20,7 @@ python submit_benchmarks.py --cluster [your_mosaicml_cluster] ARGS --RUN
 can be used to sweep a larger set of configurations. For example usage of `submit_benchmarks.py` see `sweep.sh` which lists all benchmarks in the tables.
 
 > **Note**
-> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submissing and collection scripts.
+> The `collect_results.py` will by default find all runs with `tput` in the run name. To customize this project tag, use `--project` in both the submission and collection scripts.
 
 
 ## MFU and HFU
@@ -55,7 +55,7 @@ hfu* = 4 * flops_per_seq * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS)
 hfu = (4 * flops_per_seq + 4 * attn_flops_per_seq) * seq_per_sec / (gpu_num * GPU_AVAILABLE_FLOPS)
 ```
 
-Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual lyaers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multipler would be 5 instead of 4.
+Note that these are approximations. Actual HFU would be higher since it includes the floating point operations for normalization, activation, and residual layers, as well as **all** recomputation. For example, our models use Flash Attention, which requires including an extra recompute factor for its recomputation in the forward pass. Therefore, the attention multiplier would be 5 instead of 4.
 
 ## Results
 
@@ -65,7 +65,7 @@ python submit_benchmarks.py -m 13b.yaml 30b.yaml -t fp16 -b 21 21 -s 11 14 --RUN
 ```
 This will run 8 configs for 12 steps to get throughput numbers. `python collect_results.py` can then be used to parse all output training logs and create the tables below.
 
-Our microbatching engine enables microbatch sizes that do not divde Global Batchsize while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4.
+Our microbatching engine enables microbatch sizes that do not divide global batch size while being mathematically faithful to the global batch size. For example, a total batch size of 48, and a micro batch of 11, means we will accumulate gradients across microbatches of 11, 11, 11, 11, 4.
 
 [comment]: # (TODO: Update tables with torch 2.0 after next Composer release)
 
diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py
index d3691e951c..151286dbc6 100644
--- a/scripts/train/benchmarking/collect_results.py
+++ b/scripts/train/benchmarking/collect_results.py
@@ -150,8 +150,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]:
     d_model = run.submitted_config.parameters['model']['d_model']
     n_layers = run.submitted_config.parameters['model']['n_layers']
 
-    # mfu is approximated using thoughtput and param count
-    # the number of paramters is approximately the number of multiply-accumulates (MAC) in the network
+    # mfu is approximated using throughput and param count
+    # the number of parameters is approximately the number of multiply-accumulates (MAC) in the network
     # each MAC has 2 FLOPs - we multiply by 2 ie 2 * n_param
     # there are 3 passes of a NN (fwd, bwd, delta) - we multiply by 3 ie 2 * 3 * n_param
     # this gets us FLOPs / token
diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py
index aff570e3d4..5e83ae41b7 100644
--- a/scripts/train/benchmarking/submit_benchmarks.py
+++ b/scripts/train/benchmarking/submit_benchmarks.py
@@ -205,7 +205,7 @@ def get_global_train_batch_sizes(max_seq_len: int,
     if batch_sizes is None:
         batch_sizes = []
     if pows:
-        # global batch size in tokens (defualt: .5M thru 8M)
+        # global batch size in tokens (default: .5M thru 8M)
         global_train_token_counts = [2**n for n in range(pows[0], pows[1] + 1)]
         batch_sizes += [t // max_seq_len for t in global_train_token_counts
                        ]  # global batch size in samples
diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
index d7b9db10d4..46141ce5ab 100644
--- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml
+++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
@@ -16,7 +16,7 @@ model:
   name: mpt_causal_lm
   init_device: meta
   d_model: 2048
-  n_heads: 16  # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
+  n_heads: 16  # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
   n_layers: 24
   expansion_ratio: 4
   max_seq_len: ${max_seq_len}
diff --git a/scripts/train/yamls/pretrain/mpt-1b.yaml b/scripts/train/yamls/pretrain/mpt-1b.yaml
index 3744a455a8..effa60c59e 100644
--- a/scripts/train/yamls/pretrain/mpt-1b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-1b.yaml
@@ -11,7 +11,7 @@ model:
   name: mpt_causal_lm
   init_device: meta
   d_model: 2048
-  n_heads: 16  # Modified 24->16 so that d_head == 128 to statisfy FlashAttention
+  n_heads: 16  # Modified 24->16 so that d_head == 128 to satisfy FlashAttention
   n_layers: 24
   expansion_ratio: 4
   max_seq_len: ${max_seq_len}
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index b0808cd8e8..3eb5e3773d 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -21,8 +21,7 @@
 from omegaconf import OmegaConf as om
 from streaming import MDSWriter
 
-from llmfoundry import build_finetuning_dataloader
-from llmfoundry.data import build_dataloader
+from llmfoundry.data import build_dataloader, build_finetuning_dataloader
 from llmfoundry.data.finetuning.collator import (_HF_IGNORE_INDEX,
                                                  validate_target_settings)
 from llmfoundry.data.finetuning.tasks import (DOWNLOADED_FT_DATASETS_DIRPATH,

From c53622e3d6f891d983ed39ab580245e23b7238d9 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 22 Apr 2024 18:55:49 -0700
Subject: [PATCH 2/3] Fix HF checkpointer + mlflow bugs (#1125)

---
 llmfoundry/callbacks/hf_checkpointer.py | 30 ++++++++++++++++---------
 setup.py                                |  2 +-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 55dcb8b833..62d54d1e6a 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 from composer.core import Callback, Event, State, Time, TimeUnit
@@ -160,8 +161,6 @@ def __init__(
         if mlflow_logging_config is None:
             mlflow_logging_config = {}
         if self.mlflow_registered_model_name is not None:
-            import numpy as np
-
             # Both the metadata and the task are needed in order for mlflow
             # and databricks optimized model serving to work
             passed_metadata = mlflow_logging_config.get('metadata', {})
@@ -171,18 +170,17 @@ def __init__(
             default_input_example = {
                 'prompt': np.array(['What is Machine Learning?'])
             }
-            is_chat = mlflow_logging_config['task'].endswith(
-                'chat') or mlflow_logging_config['metadata'].get(
-                    'task', '').endswith('chat')
+            is_chat = mlflow_logging_config['task'].endswith('chat') or (
+                mlflow_logging_config['metadata'] is not None and
+                mlflow_logging_config['metadata'].get('task',
+                                                      '').endswith('chat'))
             if is_chat:
                 default_input_example = {
-                    'messages':
-                        np.array([{
-                            'role': 'user',
-                            'content': 'What is Machine Learning?'
-                        }])
+                    'messages': [{
+                        'role': 'user',
+                        'content': 'What is Machine Learning?'
+                    }]
                 }
-                mlflow_logging_config.setdefault('example_no_conversion', True)
             mlflow_logging_config.setdefault('input_example',
                                              default_input_example)
 
@@ -260,6 +258,16 @@ def _is_last_batch(self, state: State):
             return True
 
         assert state.max_duration is not None  # for pyright
+
+        epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
+        second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
+            state.timestamp.epoch == state.max_duration.value - 1)
+        # If the save interval is specified as exactly the same number of batches as the total duration,
+        # but the max duration is specified in epochs, we need a special case to identify we are on the last batch
+        # and should write the mlflow checkpoint. This should occur on the last batch of the final epoch.
+        if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
+            return True
+
         # If the save interval is specified as 1dur, and the max duration is in epoch units
         # we need a special case to identify we are on the last batch and should write the mlflow checkpoint
         if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
diff --git a/setup.py b/setup.py
index 4bdb15aa18..eb6c88af9e 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
 
 install_requires = [
     'mosaicml[libcloud,wandb,oci,gcs]>=0.21.3,<0.22',
-    'mlflow>=2.10,<2.12',
+    'mlflow>=2.12.1,<2.13',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
     'transformers>=4.40,<4.41',
     'mosaicml-streaming>=0.7.5,<0.8',

From 0d62e6118cb20f35e433dc134a01371ed07f34a8 Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Date: Tue, 23 Apr 2024 05:16:45 +0300
Subject: [PATCH 3/3] Update JSONL sources in eval README (#1110)

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 scripts/eval/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 488dd50a3a..3a748066ec 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -193,7 +193,7 @@ Question: What star sign is Jamie Lee Curtis? Answer:
 
 The model would then be expected to generate a series of tokens beginning with either of the aliases: `Scorpio/Skorpio`.
 
-Below is a complete YAML section that works with the TriviaQA dataset in [`scripts/eval/local_data/triviaqa.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/triviaqa.jsonl):
+Below is a complete YAML section that works with the TriviaQA dataset in [`scripts/eval/local_data/world_knowledge/triviaqa.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/world_knowledge/triviaqa.jsonl):
 
 >
     label: triviaqa
@@ -237,7 +237,7 @@ He took another step, but he was still in the
 
 The model would then be expected output “ glen”.
 
-Below is a YAML section that works with the Lambada OpenAI dataset in [`scripts/eval/local_data/lambada_openai.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/lambada_openai.jsonl):
+Below is a YAML section that works with the Lambada OpenAI dataset in [`scripts/eval/local_data/language_understanding/lambada_openai.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/language_understanding/lambada_openai.jsonl):
 
 >
     label: lambada_openai
@@ -281,7 +281,7 @@ The MC task expects a **prompt string**, a **continuation delimiter** to separat
 
 The model would be deemed correct if it assigns the lowest per token perplexity to the sequence " lifts his body above the height of a pole."
 
-Below is a YAML section that works with the HellaSwag dataset in [`scripts/eval/local_data/hellaswag.jsonl`](https://raw.githubusercontent.com/mosaicml/llm-foundry/main/scripts/eval/local_data/hellaswag.jsonl):
+Below is a YAML section that works with the HellaSwag dataset in [`scripts/eval/local_data/language_understanding/hellaswag.jsonl`](https://raw.githubusercontent.com/mosaicml/llm-foundry/main/scripts/eval/local_data/language_understanding/hellaswag.jsonl):
 
 >
     label: hellaswag
@@ -327,7 +327,7 @@ The Schema task expects a **prompt string**, a **continuation delimiter** to sep
 
 The model would be assigned correct if per token perplexity of the sequence " was so upset" is lower in the second version than it is in the first version.
 
-Below is a YAML section that works with the Winograd dataset in [`scripts/eval/local_data/winograd_wsc.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/winograd_wsc.jsonl):
+Below is a YAML section that works with the Winograd dataset in [`scripts/eval/local_data/language_understanding/winograd_wsc.jsonl`](https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/language_understanding/winograd_wsc.jsonl):
 
 >
     label: winograd