From 186dd19888a8c8874584f9e78619f3fb0348309f Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Thu, 31 Aug 2023 17:12:25 -0700 Subject: [PATCH] Refactor build_tokenizer to use kwargs syntax and specify name (#532) Refactor build tokenizer in train/train.py and eval/eval.py to use **kwargs syntax so that config's are dictionaries, classes/functions have arguments explicitly stated in their signature, and build_*** uses **kwargs for initializing classes. --- llmfoundry/data/denoising.py | 11 ++---- llmfoundry/data/finetuning/dataloader.py | 7 ++-- llmfoundry/data/packing.py | 9 ++++- llmfoundry/data/text_data.py | 7 ++-- llmfoundry/utils/builders.py | 13 +++---- llmfoundry/utils/config_utils.py | 2 +- scripts/eval/eval.py | 7 +++- scripts/train/train.py | 9 ++++- tests/test_dataloader.py | 32 +++++----------- tests/test_hf_config.py | 7 +++- tests/test_hf_mpt_gen.py | 9 ++++- tests/test_model.py | 49 ++++++++++++++++++++---- 12 files changed, 101 insertions(+), 61 deletions(-) diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py index 443777668c..e4b018ab05 100644 --- a/llmfoundry/data/denoising.py +++ b/llmfoundry/data/denoising.py @@ -864,13 +864,10 @@ def _format_tokens_for_decoder_only( cfg = om.create(cfg) device_batch_size = 2 - tokenizer_cfg = { - 'name': 'EleutherAI/gpt-neox-20b' if decoder_only else 't5-base', - 'kwargs': {} - } - tokenizer_cfg['kwargs'] = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer_cfg = om.create(tokenizer_cfg) - tokenizer = build_tokenizer(tokenizer_cfg) + tokenizer_name = 'EleutherAI/gpt-neox-20b' if decoder_only else 't5-base' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_kwargs=tokenizer_kwargs) loader = build_text_denoising_dataloader(cfg, tokenizer, device_batch_size) assert isinstance(loader.dataset, StreamingTextDataset) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 004870d7b4..765457df56 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -413,10 +413,9 @@ def _build_collate_fn(dataset_cfg: DictConfig, 'timeout': 0 }) - tokenizer_cfg = {'name': 'EleutherAI/gpt-neox-20b', 'kwargs': {}} - tokenizer_cfg['kwargs'] = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer_cfg = om.create(tokenizer_cfg) - tokenizer = build_tokenizer(tokenizer_cfg) + tokenizer_name = 'EleutherAI/gpt-neox-20b' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) device_batch_size = 2 dataloader = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 1bd03b42b4..a00e694228 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Callable, Dict, List, Literal, Optional, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple import numpy as np import torch @@ -360,7 +360,12 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, # build tokenizer if 'tokenizer' not in cfg: raise ValueError('config must define tokenizer') - tokenizer = build_tokenizer(cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(cfg.tokenizer, + resolve=True) # type: ignore + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # Turn off packing for the dataloader (we want raw, pre-packed examples) dataloader_cfg.dataset.packing_ratio = None diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 9ed550df41..0edcf4884a 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -327,10 +327,9 @@ def build_text_dataloader( cfg = om.create(cfg) device_batch_size = 2 - tokenizer_cfg = {'name': args.tokenizer, 'kwargs': {}} - tokenizer_cfg['kwargs'] = {'model_max_length': args.max_seq_len} - tokenizer_cfg = om.create(tokenizer_cfg) - tokenizer = build_tokenizer(tokenizer_cfg) + tokenizer_name = args.tokenizer + tokenizer_kwargs = {'model_max_length': args.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) loader = build_text_dataloader(cfg, tokenizer, device_batch_size) tokenizer = loader.dataset.tokenizer # type: ignore diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 7492295509..e4117e8a7d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -34,7 +34,7 @@ def build_icl_data_and_gauntlet( icl_tasks_config: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]], - tokenizer: AutoTokenizer, + tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int] = None @@ -153,15 +153,12 @@ def build_scheduler(name: str, scheduler_config: Dict[str, Any]): raise ValueError(f'Not sure how to build scheduler: {name}') -def build_tokenizer(om_tokenizer_config: DictConfig) -> PreTrainedTokenizerBase: +def build_tokenizer( + tokenizer_name: str, + tokenizer_kwargs: Dict[str, Any]) -> PreTrainedTokenizerBase: os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' - resolved_om_tokenizer_config = om.to_container(om_tokenizer_config, - resolve=True) - tokenizer_kwargs = resolved_om_tokenizer_config.get( # type: ignore - 'kwargs', {}) - tokenizer_name = resolved_om_tokenizer_config['name'] # type: ignore tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **tokenizer_kwargs) @@ -178,7 +175,7 @@ def build_tokenizer(om_tokenizer_config: DictConfig) -> PreTrainedTokenizerBase: def build_icl_evaluators( icl_tasks: Union[str, ListConfig], - tokenizer: AutoTokenizer, + tokenizer: PreTrainedTokenizerBase, default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str] = None, diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7e8156e34c..455433cb04 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -29,7 +29,7 @@ def pop_config(cfg: DictConfig, if not isinstance(value, DictConfig) and not isinstance( value, ListConfig): raise ValueError( - f'The key: {key} has a value: {value} that cannot be \ + f'The key {key} has a value of type {type(value)} that cannot be \ converted to a dict or list. Please check your yaml.' ) return om.to_container(value) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index b98099c284..ef567345f9 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -100,7 +100,12 @@ def evaluate_model(model_cfg: DictConfig, dist_timeout: Union[float, int], icl_subset_num_batches: Optional[int]): print(f'Evaluating model: {model_cfg.model_name}', flush=True) # Build tokenizer and model - tokenizer = build_tokenizer(model_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(model_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet( icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size, diff --git a/scripts/train/train.py b/scripts/train/train.py index a76f42e0c6..58fa67afe1 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -217,7 +217,10 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = pop_config(cfg, 'model', must_exist=True) - tokenizer_config: DictConfig = pop_config(cfg, 'tokenizer', must_exist=True) + tokenizer_config: Dict[str, Any] = pop_config(cfg, + 'tokenizer', + must_exist=True, + convert=True) optimizer_config: Dict[str, Any] = pop_config(cfg, 'optimizer', must_exist=True, @@ -416,7 +419,9 @@ def main(cfg: DictConfig) -> Trainer: logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) # Build tokenizer - tokenizer = build_tokenizer(tokenizer_config) + tokenizer_name = tokenizer_config['name'] + tokenizer_kwargs = tokenizer_config.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # Scheduler scheduler_name: str = scheduler_config.pop('name') diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 3cd930b85e..457265806f 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -105,10 +105,9 @@ def test_correct_padding(tokenizer_name: str, }) tokenizer = build_tokenizer( - om.create({ - 'name': tokenizer_name, - 'kwargs': {} - })) + tokenizer_name=tokenizer_name, + tokenizer_kwargs={}, + ) # Dataloaders eval_loader = build_text_dataloader( @@ -202,12 +201,8 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool, expected_keys += ['sequence_id'] tokenizer = build_tokenizer( - om.create({ - 'name': tokenizer_name, - 'kwargs': { - 'model_max_length': max_seq_len - } - })) + tokenizer_name=tokenizer_name, + tokenizer_kwargs={'model_max_length': max_seq_len}) loader = build_text_denoising_dataloader(cfg, tokenizer, device_batch_size) @@ -258,12 +253,8 @@ def test_finetuning_dataloader(decoder_only_format: bool, cfg = om.create(cfg) tokenizer = build_tokenizer( - om.create({ - 'name': tokenizer_name, - 'kwargs': { - 'model_max_length': max_seq_len - } - })) + tokenizer_name=tokenizer_name, + tokenizer_kwargs={'model_max_length': max_seq_len}) device_batch_size = 2 @@ -332,12 +323,9 @@ def test_finetuning_dataloader_small_data(dataset_size: int, cfg = om.create(cfg) tokenizer = build_tokenizer( - om.create({ - 'name': tokenizer_name, - 'kwargs': { - 'model_max_length': max_seq_len - } - })) + tokenizer_name=tokenizer_name, + tokenizer_kwargs={'model_max_length': max_seq_len}, + ) expected_keys = ['input_ids', 'attention_mask', 'labels'] expected_keys += ['bidirectional_mask'] diff --git a/tests/test_hf_config.py b/tests/test_hf_config.py index e5399e716c..8a1fb49398 100644 --- a/tests/test_hf_config.py +++ b/tests/test_hf_config.py @@ -71,7 +71,12 @@ def test_hf_config_override( test_cfg.precision = 'fp16' test_cfg.model.attn_config = {'attn_impl': 'torch', 'alibi': True} - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model, tokenizer) diff --git a/tests/test_hf_mpt_gen.py b/tests/test_hf_mpt_gen.py index 969f42eca3..68cef14c43 100644 --- a/tests/test_hf_mpt_gen.py +++ b/tests/test_hf_mpt_gen.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict + import pytest from composer.core.precision import get_precision_context from composer.utils import get_device, reproducibility @@ -44,7 +46,12 @@ def test_init_hfhub_mpt(device: str, attn_impl: str): }) # build tokenizer - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # build model model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model, diff --git a/tests/test_model.py b/tests/test_model.py index f66b5382c3..732681a38b 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -74,7 +74,11 @@ def get_objs(conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml'): test_cfg.device_eval_batch_size = 2 test_cfg.device_train_microbatch_size = 2 - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(test_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model, tokenizer) @@ -221,7 +225,11 @@ def test_full_forward_and_backward_gpt2_small(prefixlm: bool, else: neo_cfg.model.name = 'hf_causal_lm' - tokenizer = build_tokenizer(neo_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(neo_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(neo_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) model = COMPOSER_MODEL_REGISTRY[neo_cfg.model.name](neo_cfg.model, tokenizer).to(device) @@ -264,7 +272,11 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): t5_cfg.device = device t5_cfg.max_seq_len = 16 - tokenizer = build_tokenizer(t5_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(t5_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(t5_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) model = COMPOSER_MODEL_REGISTRY[t5_cfg.model.name](t5_cfg.model, tokenizer).to(device) @@ -316,7 +328,11 @@ def test_determinism(attn_impl: str, precision: torch.dtype): test_cfg.model.init_device = 'cuda:0' test_cfg.device = 'cuda:0' - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(test_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) model_1 = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model, tokenizer) @@ -381,7 +397,11 @@ def test_loss_fn(): reproducibility.seed_all(test_cfg.get('global_seed', 42)) - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(test_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) model_1 = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model, tokenizer) @@ -440,7 +460,11 @@ def test_opt_wrapping(prefixlm: bool): } config = DictConfig(conf) - tokenizer = build_tokenizer(config.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(config.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(config.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) if prefixlm: model = ComposerHFPrefixLM(config.model, tokenizer) @@ -1388,7 +1412,12 @@ def test_hf_init(tmp_path: pathlib.Path, model = AutoModelForCausalLM.from_pretrained(save_path, trust_remote_code=True) - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(test_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) + optimizer = DecoupledAdamW(model.parameters(), lr=1e-5, betas=tuple([0.9, 0.99])) @@ -1435,7 +1464,11 @@ def test_head_dim_8_triton_mqa_attn(batch_size: int = 2): ) test_cfg.device = torch.cuda.current_device() - tokenizer = build_tokenizer(test_cfg.tokenizer) + tokenizer_cfg: Dict[str, + Any] = om.to_container(test_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer = build_tokenizer(test_cfg.tokenizer.name, + tokenizer_cfg.get('kwargs', {})) mpt = MPTForCausalLM(hf_config)