Skip to content

Commit

Permalink
Support for using tiktoken tokenizers (#610)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Sep 21, 2023
1 parent 512d4ea commit a8e8783
Show file tree
Hide file tree
Showing 16 changed files with 650 additions and 104 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ repos:
hooks:
- id: yapf
name: yapf
exclude: tests/horrible_strings.py
description: A formatter for Python files.
entry: yapf
args: [-i, -vv, -p] # inplace
Expand Down Expand Up @@ -50,6 +51,7 @@ repos:
- id: debug-statements
- id: destroyed-symlinks
- id: double-quote-string-fixer
exclude: tests/horrible_strings.py
- id: end-of-file-fixer
- id: fix-byte-order-marker
- id: mixed-line-ending
Expand Down Expand Up @@ -93,6 +95,7 @@ repos:
hooks:
- id: pyright
name: pyright
exclude: tests/horrible_strings.py
entry: pyright
language: node
types: [python]
Expand All @@ -104,6 +107,7 @@ repos:
hooks:
- id: trufflehog
name: secret scan
exclude: tests/horrible_strings.py
entry: trufflehog filesystem ./
args:
- --only-verified
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
MPTForCausalLM, MPTModel,
MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper

except ImportError as e:
try:
Expand Down Expand Up @@ -64,6 +65,7 @@
'build_alibi_bias',
'optim',
'utils',
'TiktokenTokenizerWrapper',
]

__version__ = '0.2.0'
3 changes: 3 additions & 0 deletions llmfoundry/callbacks/monolithic_ckpt_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def _save_checkpoint(self, state: State, logger: Logger) -> None:
) if self.upload_to_object_store else contextlib.nullcontext(
enter_result=save_dir)
with dir_context_mgr as temp_save_dir:
# pyright doesn't know about enter_result
assert isinstance(temp_save_dir, str)

save_path = str(Path(temp_save_dir) / Path(filename))
dirname = os.path.dirname(save_path)
if dirname:
Expand Down
3 changes: 1 addition & 2 deletions llmfoundry/models/inference_api_wrapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
from llmfoundry.models.inference_api_wrapper.interface import \
InferenceAPIEvalWrapper
from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper, OpenAITokenizerWrapper)
OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper)

__all__ = [
'OpenAICausalLMEvalWrapper',
'OpenAIChatAPIEvalWrapper',
'OpenAITokenizerWrapper',
'InferenceAPIEvalWrapper',
]
93 changes: 6 additions & 87 deletions llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,102 +11,21 @@
import torch
from composer.core.types import Batch
from composer.utils.import_helpers import MissingConditionalImportError
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
from transformers import AutoTokenizer

log = logging.getLogger(__name__)

from llmfoundry.models.inference_api_wrapper.interface import \
InferenceAPIEvalWrapper

__all__ = [
'OpenAICausalLMEvalWrapper', 'OpenAIChatAPIEvalWrapper',
'OpenAITokenizerWrapper'
'OpenAICausalLMEvalWrapper',
'OpenAIChatAPIEvalWrapper',
]

Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]

MAX_RETRIES = 10


class OpenAITokenizerWrapper(AutoTokenizer):
# this API is experimental and for evaluation only. It is subject to change as we add support for training
def __init__(self, name: str) -> None:
try:
import tiktoken
except ImportError as e:
raise MissingConditionalImportError(
extra_deps_group='openai',
conda_package='tiktoken',
conda_channel='conda-forge') from e
self.tokenizer = tiktoken.encoding_for_model(name)

def __call__(self, x: str, add_special_tokens: bool = False):
if add_special_tokens:
raise ValueError(
'OpenAITokenizerWrapper only supports add_special_tokens=False')
return self.encode(x)

def encode(self,
x: Union[str, List[str]],
add_special_tokens: bool = False):
if add_special_tokens:
raise ValueError(
'OpenAITokenizerWrapper only supports add_special_tokens=False')
if isinstance(x, str):
return {
'input_ids':
self.tokenizer.encode(x, allowed_special={'<|endoftext|>'})
}
elif isinstance(x,
list): # pyright: ignore [reportUnnecessaryIsInstance]
return {
'input_ids':
self.tokenizer.encode_batch(
x, allowed_special={'<|endoftext|>'})
}
else:
raise ValueError(
f'`encode` argument must be str or List[str], got: {type(x)}')

def decode(
self,
x: Union[List[int], List[List[int]]],
):
if len(x) > 0 and isinstance(x[0], list):
return self.tokenizer.decode_batch(
x) # pyright: ignore [reportGeneralTypeIssues]
else:
assert isinstance(x, list)
return self.tokenizer.decode(
x) # pyright: ignore [reportGeneralTypeIssues]

@property
def pad_token_id(self):
return self.tokenizer.eot_token

@property
def eos_token_id(self):
return self.tokenizer.eot_token

@property
def vocab_size(self):
return self.tokenizer.n_vocab

def construct_logit_tensor(self, logprobs: Dict[str, float]):
"""Construct tensor of shape (vocab_size,) mapping words to logprobs.
Args:
logprobs (Dict[str, float]): Dictionary mapping tokens to log probabilities assigned to them by the model.
"""
tensor = torch.tensor([min(logprobs.values()) - 1] * (self.vocab_size))
for k in logprobs:
encoding = self.encode(k)['input_ids']
idx = encoding[0]
tensor[idx] = logprobs[k]
return tensor


class OpenAIEvalInterface(InferenceAPIEvalWrapper):

def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
Expand Down Expand Up @@ -185,7 +104,7 @@ def retokenize(self, tokens: List[int], cont_idxs: List[int]):
re-tokenize with the space removed.
"""
original_len = len(tokens)
retokenized_continuation = self.tokenizer.encode(
retokenized_continuation = self.tokenizer(
self.tokenizer.decode(tokens[cont_idxs[0]:cont_idxs[-1] +
1]).strip())['input_ids']

Expand Down Expand Up @@ -275,8 +194,8 @@ def process_result(self, completion: Optional[dict]):
assert isinstance(completion, dict)
if len(completion['choices']) > 0:
tensors = []
for t in self.tokenizer.encode(completion['choices'][0]['message']
['content'])['input_ids']:
for t in self.tokenizer(completion['choices'][0]['message']
['content'])['input_ids']:
tensors.append(
self.tokenizer.construct_logit_tensor(
{self.tokenizer.decode([t]): 0.0}))
Expand Down
8 changes: 8 additions & 0 deletions llmfoundry/tokenizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper

__all__ = [
'TiktokenTokenizerWrapper',
]
Loading

0 comments on commit a8e8783

Please sign in to comment.