diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py index ab87c601c9..d2d242d150 100644 --- a/src/helm/benchmark/tokenizer_config_registry.py +++ b/src/helm/benchmark/tokenizer_config_registry.py @@ -26,7 +26,11 @@ class TokenizerConfig: tokenizer_spec: TokenizerSpec """Specification for instantiating the client for this tokenizer.""" - # TODO: Add `end_of_text_token`` and `prefix_token`` + end_of_text_token: Optional[str] = None + """The end of text token.""" + + prefix_token: Optional[str] = None + """The prefix token.""" @dataclass(frozen=True) diff --git a/src/helm/common/object_spec.py b/src/helm/common/object_spec.py index 8fab448960..5669daeb33 100644 --- a/src/helm/common/object_spec.py +++ b/src/helm/common/object_spec.py @@ -1,6 +1,6 @@ import importlib import dataclasses -from dataclasses import dataclass +from dataclasses import dataclass, field import inspect from typing import Any, Callable, Dict, Optional, Tuple, Hashable, Type, TypeVar @@ -13,7 +13,7 @@ class ObjectSpec: class_name: str # Arguments used to construct the scenario - args: Dict[str, Any] + args: Dict[str, Any] = field(default_factory=dict) def __hash__(self): def get_arg_value(key: str) -> Any: diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml index 0cf211eaad..48863c1d47 100644 --- a/src/helm/config/tokenizer_configs.yaml +++ b/src/helm/config/tokenizer_configs.yaml @@ -1,62 +1,208 @@ tokenizer_configs: - # ========== AI21 Labs ========== # + + # ========== Ai21 ========== # - name: ai21/j1 tokenizer_spec: class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer" - args: {} - # =============================== # - - # ========== Aleph Alpha ========== # + end_of_text_token: " " + prefix_token: "" + # ========================== # + + # ========== AlephAlpha ========== # - name: AlephAlpha/luminous-base tokenizer_spec: class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" - args: {} + end_of_text_token: "" + prefix_token: "" - name: AlephAlpha/luminous-extended tokenizer_spec: class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" - args: {} + end_of_text_token: "" + prefix_token: "" - name: AlephAlpha/luminous-supreme tokenizer_spec: class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" - args: {} + end_of_text_token: "" + prefix_token: "" - name: AlephAlpha/luminous-world tokenizer_spec: class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" - args: {} - # ================================= # - - # =========== Anthropic =========== # + end_of_text_token: "" + prefix_token: "" + # ================================ # + + # ========== Anthropic ========== # - name: anthropic/claude tokenizer_spec: class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer" - args: {} - # ================================= # + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # =============================== # + + # ========== Bigcode ========== # + - name: bigcode/santacoder + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + - name: bigcode/starcoder + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # ============================= # - # =========== BigScience =========== # + # ========== Bigscience ========== # - name: bigscience/bloom tokenizer_spec: class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" - args: {} + end_of_text_token: "" + prefix_token: "" - name: bigscience/T0pp tokenizer_spec: class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" - args: {} - # ================================== # + end_of_text_token: "" + prefix_token: "" + # ================================ # - # =========== BigCode =========== # - - name: bigcode/santacoder + # ========== Cohere ========== # + - name: cohere/cohere + tokenizer_spec: + class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer" + end_of_text_token: "" + prefix_token: ":" + # ============================ # + + # ========== EleutherAI ========== # + - name: EleutherAI/gpt-j-6B tokenizer_spec: class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" - args: {} - - name: bigcode/starcoder + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + - name: EleutherAI/gpt-neox-20b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # ================================ # + + # ========== Facebook ========== # + - name: facebook/opt-66b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + # ============================== # + + # ========== Google ========== # + - name: google/t5-11b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: google/flan-t5-xxl + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: google/ul2 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + # ============================ # + + # ========== Hf-internal-testing ========== # + - name: hf-internal-testing/llama-tokenizer + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + # ========================================= # + + # ========== HuggingFaceM4 ========== # + - name: HuggingFaceM4/idefics-9b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-9b-instruct + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-80b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-80b-instruct + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + # =================================== # + + # ========== Huggingface ========== # + - name: huggingface/gpt2 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # ================================= # + + # ========== Meta-llama ========== # + - name: meta-llama/Llama-2-7b-hf + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + # ================================ # + + # ========== Mistralai ========== # + - name: mistralai/Mistral-7B-v0.1 tokenizer_spec: class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" - args: {} + end_of_text_token: "" + prefix_token: "" # =============================== # - # =========== Cohere =========== # - - name: cohere/cohere + # ========== Neurips ========== # + - name: neurips/local tokenizer_spec: - class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer" - args: {} - # ============================== # \ No newline at end of file + class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # ============================= # + + # ========== Openai ========== # + - name: openai/cl100k_base + tokenizer_spec: + class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + # ============================ # + + # ========== Tiiuae ========== # + - name: tiiuae/falcon-7b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: null + # ============================ # + + # ========== TsinghuaKEG ========== # + - name: TsinghuaKEG/ice + tokenizer_spec: + class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer" + end_of_text_token: "" + prefix_token: "" + # ================================= # + + # ========== Yandex ========== # + - name: Yandex/yalm + tokenizer_spec: + class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer" + end_of_text_token: "" + prefix_token: "" + # ============================ # \ No newline at end of file