diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py
index ab87c601c9..d2d242d150 100644
--- a/src/helm/benchmark/tokenizer_config_registry.py
+++ b/src/helm/benchmark/tokenizer_config_registry.py
@@ -26,7 +26,11 @@ class TokenizerConfig:
tokenizer_spec: TokenizerSpec
"""Specification for instantiating the client for this tokenizer."""
- # TODO: Add `end_of_text_token`` and `prefix_token``
+ end_of_text_token: Optional[str] = None
+ """The end of text token."""
+
+ prefix_token: Optional[str] = None
+ """The prefix token."""
@dataclass(frozen=True)
diff --git a/src/helm/common/object_spec.py b/src/helm/common/object_spec.py
index 8fab448960..5669daeb33 100644
--- a/src/helm/common/object_spec.py
+++ b/src/helm/common/object_spec.py
@@ -1,6 +1,6 @@
import importlib
import dataclasses
-from dataclasses import dataclass
+from dataclasses import dataclass, field
import inspect
from typing import Any, Callable, Dict, Optional, Tuple, Hashable, Type, TypeVar
@@ -13,7 +13,7 @@ class ObjectSpec:
class_name: str
# Arguments used to construct the scenario
- args: Dict[str, Any]
+ args: Dict[str, Any] = field(default_factory=dict)
def __hash__(self):
def get_arg_value(key: str) -> Any:
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index 0cf211eaad..48863c1d47 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -1,62 +1,208 @@
tokenizer_configs:
- # ========== AI21 Labs ========== #
+
+ # ========== Ai21 ========== #
- name: ai21/j1
tokenizer_spec:
class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"
- args: {}
- # =============================== #
-
- # ========== Aleph Alpha ========== #
+ end_of_text_token: " "
+ prefix_token: ""
+ # ========================== #
+
+ # ========== AlephAlpha ========== #
- name: AlephAlpha/luminous-base
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
- args: {}
+ end_of_text_token: ""
+ prefix_token: ""
- name: AlephAlpha/luminous-extended
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
- args: {}
+ end_of_text_token: ""
+ prefix_token: ""
- name: AlephAlpha/luminous-supreme
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
- args: {}
+ end_of_text_token: ""
+ prefix_token: ""
- name: AlephAlpha/luminous-world
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
- args: {}
- # ================================= #
-
- # =========== Anthropic =========== #
+ end_of_text_token: ""
+ prefix_token: ""
+ # ================================ #
+
+ # ========== Anthropic ========== #
- name: anthropic/claude
tokenizer_spec:
class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
- args: {}
- # ================================= #
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # =============================== #
+
+ # ========== Bigcode ========== #
+ - name: bigcode/santacoder
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ - name: bigcode/starcoder
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # ============================= #
- # =========== BigScience =========== #
+ # ========== Bigscience ========== #
- name: bigscience/bloom
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
- args: {}
+ end_of_text_token: ""
+ prefix_token: ""
- name: bigscience/T0pp
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
- args: {}
- # ================================== #
+ end_of_text_token: ""
+ prefix_token: ""
+ # ================================ #
- # =========== BigCode =========== #
- - name: bigcode/santacoder
+ # ========== Cohere ========== #
+ - name: cohere/cohere
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
+ end_of_text_token: ""
+ prefix_token: ":"
+ # ============================ #
+
+ # ========== EleutherAI ========== #
+ - name: EleutherAI/gpt-j-6B
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
- args: {}
- - name: bigcode/starcoder
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ - name: EleutherAI/gpt-neox-20b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # ================================ #
+
+ # ========== Facebook ========== #
+ - name: facebook/opt-66b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ============================== #
+
+ # ========== Google ========== #
+ - name: google/t5-11b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: google/flan-t5-xxl
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: google/ul2
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ============================ #
+
+ # ========== Hf-internal-testing ========== #
+ - name: hf-internal-testing/llama-tokenizer
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ========================================= #
+
+ # ========== HuggingFaceM4 ========== #
+ - name: HuggingFaceM4/idefics-9b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-9b-instruct
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-80b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-80b-instruct
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # =================================== #
+
+ # ========== Huggingface ========== #
+ - name: huggingface/gpt2
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # ================================= #
+
+ # ========== Meta-llama ========== #
+ - name: meta-llama/Llama-2-7b-hf
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ================================ #
+
+ # ========== Mistralai ========== #
+ - name: mistralai/Mistral-7B-v0.1
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
- args: {}
+ end_of_text_token: ""
+ prefix_token: ""
# =============================== #
- # =========== Cohere =========== #
- - name: cohere/cohere
+ # ========== Neurips ========== #
+ - name: neurips/local
tokenizer_spec:
- class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
- args: {}
- # ============================== #
\ No newline at end of file
+ class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # ============================= #
+
+ # ========== Openai ========== #
+ - name: openai/cl100k_base
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ # ============================ #
+
+ # ========== Tiiuae ========== #
+ - name: tiiuae/falcon-7b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: null
+ # ============================ #
+
+ # ========== TsinghuaKEG ========== #
+ - name: TsinghuaKEG/ice
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ================================= #
+
+ # ========== Yandex ========== #
+ - name: Yandex/yalm
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ # ============================ #
\ No newline at end of file