Skip to content

Commit

Permalink
Added all tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts committed Nov 3, 2023
1 parent f27ac03 commit 3b875cb
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 31 deletions.
6 changes: 5 additions & 1 deletion src/helm/benchmark/tokenizer_config_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ class TokenizerConfig:
tokenizer_spec: TokenizerSpec
"""Specification for instantiating the client for this tokenizer."""

# TODO: Add `end_of_text_token`` and `prefix_token``
end_of_text_token: Optional[str] = None
"""The end of text token."""

prefix_token: Optional[str] = None
"""The prefix token."""


@dataclass(frozen=True)
Expand Down
4 changes: 2 additions & 2 deletions src/helm/common/object_spec.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import importlib
import dataclasses
from dataclasses import dataclass
from dataclasses import dataclass, field
import inspect
from typing import Any, Callable, Dict, Optional, Tuple, Hashable, Type, TypeVar

Expand All @@ -13,7 +13,7 @@ class ObjectSpec:
class_name: str

# Arguments used to construct the scenario
args: Dict[str, Any]
args: Dict[str, Any] = field(default_factory=dict)

def __hash__(self):
def get_arg_value(key: str) -> Any:
Expand Down
202 changes: 174 additions & 28 deletions src/helm/config/tokenizer_configs.yaml
Original file line number Diff line number Diff line change
@@ -1,62 +1,208 @@
tokenizer_configs:
# ========== AI21 Labs ========== #

# ========== Ai21 ========== #
- name: ai21/j1
tokenizer_spec:
class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"
args: {}
# =============================== #

# ========== Aleph Alpha ========== #
end_of_text_token: " "
prefix_token: ""
# ========================== #

# ========== AlephAlpha ========== #
- name: AlephAlpha/luminous-base
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
args: {}
end_of_text_token: ""
prefix_token: ""
- name: AlephAlpha/luminous-extended
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
args: {}
end_of_text_token: ""
prefix_token: ""
- name: AlephAlpha/luminous-supreme
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
args: {}
end_of_text_token: ""
prefix_token: ""
- name: AlephAlpha/luminous-world
tokenizer_spec:
class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
args: {}
# ================================= #

# =========== Anthropic =========== #
end_of_text_token: ""
prefix_token: ""
# ================================ #

# ========== Anthropic ========== #
- name: anthropic/claude
tokenizer_spec:
class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
args: {}
# ================================= #
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# =============================== #

# ========== Bigcode ========== #
- name: bigcode/santacoder
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
- name: bigcode/starcoder
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# ============================= #

# =========== BigScience =========== #
# ========== Bigscience ========== #
- name: bigscience/bloom
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args: {}
end_of_text_token: "</s>"
prefix_token: "</s>"
- name: bigscience/T0pp
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args: {}
# ================================== #
end_of_text_token: "</s>"
prefix_token: ""
# ================================ #

# =========== BigCode =========== #
- name: bigcode/santacoder
# ========== Cohere ========== #
- name: cohere/cohere
tokenizer_spec:
class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
end_of_text_token: ""
prefix_token: ":"
# ============================ #

# ========== EleutherAI ========== #
- name: EleutherAI/gpt-j-6B
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args: {}
- name: bigcode/starcoder
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
- name: EleutherAI/gpt-neox-20b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# ================================ #

# ========== Facebook ========== #
- name: facebook/opt-66b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "</s>"
# ============================== #

# ========== Google ========== #
- name: google/t5-11b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: ""
- name: google/flan-t5-xxl
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: ""
- name: google/ul2
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: ""
# ============================ #

# ========== Hf-internal-testing ========== #
- name: hf-internal-testing/llama-tokenizer
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
# ========================================= #

# ========== HuggingFaceM4 ========== #
- name: HuggingFaceM4/idefics-9b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
- name: HuggingFaceM4/idefics-9b-instruct
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
- name: HuggingFaceM4/idefics-80b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
- name: HuggingFaceM4/idefics-80b-instruct
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
# =================================== #

# ========== Huggingface ========== #
- name: huggingface/gpt2
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# ================================= #

# ========== Meta-llama ========== #
- name: meta-llama/Llama-2-7b-hf
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "</s>"
prefix_token: "<s>"
# ================================ #

# ========== Mistralai ========== #
- name: mistralai/Mistral-7B-v0.1
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
args: {}
end_of_text_token: "</s>"
prefix_token: "<s>"
# =============================== #

# =========== Cohere =========== #
- name: cohere/cohere
# ========== Neurips ========== #
- name: neurips/local
tokenizer_spec:
class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
args: {}
# ============================== #
class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# ============================= #

# ========== Openai ========== #
- name: openai/cl100k_base
tokenizer_spec:
class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: "<|endoftext|>"
# ============================ #

# ========== Tiiuae ========== #
- name: tiiuae/falcon-7b
tokenizer_spec:
class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
end_of_text_token: "<|endoftext|>"
prefix_token: null
# ============================ #

# ========== TsinghuaKEG ========== #
- name: TsinghuaKEG/ice
tokenizer_spec:
class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"
end_of_text_token: "</s>"
prefix_token: ""
# ================================= #

# ========== Yandex ========== #
- name: Yandex/yalm
tokenizer_spec:
class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"
end_of_text_token: "</s>"
prefix_token: "</s>"
# ============================ #

0 comments on commit 3b875cb

Please sign in to comment.