Allow HuggingFaceClient and HuggingFaceTokenizer to be used on any name

stanford-crfm · Aug 16, 2023 · 9880044 · 9880044
1 parent 6d18584
commit 9880044
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 34 deletions.
diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py
@@ -25,6 +25,13 @@
 from threading import Lock
 
 
+# Map of HELM model name to Hugging Face Hub model name where they differ.
+_KNOWN_MODEL_ALIASES: Dict[str, str] = {
+    "huggingface/gpt2": "gpt2",
+    "huggingface/starcoder": "bigcode/starcoder",
+}
+
+
 class HuggingFaceServer:
     def __init__(self, model_config: HuggingFaceModelConfig):
         if torch.cuda.is_available():
@@ -154,23 +161,14 @@ def __init__(self, cache_config: CacheConfig):
         self.cache = Cache(cache_config)
         self.model_server_instances: Dict[str, HuggingFaceServer] = {}
 
-    def get_model_server_instance(self, model) -> HuggingFaceServer:
+    def get_model_server_instance(self, model: str) -> HuggingFaceServer:
         model_config = get_huggingface_model_config(model)
         # Special-case some models in so that users don't have to enable them with --enable-huggingface-models
         if not model_config:
-            # Other HuggingFace hub models that we'll look up for you even if you didn't enable them via the flag
-            if model == "EleutherAI/gpt-j-6B":
-                model_config = HuggingFaceHubModelConfig.from_string("EleutherAI/gpt-j-6B")
-            elif model == "huggingface/gpt2":
-                model_config = HuggingFaceHubModelConfig.from_string("gpt2")
-            elif model == "bigcode/santacoder":
-                model_config = HuggingFaceHubModelConfig.from_string("bigcode/santacoder")
-            elif model == "huggingface/starcoder":
-                model_config = HuggingFaceHubModelConfig.from_string("bigcode/starcoder")
-            elif model == "mosaicml/mpt-7b":
-                model_config = HuggingFaceHubModelConfig.from_string("mosaicml/mpt-7b")
+            if model in _KNOWN_MODEL_ALIASES:
+                model_config = HuggingFaceHubModelConfig.from_string(_KNOWN_MODEL_ALIASES[model])
             else:
-                raise Exception(f"Unknown HuggingFace model: {model}")
+                model_config = HuggingFaceHubModelConfig.from_string(model)
         return _get_singleton_server(model_config)
 
     def make_request(self, request: Request) -> RequestResult:

diff --git a/src/helm/proxy/clients/huggingface_tokenizer.py b/src/helm/proxy/clients/huggingface_tokenizer.py
@@ -12,24 +12,7 @@
 )
 
 
-# Tokenizer names where the HELM tokenizer name and the Hugging Face tokenizer name
-# are identical.
-_KNOWN_TOKENIZER_NAMES: Set[str] = {
-    "EleutherAI/gpt-j-6B",  # Not a typo: Named "gpt-j-6B" instead of "gpt-j-6b" in Hugging Face
-    "EleutherAI/gpt-neox-20b",
-    "bigscience/bloom",
-    "bigscience/T0pp",
-    "facebook/opt-66b",
-    "google/ul2",
-    "google/flan-t5-xxl",
-    "meta-llama/Llama-2-7b-hf",
-    "bigcode/santacoder",
-    "bigcode/starcoder",
-    "hf-internal-testing/llama-tokenizer",
-}
-
-
-# Map of HELM tokenizer name to Hugging Face tokenizer name for tokenizers where they differ.
+# Map of HELM tokenizer name to Hugging Face Hub tokenizer name where they differ.
 _KNOWN_TOKENIZER_ALIASES: Dict[str, str] = {
     "huggingface/gpt2": "gpt2",
     "google/t5-11b": "t5-11b",
@@ -90,12 +73,10 @@ def load_tokenizer(hf_tokenizer_name: str, revision: Optional[str] = None):
                         revision = model_config.revision
                     else:
                         raise ValueError(f"Unrecognized Hugging Face model config: {type(model_config)})")
-                elif tokenizer_name in _KNOWN_TOKENIZER_NAMES:
-                    hf_tokenizer_name = tokenizer_name
                 elif tokenizer_name in _KNOWN_TOKENIZER_ALIASES:
                     hf_tokenizer_name = _KNOWN_TOKENIZER_ALIASES[tokenizer_name]
                 else:
-                    raise ValueError(f"Unsupported HuggingFace tokenizer: {tokenizer_name}")
+                    hf_tokenizer_name = tokenizer_name
 
                 # Keep the tokenizer in memory, so we don't recreate it for future requests
                 HuggingFaceTokenizers.tokenizers[tokenizer_name] = load_tokenizer(hf_tokenizer_name, revision)