-
Notifications
You must be signed in to change notification settings - Fork 245
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Configurable models for NeurIPS Efficiency Challenge (#1861)
- Loading branch information
Showing
11 changed files
with
294 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
from typing import Dict, Optional, List | ||
from dataclasses import dataclass | ||
|
||
import cattrs | ||
import yaml | ||
|
||
from helm.common.hierarchical_logger import hlog | ||
from helm.common.object_spec import ObjectSpec | ||
|
||
|
||
TOKENIEZR_CONFIGS_FILE = "tokenizer_configs.yaml" | ||
|
||
|
||
class TokenizerSpec(ObjectSpec): | ||
pass | ||
|
||
|
||
@dataclass(frozen=True) | ||
class TokenizerConfig: | ||
"""Configuration for a tokenizer.""" | ||
|
||
name: str | ||
"""Name of the tokenizer.""" | ||
|
||
tokenizer_spec: TokenizerSpec | ||
"""Specification for instantiating the client for this tokenizer.""" | ||
|
||
# TODO: Add `end_of_text_token`` and `prefix_token`` | ||
|
||
|
||
@dataclass(frozen=True) | ||
class TokenizerConfigs: | ||
tokenizer_configs: List[TokenizerConfig] | ||
|
||
|
||
_name_to_tokenizer_config: Dict[str, TokenizerConfig] = {} | ||
|
||
|
||
def register_tokenizer_configs_from_path(path: str) -> None: | ||
global _name_to_tokenizer_config | ||
hlog(f"Reading tokenizer configs from {path}...") | ||
with open(path, "r") as f: | ||
raw = yaml.safe_load(f) | ||
tokenizer_configs: TokenizerConfigs = cattrs.structure(raw, TokenizerConfigs) | ||
for tokenizer_config in tokenizer_configs.tokenizer_configs: | ||
_name_to_tokenizer_config[tokenizer_config.name] = tokenizer_config | ||
|
||
|
||
def maybe_register_tokenizer_configs_from_base_path(base_path: str) -> None: | ||
path = os.path.join(base_path, TOKENIEZR_CONFIGS_FILE) | ||
if os.path.exists(path): | ||
register_tokenizer_configs_from_path(path) | ||
|
||
|
||
def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]: | ||
return _name_to_tokenizer_config.get(name) |
39 changes: 39 additions & 0 deletions
39
src/helm/benchmark/window_services/default_window_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from typing import Optional | ||
from .local_window_service import LocalWindowService | ||
from .tokenizer_service import TokenizerService | ||
|
||
|
||
class DefaultWindowService(LocalWindowService): | ||
def __init__( | ||
self, | ||
service: TokenizerService, | ||
tokenizer_name: str, | ||
max_sequence_length: int, | ||
max_request_length: Optional[int] = None, | ||
): | ||
super().__init__(service) | ||
self._tokenizer_name = tokenizer_name | ||
self._max_sequence_length = max_sequence_length | ||
self._max_request_length = max_request_length | ||
|
||
@property | ||
def max_sequence_length(self) -> int: | ||
return self._max_sequence_length | ||
|
||
@property | ||
def max_request_length(self) -> int: | ||
return self._max_request_length or self._max_sequence_length | ||
|
||
@property | ||
def end_of_text_token(self) -> str: | ||
# TODO: Support this | ||
return "" | ||
|
||
@property | ||
def tokenizer_name(self) -> str: | ||
return self._tokenizer_name | ||
|
||
@property | ||
def prefix_token(self) -> str: | ||
# TODO: Support this | ||
return "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.