Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into DecodingTrust
Browse files Browse the repository at this point in the history
  • Loading branch information
danielz02 committed Nov 29, 2023
2 parents 71979f3 + c545c49 commit c4ead28
Show file tree
Hide file tree
Showing 33 changed files with 150 additions and 197 deletions.
22 changes: 11 additions & 11 deletions docs/get_helm_rank.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ now that the files are in your results directory, all HELM models will be shown

## Run Efficient-HELM

According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023).
According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM Research, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023).

Specifically, the authors calculated the CI $95\%$ of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]:
Specifically, the authors calculated the CI 95% of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]:

| Examples Per Scenario | CI $95\%$ of Rank Location | Compute saved |
| :-------------------: | :------------------------: | :-----------: |
| $10$ | $\pm5$ | $\times400$ |
| $20$ | $\pm4$ | $\times200$ |
| $50$ | $\pm3$ | $\times80$ |
| $200$ | $\pm2$ | $\times20$ |
| $1000$ | $\pm1$ | $\times4$ |
| All | $\pm1$ | $\times1$ |
| Examples Per Scenario | CI 95% of Rank Location | Compute saved |
| :-------------------: | :---------------------: | :-----------: |
| 10 | ±5 | X400 |
| 20 | ±4 | X200 |
| 50 | ±3 | X80 |
| 200 | ±2 | X20 |
| 1000 | ±1 | X4 |
| All | ±1 | X1 |


Choose your point on your tradeoff, how accurate do you need your rank? how much time do you want to wait? Once you have chosen, download the config and define your model
Expand Down Expand Up @@ -81,4 +81,4 @@ helm-server

```Perlitz, Y., Bandel, E., Gera, A., Arviv, O., Ein-Dor, L., Shnarch, E., Slonim, N., Shmueli-Scheuer, M. and Choshen, L., 2023. Efficient Benchmarking (of Language Models). arXiv preprint arXiv:2308.11696.```

[^1]: Note that the quantities below are the CI $95\%$ of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above $\pm2$ for any of the options above.
[^1]: Note that the quantities below are the CI 95% of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above ±2 for any of the options above.]:
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ nav:
- 'User Guide':
- 'installation.md'
- 'quick_start.md'
- 'get_helm_rank.md'
- 'tutorial.md'
- 'benchmark.md'
- 'huggingface_models.md'
Expand Down
4 changes: 2 additions & 2 deletions src/helm-frontend/public/config.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
window.BENCHMARK_OUTPUT_BASE_URL =
"https://storage.googleapis.com/crfm-helm-public/";
window.SUITE = "v0.2.4";
window.RELEASE = "v0.3.0";
window.SUITE = null;
window.RELEASE = "v0.4.0";
2 changes: 1 addition & 1 deletion src/helm-frontend/src/components/Footer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export default function Footer() {
const version = getBenchmarkRelease();
return (
<div className="bottom-0 right-0 p-4 bg-white-800 text-black text-right">
<p>Version {version}</p>
<p>Release: {version}</p>
</div>
);
}
15 changes: 12 additions & 3 deletions src/helm-frontend/src/components/GroupsTables.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,19 @@ interface Props {
activeGroup: number;
ignoreHref?: boolean;
sortable?: boolean;
sortFirstMetric?: boolean;
}

export default function GroupsTables({
groupsTables,
activeGroup,
ignoreHref = false,
sortable = true,
sortFirstMetric = true,
}: Props) {
const [activeSortColumn, setActiveSortColumn] = useState<
number | undefined
>();
const [activeSortColumn, setActiveSortColumn] = useState<number | undefined>(
sortFirstMetric ? 1 : undefined,
);
const [activeGroupsTable, setActiveGroupsTable] = useState<GroupsTable>({
...groupsTables[activeGroup],
});
Expand Down Expand Up @@ -66,6 +68,13 @@ export default function GroupsTables({
});
};

useEffect(() => {
if (sortFirstMetric && activeSortColumn) {
handleSort(activeSortColumn);
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sortFirstMetric, activeSortColumn]);

return (
<div className="overflow-x-auto">
<table className="table">
Expand Down
7 changes: 4 additions & 3 deletions src/helm-frontend/src/components/NavDropdown.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,14 @@ function NavDropdown() {
className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
role="menuitem"
>
<Link to="https://crfm.stanford.edu/heim/latest/?">
<a href="https://crfm.stanford.edu/heim/latest/?">
<div className="flex items-center">
<span>
<strong>HEIM: </strong>Holistic evaluation of image models
<strong>HEIM: </strong>Holistic evaluation of text-to-image
models
</span>
</div>
</Link>
</a>
</div>
</div>
</div>
Expand Down
1 change: 1 addition & 0 deletions src/helm/benchmark/metrics/summarization_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pickle

import spacy
import spacy.cli
from typing import List, Dict, Optional
from collections import defaultdict

Expand Down
5 changes: 2 additions & 3 deletions src/helm/proxy/clients/ai21_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
Sequence,
Token,
)
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence, cleanup_str
from .ai21_utils import AI21RequestError, handle_failed_request

Expand All @@ -24,8 +23,8 @@ class AI21Client(CachingClient):
COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/{model}/complete"
EXPERIMENTAL_COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/experimental/{model}/complete"

def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig, url: Optional[str] = None):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
def __init__(self, api_key: str, cache_config: CacheConfig, url: Optional[str] = None):
super().__init__(cache_config=cache_config)
self.api_key = api_key
self.url = url

Expand Down
5 changes: 2 additions & 3 deletions src/helm/proxy/clients/aleph_alpha_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@

from helm.common.cache import CacheConfig
from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence


class AlephAlphaClient(CachingClient):
COMPLETION_ENDPOINT: str = "complete"

def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
def __init__(self, api_key: str, cache_config: CacheConfig):
super().__init__(cache_config=cache_config)
self.api_key: str = api_key

def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
5 changes: 3 additions & 2 deletions src/helm/proxy/clients/anthropic_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ class AnthropicClient(CachingClient):
PROMPT_ANSWER_START: str = "The answer is "

def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig, api_key: Optional[str] = None):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
super().__init__(cache_config=cache_config)
self.tokenizer = tokenizer
self.api_key: Optional[str] = api_key
self._client = anthropic.Client(api_key) if api_key else None

Expand Down Expand Up @@ -241,7 +242,7 @@ def is_valid_logprobs_response(raw_response: str) -> bool:

def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
hlog("This client is deprecated. Please use AnthropicClient instead.")
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
super().__init__(cache_config=cache_config)
self.api_key = api_key

def make_request(self, request: Request) -> RequestResult:
Expand Down
33 changes: 5 additions & 28 deletions src/helm/proxy/clients/auto_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from dataclasses import replace
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
from typing import Any, Dict, Mapping, Optional

from retrying import Attempt, RetryError

Expand All @@ -11,22 +11,12 @@
from helm.common.hierarchical_logger import hlog
from helm.common.object_spec import create_object, inject_object_spec_args
from helm.common.request import Request, RequestResult
from helm.common.tokenization_request import (
DecodeRequest,
DecodeRequestResult,
TokenizationRequest,
TokenizationRequestResult,
)
from helm.proxy.clients.client import Client
from helm.proxy.critique.critique_client import CritiqueClient
from helm.proxy.clients.huggingface_client import HuggingFaceClient
from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
from helm.proxy.retry import NonRetriableException, retry_request
from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer


if TYPE_CHECKING:
import helm.proxy.clients.huggingface_client


class AuthenticationError(NonRetriableException):
Expand All @@ -43,7 +33,7 @@ def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: s
self.mongo_uri = mongo_uri
self.clients: Dict[str, Client] = {}
# self._huggingface_client is lazily instantiated by get_huggingface_client()
self._huggingface_client: Optional["helm.proxy.clients.huggingface_client.HuggingFaceClient"] = None
self._huggingface_client: Optional[HuggingFaceClient] = None
# self._critique_client is lazily instantiated by get_critique_client()
self._critique_client: Optional[CritiqueClient] = None
hlog(f"AutoClient: cache_path = {cache_path}")
Expand Down Expand Up @@ -125,16 +115,6 @@ def make_request_with_retry(client: Client, request: Request) -> RequestResult:
# Notify our user that we failed to make the request even after retrying.
return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")

# TODO: remove this method after a few weeks (2023-11-09)
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
raise NotImplementedError(
"AutoClient.tokenize() is not supported anymore." "Use AutoTokenizer.tokenize() instead."
)

# TODO: remove this method after a few weeks (2023-11-09)
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
raise NotImplementedError("AutoClient.decode() is not supported anymore." "Use AutoTokenizer.decode() instead.")

def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
"""Get the toxicity classifier client. We currently only support Perspective API."""
from helm.proxy.clients.perspective_api_client import PerspectiveAPIClient
Expand Down Expand Up @@ -195,14 +175,11 @@ def get_critique_client(self) -> CritiqueClient:
)
return self._critique_client

def get_huggingface_client(self) -> "helm.proxy.clients.huggingface_client.HuggingFaceClient":
def get_huggingface_client(self) -> HuggingFaceClient:
"""Get the Hugging Face client."""
from helm.proxy.clients.huggingface_client import HuggingFaceClient

if self._huggingface_client:
assert isinstance(self._huggingface_client, HuggingFaceClient)
return self._huggingface_client
cache_config = build_cache_config(self.cache_path, self.mongo_uri, "huggingface")
tokenizer = HuggingFaceTokenizer(cache_config)
self._huggingface_client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
self._huggingface_client = HuggingFaceClient(cache_config=cache_config)
return self._huggingface_client
48 changes: 1 addition & 47 deletions src/helm/proxy/clients/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,10 @@
from helm.common.hierarchical_logger import hlog
from helm.common.media_object import MultimediaObject, TEXT_TYPE
from helm.common.request import Request, RequestResult, Sequence, Token
from helm.common.tokenization_request import (
TokenizationRequest,
TokenizationRequestResult,
DecodeRequest,
DecodeRequestResult,
)
from helm.common.cache import Cache, CacheConfig
from helm.proxy.tokenizers.tokenizer import Tokenizer


class Client(ABC):
# TODO: This method should be removed.
# This only kept for the AutoClient. Eventually, we should introduce an
# AutoTokenizer or TokenizerFactory class.
@abstractmethod
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
"""Tokenizes `request.text` using `request.tokenizer`.
This simply calls the `tokenize` method of the tokenizer.
Some exceptions can be made (but should be avoided).
This is the case for the auto client, which needs to handle
tokenization for multiple tokenizers.
"""
pass

# TODO: This method should be removed.
# This only kept for the AutoClient. Eventually, we should introduce an
# AutoTokenizer or TokenizerFactory class.
@abstractmethod
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
"""Decodes `request.tokens` using `request.tokenizer`.
This simply calls the `decode` method of the tokenizer.
Some exceptions can be made (but should be avoided).
This is the case for the auto client, which needs to handle
tokenization for multiple tokenizers.
"""
pass

@abstractmethod
def make_request(self, request: Request) -> RequestResult:
"""Makes a request to the model.
Expand All @@ -54,7 +19,7 @@ def make_request(self, request: Request) -> RequestResult:


class CachingClient(Client):
def __init__(self, cache_config: CacheConfig, tokenizer: Tokenizer) -> None:
def __init__(self, cache_config: CacheConfig) -> None:
"""Initializes the client.
For most clients, both the cache config and tokenizer are required.
Expand All @@ -63,7 +28,6 @@ def __init__(self, cache_config: CacheConfig, tokenizer: Tokenizer) -> None:
the request is made.
"""
self.cache = Cache(cache_config) if cache_config is not None else None
self.tokenizer = tokenizer

@staticmethod
def make_cache_key(raw_request: Dict, request: Request) -> Dict:
Expand All @@ -78,16 +42,6 @@ def make_cache_key(raw_request: Dict, request: Request) -> Dict:
cache_key = raw_request
return cache_key

def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
# Deprecated - use `self.tokenizer.tokenize` instead. Warn the user.
hlog("WARNING: CachingClient.tokenize is deprecated, use self.tokenizer.tokenize instead")
return self.tokenizer.tokenize(request)

def decode(self, request: DecodeRequest) -> DecodeRequestResult:
# Deprecated - use `self.tokenizer.decode` instead. Warn the user.
hlog("WARNING: CachingClient.decode is deprecated, use self.tokenizer.decode instead")
return self.tokenizer.decode(request)


def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool = True) -> Sequence:
"""
Expand Down
5 changes: 2 additions & 3 deletions src/helm/proxy/clients/cohere_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
Token,
)
from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence
from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION

Expand All @@ -21,8 +20,8 @@ class CohereClient(CachingClient):
ORGANIZATION: str = "cohere"
GENERATE_ENDPOINT: str = "generate"

def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
def __init__(self, api_key: str, cache_config: CacheConfig):
super().__init__(cache_config=cache_config)
self.api_key: str = api_key

def make_request(self, request: Request) -> RequestResult:
Expand Down
5 changes: 2 additions & 3 deletions src/helm/proxy/clients/google_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from helm.common.cache import CacheConfig
from helm.common.request import Request, RequestResult, Sequence, Token
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence


Expand All @@ -28,8 +27,8 @@ def convert_to_raw_request(request: Request) -> Dict:
"top_p": request.top_p,
}

def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
def __init__(self, cache_config: CacheConfig):
super().__init__(cache_config=cache_config)

def make_request(self, request: Request) -> RequestResult:
raw_request = GoogleClient.convert_to_raw_request(request)
Expand Down
5 changes: 2 additions & 3 deletions src/helm/proxy/clients/goose_ai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
Sequence,
Token,
)
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence
from .openai_client import ORIGINAL_COMPLETION_ATTRIBUTES

Expand All @@ -23,8 +22,8 @@ class GooseAIClient(CachingClient):
- Supported models: https://goose.ai/docs/models
"""

def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig, org_id: Optional[str] = None):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
def __init__(self, api_key: str, cache_config: CacheConfig, org_id: Optional[str] = None):
super().__init__(cache_config=cache_config)
self.org_id: Optional[str] = org_id
self.api_key: str = api_key
self.api_base: str = "https://api.goose.ai/v1"
Expand Down
Loading

0 comments on commit c4ead28

Please sign in to comment.