Merge remote-tracking branch 'upstream/main' into DecodingTrust

AI-secure · Nov 29, 2023 · c4ead28 · c4ead28
2 parents 71979f3 + c545c49
commit c4ead28
Show file tree

Hide file tree

Showing 33 changed files with 150 additions and 197 deletions.
diff --git a/docs/get_helm_rank.md b/docs/get_helm_rank.md
@@ -21,18 +21,18 @@ now that the files are in your results directory, all HELM models will be shown
 
 ## Run Efficient-HELM
 
-According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023).  
+According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM Research, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023).  
 
-Specifically, the authors calculated the CI $95\%$ of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]:
+Specifically, the authors calculated the CI 95% of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]:
 
-| Examples Per Scenario | CI $95\%$ of Rank Location | Compute saved |
-| :-------------------: | :------------------------: | :-----------: |
-|         $10$          |           $\pm5$           |  $\times400$  |
-|         $20$          |           $\pm4$           |  $\times200$  |
-|         $50$          |           $\pm3$           |  $\times80$   |
-|         $200$         |           $\pm2$           |  $\times20$   |
-|        $1000$         |           $\pm1$           |   $\times4$   |
-|          All          |           $\pm1$           |   $\times1$   |
+| Examples Per Scenario | CI 95% of Rank Location | Compute saved |
+| :-------------------: | :---------------------: | :-----------: |
+|          10           |           ±5            |     X400      |
+|          20           |           ±4            |     X200      |
+|          50           |           ±3            |      X80      |
+|          200          |           ±2            |      X20      |
+|         1000          |           ±1            |      X4       |
+|          All          |           ±1            |      X1       |
 
 
 Choose your point on your tradeoff, how accurate do you need your rank? how much time do you want to wait? Once you have chosen, download the config and define your model
@@ -81,4 +81,4 @@ helm-server
 
 ```Perlitz, Y., Bandel, E., Gera, A., Arviv, O., Ein-Dor, L., Shnarch, E., Slonim, N., Shmueli-Scheuer, M. and Choshen, L., 2023. Efficient Benchmarking (of Language Models). arXiv preprint arXiv:2308.11696.```
 
-[^1]: Note that the quantities below are the CI $95\%$ of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above $\pm2$ for any of the options above.
+[^1]: Note that the quantities below are the CI 95% of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above ±2 for any of the options above.]:
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -35,6 +35,7 @@ nav:
   - 'User Guide':
     - 'installation.md'
     - 'quick_start.md'
+    - 'get_helm_rank.md'
     - 'tutorial.md'
     - 'benchmark.md'
     - 'huggingface_models.md'

diff --git a/src/helm-frontend/public/config.js b/src/helm-frontend/public/config.js
@@ -1,4 +1,4 @@
 window.BENCHMARK_OUTPUT_BASE_URL =
 	"https://storage.googleapis.com/crfm-helm-public/";
-window.SUITE = "v0.2.4";
-window.RELEASE = "v0.3.0";
+window.SUITE = null;
+window.RELEASE = "v0.4.0";
diff --git a/src/helm-frontend/src/components/Footer.tsx b/src/helm-frontend/src/components/Footer.tsx
@@ -5,7 +5,7 @@ export default function Footer() {
   const version = getBenchmarkRelease();
   return (
     <div className="bottom-0 right-0 p-4 bg-white-800 text-black text-right">
-      <p>Version {version}</p>
+      <p>Release: {version}</p>
     </div>
   );
 }
diff --git a/src/helm-frontend/src/components/GroupsTables.tsx b/src/helm-frontend/src/components/GroupsTables.tsx
@@ -8,17 +8,19 @@ interface Props {
   activeGroup: number;
   ignoreHref?: boolean;
   sortable?: boolean;
+  sortFirstMetric?: boolean;
 }
 
 export default function GroupsTables({
   groupsTables,
   activeGroup,
   ignoreHref = false,
   sortable = true,
+  sortFirstMetric = true,
 }: Props) {
-  const [activeSortColumn, setActiveSortColumn] = useState<
-    number | undefined
-  >();
+  const [activeSortColumn, setActiveSortColumn] = useState<number | undefined>(
+    sortFirstMetric ? 1 : undefined,
+  );
   const [activeGroupsTable, setActiveGroupsTable] = useState<GroupsTable>({
     ...groupsTables[activeGroup],
   });
@@ -66,6 +68,13 @@ export default function GroupsTables({
     });
   };
 
+  useEffect(() => {
+    if (sortFirstMetric && activeSortColumn) {
+      handleSort(activeSortColumn);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [sortFirstMetric, activeSortColumn]);
+
   return (
     <div className="overflow-x-auto">
       <table className="table">

diff --git a/src/helm-frontend/src/components/NavDropdown.tsx b/src/helm-frontend/src/components/NavDropdown.tsx
@@ -62,13 +62,14 @@ function NavDropdown() {
               className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
               role="menuitem"
             >
-              <Link to="https://crfm.stanford.edu/heim/latest/?">
+              <a href="https://crfm.stanford.edu/heim/latest/?">
                 <div className="flex items-center">
                   <span>
-                    <strong>HEIM: </strong>Holistic evaluation of image models
+                    <strong>HEIM: </strong>Holistic evaluation of text-to-image
+                    models
                   </span>
                 </div>
-              </Link>
+              </a>
             </div>
           </div>
         </div>

diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py
@@ -5,6 +5,7 @@
 import pickle
 
 import spacy
+import spacy.cli
 from typing import List, Dict, Optional
 from collections import defaultdict
 

diff --git a/src/helm/proxy/clients/ai21_client.py b/src/helm/proxy/clients/ai21_client.py
@@ -10,7 +10,6 @@
     Sequence,
     Token,
 )
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence, cleanup_str
 from .ai21_utils import AI21RequestError, handle_failed_request
 
@@ -24,8 +23,8 @@ class AI21Client(CachingClient):
     COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/{model}/complete"
     EXPERIMENTAL_COMPLETION_URL_TEMPLATE: str = "https://api.ai21.com/studio/v1/experimental/{model}/complete"
 
-    def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig, url: Optional[str] = None):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+    def __init__(self, api_key: str, cache_config: CacheConfig, url: Optional[str] = None):
+        super().__init__(cache_config=cache_config)
         self.api_key = api_key
         self.url = url
 

diff --git a/src/helm/proxy/clients/aleph_alpha_client.py b/src/helm/proxy/clients/aleph_alpha_client.py
@@ -4,15 +4,14 @@
 
 from helm.common.cache import CacheConfig
 from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence
 
 
 class AlephAlphaClient(CachingClient):
     COMPLETION_ENDPOINT: str = "complete"
 
-    def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+    def __init__(self, api_key: str, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
         self.api_key: str = api_key
 
     def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/src/helm/proxy/clients/anthropic_client.py b/src/helm/proxy/clients/anthropic_client.py
@@ -57,7 +57,8 @@ class AnthropicClient(CachingClient):
     PROMPT_ANSWER_START: str = "The answer is "
 
     def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig, api_key: Optional[str] = None):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+        super().__init__(cache_config=cache_config)
+        self.tokenizer = tokenizer
         self.api_key: Optional[str] = api_key
         self._client = anthropic.Client(api_key) if api_key else None
 
@@ -241,7 +242,7 @@ def is_valid_logprobs_response(raw_response: str) -> bool:
 
     def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
         hlog("This client is deprecated. Please use AnthropicClient instead.")
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+        super().__init__(cache_config=cache_config)
         self.api_key = api_key
 
     def make_request(self, request: Request) -> RequestResult:

diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py
@@ -1,6 +1,6 @@
 import os
 from dataclasses import replace
-from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
+from typing import Any, Dict, Mapping, Optional
 
 from retrying import Attempt, RetryError
 
@@ -11,22 +11,12 @@
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import create_object, inject_object_spec_args
 from helm.common.request import Request, RequestResult
-from helm.common.tokenization_request import (
-    DecodeRequest,
-    DecodeRequestResult,
-    TokenizationRequest,
-    TokenizationRequestResult,
-)
 from helm.proxy.clients.client import Client
 from helm.proxy.critique.critique_client import CritiqueClient
+from helm.proxy.clients.huggingface_client import HuggingFaceClient
 from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
 from helm.proxy.retry import NonRetriableException, retry_request
 from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
-from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
-
-
-if TYPE_CHECKING:
-    import helm.proxy.clients.huggingface_client
 
 
 class AuthenticationError(NonRetriableException):
@@ -43,7 +33,7 @@ def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: s
         self.mongo_uri = mongo_uri
         self.clients: Dict[str, Client] = {}
         # self._huggingface_client is lazily instantiated by get_huggingface_client()
-        self._huggingface_client: Optional["helm.proxy.clients.huggingface_client.HuggingFaceClient"] = None
+        self._huggingface_client: Optional[HuggingFaceClient] = None
         # self._critique_client is lazily instantiated by get_critique_client()
         self._critique_client: Optional[CritiqueClient] = None
         hlog(f"AutoClient: cache_path = {cache_path}")
@@ -125,16 +115,6 @@ def make_request_with_retry(client: Client, request: Request) -> RequestResult:
             # Notify our user that we failed to make the request even after retrying.
             return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
 
-    # TODO: remove this method after a few weeks (2023-11-09)
-    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
-        raise NotImplementedError(
-            "AutoClient.tokenize() is not supported anymore." "Use AutoTokenizer.tokenize() instead."
-        )
-
-    # TODO: remove this method after a few weeks (2023-11-09)
-    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
-        raise NotImplementedError("AutoClient.decode() is not supported anymore." "Use AutoTokenizer.decode() instead.")
-
     def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
         """Get the toxicity classifier client. We currently only support Perspective API."""
         from helm.proxy.clients.perspective_api_client import PerspectiveAPIClient
@@ -195,14 +175,11 @@ def get_critique_client(self) -> CritiqueClient:
             )
         return self._critique_client
 
-    def get_huggingface_client(self) -> "helm.proxy.clients.huggingface_client.HuggingFaceClient":
+    def get_huggingface_client(self) -> HuggingFaceClient:
         """Get the Hugging Face client."""
-        from helm.proxy.clients.huggingface_client import HuggingFaceClient
-
         if self._huggingface_client:
             assert isinstance(self._huggingface_client, HuggingFaceClient)
             return self._huggingface_client
         cache_config = build_cache_config(self.cache_path, self.mongo_uri, "huggingface")
-        tokenizer = HuggingFaceTokenizer(cache_config)
-        self._huggingface_client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
+        self._huggingface_client = HuggingFaceClient(cache_config=cache_config)
         return self._huggingface_client
diff --git a/src/helm/proxy/clients/client.py b/src/helm/proxy/clients/client.py
@@ -5,45 +5,10 @@
 from helm.common.hierarchical_logger import hlog
 from helm.common.media_object import MultimediaObject, TEXT_TYPE
 from helm.common.request import Request, RequestResult, Sequence, Token
-from helm.common.tokenization_request import (
-    TokenizationRequest,
-    TokenizationRequestResult,
-    DecodeRequest,
-    DecodeRequestResult,
-)
 from helm.common.cache import Cache, CacheConfig
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 
 
 class Client(ABC):
-    # TODO: This method should be removed.
-    # This only kept for the AutoClient. Eventually, we should introduce an
-    # AutoTokenizer or TokenizerFactory class.
-    @abstractmethod
-    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
-        """Tokenizes `request.text` using `request.tokenizer`.
-
-        This simply calls the `tokenize` method of the tokenizer.
-        Some exceptions can be made (but should be avoided).
-        This is the case for the auto client, which needs to handle
-        tokenization for multiple tokenizers.
-        """
-        pass
-
-    # TODO: This method should be removed.
-    # This only kept for the AutoClient. Eventually, we should introduce an
-    # AutoTokenizer or TokenizerFactory class.
-    @abstractmethod
-    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
-        """Decodes `request.tokens` using `request.tokenizer`.
-
-        This simply calls the `decode` method of the tokenizer.
-        Some exceptions can be made (but should be avoided).
-        This is the case for the auto client, which needs to handle
-        tokenization for multiple tokenizers.
-        """
-        pass
-
     @abstractmethod
     def make_request(self, request: Request) -> RequestResult:
         """Makes a request to the model.
@@ -54,7 +19,7 @@ def make_request(self, request: Request) -> RequestResult:
 
 
 class CachingClient(Client):
-    def __init__(self, cache_config: CacheConfig, tokenizer: Tokenizer) -> None:
+    def __init__(self, cache_config: CacheConfig) -> None:
         """Initializes the client.
 
         For most clients, both the cache config and tokenizer are required.
@@ -63,7 +28,6 @@ def __init__(self, cache_config: CacheConfig, tokenizer: Tokenizer) -> None:
         the request is made.
         """
         self.cache = Cache(cache_config) if cache_config is not None else None
-        self.tokenizer = tokenizer
 
     @staticmethod
     def make_cache_key(raw_request: Dict, request: Request) -> Dict:
@@ -78,16 +42,6 @@ def make_cache_key(raw_request: Dict, request: Request) -> Dict:
             cache_key = raw_request
         return cache_key
 
-    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
-        # Deprecated - use `self.tokenizer.tokenize` instead. Warn the user.
-        hlog("WARNING: CachingClient.tokenize is deprecated, use self.tokenizer.tokenize instead")
-        return self.tokenizer.tokenize(request)
-
-    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
-        # Deprecated - use `self.tokenizer.decode` instead. Warn the user.
-        hlog("WARNING: CachingClient.decode is deprecated, use self.tokenizer.decode instead")
-        return self.tokenizer.decode(request)
-
 
 def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool = True) -> Sequence:
     """

diff --git a/src/helm/proxy/clients/cohere_client.py b/src/helm/proxy/clients/cohere_client.py
@@ -12,7 +12,6 @@
     Token,
 )
 from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence
 from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
 
@@ -21,8 +20,8 @@ class CohereClient(CachingClient):
     ORGANIZATION: str = "cohere"
     GENERATE_ENDPOINT: str = "generate"
 
-    def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+    def __init__(self, api_key: str, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
         self.api_key: str = api_key
 
     def make_request(self, request: Request) -> RequestResult:

diff --git a/src/helm/proxy/clients/google_client.py b/src/helm/proxy/clients/google_client.py
@@ -2,7 +2,6 @@
 
 from helm.common.cache import CacheConfig
 from helm.common.request import Request, RequestResult, Sequence, Token
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence
 
 
@@ -28,8 +27,8 @@ def convert_to_raw_request(request: Request) -> Dict:
             "top_p": request.top_p,
         }
 
-    def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
 
     def make_request(self, request: Request) -> RequestResult:
         raw_request = GoogleClient.convert_to_raw_request(request)

diff --git a/src/helm/proxy/clients/goose_ai_client.py b/src/helm/proxy/clients/goose_ai_client.py
@@ -11,7 +11,6 @@
     Sequence,
     Token,
 )
-from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence
 from .openai_client import ORIGINAL_COMPLETION_ATTRIBUTES
 
@@ -23,8 +22,8 @@ class GooseAIClient(CachingClient):
     - Supported models: https://goose.ai/docs/models
     """
 
-    def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig, org_id: Optional[str] = None):
-        super().__init__(cache_config=cache_config, tokenizer=tokenizer)
+    def __init__(self, api_key: str, cache_config: CacheConfig, org_id: Optional[str] = None):
+        super().__init__(cache_config=cache_config)
         self.org_id: Optional[str] = org_id
         self.api_key: str = api_key
         self.api_base: str = "https://api.goose.ai/v1"