diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 684eaafd5d..06ed1d8581 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -29,10 +29,12 @@ jobs:
           key: pip-${{ hashFiles('requirements.txt') }}-${{ matrix.python-version }}
           restore-keys: |
             pip-
-      - run: pip install -e .
-      - run: helm-run -h
-      - run: helm-summarize -h
-      - run: echo "Finished installation."
+      - run: python3 -m pip install --upgrade build
+      - run: python3 -m build
+      - run: python3 -m pip install dist/crfm_helm-*.whl
+      - run: helm-run --run-specs simple1:model=simple/model1 --max-eval-instances 10 --suite test
+      - run: helm-summarize --suite test
+      - run: helm-server --help
 
   test:
     name: Tests
diff --git a/.gitignore b/.gitignore
index 3f3d0b4f16..cb3d5237f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,5 @@ notes.otl
 
 # Miscellaneous
 .nfs*
+
+node_modules
diff --git a/MANIFEST.in b/MANIFEST.in
index 72af21c530..ba863069aa 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 recursive-include src/helm/proxy/clients/ *.sp
 recursive-include src/helm/benchmark/ *.json
 recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
+recursive-include src/helm/config/ *.yaml
diff --git a/docs/mkdocs_macros.py b/docs/mkdocs_macros.py
index ba78537f86..914113eb22 100644
--- a/docs/mkdocs_macros.py
+++ b/docs/mkdocs_macros.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass, field
 from typing import Optional, List
 
-from helm.benchmark.presentation.schema import read_schema, ModelField
+from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME, ModelField
 from helm.benchmark.run_expander import RUN_EXPANDERS
 from helm.proxy.models import ALL_MODELS, Model
 
@@ -27,7 +27,8 @@ def from_model_field_and_model_object(model_field: ModelField, model_object: Opt
 def define_env(env):
     @env.macro
     def models_by_organization():
-        schema = read_schema()
+        # TODO: make this customizable
+        schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
         result = defaultdict(list)
 
         # Create dict name -> madel_object (ALL_MODELS)
diff --git a/docs/tutorial.md b/docs/tutorial.md
index cc103e11f6..14ac4bdd37 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -63,7 +63,7 @@ This reads the pre-existing files in `benchmark_output/runs/v1/` that were writt
 - `groups.json` contains a serialized list of `Table`, each containing information about groups in a group category.
 - `groups_metadata.json` contains a list of all the groups along with a human-readable description and a taxonomy.
 
-Additionally, for each group and group-relavent metric, it will output a pair of files: `benchmark_output/runs/v1/groups/latex/<group_name>_<metric_name>.tex` and `benchmark_output/runs/v1/groups/latex/<group_name>_<metric_name>.json`. These files contain the statistics for that metric from each run within the group.
+Additionally, for each group and group-relavent metric, it will output a pair of files: `benchmark_output/runs/v1/groups/latex/<group_name>_<metric_name>.tex` and `benchmark_output/runs/v1/groups/json/<group_name>_<metric_name>.json`. These files contain the statistics for that metric from each run within the group.
 
 <!--
 # TODO(#1441): Enable plots
diff --git a/requirements.txt b/requirements.txt
index 85be7d0491..e5dcc3fde9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -50,6 +50,7 @@ google-api-core==2.10.1
 google-api-python-client==2.64.0
 google-auth==2.12.0
 google-auth-httplib2==0.1.0
+google-cloud-aiplatform==1.36.4
 googleapis-common-protos==1.56.4
 greenlet==1.1.3
 gunicorn==20.1.0
@@ -67,6 +68,7 @@ jmespath==1.0.1
 joblib==1.2.0
 kiwisolver==1.4.4
 langcodes==3.3.0
+langdetect==1.0.9
 llvmlite==0.39.1
 lxml==4.9.1
 Mako==1.2.3
diff --git a/scripts/cache/copy_cache.py b/scripts/cache/copy_cache.py
index 5eb045ee0f..388527e60b 100644
--- a/scripts/cache/copy_cache.py
+++ b/scripts/cache/copy_cache.py
@@ -14,7 +14,7 @@
 import os
 
 from sqlitedict import SqliteDict
-from helm.common.cache import _MongoKeyValueStore
+from helm.common.mongo_key_value_store import MongoKeyValueStore
 from helm.common.hierarchical_logger import hlog, htrack
 from typing import Optional
 
@@ -60,7 +60,7 @@ def copy_cache(
     hlog(f"Opening Sqlite cache {cache_path}")
     with SqliteDict(cache_path) as source_cache:
         hlog(f"Copying to MongoDB {mongo_host}")
-        with _MongoKeyValueStore(mongo_host, collection_name=organization) as target_cache:
+        with MongoKeyValueStore(mongo_host, collection_name=organization) as target_cache:
             for key, value in source_cache.items():
                 if not dry_run and (not range_start or num_items >= range_start):
                     try:
diff --git a/scripts/cache/fix_anthropic_cache.py b/scripts/cache/fix_anthropic_cache.py
index 79c7546d4f..565e41cf14 100644
--- a/scripts/cache/fix_anthropic_cache.py
+++ b/scripts/cache/fix_anthropic_cache.py
@@ -8,7 +8,6 @@
 from helm.common.hierarchical_logger import hlog, htrack
 from helm.proxy.clients.anthropic_client import AnthropicLegacyClient
 from helm.proxy.retry import get_retry_decorator
-from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 
 
 """
@@ -48,9 +47,7 @@ def add_logprobs(mongo_uri: str, credentials_path: str, dry_run: bool):
         api_key: str = credentials["anthropicApiKey"]
 
     cache_config = MongoCacheConfig(mongo_uri, collection_name="anthropic")
-    client = AnthropicLegacyClient(
-        api_key=api_key, tokenizer=HuggingFaceTokenizer(cache_config), cache_config=cache_config
-    )
+    client = AnthropicLegacyClient(api_key=api_key, cache_config=cache_config)
 
     with create_key_value_store(cache_config) as cache:
         for i, (request, response) in enumerate(cache.get_all()):
diff --git a/scripts/compute_request_limits.py b/scripts/compute_request_limits.py
index 55cd813d70..f081ba0a36 100644
--- a/scripts/compute_request_limits.py
+++ b/scripts/compute_request_limits.py
@@ -124,7 +124,8 @@ def figure_out_max_prompt_length(
 
 
 def figure_out_max_prompt_length_plus_tokens(
-    client: Any,  # Client,
+    client: AutoClient,
+    auto_tokenizer: AutoTokenizer,
     model_deployment_name: str,
     model_name: str,
     tokenizer_name: str,
@@ -132,7 +133,7 @@ def figure_out_max_prompt_length_plus_tokens(
     prefix: str = "",
     suffix: str = "",
 ) -> int:
-    tokenizer = client._get_tokenizer(tokenizer_name)
+    tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)
     lower_bound = 1
     upper_bound = 2 * max_prompt_length + 1
 
@@ -387,6 +388,7 @@ def main():
     print("========== Figure out max_prompt_length_plus_tokens ==========")
     max_prompt_length_plus_tokens: int = figure_out_max_prompt_length_plus_tokens(
         client,
+        auto_tokenizer,
         args.model_deployment_name,
         args.model_name,
         args.tokenizer_name,
diff --git a/scripts/offline_eval/export_requests.py b/scripts/offline_eval/export_requests.py
index 7e74548530..cca192bbcc 100644
--- a/scripts/offline_eval/export_requests.py
+++ b/scripts/offline_eval/export_requests.py
@@ -12,8 +12,8 @@
     MongoCacheConfig,
     SqliteCacheConfig,
     create_key_value_store,
-    request_to_key,
 )
+from helm.common.key_value_store import request_to_key
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.proxy.clients.google_client import GoogleClient
 from helm.proxy.clients.together_client import TogetherClient
diff --git a/scripts/offline_eval/import_results.py b/scripts/offline_eval/import_results.py
index 4d7aebb7bd..7eab0cfab4 100644
--- a/scripts/offline_eval/import_results.py
+++ b/scripts/offline_eval/import_results.py
@@ -9,8 +9,8 @@
     MongoCacheConfig,
     SqliteCacheConfig,
     create_key_value_store,
-    request_to_key,
 )
+from helm.common.key_value_store import request_to_key
 from helm.common.hierarchical_logger import hlog, htrack
 from .export_requests import SUPPORTED_ORGS
 
diff --git a/setup.cfg b/setup.cfg
index 764a864116..4818e61bd7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,6 @@ install_requires=
     # Keep sqlitedict version at 1.7.0.
     sqlitedict~=1.7.0
     bottle~=0.12.23
-    # TODO: Remove these from common
-    pymongo~=4.2.0
 
     # Basic Scenarios
     datasets~=2.5.2
@@ -103,6 +101,9 @@ images =
     accelerate~=0.23.0  # For the newer versions of Transformers
     pillow~=9.4.0
 
+mongo =
+    pymongo~=4.2.0
+
 # Model extras
 aleph-alpha =
     aleph-alpha-client~=2.14.0
@@ -116,6 +117,9 @@ openai =
     openai~=0.27.8
     tiktoken~=0.3.3
 
+google =
+    google-cloud-aiplatform~=1.36.4
+
 tsinghua =
     icetk~=0.0.4
 
@@ -125,6 +129,7 @@ yandex =
 models =
     crfm-helm[aleph-alpha]
     crfm-helm[anthropic]
+    crfm-helm[google]
     crfm-helm[openai]
     crfm-helm[tsinghua]
     crfm-helm[yandex]
@@ -141,6 +146,7 @@ all =
     crfm-helm[cleva]
     crfm-helm[images]
     crfm-helm[models]
+    crfm-helm[mongo]
 
 # Development only
 # Do not include in all
diff --git a/src/helm-frontend/README.md b/src/helm-frontend/README.md
index d1cc19541a..3a859ec35a 100644
--- a/src/helm-frontend/README.md
+++ b/src/helm-frontend/README.md
@@ -7,7 +7,7 @@ This app makes use of [React](https://react.dev/) + [TypeScript](https://www.typ
 ### Installation
 
 ```bash
-npm Install
+yarn install
 ```
 
 ### Develop
@@ -15,7 +15,7 @@ npm Install
 This will open a development server
 
 ```bash
-npm run dev
+yarn dev
 ```
 
 You will also want to start `helm-server` locally as well. In the `src/helm` directory run the following
@@ -27,13 +27,13 @@ helm-server
 ### Testing
 
 ```
-npm run test
+yarn test
 ```
 
 ### Build
 
 ```bash
-npm run build
+yarn build
 ```
 
 ### Deployment
@@ -45,7 +45,7 @@ You can rename the build directory to the desired release name and upload it to
 ### Linting
 
 ```bash
-npm run lint
+yarn lint
 ```
 
 ### Formatting
@@ -53,25 +53,5 @@ npm run lint
 If you don't have prettier configured in your IDE or Node environment, you will have to run the following before commiting, in order to pass tests.
 
 ```bash
-npm run format
-```
-
-### Environment Variables
-
-Requires the following environment variables for development and deployment. In development these can be placed in a `.env.local` file with the following:
-
-```
-# The default location of local `helm-server`
-VITE_HELM_BENCHMARKS_ENDPOINT="http://localhost:8000/"
-# The suites available based on local runs
-VITE_HELM_BENCHMARKS_SUITE="v1"
-```
-
-This can instead be pointed to the public HELM data to avoid needing to run `helm-server` locally.
-
-```
-# Example
-VITE_HELM_BENCHMARKS_ENDPOINT="https://storage.googleapis.com/crfm-helm-public/"
-# Change to current version
-VITE_HELM_BENCHMARKS_SUITE="v0.2.3"
+yarn format
 ```
diff --git a/src/helm-frontend/public/config.js b/src/helm-frontend/public/config.js
index 13e9f15fe9..34b37a2ccc 100644
--- a/src/helm-frontend/public/config.js
+++ b/src/helm-frontend/public/config.js
@@ -1,4 +1,4 @@
 window.BENCHMARK_OUTPUT_BASE_URL =
-	"https://storage.googleapis.com/crfm-helm-public/";
+  "https://storage.googleapis.com/crfm-helm-public/lite/";
 window.SUITE = null;
-window.RELEASE = "v0.4.0";
+window.RELEASE = "v1.0.0";
diff --git a/src/helm-frontend/src/App.tsx b/src/helm-frontend/src/App.tsx
index 3808668c15..1e8d9e5d56 100644
--- a/src/helm-frontend/src/App.tsx
+++ b/src/helm-frontend/src/App.tsx
@@ -7,8 +7,8 @@ import Groups from "@/routes/Groups";
 import Group from "@/routes/Group";
 import Runs from "@/routes/Runs";
 import Run from "@/routes/Run";
-import Landing from "@/routes/Landing";
 import Leaderboard from "@/routes/Leaderboard";
+import Landing from "@/routes/Landing";
 
 export default function App() {
   return (
@@ -16,8 +16,8 @@ export default function App() {
       <Routes>
         <Route path={`/`} element={<Layout />}>
           <Route index element={<Landing />} />
-          <Route path="models" element={<Models />} />
           <Route path="leaderboard" element={<Leaderboard />} />
+          <Route path="models" element={<Models />} />
           <Route path="scenarios" element={<Scenarios />} />
           <Route path="groups" element={<Groups />} />
           <Route path="groups/:groupName" element={<Group />} />
diff --git a/src/helm-frontend/src/assets/logos/together.png b/src/helm-frontend/src/assets/logos/together.png
index 76ffc350f9..28644a53b3 100644
Binary files a/src/helm-frontend/src/assets/logos/together.png and b/src/helm-frontend/src/assets/logos/together.png differ
diff --git a/src/helm-frontend/src/components/Footer.tsx b/src/helm-frontend/src/components/Footer.tsx
index 0a678de8fc..7fd2d14425 100644
--- a/src/helm-frontend/src/components/Footer.tsx
+++ b/src/helm-frontend/src/components/Footer.tsx
@@ -5,7 +5,7 @@ export default function Footer() {
   const version = getBenchmarkRelease();
   return (
     <div className="bottom-0 right-0 p-4 bg-white-800 text-black text-right">
-      <p>Release: {version}</p>
+      <p>Release {version}</p>
     </div>
   );
 }
diff --git a/src/helm-frontend/src/components/Hero.tsx b/src/helm-frontend/src/components/Hero.tsx
index 886b33196f..4afbd08e3a 100644
--- a/src/helm-frontend/src/components/Hero.tsx
+++ b/src/helm-frontend/src/components/Hero.tsx
@@ -1,43 +1,50 @@
 import helmHero from "@/assets/helmhero.png";
 import { Link } from "react-router-dom";
+import MiniLeaderboard from "./MiniLeaderboard";
 
 export default function Hero() {
   return (
-    <div className="flex px-6 py-14">
-      {/* Left side content */}
-      <div className="flex-1 p-4 flex flex-col justify-center">
-        {" "}
-        {/* Added flex and justify-center */}
-        <div className="flex justify-start">
-          <h1 className="text-5xl mb-4 mx-4 mt-2">
-            <strong>
-              A holistic framework for evaluating foundation models.
-            </strong>
-          </h1>
-        </div>
-        <div className="flex justify-start mt-6 ml-4">
-          <Link to="leaderboard">
-            <button className="px-6 btn btn-grey rounded-md">
-              <body>Leaderboard</body>
-            </button>
-          </Link>
-          <Link to="https://github.com/stanford-crfm/helm" className="ml-4">
-            {" "}
-            {/* Added margin-left for spacing */}
-            <button className="px-6 btn btn-grey rounded-md">Github</button>
-          </Link>
-        </div>
+    <div className="flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0">
+      {/* Text section */}
+      <div className="flex flex-col text-center mb-10 justify-start">
+        <h1 className="text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2">
+          <strong>
+            A holistic framework for evaluating foundation models.
+          </strong>
+        </h1>
       </div>
 
-      {/* Right side image */}
-      <div className="w-1/3 mx-4">
-        {" "}
-        {/* Added mx-4 for horizontal margin */}
-        <img
-          src={helmHero}
-          alt="HELM Hero"
-          className="object-cover w-full h-full"
-        />
+      {/* Container for Image and Leaderboard */}
+      <div
+        className="flex flex-col md:flex-col lg:flex-row lg:justify-center"
+        style={{ height: "525px", transform: "scale(0.9)" }} // Reduced height by 10%
+      >
+        {/* Image section */}
+        <div className="w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10">
+          <img
+            src={helmHero}
+            alt="HELM Hero"
+            className="object-cover h-full" // Stretched to full height
+            style={{ maxWidth: "100%" }}
+          />
+        </div>
+
+        {/* Leaderboard section */}
+        <div className="w-full lg:w-1/2 flex justify-center h-full py-10">
+          <div
+            className="py-2 pb-6 rounded-3xl bg-gray-100 h-full" // Stretched to full height
+            style={{ maxWidth: "100%" }}
+          >
+            <MiniLeaderboard></MiniLeaderboard>
+            <div className="flex justify-end">
+              <Link to="leaderboard">
+                <button className="px-4 mx-3 mt-1 btn bg-white rounded-md">
+                  <span>See More</span>
+                </button>
+              </Link>
+            </div>
+          </div>
+        </div>
       </div>
     </div>
   );
diff --git a/src/helm-frontend/src/components/LeaderboardTables.tsx b/src/helm-frontend/src/components/LeaderboardTables.tsx
index 3580fda1e4..7c30e13e94 100644
--- a/src/helm-frontend/src/components/LeaderboardTables.tsx
+++ b/src/helm-frontend/src/components/LeaderboardTables.tsx
@@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { useEffect, useState } from "react";
 import { ChevronUpDownIcon } from "@heroicons/react/24/solid";
 import type GroupsTable from "@/types/GroupsTable";
@@ -9,6 +10,10 @@ interface Props {
   ignoreHref?: boolean;
   sortable?: boolean;
   sortFirstMetric?: boolean;
+  filtered?: boolean;
+  modelsToFilter?: string[];
+  numModelsToAutoFilter?: number;
+  filteredCols?: any[];
 }
 
 export default function LeaderboardTables({
@@ -17,6 +22,10 @@ export default function LeaderboardTables({
   ignoreHref = false,
   sortable = true,
   sortFirstMetric = true,
+  filtered = false,
+  filteredCols = [],
+  modelsToFilter = [],
+  numModelsToAutoFilter = 0, // if non-zero, sets how many models to filter down to (ranked by first column)
 }: Props) {
   const [activeSortColumn, setActiveSortColumn] = useState<number | undefined>(
     sortFirstMetric ? 1 : undefined,
@@ -25,10 +34,38 @@ export default function LeaderboardTables({
     ...groupsTables[activeGroup],
   });
   const [sortDirection, setSortDirection] = useState<number>(1);
+  const [filteredModels, setFilteredModels] =
+    useState<string[]>(modelsToFilter);
+
+  interface HeaderValueObject {
+    value: string;
+  }
+
+  const getHeaderValue = (headerValueObject: HeaderValueObject): string => {
+    if (headerValueObject.value === "Model/adapter") {
+      return "Model";
+    } else {
+      return headerValueObject.value;
+    }
+  };
 
   useEffect(() => {
     setActiveGroupsTable({ ...groupsTables[activeGroup] });
-  }, [activeGroup, groupsTables]);
+    // upon receiving and setting data for current table, use sort to figure out n top models
+    if (numModelsToAutoFilter) {
+      const activeRows = groupsTables[0].rows;
+      const sortedRows = activeRows.sort((a, b) => {
+        // assumes we sort by column 1, which represents Mean Win Rate in the Core Scenarios table
+        // this assumption works as numModelsToAutoFilter is only used in mini leaderboards
+        // which always have one main scenario we sort by
+        return Number(b[1].value) - Number(a[1].value);
+      });
+      // Get the top ModelsToAutoFilter
+      const topNumRows = sortedRows.slice(0, numModelsToAutoFilter);
+      const topNumRowNames = topNumRows.map((row) => String(row[0].value));
+      setFilteredModels(topNumRowNames);
+    }
+  }, [activeGroup, groupsTables, numModelsToAutoFilter]);
 
   const handleSort = (columnIndex: number) => {
     let sort = sortDirection;
@@ -75,54 +112,134 @@ export default function LeaderboardTables({
   }, [sortFirstMetric, activeSortColumn]);
 
   return (
-    <div className="rounded-lg overflow-hidden shadow-md bg-white p-4">
-      <div className="overflow-x-auto">
-        <table className="table w-full px-4">
-          <thead>
-            <tr>
-              {activeGroupsTable.header.map((headerValue, idx) => (
-                <th
-                  key={`${activeGroup}-${idx}`}
-                  className={`${
-                    idx === activeSortColumn ? "bg-gray-100" : ""
-                  } whitespace-nowrap px-4`}
-                >
-                  <div className="flex gap-2 items-center">
-                    <span>{headerValue.value}</span>
-                    {sortable ? (
-                      <button className="link" onClick={() => handleSort(idx)}>
-                        <ChevronUpDownIcon className="w-6 h-6" />
-                      </button>
-                    ) : null}
-                  </div>
-                </th>
-              ))}
-            </tr>
-          </thead>
-          <tbody>
-            {activeGroupsTable.rows.map((row, idx) => (
-              <tr
-                key={`${activeGroup}-${idx}`}
-                className={`${idx % 2 === 0 ? "bg-gray-50" : ""}`}
-              >
-                {" "}
-                {/* Added alternating row highlighting */}
-                {row.map((rowValue, cellIdx) => (
-                  <td
-                    key={`${activeGroup}-${cellIdx}`}
-                    className={`${cellIdx === 0 ? "text-lg" : ""}`}
+    <>
+      {filtered ? (
+        <div
+          className="rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0"
+          style={{ overflow: "auto" }}
+        >
+          <div className="overflow-x-auto">
+            <table className="table w-full">
+              <thead>
+                <tr>
+                  {activeGroupsTable.header
+                    .filter(
+                      (_, cellIdx) =>
+                        filteredCols.length === 0 ||
+                        filteredCols.includes(cellIdx),
+                    )
+                    .map((headerValue, idx) => (
+                      <th
+                        key={`${activeGroup}-${idx}`}
+                        className={`${
+                          idx === activeSortColumn ? "bg-gray-100" : ""
+                        } whitespace-nowrap px-4`}
+                      >
+                        <div className="flex gap-2 items-center">
+                          <span>{getHeaderValue(headerValue)}</span>
+                          {sortable ? (
+                            <button
+                              className="link"
+                              onClick={() => handleSort(idx)}
+                            >
+                              <ChevronUpDownIcon className="w-6 h-6" />
+                            </button>
+                          ) : null}
+                        </div>
+                      </th>
+                    ))}
+                </tr>
+              </thead>
+              <tbody>
+                {activeGroupsTable.rows
+                  .filter((row) =>
+                    filteredModels.includes(String(row[0].value)),
+                  )
+                  .map((row, idx) => (
+                    <tr
+                      key={`${activeGroup}-${idx}`}
+                      className={`${idx % 2 === 0 ? "bg-gray-50" : ""}`}
+                    >
+                      {" "}
+                      {/* Added alternating row highlighting */}
+                      {row
+                        // Filtering columns if filteredCols is provided
+                        .filter(
+                          (_, cellIdx) =>
+                            filteredCols.length === 0 ||
+                            filteredCols.includes(cellIdx),
+                        )
+                        .map((rowValue, cellIdx) => (
+                          <td
+                            key={`${activeGroup}-${cellIdx}`}
+                            className={`${cellIdx === 0 ? "text-lg" : ""}`}
+                          >
+                            <RowValue
+                              ignoreHref={ignoreHref && cellIdx === 0}
+                              value={rowValue}
+                            />
+                          </td>
+                        ))}
+                    </tr>
+                  ))}
+              </tbody>
+            </table>
+          </div>
+        </div>
+      ) : (
+        <div className="rounded-lg overflow-hidden shadow-md bg-white p-4">
+          <div className="overflow-x-auto">
+            <table className="table w-full px-4">
+              <thead>
+                <tr>
+                  {activeGroupsTable.header.map((headerValue, idx) => (
+                    <th
+                      key={`${activeGroup}-${idx}`}
+                      className={`${
+                        idx === activeSortColumn ? "bg-gray-100" : ""
+                      } whitespace-nowrap px-4`}
+                    >
+                      <div className="flex gap-2 items-center">
+                        <span>{getHeaderValue(headerValue)}</span>
+                        {sortable ? (
+                          <button
+                            className="link"
+                            onClick={() => handleSort(idx)}
+                          >
+                            <ChevronUpDownIcon className="w-6 h-6" />
+                          </button>
+                        ) : null}
+                      </div>
+                    </th>
+                  ))}
+                </tr>
+              </thead>
+              <tbody>
+                {activeGroupsTable.rows.map((row, idx) => (
+                  <tr
+                    key={`${activeGroup}-${idx}`}
+                    className={`${idx % 2 === 0 ? "bg-gray-50" : ""}`}
                   >
-                    <RowValue
-                      ignoreHref={ignoreHref && cellIdx === 0}
-                      value={rowValue}
-                    />
-                  </td>
+                    {" "}
+                    {/* Added alternating row highlighting */}
+                    {row.map((rowValue, cellIdx) => (
+                      <td
+                        key={`${activeGroup}-${cellIdx}`}
+                        className={`${cellIdx === 0 ? "text-lg" : ""}`}
+                      >
+                        <RowValue
+                          ignoreHref={ignoreHref && cellIdx === 0}
+                          value={rowValue}
+                        />
+                      </td>
+                    ))}
+                  </tr>
                 ))}
-              </tr>
-            ))}
-          </tbody>
-        </table>
-      </div>
-    </div>
+              </tbody>
+            </table>
+          </div>
+        </div>
+      )}
+    </>
   );
 }
diff --git a/src/helm-frontend/src/components/MetricsList.tsx b/src/helm-frontend/src/components/MetricsList.tsx
index 0fdc996a55..84ee034845 100644
--- a/src/helm-frontend/src/components/MetricsList.tsx
+++ b/src/helm-frontend/src/components/MetricsList.tsx
@@ -1,6 +1,5 @@
 import type Metric from "@/types/Metric";
 import type MetricGroup from "@/types/MetricGroup";
-import { Link as ReactRouterLink } from "react-router-dom";
 
 interface Props {
   metrics: Metric[];
@@ -8,34 +7,44 @@ interface Props {
 }
 
 export default function MetricList({ metrics, metricGroups }: Props) {
+  const metricNameToMetric = new Map<string, Metric>();
+  metrics.forEach((metric) => metricNameToMetric.set(metric.name, metric));
+
+  // Only count metrics that have a group and are displayed
+  // i.e. don't count "orphaned" metrics
+  // Also, don't double-count metrics that appear in multiple groups
+  const groupedMetricNames = new Set<string>();
+
+  const metricGroupsWithMetrics: [MetricGroup, Metric[]][] = [];
+  metricGroups.forEach((metricGroup) => {
+    const metricGroupMetrics: Metric[] = [];
+    metricGroup.metrics.forEach((metricField) => {
+      const maybeMetric = metricNameToMetric.get(metricField.name);
+      if (maybeMetric) {
+        metricGroupMetrics.push(maybeMetric);
+        groupedMetricNames.add(maybeMetric.name);
+      }
+    });
+    if (metricGroupMetrics.length > 0) {
+      metricGroupsWithMetrics.push([metricGroup, metricGroupMetrics]);
+    }
+  });
+
   return (
     <section>
-      <h3 className="text-3xl">{metrics.length} metrics</h3>
+      <h3 className="text-3xl">{groupedMetricNames.size} metrics</h3>
       <ul>
-        {metricGroups.map((metricGroup, idx) => (
-          <li className="my-3" key={idx}>
-            {metrics.filter((metric) =>
-              metricGroup.metrics.some((m) => m.name === metric.name),
-            ).length > 0 ? (
-              <ReactRouterLink
-                className="text-black"
-                to={"groups/" + metricGroup.name}
-              >
-                <h4>{metricGroup.display_name}</h4>
-              </ReactRouterLink>
-            ) : null}
+        {metricGroupsWithMetrics.map(([metricGroup, metrics]) => (
+          <li className="my-3" key={metricGroup.name}>
+            <h4>{metricGroup.display_name}</h4>
             <ul className="list-disc list-inside">
-              {metrics
-                .filter((metric) =>
-                  metricGroup.metrics.some((m) => m.name === metric.name),
-                )
-                .map((metric, idx) => {
-                  return (
-                    <li key={idx} className="ml-4">
-                      {metric.display_name}
-                    </li>
-                  );
-                })}
+              {metrics.map((metric) => {
+                return (
+                  <li key={metric.name} className="ml-4">
+                    {metric.display_name}
+                  </li>
+                );
+              })}
             </ul>
           </li>
         ))}
diff --git a/src/helm-frontend/src/components/MiniLeaderboard.tsx b/src/helm-frontend/src/components/MiniLeaderboard.tsx
new file mode 100644
index 0000000000..d209bc39ae
--- /dev/null
+++ b/src/helm-frontend/src/components/MiniLeaderboard.tsx
@@ -0,0 +1,92 @@
+import { useEffect, useState } from "react";
+import PageTitle from "@/components/PageTitle";
+import LeaderboardTables from "@/components/LeaderboardTables";
+import type GroupsTable from "@/types/GroupsTable";
+import type GroupMetadata from "@/types/GroupMetadata";
+import getGroupsTablesByName from "@/services/getGroupTablesByName";
+import getGroupsMetadata from "@/services/getGroupsMetadata";
+import Loading from "@/components/Loading";
+import getGroupsTables from "@/services/getGroupsTables";
+
+interface GroupDisplayData {
+  title: string;
+  name: string;
+}
+
+export default function MiniLeaderboard() {
+  const defaultGroup = { title: "Core Scenarios", name: "core_scenarios" };
+  const selectedGroupDisplayData = defaultGroup;
+  const [allGroupData, setAllGroupData] = useState<GroupDisplayData[]>([]);
+  const [groupsTables, setGroupsTables] = useState<GroupsTable[]>([]);
+  const [groupMetadata, setGroupMetadata] = useState<
+    GroupMetadata | undefined
+  >();
+  const [isLoading, setIsLoading] = useState<boolean>(true);
+  const activeGroup = 0;
+  console.log(allGroupData);
+
+  useEffect(() => {
+    const controller = new AbortController();
+    async function fetchData() {
+      if (selectedGroupDisplayData.name === undefined) {
+        return;
+      }
+      const groups = await getGroupsTables(controller.signal);
+      const result: GroupDisplayData[] = [];
+      groups.forEach((group) => {
+        group.rows.forEach((row) => {
+          result.push({
+            title: String(row[0].value),
+            name: row[0].href.replace("?group=", ""),
+          });
+        });
+      });
+      setAllGroupData(result);
+
+      const [group, metadata] = await Promise.all([
+        getGroupsTablesByName(selectedGroupDisplayData.name, controller.signal),
+        getGroupsMetadata(controller.signal),
+      ]);
+      setGroupsTables(group);
+      setGroupMetadata(metadata[selectedGroupDisplayData.name]);
+      setIsLoading(false);
+    }
+
+    void fetchData();
+    return () => controller.abort();
+  }, [selectedGroupDisplayData.name]);
+
+  if (isLoading || groupMetadata === undefined) {
+    return <Loading />;
+  }
+
+  if (groupsTables.length === 0) {
+    return (
+      <>
+        <PageTitle
+          title={groupMetadata.display_name}
+          subtitle={groupMetadata.description}
+          markdown={true}
+          className="mr-8"
+        />
+        <div className="divider"></div>
+        <p className="text-center mt-8">Group currently has no results.</p>
+      </>
+    );
+  }
+
+  return (
+    <>
+      <>
+        <LeaderboardTables
+          groupsTables={groupsTables}
+          activeGroup={activeGroup}
+          ignoreHref={true}
+          filtered
+          numModelsToAutoFilter={6}
+          filteredCols={[0, 1]}
+        />
+      </>
+    </>
+  );
+}
diff --git a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
index cbd2f650cc..1c41a0c88f 100644
--- a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
+++ b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
@@ -11,6 +11,6 @@ test("displays nav bar", () => {
   );
 
   expect(screen.getByRole("navigation")).toHaveTextContent(
-    "LeaderboardModelsScenariosExplore PredictionsLeaderboardModelsScenariosExplore Predictions",
+    "LeaderboardModelsScenariosPredictionsGitHub LeaderboardModelsScenariosPredictionsGitHub Release: undefined",
   );
 });
diff --git a/src/helm-frontend/src/components/NavBar/NavBar.tsx b/src/helm-frontend/src/components/NavBar/NavBar.tsx
index 354aa71455..0d441bbecf 100644
--- a/src/helm-frontend/src/components/NavBar/NavBar.tsx
+++ b/src/helm-frontend/src/components/NavBar/NavBar.tsx
@@ -3,10 +3,21 @@ import { Bars3Icon } from "@heroicons/react/24/outline";
 import crfmLogo from "@/assets/crfm-logo.png";
 //import helmLogo from "@/assets/helm-logo-simple.png";
 import NavDropdown from "@/components/NavDropdown";
+import ReleaseDropdown from "../ReleaseDropdown";
 
 export default function NavBar() {
+  const navbarStyle: React.CSSProperties = {
+    position: "sticky",
+    top: "0",
+    zIndex: "1000",
+    backgroundColor: "rgba(255, 255, 255, 0.9)",
+    // Add any additional inline styles for the sticky navbar here
+  };
   return (
-    <nav className="navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px">
+    <nav
+      style={navbarStyle}
+      className="navbar sticky-nav h-24 px-8 md:px-12 bg-base-100 max-w[1500]px"
+    >
       <div>
         <div className="dropdown md:hidden mr-4">
           <label
@@ -30,9 +41,12 @@ export default function NavBar() {
             </li>
             <li>
               <Link to="runs" className="whitespace-nowrap">
-                Explore Predictions
+                Predictions
               </Link>
             </li>
+            <li>
+              <Link to="https://github.com/stanford-crfm/helm">GitHub</Link>
+            </li>
           </ul>
         </div>
       </div>
@@ -54,7 +68,13 @@ export default function NavBar() {
             <Link to="scenarios">Scenarios</Link>
           </li>
           <li>
-            <Link to="runs">Explore Predictions</Link>
+            <Link to="runs">Predictions</Link>
+          </li>
+          <li>
+            <Link to="https://github.com/stanford-crfm/helm">GitHub</Link>
+          </li>
+          <li className="hidden lg:flex">
+            <ReleaseDropdown />
           </li>
         </ul>
       </div>
diff --git a/src/helm-frontend/src/components/NavDropdown.tsx b/src/helm-frontend/src/components/NavDropdown.tsx
index 403aa9f1a1..5a1ae09fde 100644
--- a/src/helm-frontend/src/components/NavDropdown.tsx
+++ b/src/helm-frontend/src/components/NavDropdown.tsx
@@ -8,11 +8,22 @@ function NavDropdown() {
     <div className="relative inline-block text-left p-2">
       <div className="inline-flex items-center p-2">
         <Link to="/">
-          <img
-            src="https://crfm.stanford.edu/helm/v0.3.0/images/helm-logo-simple.png"
-            alt="Image 1"
-            className="w-full h-10 object-cover"
-          />
+          <div className="flex items-center">
+            <img
+              src="https://crfm.stanford.edu/helm/v0.3.0/images/helm-logo-simple.png"
+              alt="Image 1"
+              className="w-full h-10 object-cover"
+            />
+            {/* Manually set whether Classic or Not via config, otherwise don't show this */}
+            {window.HELM_TYPE ? (
+              <div className="hidden lg:flex pl-3">
+                {" "}
+                <strong>{window.HELM_TYPE}</strong>{" "}
+              </div>
+            ) : (
+              <> </>
+            )}
+          </div>
         </Link>
 
         {/* Chevron Button */}
@@ -22,7 +33,7 @@ function NavDropdown() {
         >
           <svg
             xmlns="http://www.w3.org/2000/svg"
-            className="h-6 w-6 ml-2"
+            className="h-4 w-4 ml-2"
             fill="none"
             viewBox="0 0 24 24"
             stroke="currentColor"
@@ -38,7 +49,7 @@ function NavDropdown() {
       </div>
 
       {dropdownOpen && (
-        <div className="absolute mt-2 w-96 translate-x-4 rounded-md shadow-lg bg-white ring-1 ring-black ring-opacity-5">
+        <div className="absolute mt-2 w-max translate-x-4 rounded-md shadow-lg bg-white ring-1 ring-black ring-opacity-5">
           <div
             className="py-1"
             role="menu"
@@ -49,20 +60,36 @@ function NavDropdown() {
               className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
               role="menuitem"
             >
-              <Link to="https://crfm.stanford.edu/helm/latest/">
+              <a href="https://crfm.stanford.edu/helm/classic/latest/">
                 <div className="flex items-center">
                   <span>
-                    <strong>HELM: </strong>Holistic evaluation of language
-                    models
+                    <strong>HELM Classic: </strong>Thorough language model
+                    evaluations based on the scenarios from the original HELM
+                    paper
                   </span>
                 </div>
-              </Link>
+              </a>
+            </div>
+
+            <div
+              className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
+              role="menuitem"
+            >
+              <a href="https://crfm.stanford.edu/helm/lite/latest/">
+                <div className="flex items-center">
+                  <span>
+                    <strong>HELM Lite: </strong>Lightweight, broad evaluation of
+                    the capabilities of language models using in-context
+                    learning
+                  </span>
+                </div>
+              </a>
             </div>
             <div
               className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
               role="menuitem"
             >
-              <a href="https://crfm.stanford.edu/heim/latest/?">
+              <a href="https://crfm.stanford.edu/heim/latest/">
                 <div className="flex items-center">
                   <span>
                     <strong>HEIM: </strong>Holistic evaluation of text-to-image
diff --git a/src/helm-frontend/src/components/ReleaseDropdown.tsx b/src/helm-frontend/src/components/ReleaseDropdown.tsx
new file mode 100644
index 0000000000..dda60035a8
--- /dev/null
+++ b/src/helm-frontend/src/components/ReleaseDropdown.tsx
@@ -0,0 +1,63 @@
+import { useState } from "react";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+
+function ReleaseDropdown() {
+  const [dropdownOpen, setDropdownOpen] = useState(false);
+  const release = getBenchmarkRelease();
+
+  const accessibleReleases = ["v0.4.0", "v0.3.0", "v0.2.2"]; // this could also read from a config file in the future
+
+  return (
+    <div>
+      <div className="inline-flex items-center">
+        {/* Chevron Button */}
+        <button
+          onClick={() => setDropdownOpen(!dropdownOpen)}
+          className="inline-flex items-center justify-center focus:outline-none focus-visible:ring-2 focus-visible:ring-white focus-visible:ring-opacity-75"
+        >
+          <div> Release: {release} </div>
+          <svg
+            xmlns="http://www.w3.org/2000/svg"
+            className="h-4 w-4 ml-2"
+            fill="none"
+            viewBox="0 0 24 24"
+            stroke="currentColor"
+          >
+            <path
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              strokeWidth={2}
+              d="M19 9l-7 7-7-7"
+            />
+          </svg>
+        </button>
+      </div>
+
+      {dropdownOpen && (
+        <div className="absolute mt-2 w-max translate-x-4 rounded-md shadow-lg bg-white ring-1 ring-black ring-opacity-5">
+          <div
+            className="py-1"
+            role="menu"
+            aria-orientation="vertical"
+            aria-labelledby="options-menu"
+          >
+            {accessibleReleases.map((currRelease) => (
+              <div
+                className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
+                role="menuitem"
+              >
+                <a href={"https://crfm.stanford.edu/helm/" + currRelease}>
+                  <div className="flex items-center">
+                    <span>{currRelease}</span>
+                  </div>
+                </a>
+              </div>
+            ))}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
+
+export default ReleaseDropdown;
diff --git a/src/helm-frontend/src/components/ScenariosList.tsx b/src/helm-frontend/src/components/ScenariosList.tsx
index 20afd79379..67081107eb 100644
--- a/src/helm-frontend/src/components/ScenariosList.tsx
+++ b/src/helm-frontend/src/components/ScenariosList.tsx
@@ -6,77 +6,80 @@ interface Props {
 }
 
 export default function ScenariosList({ runGroups }: Props) {
-  const { topLevelGroups, subGroups } = runGroups.reduce(
-    (acc, cur) => {
-      if (cur.category !== undefined) {
-        acc.topLevelGroups.push(cur);
-      } else {
-        acc.subGroups.push(cur);
-      }
-
-      return acc;
-    },
-    { topLevelGroups: [], subGroups: [] } as {
-      topLevelGroups: RunGroup[];
-      subGroups: RunGroup[];
-    },
+  // A run group is a scenario if it has metric groups but no subgroups.
+  const scenariosByName = new Map<string, RunGroup>(
+    runGroups
+      .filter(
+        (runGroup) =>
+          runGroup.metric_groups !== undefined &&
+          (runGroup.subgroups === undefined || runGroup.subgroups.length === 0),
+      )
+      .map((runGroup) => [runGroup.name, runGroup]),
   );
 
+  // Only count scenarios that have a category and are displayed
+  // i.e. don't count "orphaned" scenarios
+  // Also, don't double-count scenarios that appear in multiple categories
+  const categorizedScenarioNames = new Set<string>();
+
+  const categoriesWithScenarios: [RunGroup, RunGroup[]][] = [];
+  runGroups.forEach((runGroup) => {
+    const subgroups: string[] = runGroup.subgroups ? runGroup.subgroups : [];
+    const groupScenarios: RunGroup[] = [];
+    subgroups.forEach((subgroup) => {
+      const maybeScenario = scenariosByName.get(subgroup);
+      if (maybeScenario) {
+        groupScenarios.push(maybeScenario);
+        categorizedScenarioNames.add(maybeScenario.name);
+      }
+    });
+    if (groupScenarios.length > 0) {
+      categoriesWithScenarios.push([runGroup, groupScenarios]);
+    }
+  });
+
   return (
     <section>
-      <h3 className="text-3xl">{runGroups.length} scenarios</h3>
+      <h3 className="text-3xl">{categorizedScenarioNames.size} scenarios</h3>
       <ul>
-        {topLevelGroups
-          .filter((topLevelGroup) =>
-            subGroups.some((subGroup) =>
-              (topLevelGroup.subgroups || []).includes(subGroup.name),
-            ),
-          )
-          .map((topLevelGroup, idx) => (
-            <li key={idx} className="my-3">
-              <ReactRouterLink
-                className="text-black"
-                to={"groups/" + topLevelGroup.name}
-              >
-                <h2>{topLevelGroup.display_name}</h2>
-              </ReactRouterLink>
-              <ul className="list-disc list-inside">
-                {subGroups
-                  .filter((subGroup) =>
-                    (topLevelGroup.subgroups || []).includes(subGroup.name),
-                  )
-                  .map((subGroup, idx) =>
-                    subGroup.todo || subGroup.name.includes("CLEVA") ? (
-                      <li
-                        key={idx}
-                        className={`${
-                          subGroup.todo ? "ml-4 text-slate-300" : "ml-4"
-                        }`}
-                      >
-                        {subGroup.display_name}
-                      </li>
-                    ) : (
-                      <ReactRouterLink
-                        className="text-black"
-                        to={"groups/" + subGroup.name}
-                      >
-                        <li
-                          key={idx}
-                          className={`${
-                            subGroup.todo ||
-                            subGroup.display_name.includes("CLEVA")
-                              ? "ml-4 text-slate-300"
-                              : "ml-4"
-                          }`}
-                        >
-                          {subGroup.display_name}
-                        </li>
-                      </ReactRouterLink>
-                    ),
-                  )}
-              </ul>
-            </li>
-          ))}
+        {categoriesWithScenarios.map(([category, scenarios]) => (
+          <li key={category.name} className="my-3">
+            <ReactRouterLink
+              className="text-black"
+              to={"groups/" + category.name}
+            >
+              <h2>{category.display_name}</h2>
+            </ReactRouterLink>
+            <ul className="list-disc list-inside">
+              {scenarios.map((scenario) =>
+                scenario.todo ? (
+                  <li
+                    key={scenario.name}
+                    className={`${
+                      scenario.todo ? "ml-4 text-slate-300" : "ml-4"
+                    }`}
+                  >
+                    {scenario.display_name}
+                  </li>
+                ) : (
+                  <ReactRouterLink
+                    className="text-black"
+                    to={"groups/" + scenario.name}
+                  >
+                    <li
+                      key={scenario.name}
+                      className={`${
+                        scenario.todo ? "ml-4 text-slate-300" : "ml-4"
+                      }`}
+                    >
+                      {scenario.display_name}
+                    </li>
+                  </ReactRouterLink>
+                ),
+              )}
+            </ul>
+          </li>
+        ))}
       </ul>
     </section>
   );
diff --git a/src/helm-frontend/src/layouts/Main.tsx b/src/helm-frontend/src/layouts/Main.tsx
index cc356549cb..0e18442f35 100644
--- a/src/helm-frontend/src/layouts/Main.tsx
+++ b/src/helm-frontend/src/layouts/Main.tsx
@@ -1,6 +1,5 @@
 import { Outlet } from "react-router-dom";
 import Nav from "@/components/NavBar/NavBar";
-import Footer from "@/components/Footer";
 
 export default function Main() {
   return (
@@ -11,7 +10,6 @@ export default function Main() {
           <Outlet />
         </div>
       </main>
-      <Footer />
     </>
   );
 }
diff --git a/src/helm-frontend/src/routes/Landing.tsx b/src/helm-frontend/src/routes/Landing.tsx
index bb4a13023a..d7e505abcf 100644
--- a/src/helm-frontend/src/routes/Landing.tsx
+++ b/src/helm-frontend/src/routes/Landing.tsx
@@ -2,7 +2,6 @@ import { useEffect, useState } from "react";
 import getSchema from "@/services/getSchema";
 import type Schema from "@/types/Schema";
 import ModelsList from "@/components/ModelsList";
-import MetricsList from "@/components/MetricsList";
 import ScenariosList from "@/components/ScenariosList";
 import Hero from "@/components/Hero";
 
@@ -60,39 +59,37 @@ export default function LegacyLanding() {
   return (
     <>
       <Hero />
+      <div className="mx-auto text-lg px-16">
+        <div className="container mb-12 mx-auto text-lg px-16">
+          <div className="flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32">
+            {" "}
+            <h1 className="text-4xl  mx-4 mt-40">
+              <strong>Our Partners</strong>
+            </h1>
+          </div>
 
-      <div className="container mb-12 mx-auto text-lg px-16">
-        <div className="flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32">
-          {" "}
-          <h1 className="text-4xl  mx-4 mt-40">
-            <strong>Our Partners</strong>
-          </h1>
+          <ol className="my-8 flex flex-col gap-32">
+            <li>
+              <div className="flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto">
+                {logos.map((logo, idx) => (
+                  <div className="w-24 h-24 flex items-center m-6" key={idx}>
+                    <img
+                      src={logo}
+                      alt="Logo"
+                      className="mx-auto block"
+                      sizes="100vw"
+                    />
+                  </div>
+                ))}
+              </div>
+            </li>
+          </ol>
         </div>
-        <ol className="my-8 flex flex-col gap-32">
-          <li>
-            <div className="flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto">
-              {logos.map((logo, idx) => (
-                <div className="w-24 h-24 flex items-center m-6" key={idx}>
-                  <img
-                    src={logo}
-                    alt="Logo"
-                    className="mx-auto block"
-                    sizes="100vw"
-                  />
-                </div>
-              ))}
-            </div>
-          </li>
-        </ol>
-      </div>
-      <div className="container mx-auto">
-        <div className="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-8">
-          <ModelsList models={schema.models} />
-          <ScenariosList runGroups={schema.run_groups} />
-          <MetricsList
-            metrics={schema.metrics}
-            metricGroups={schema.metric_groups}
-          />
+        <div className="container mx-auto">
+          <div className="grid grid-cols-1 sm:grid-cols-2 gap-8">
+            <ModelsList models={schema.models} />
+            <ScenariosList runGroups={schema.run_groups} />
+          </div>
         </div>
       </div>
     </>
diff --git a/src/helm-frontend/src/routes/Leaderboard.tsx b/src/helm-frontend/src/routes/Leaderboard.tsx
index bc5b8b2aa9..71d4905ea8 100644
--- a/src/helm-frontend/src/routes/Leaderboard.tsx
+++ b/src/helm-frontend/src/routes/Leaderboard.tsx
@@ -9,7 +9,6 @@ import getGroupsTablesByName from "@/services/getGroupTablesByName";
 import getGroupsMetadata from "@/services/getGroupsMetadata";
 import Loading from "@/components/Loading";
 import getGroupsTables from "@/services/getGroupsTables";
-import Alert from "@/components/Alert";
 
 interface GroupDisplayData {
   title: string;
@@ -18,7 +17,6 @@ interface GroupDisplayData {
 
 export default function Leaderboard() {
   const defaultGroup = { title: "Core Scenarios", name: "core_scenarios" };
-  //const defaultGroup.name = "core_scenarios";
   const [allGroupData, setAllGroupData] = useState<GroupDisplayData[]>([]);
   const [selectedGroupDisplayData, setSelectedGroupDisplayData] =
     useState(defaultGroup);
@@ -28,7 +26,6 @@ export default function Leaderboard() {
   >();
   const [isLoading, setIsLoading] = useState<boolean>(true);
   const [activeGroup, setActiveGroup] = useState<number>(0);
-  console.log(allGroupData);
 
   function findMatchingGroup(
     allGroupData: GroupDisplayData[],
@@ -99,58 +96,59 @@ export default function Leaderboard() {
 
   return (
     <>
-      <Alert />
-      <div className="flex flex-row justify-between">
-        <PageTitle
-          title={"Leaderboard"}
-          subtitle={
-            "The leaderboard shows how the various models (with particular adaptation procedures) perform across different groups of scenarios and different metrics."
-          }
-          markdown={true}
-          className="mr-8 mb-16"
-        />
-        <div className="w-64 py-10 ">
-          <label
-            htmlFor="group"
-            className="block text-sm font-medium text-gray-700"
-          >
-            Select a group:
-          </label>
-          <select
-            id="group"
-            name="group"
-            value={selectedGroupDisplayData.title}
-            onChange={(e) => updateLeaderboard(allGroupData, e.target.value)}
-            className="mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md"
-          >
-            {allGroupData.map((group, index) => (
-              <option key={index} value={group.title}>
-                {group.title}
-              </option>
-            ))}
-          </select>
+      <>
+        <div className="flex flex-row justify-between">
+          <PageTitle
+            title={"HELM Leaderboard"}
+            subtitle={
+              "HELM is a framework for evaluating foundation models. Our leaderboard shows how the various models (with particular adaptation procedures) perform across different groups of scenarios and different metrics."
+            }
+            markdown={true}
+            className="mr-8 mb-16"
+          />
+          <div className="w-64 py-10 ">
+            <label
+              htmlFor="group"
+              className="block text-sm font-medium text-gray-700"
+            >
+              Select a group:
+            </label>
+            <select
+              id="group"
+              name="group"
+              value={selectedGroupDisplayData.title}
+              onChange={(e) => updateLeaderboard(allGroupData, e.target.value)}
+              className="mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md"
+            >
+              {allGroupData.map((group, index) => (
+                <option key={index} value={group.title}>
+                  {group.title}
+                </option>
+              ))}
+            </select>
+          </div>
         </div>
-      </div>
-      <div className="overflow-x-auto">
-        {groupsTables.length > 1 ? (
-          <Tabs>
-            {groupsTables.map((groupsTable, idx) => (
-              <Tab
-                key={idx}
-                active={idx === activeGroup}
-                onClick={() => setActiveGroup(idx)}
-              >
-                {groupsTable.title}
-              </Tab>
-            ))}
-          </Tabs>
-        ) : null}
-      </div>
-      <LeaderboardTables
-        groupsTables={groupsTables}
-        activeGroup={activeGroup}
-        ignoreHref={true}
-      />
+        <div className="overflow-x-auto">
+          {groupsTables.length > 1 ? (
+            <Tabs>
+              {groupsTables.map((groupsTable, idx) => (
+                <Tab
+                  key={idx}
+                  active={idx === activeGroup}
+                  onClick={() => setActiveGroup(idx)}
+                >
+                  {groupsTable.title}
+                </Tab>
+              ))}
+            </Tabs>
+          ) : null}
+        </div>
+        <LeaderboardTables
+          groupsTables={groupsTables}
+          activeGroup={activeGroup}
+          ignoreHref={true}
+        />
+      </>
     </>
   );
 }
diff --git a/src/helm-frontend/src/routes/Run.tsx b/src/helm-frontend/src/routes/Run.tsx
index 1d179f1915..bde49440ef 100644
--- a/src/helm-frontend/src/routes/Run.tsx
+++ b/src/helm-frontend/src/routes/Run.tsx
@@ -51,6 +51,7 @@ export default function Run() {
   const [totalMetricsPages, setTotalMetricsPages] = useState<number>(1);
   const [model, setModel] = useState<Model | undefined>();
   const [scenario, setScenario] = useState<Scenario | undefined>();
+  const [searchTerm, setSearchTerm] = useState("");
 
   useEffect(() => {
     const controller = new AbortController();
@@ -261,6 +262,15 @@ export default function Run() {
         </>
       ) : (
         <div>
+          {/* Search bar */}
+          <div className="flex justify-start my-4">
+            <input
+              type="text"
+              className="input input-bordered w-full max-w-xs"
+              placeholder="Search for a metric"
+              onChange={(e) => setSearchTerm(e.target.value)}
+            />
+          </div>
           <div className="overflow-x-auto">
             <table className="table">
               <thead>
@@ -271,26 +281,34 @@ export default function Run() {
                 </tr>
               </thead>
               <tbody>
-                {pagedMetrics.map((stat) => (
-                  <tr>
-                    {Object.entries(stat).map(([key, value]) => {
-                      if (key === "name") {
-                        return (
-                          <td key={key}>
-                            <StatNameDisplay stat={stat} />
-                            <div className="text-sm text-gray-500">
-                              {
-                                /* eslint-disable-next-line @typescript-eslint/no-unsafe-member-access */
-                                value.name
-                              }
-                            </div>
-                          </td>
-                        );
-                      }
-                      return <td>{value}</td>;
-                    })}
-                  </tr>
-                ))}
+                {pagedMetrics
+                  .filter(
+                    (stat) =>
+                      !searchTerm ||
+                      stat.name.name
+                        .toLowerCase()
+                        .includes(searchTerm.toLowerCase()),
+                  )
+                  .map((stat) => (
+                    <tr>
+                      {Object.entries(stat).map(([key, value]) => {
+                        if (key === "name") {
+                          return (
+                            <td key={key}>
+                              <StatNameDisplay stat={stat} />
+                              <div className="text-sm text-gray-500">
+                                {
+                                  /* eslint-disable-next-line @typescript-eslint/no-unsafe-member-access */
+                                  value.name
+                                }
+                              </div>
+                            </td>
+                          );
+                        }
+                        return <td>{value}</td>;
+                      })}
+                    </tr>
+                  ))}
               </tbody>
             </table>
           </div>
diff --git a/src/helm-frontend/src/services/getSchema.ts b/src/helm-frontend/src/services/getSchema.ts
index b6b056c2ba..cd9c87756b 100644
--- a/src/helm-frontend/src/services/getSchema.ts
+++ b/src/helm-frontend/src/services/getSchema.ts
@@ -1,21 +1,15 @@
-import { parse } from "yaml";
-
 import type Schema from "@/types/Schema";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
-import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
 
 export default async function getSchema(signal: AbortSignal): Promise<Schema> {
   try {
     const resp = await fetch(
-      `https://crfm.stanford.edu/helm/${
-        getBenchmarkRelease() || getBenchmarkSuite()
-      }/schema.yaml`,
+      getBenchmarkEndpoint(`${getVersionBaseUrl()}/schema.json`),
       { signal },
     );
-    const data = await resp.text();
-    const schema = parse(data) as Schema;
 
-    return schema;
+    return (await resp.json()) as Schema;
   } catch (error) {
     console.log(error);
     return {
diff --git a/src/helm-frontend/src/types/global.d.ts b/src/helm-frontend/src/types/global.d.ts
index 697db24750..d460dbcd2c 100644
--- a/src/helm-frontend/src/types/global.d.ts
+++ b/src/helm-frontend/src/types/global.d.ts
@@ -2,4 +2,5 @@ interface Window {
   RELEASE: string;
   SUITE: string;
   BENCHMARK_OUTPUT_BASE_URL: string;
+  HELM_TYPE: string;
 }
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 82648c7be6..fc4cf3da31 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -26,6 +26,9 @@ class AdapterSpec:
     # For example, it is recommended to prefix all prompts with [NLG] for UL2.
     global_prefix: str = ""
 
+    # Append all prompts with this string.
+    global_suffix: str = ""
+
     # Prompt starts with instructions
     instructions: str = ""
 
diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index 3a8d514532..be3f71ca3c 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -214,6 +214,7 @@ def construct_prompt(
         # Prompt
         prompt = Prompt(
             global_prefix=self.adapter_spec.global_prefix,
+            global_suffix=self.adapter_spec.global_suffix,
             instructions_block=instructions_block,
             train_instance_blocks=train_instance_blocks,
             eval_instance_block=eval_instance_block,
diff --git a/src/helm/benchmark/adaptation/prompt.py b/src/helm/benchmark/adaptation/prompt.py
index 856bb8bda8..f6429b27dc 100644
--- a/src/helm/benchmark/adaptation/prompt.py
+++ b/src/helm/benchmark/adaptation/prompt.py
@@ -12,6 +12,9 @@ class Prompt:
     # Global prefix, carried over from `AdapterSpec`
     global_prefix: str
 
+    # Global suffix, carried over from `AdapterSpec`
+    global_suffix: str
+
     # Instance prefix, carried over from `AdapterSpec`
     instance_prefix: str
 
@@ -47,7 +50,10 @@ def text(self) -> str:
 
         # Note: this could be implemented via substitutions.
         if self.global_prefix:
-            non_truncated_text = f"{self.global_prefix} {non_truncated_text}"
+            non_truncated_text = f"{self.global_prefix}{non_truncated_text}"
+
+        if self.global_suffix:
+            non_truncated_text = f"{non_truncated_text}{self.global_suffix}"
 
         # Perform substitutions (e.g., add "<br>" before "\n")
         for subst in self.substitutions:
diff --git a/src/helm/benchmark/config_registry.py b/src/helm/benchmark/config_registry.py
index 0fab062949..98b67f35e0 100644
--- a/src/helm/benchmark/config_registry.py
+++ b/src/helm/benchmark/config_registry.py
@@ -1,14 +1,32 @@
-from helm.benchmark.model_deployment_registry import register_deployments_if_not_already_registered
-from helm.benchmark.model_metadata_registry import register_metadatas_if_not_already_registered
-from helm.benchmark.tokenizer_config_registry import register_tokenizers_if_not_already_registered
+import os
+import importlib_resources as resources
 
-HELM_REGISTERED: bool = False
+from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
+from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
+from helm.benchmark.tokenizer_config_registry import register_tokenizer_configs_from_path
 
 
-def register_helm_configurations():
-    global HELM_REGISTERED
-    if not HELM_REGISTERED:
-        register_metadatas_if_not_already_registered()
-        register_tokenizers_if_not_already_registered()
-        register_deployments_if_not_already_registered()
-        HELM_REGISTERED = True
+MODEL_METADATA_FILE: str = "model_metadata.yaml"
+TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
+MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
+
+CONFIG_PACKAGE = "helm.config"
+
+
+def register_configs_from_directory(dir_path) -> None:
+    model_metadata_path = os.path.join(dir_path, MODEL_METADATA_FILE)
+    if os.path.isfile(model_metadata_path):
+        register_model_metadata_from_path(model_metadata_path)
+
+    tokenizer_configs_path = os.path.join(dir_path, TOKENIZER_CONFIGS_FILE)
+    if os.path.isfile(tokenizer_configs_path):
+        register_tokenizer_configs_from_path(tokenizer_configs_path)
+
+    model_deployments_path = os.path.join(dir_path, MODEL_DEPLOYMENTS_FILE)
+    if os.path.isfile(model_deployments_path):
+        register_model_deployments_from_path(model_deployments_path)
+
+
+def register_builtin_configs_from_helm_package() -> None:
+    package_path = str(resources.files(CONFIG_PACKAGE))
+    register_configs_from_directory(package_path)
diff --git a/src/helm/benchmark/huggingface_registration.py b/src/helm/benchmark/huggingface_registration.py
index bc833ddea2..a48e2de9b7 100644
--- a/src/helm/benchmark/huggingface_registration.py
+++ b/src/helm/benchmark/huggingface_registration.py
@@ -1,6 +1,5 @@
 import os
 from typing import Optional
-from datetime import date
 
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
@@ -10,10 +9,8 @@
 )
 from helm.benchmark.model_metadata_registry import (
     get_model_metadata,
-    ModelMetadata,
+    get_unknown_model_metadata,
     register_model_metadata,
-    TEXT_MODEL_TAG,
-    FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
 )
 from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
 from helm.common.hierarchical_logger import hlog
@@ -47,17 +44,7 @@ def register_huggingface_model(
     try:
         _ = get_model_metadata(model_name=helm_model_name)
     except ValueError:
-        register_model_metadata(
-            ModelMetadata(
-                name=helm_model_name,
-                creator_organization_name="Unknown",
-                display_name=helm_model_name,
-                description=helm_model_name,
-                access="open",
-                release_date=date.today(),
-                tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-            )
-        )
+        register_model_metadata(get_unknown_model_metadata(helm_model_name))
         hlog(f"Registered default metadata for model {helm_model_name}")
 
     register_model_deployment(model_deployment)
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index aa0eb8ae94..eb5bd6dcb0 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -5,10 +5,10 @@
 from functools import partial
 
 import json
-import re
 import string
 import nltk
 import numpy as np
+import re
 import scipy
 import calibration as cal
 import importlib_resources as resources
@@ -202,6 +202,25 @@ def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float:
     return exact_match(gold, pred)
 
 
+def final_number_exact_match(gold: str, pred: str) -> float:
+    """
+    Returns 1 iff the final number in gold and pred match.
+    Similar to exact_match_indicator.
+    Example:
+    - gold = "The answer is 15."
+    - pred = "The answer is 15 eggs."
+    - Returns 1
+    """
+
+    def get_final_number(x: str) -> str:
+        matches = re.findall(r"-?[\d,]+(?:.\d+)?", x)
+        if not matches:
+            return ""
+        return matches[-1].replace(",", "")
+
+    return exact_match(get_final_number(gold), get_final_number(pred))
+
+
 def get_num_bytes(tokens: List[Token]) -> int:
     """
     Compute the byte length of the input tokens. For a UTF-8 string token, we use byte() to convert
@@ -497,6 +516,7 @@ def compute_metrics_helper(
             "prefix_exact_match": prefix_exact_match,
             "quasi_prefix_exact_match": quasi_prefix_exact_match,
             "exact_match_indicator": exact_match_indicator,
+            "final_number_exact_match": final_number_exact_match,
             "exact_set_match": exact_set_match,
             "iou_set_match": iou_set_match,
             "f1_set_match": f1_set_match,
diff --git a/src/helm/benchmark/metrics/machine_translation_metrics.py b/src/helm/benchmark/metrics/machine_translation_metrics.py
deleted file mode 100644
index 7ebb735a6e..0000000000
--- a/src/helm/benchmark/metrics/machine_translation_metrics.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from typing import List
-
-from helm.benchmark.adaptation.request_state import RequestState
-from helm.common.optional_dependencies import handle_module_not_found_error
-from .metric import Metric
-from .metric_name import MetricName
-from .statistic import Stat
-
-try:
-    from sacrebleu.metrics import BLEU
-    from langdetect import detect
-except ModuleNotFoundError as e:
-    handle_module_not_found_error(e)
-
-
-class MachineTranslationMetric(Metric):
-    """
-    Compute the BLEU score for Machine Translation scenarios. The implementation is based on sacrebleu.
-    """
-
-    def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
-        """
-        Compute the corpus-level metric based on all reqeust_states.
-        """
-
-        bleu = BLEU()
-
-        refs: List[List[str]] = [[]]
-        sys: List = []
-        for request_state in request_states:
-            # Assume there is one referece per instance. TODO: Support multiple references after adding more scenarios.
-            num_references: int = len(request_state.instance.references)
-            if num_references != 1:
-                raise ValueError(f"This instance has {num_references} references, but we currently only support one.")
-            # Usually there is only one completion for each instance.
-            assert request_state.result is not None
-            if len(request_state.result.completions) != 1:
-                raise ValueError("Each request result should have only exactly one completion.")
-            sys.append(request_state.result.completions[0].text)
-            refs[0].append(request_state.instance.references[0].output.text)
-        bleu_score = bleu.corpus_score(sys, refs).score
-        return [Stat(MetricName("bleu")).add(bleu_score)]
-
-
-class CLEVAMachineTranslationMetric(Metric):
-    """
-    Compute the BLEU score for Machine Translation scenarios of CLEVA benchmark.
-    Based on sacrebleu, this implementation distinguishes target language and allows variable number of references.
-    If there are more than one hypothesis, only the first one is adopted in the calculation.
-    """
-
-    def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
-        """
-        Compute the corpus-level metric based on all reqeust_states.
-        """
-
-        def detect_language(request_states: List[RequestState]) -> str:
-            """
-            Determine the target language by detecting the language of references.
-            Currently, it only distinguishes if the target language is Chinese.
-            """
-
-            corpus: str = "".join(
-                [request_state.instance.references[0].output.text for request_state in request_states[:10]]
-            )
-            if detect(corpus) in ["zh-cn", "zh-tw"]:
-                return "zh"
-            else:
-                return "13a"  # Default tokenizer for sacrebleu.BLEU
-
-        bleu = BLEU(tokenize=detect_language(request_states))
-
-        max_num_references: int = max([len(request_state.instance.references) for request_state in request_states])
-        refs: List[List[str]] = [
-            [
-                request_state.instance.references[i].output.text if i < len(request_state.instance.references) else ""
-                for request_state in request_states
-            ]
-            for i in range(max_num_references)
-        ]
-
-        sys: List = []
-        for request_state in request_states:
-            assert request_state.result is not None
-            sys.append(request_state.result.completions[0].text)
-
-        bleu_score = bleu.corpus_score(sys, refs).score
-
-        return [Stat(MetricName("cleva_machine_translation_bleu")).add(bleu_score)]
diff --git a/src/helm/benchmark/metrics/test_basic_metrics.py b/src/helm/benchmark/metrics/test_basic_metrics.py
new file mode 100644
index 0000000000..1aa9b26a98
--- /dev/null
+++ b/src/helm/benchmark/metrics/test_basic_metrics.py
@@ -0,0 +1,26 @@
+from .basic_metrics import exact_match, exact_match_indicator, final_number_exact_match
+
+
+def test_exact_match():
+    assert exact_match("33", "33") == 1
+    assert exact_match("33", "33 ") == 1
+    assert exact_match("33", "34") == 0
+
+
+def test_exact_match_indicator():
+    assert exact_match_indicator("33", "33") == 1
+    assert exact_match_indicator("33", "stuff 33") == 1
+    assert exact_match_indicator("stuff 33", "33") == 1
+    assert exact_match_indicator("33", "33 stuff") == 0
+
+
+def test_final_number_exact_match():
+    assert final_number_exact_match("33", "33") == 1
+    assert final_number_exact_match("33", "33 eggs.") == 1
+    assert final_number_exact_match("33", "-33") == 0
+    assert final_number_exact_match("-33", "-33") == 1
+    assert final_number_exact_match("The answer is 33", "\\boxed{33}") == 1
+    assert final_number_exact_match("The answer is 33", "\\boxed{33} and 34") == 0
+    assert final_number_exact_match("34.2", "2") == 0
+    assert final_number_exact_match("342", "342.") == 1
+    assert final_number_exact_match("3,420", "3420") == 1
diff --git a/src/helm/benchmark/model_deployment_registry.py b/src/helm/benchmark/model_deployment_registry.py
index c3f9d36147..55abb39356 100644
--- a/src/helm/benchmark/model_deployment_registry.py
+++ b/src/helm/benchmark/model_deployment_registry.py
@@ -1,18 +1,17 @@
-import os
 from typing import Dict, Optional, List
 from dataclasses import dataclass
-import importlib_resources as resources
 
 import cattrs
 import yaml
 
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import ObjectSpec
-from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, CONFIG_PACKAGE
-
-
-MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
-DEPLOYMENTS_REGISTERED: bool = False
+from helm.benchmark.model_metadata_registry import (
+    ModelMetadata,
+    get_model_metadata,
+    get_unknown_model_metadata,
+    register_model_metadata,
+)
 
 
 class ClientSpec(ObjectSpec):
@@ -95,23 +94,26 @@ class ModelDeployments:
 }
 
 
-# ===================== REGISTRATION FUNCTIONS ==================== #
 def register_model_deployment(model_deployment: ModelDeployment) -> None:
-    # hlog(f"Registered model deployment {model_deployment.name}")
     DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_deployment.name] = model_deployment
     ALL_MODEL_DEPLOYMENTS.append(model_deployment)
 
     model_name: str = model_deployment.model_name or model_deployment.name
 
+    model_metadata: ModelMetadata
     try:
-        model_metadata: ModelMetadata = get_model_metadata(model_name)
-        deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
-        if model_deployment.name not in deployment_names:
-            if model_metadata.deployment_names is None:
-                model_metadata.deployment_names = []
-            model_metadata.deployment_names.append(model_deployment.name)
+        model_metadata = get_model_metadata(model_name)
     except ValueError:
-        raise ValueError(f"Model deployment {model_deployment.name} has no corresponding model metadata")
+        hlog(
+            f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
+        )
+        model_metadata = get_unknown_model_metadata(model_name)
+        register_model_metadata(model_metadata)
+    deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
+    if model_deployment.name not in deployment_names:
+        if model_metadata.deployment_names is None:
+            model_metadata.deployment_names = []
+        model_metadata.deployment_names.append(model_deployment.name)
 
 
 def register_model_deployments_from_path(path: str) -> None:
@@ -123,15 +125,7 @@ def register_model_deployments_from_path(path: str) -> None:
         register_model_deployment(model_deployment)
 
 
-def maybe_register_model_deployments_from_base_path(path: str) -> None:
-    """Register model deployments from yaml file if the path exists."""
-    if os.path.exists(path):
-        register_model_deployments_from_path(path)
-
-
-# ===================== UTIL FUNCTIONS ==================== #
 def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeployment:
-    register_deployments_if_not_already_registered()
     if name not in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
         raise ValueError(f"Model deployment {name} not found")
     deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
@@ -140,46 +134,17 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
     return deployment
 
 
-def get_model_deployments_by_host_organization(host_organization: str) -> List[str]:
-    """
-    Gets models by host organization.
-    Example:   together => [" together/bloom", "together/t0pp", ...]
-    """
-    register_deployments_if_not_already_registered()
-    return [
-        deployment.name for deployment in ALL_MODEL_DEPLOYMENTS if deployment.host_organization == host_organization
-    ]
-
-
 def get_model_deployment_host_organization(name: str) -> str:
-    """
-    Extracts the host organization from the model deployment name.
-    Example: "huggingface/t5-11b" => "huggingface"
-    """
+    """Return the host organization name based on the model deployment name.
+
+    Example: "huggingface/t5-11b" -> "huggingface"""
     deployment: ModelDeployment = get_model_deployment(name)
     return deployment.host_organization
 
 
-def get_metadata_for_deployment(deployment_name: str) -> ModelMetadata:
-    """
-    Given a deployment name, returns the corresponding model metadata.
-    """
-    deployment: ModelDeployment = get_model_deployment(deployment_name)
-    return get_model_metadata(deployment.model_name or deployment.name)
-
-
 def get_model_names_with_tokenizer(tokenizer_name: str) -> List[str]:
-    """Get all the name of the models with tokenizer `tokenizer_name`."""
-    register_deployments_if_not_already_registered()
+    """Return the names of all models with the given tokenizer."""
     deployments: List[ModelDeployment] = [
         deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.tokenizer_name == tokenizer_name
     ]
     return [deployment.model_name or deployment.name for deployment in deployments]
-
-
-def register_deployments_if_not_already_registered() -> None:
-    global DEPLOYMENTS_REGISTERED
-    if not DEPLOYMENTS_REGISTERED:
-        path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_DEPLOYMENTS_FILE)
-        maybe_register_model_deployments_from_base_path(path)
-        DEPLOYMENTS_REGISTERED = True
diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py
index 335c75c5b4..de1963dc11 100644
--- a/src/helm/benchmark/model_metadata_registry.py
+++ b/src/helm/benchmark/model_metadata_registry.py
@@ -1,8 +1,6 @@
-import os
 from typing import Dict, Optional, List
 from dataclasses import dataclass, field
 from datetime import date
-import importlib_resources as resources
 
 import dacite
 import yaml
@@ -22,12 +20,14 @@
 CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 
 # OpenAI Chat format
-OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt"
+OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
 
 # For Anthropic models
 ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
 ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
 
+GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
+
 # Models which emit garbage tokens when temperature=0.
 BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
 
@@ -50,11 +50,6 @@
 VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
 
 
-CONFIG_PACKAGE = "helm.config"
-MODEL_METADATA_FILE: str = "model_metadata.yaml"
-METADATAS_REGISTERED: bool = False
-
-
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
 @dataclass(frozen=False)
@@ -80,7 +75,7 @@ class ModelMetadata:
     - "limited": accessible with an API key.
     If there are multiple deployments, this should be the most permissive access across all deployments."""
 
-    release_date: date
+    release_date: Optional[date]
     """Release date of the model."""
 
     tags: List[str] = field(default_factory=list)
@@ -123,7 +118,6 @@ class ModelMetadataList:
 MODEL_NAME_TO_MODEL_METADATA: Dict[str, ModelMetadata] = {model.name: model for model in ALL_MODELS_METADATA}
 
 
-# ===================== REGISTRATION FUNCTIONS ==================== #
 def register_model_metadata_from_path(path: str) -> None:
     """Register model configurations from the given path."""
     with open(path, "r") as f:
@@ -137,72 +131,51 @@ def register_model_metadata_from_path(path: str) -> None:
 
 def register_model_metadata(model_metadata: ModelMetadata) -> None:
     """Register a single model configuration."""
-    # hlog(f"Registered model metadata {model_metadata.name}")
     ALL_MODELS_METADATA.append(model_metadata)
     MODEL_NAME_TO_MODEL_METADATA[model_metadata.name] = model_metadata
 
 
-def maybe_register_model_metadata_from_base_path(path: str) -> None:
-    """Register model metadata from yaml file if the path exists."""
-    if os.path.exists(path):
-        register_model_metadata_from_path(path)
-
-
-# ===================== UTIL FUNCTIONS ==================== #
 def get_model_metadata(model_name: str) -> ModelMetadata:
-    """Get the `Model` given the name."""
-    register_metadatas_if_not_already_registered()
+    """Return the `ModelMetadata` for the model name."""
     if model_name not in MODEL_NAME_TO_MODEL_METADATA:
         raise ValueError(f"No model with name: {model_name}")
 
     return MODEL_NAME_TO_MODEL_METADATA[model_name]
 
 
-def get_model_creator_organization(model_name: str) -> str:
-    """Get the model's group given the name."""
-    model: ModelMetadata = get_model_metadata(model_name)
-    return model.creator_organization
-
-
 def get_all_models() -> List[str]:
-    """Get all model names."""
-    register_metadatas_if_not_already_registered()
+    """Return all model names."""
     return list(MODEL_NAME_TO_MODEL_METADATA.keys())
 
 
-def get_models_by_creator_organization(organization: str) -> List[str]:
-    """
-    Gets models by creator organization.
-    Example:   ai21   =>   ai21/j1-jumbo, ai21/j1-grande, ai21-large.
-    """
-    register_metadatas_if_not_already_registered()
-    return [model.name for model in ALL_MODELS_METADATA if model.creator_organization == organization]
-
-
 def get_model_names_with_tag(tag: str) -> List[str]:
-    """Get all the name of the models with tag `tag`."""
-    register_metadatas_if_not_already_registered()
+    """Return all model names of models with the given tag."""
     return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
 
 
 def get_all_text_models() -> List[str]:
-    """Get all text model names."""
+    """Return all model names of text models."""
     return get_model_names_with_tag(TEXT_MODEL_TAG)
 
 
 def get_all_code_models() -> List[str]:
-    """Get all code model names."""
+    """Return all model names of code models."""
     return get_model_names_with_tag(CODE_MODEL_TAG)
 
 
 def get_all_instruction_following_models() -> List[str]:
-    """Get all instruction-following model names."""
+    """Return all model names of instruction following models."""
     return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
 
 
-def register_metadatas_if_not_already_registered() -> None:
-    global METADATAS_REGISTERED
-    if not METADATAS_REGISTERED:
-        path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_METADATA_FILE)
-        maybe_register_model_metadata_from_base_path(path)
-        METADATAS_REGISTERED = True
+def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
+    """Return placeholder ModelMetadata for an unknown model."""
+    return ModelMetadata(
+        name=helm_model_name,
+        creator_organization_name="Unknown",
+        display_name=helm_model_name,
+        description=helm_model_name,
+        access="open",
+        release_date=date.today(),
+        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
+    )
diff --git a/src/helm/benchmark/presentation/create_plots.py b/src/helm/benchmark/presentation/create_plots.py
index 09f7ce6a24..395b28fcc8 100644
--- a/src/helm/benchmark/presentation/create_plots.py
+++ b/src/helm/benchmark/presentation/create_plots.py
@@ -12,7 +12,7 @@
 
 from helm.common.hierarchical_logger import hlog
 from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.benchmark.presentation.schema import read_schema
+from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
 from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
 
 try:
@@ -133,7 +133,7 @@ def __init__(self, base_path: str, save_path: str, plot_format: str):
         self.plot_format = plot_format
         self._tables_cache: Dict[str, Dict[str, Table]] = {}
 
-        schema = read_schema()
+        schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
         self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
 
     def get_group_tables(self, group_name: str) -> Dict[str, Table]:
diff --git a/src/helm/benchmark/presentation/run_specs_dec2023.conf b/src/helm/benchmark/presentation/run_specs_dec2023.conf
new file mode 100644
index 0000000000..858555f5dc
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_dec2023.conf
@@ -0,0 +1,133 @@
+# HELM scenarios.
+
+entries: [
+  # NarrativeQA
+  {description: "narrative_qa:model=text", priority: 1}
+
+  # NaturalQuestions
+  {description: "natural_qa:model=text,mode=openbook_longans", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook", priority: 1}
+
+  # OpenbookQA
+  {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
+
+  # MMLU
+  {description: "mmlu:model=text,subject=abstract_algebra", priority: 2}
+  {description: "mmlu:model=text,subject=anatomy", priority: 3}
+  {description: "mmlu:model=text,subject=college_chemistry", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics", priority: 2}
+  {description: "mmlu:model=text,subject=global_facts", priority: 3}
+  {description: "mmlu:model=text,subject=jurisprudence", priority: 3}
+  {description: "mmlu:model=text,subject=philosophy", priority: 3}
+  {description: "mmlu:model=text,subject=professional_medicine", priority: 3}
+  {description: "mmlu:model=text,subject=us_foreign_policy", priority: 2}
+  {description: "mmlu:model=text,subject=astronomy", priority: 4}
+  {description: "mmlu:model=text,subject=business_ethics", priority: 4}
+  {description: "mmlu:model=text,subject=clinical_knowledge", priority: 4}
+  {description: "mmlu:model=text,subject=college_biology", priority: 4}
+  {description: "mmlu:model=text,subject=college_computer_science", priority: 4}
+  {description: "mmlu:model=text,subject=college_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=college_medicine", priority: 4}
+  {description: "mmlu:model=text,subject=college_physics", priority: 4}
+  {description: "mmlu:model=text,subject=conceptual_physics", priority: 4}
+  {description: "mmlu:model=text,subject=electrical_engineering", priority: 4}
+  {description: "mmlu:model=text,subject=elementary_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=formal_logic", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_biology", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_chemistry", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_computer_science", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_european_history", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_geography", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_government_and_politics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_macroeconomics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_microeconomics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_physics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_psychology", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_statistics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_us_history", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_world_history", priority: 4}
+  {description: "mmlu:model=text,subject=human_aging", priority: 4}
+  {description: "mmlu:model=text,subject=human_sexuality", priority: 4}
+  {description: "mmlu:model=text,subject=international_law", priority: 4}
+  {description: "mmlu:model=text,subject=logical_fallacies", priority: 4}
+  {description: "mmlu:model=text,subject=machine_learning", priority: 4}
+  {description: "mmlu:model=text,subject=management", priority: 4}
+  {description: "mmlu:model=text,subject=marketing", priority: 4}
+  {description: "mmlu:model=text,subject=medical_genetics", priority: 4}
+  {description: "mmlu:model=text,subject=miscellaneous", priority: 4}
+  {description: "mmlu:model=text,subject=moral_disputes", priority: 4}
+  {description: "mmlu:model=text,subject=moral_scenarios", priority: 4}
+  {description: "mmlu:model=text,subject=nutrition", priority: 4}
+  {description: "mmlu:model=text,subject=prehistory", priority: 4}
+  {description: "mmlu:model=text,subject=professional_accounting", priority: 4}
+  {description: "mmlu:model=text,subject=professional_law", priority: 4}
+  {description: "mmlu:model=text,subject=professional_psychology", priority: 4}
+  {description: "mmlu:model=text,subject=public_relations", priority: 4}
+  {description: "mmlu:model=text,subject=security_studies", priority: 4}
+  {description: "mmlu:model=text,subject=sociology", priority: 4}
+  {description: "mmlu:model=text,subject=virology", priority: 4}
+  {description: "mmlu:model=text,subject=world_religions", priority: 4}
+
+  # MATH
+  {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True", priority: 2}
+  {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True", priority: 2}
+
+  {description: "math:model=text_code,subject=number_theory,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=algebra,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=prealgebra,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=geometry,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=counting_and_probability,level=2,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=precalculus,level=2,use_chain_of_thought=True", priority: 4}
+
+  {description: "math:model=text_code,subject=number_theory,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=algebra,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=prealgebra,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=geometry,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=counting_and_probability,level=3,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=precalculus,level=3,use_chain_of_thought=True", priority: 3}
+
+  {description: "math:model=text_code,subject=number_theory,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=algebra,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=prealgebra,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=geometry,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=counting_and_probability,level=4,use_chain_of_thought=True", priority: 4}
+  {description: "math:model=text_code,subject=precalculus,level=4,use_chain_of_thought=True", priority: 4}
+
+  {description: "math:model=text_code,subject=number_theory,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=algebra,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=prealgebra,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=geometry,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=counting_and_probability,level=5,use_chain_of_thought=True", priority: 3}
+  {description: "math:model=text_code,subject=precalculus,level=5,use_chain_of_thought=True", priority: 3}
+
+  # GSM
+  {description: "gsm:model=text_code", priority: 2}
+
+  # LegalBench
+  {description: "legalbench:model=text_code,subset=abercrombie", priority: 2}
+  {description: "legalbench:model=text_code,subset=corporate_lobbying", priority: 2}
+  {description: "legalbench:model=text_code,subset=international_citizenship_questions", priority: 2}
+  {description: "legalbench:model=text_code,subset=function_of_decision_section", priority: 2}
+  {description: "legalbench:model=text_code,subset=proa", priority: 2}
+
+  # MedQA
+  {description: "med_qa:model=text_code", priority: 2}
+
+  # WMT14
+  {description: "wmt_14:language_pair=cs-en,model=text", priority: 2}
+  {description: "wmt_14:language_pair=de-en,model=text", priority: 2}
+  {description: "wmt_14:language_pair=fr-en,model=text", priority: 2}
+  {description: "wmt_14:language_pair=hi-en,model=text", priority: 2}
+  {description: "wmt_14:language_pair=ru-en,model=text", priority: 2}
+]
diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index 3a0b7877b8..1b03014814 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -11,8 +11,11 @@
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
 
 
+# TODO: change to `helm.benchmark.config`
 SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
-SCHEMA_YAML_FILENAME: str = "schema.yaml"
+
+# TODO: add heim, vhelm, etc.
+SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
 
 
 @dataclass(frozen=True)
@@ -245,9 +248,10 @@ def __post_init__(self):
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
 
 
-def read_schema() -> Schema:
-    hlog(f"Reading schema from {SCHEMA_YAML_FILENAME}...")
-    schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_YAML_FILENAME)
+def read_schema(filename: str) -> Schema:
+    # TODO: merge in model metadata from `model_metadata.yaml`
+    schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(filename)
+    hlog(f"Reading schema file {schema_path}...")
     with schema_path.open("r") as f:
         raw = yaml.safe_load(f)
     return dacite.from_dict(Schema, raw)
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 4d87333801..d52c39020d 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -21,11 +21,14 @@
 from typing import List, Optional, Dict, Any, Tuple, Set
 
 from tqdm import tqdm
+from helm.benchmark.model_deployment_registry import get_model_deployment
 
+from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
 from helm.common.general import (
     write,
     ensure_directory_exists,
     asdict_without_nones,
+    serialize_dates,
     parallel_map,
     singleton,
     unique_simplification,
@@ -45,7 +48,7 @@
     MetricNameMatcher,
     RunGroup,
     read_schema,
-    SCHEMA_YAML_FILENAME,
+    SCHEMA_CLASSIC_YAML_FILENAME,
     BY_GROUP,
     THIS_GROUP_ONLY,
     NO_GROUPS,
@@ -57,10 +60,9 @@
     CONTAMINATION_STYLES,
     CONTAMINATION_LEVEL_STRONG,
 )
-from helm.benchmark.config_registry import register_helm_configurations
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
 from helm.benchmark.presentation.run_display import write_run_display_json
-from helm.benchmark.model_deployment_registry import get_metadata_for_deployment
-from helm.benchmark.model_metadata_registry import ModelMetadata
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
 
 
 OVERLAP_N_COUNT = 13
@@ -142,6 +144,38 @@ def get_scenario_name(group: RunGroup, scenario_spec: ScenarioSpec):
     return group.name + "_" + dict_to_str(scenario_spec.args).replace(" ", "").replace("/", "_")
 
 
+def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetadata:
+    """Return the ModelMetadata for the model in the given AdapterSpec."""
+    # Get model metadata based on `model` in `adapter_spec`
+    try:
+        return get_model_metadata(adapter_spec.model)
+    except ValueError:
+        pass
+
+    # Get model metadata based on `model_deployment` in `adapter_spec`
+    try:
+        model_deployment = get_model_deployment(adapter_spec.model_deployment)
+        if model_deployment.model_name:
+            return get_model_metadata(model_deployment.model_name)
+    except ValueError:
+        pass
+
+    # In some cases, some models were renamed such that the old model name is now the model deployment name
+    # For instance, the model called "huggingface/gpt2" is now called "openai/gpt2", but its model deployment
+    # is still called "huggingface/gpt2".
+    # Handle these cases here.
+    # TODO: Delete this block eventually.
+    try:
+        model_deployment = get_model_deployment(adapter_spec.model)
+        if model_deployment.model_name:
+            return get_model_metadata(model_deployment.model_name)
+    except ValueError:
+        pass
+
+    # Return a placeholder "unknoown model" model metadata.
+    return get_unknown_model_metadata(adapter_spec.model)
+
+
 def get_coarse_adapter_spec(
     adapter_spec: AdapterSpec, scenario_spec: Optional[ScenarioSpec] = None, adapter_keys_shown: List[str] = []
 ) -> AdapterSpec:
@@ -275,9 +309,11 @@ def __init__(
         release: Optional[str],
         suites: Optional[List[str]],
         suite: Optional[str],
+        schema_file: str,
         output_path: str,
         verbose: bool,
         num_threads: int,
+        allow_unknown_models: bool,
     ):
         """
         A note on the relation between `release`, `suites`, and `suite`:
@@ -293,6 +329,7 @@ def __init__(
         self.suites: List[str]
         self.run_suite_paths: List[str]
         self.suite: Optional[str] = None
+        self.schema_file = schema_file
         self.release: Optional[str] = None
         if suite:
             self.suite = suite
@@ -306,10 +343,11 @@ def __init__(
             self.run_suite_paths = [os.path.join(output_path, "runs", suite) for suite in suites]
         self.verbose: bool = verbose
         self.num_threads: int = num_threads
+        self.allow_unknown_models: bool = allow_unknown_models
 
         ensure_directory_exists(self.run_release_path)
 
-        self.schema = read_schema()
+        self.schema = read_schema(schema_file)
         self.contamination = read_contamination()
         validate_contamination(self.contamination, self.schema)
 
@@ -339,7 +377,7 @@ def filter_runs_by_visibility(self, runs: List[Run], group: RunGroup) -> List[Ru
                 if run_group_name not in self.schema.name_to_run_group:
                     hlog(
                         f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
-                        f"but undefined in {SCHEMA_YAML_FILENAME}, skipping"
+                        f"but undefined in {self.schema_file}, skipping"
                     )
                     continue
                 run_group = self.schema.name_to_run_group[run_group_name]
@@ -360,7 +398,13 @@ def read_runs_for_suite(self, suite, run_suite_path):
         """Load the runs in the run suite path."""
         # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
         # so filter them out.
-        run_dir_names = sorted([p for p in os.listdir(run_suite_path) if p != "eval_cache" and p != "groups"])
+        run_dir_names = sorted(
+            [
+                p
+                for p in os.listdir(run_suite_path)
+                if p != "eval_cache" and p != "groups" and os.path.isdir(os.path.join(run_suite_path, p))
+            ]
+        )
         for run_dir_name in tqdm(run_dir_names, disable=None):
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
@@ -389,6 +433,13 @@ def group_runs(self):
                 self.group_adapter_to_runs[group_name][adapter_spec].append(run)
                 self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
 
+    def write_schema(self):
+        """Write the schema file to benchmark_output so the frontend knows about it."""
+        write(
+            os.path.join(self.run_release_path, "schema.json"),
+            json.dumps(asdict_without_nones(self.schema), indent=2, default=serialize_dates),
+        )
+
     def read_runs(self):
         self.runs: List[Run] = []
         self.runs_to_run_suites: Dict[str, str] = {}
@@ -543,7 +594,7 @@ def check_metrics_defined(self):
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
                 hlog(
-                    f"WARNING: metric name {metric_name} undefined in {SCHEMA_YAML_FILENAME} "
+                    f"WARNING: metric name {metric_name} undefined in {self.schema_file} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
 
@@ -645,7 +696,8 @@ def get_cell(stats: List[Stat], compute_mean: bool = False, compute_sum: bool =
             header = [
                 HeaderCell("Group"),
                 HeaderCell("Description"),
-                # Synchronize these names with `schema.yaml`
+                # Synchronize these names with the appropriate schema file
+                # TODO: different schema files might have different fields (for multimodal)
                 HeaderCell("Adaptation method", description="Adaptation strategy (e.g., generation)"),
                 HeaderCell("# instances", description="Number of instances evaluated on"),
                 HeaderCell("# references", description="Number of references provided per instance"),
@@ -816,7 +868,7 @@ def create_group_table(
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {SCHEMA_YAML_FILENAME}, skipping")
+                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_file}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -889,10 +941,8 @@ def _adapter_spec_sort_key(spec):
         # Populate the contents of the table
         rows = []
         for adapter_spec, info in zip(adapter_specs, infos):
-            deployment: str = (
-                adapter_spec.model_deployment if len(adapter_spec.model_deployment) > 0 else adapter_spec.model
-            )
-            model_metadata: ModelMetadata = get_metadata_for_deployment(deployment)
+            model_metadata = get_model_metadata_for_adapter_spec(adapter_spec)
+
             model_name: str = model_metadata.name
 
             runs = adapter_to_runs[adapter_spec]
@@ -1250,10 +1300,12 @@ def symlink_latest(self) -> None:
         if os.path.islink(symlink_path):
             # Remove the previous symlink if it exists.
             os.unlink(symlink_path)
-        os.symlink(os.path.abspath(self.run_release_path), symlink_path)
+        os.symlink(os.path.basename(self.run_release_path), symlink_path)
 
     def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
-        """Run the entire summarization pipeline pipeline."""
+        """Run the entire summarization pipeline."""
+        self.write_schema()
+
         self.read_runs()
         self.group_runs()
         self.check_metrics_defined()
@@ -1278,12 +1330,18 @@ def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
         self.symlink_latest()
 
 
-@htrack(None)
+@htrack("summarize")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
     )
+    parser.add_argument(
+        "--schema-file",
+        type=str,
+        help="File name of the schema to read (e.g., schema_classic.yaml).",
+        default=SCHEMA_CLASSIC_YAML_FILENAME,
+    )
     parser.add_argument(
         "--suite",
         type=str,
@@ -1314,6 +1372,18 @@ def main():
         help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
         default=1000,
     )
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        help="If running locally, the path for `ServerService`.",
+        default="prod_env",
+    )
+    parser.add_argument(
+        "--allow-unknown-models",
+        type=bool,
+        help="Whether to allow unknown models in the metadata file",
+        default=True,
+    )
     args = parser.parse_args()
 
     release: Optional[str] = None
@@ -1337,16 +1407,19 @@ def main():
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
 
-    register_helm_configurations()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
 
     # Output JSON files summarizing the benchmark results which will be loaded in the web interface
     summarizer = Summarizer(
         release=release,
         suites=suites,
         suite=suite,
+        schema_file=args.schema_file,
         output_path=args.output_path,
         verbose=args.debug,
         num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
     )
     summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
     hlog("Done.")
diff --git a/src/helm/benchmark/presentation/test_contamination.py b/src/helm/benchmark/presentation/test_contamination.py
index 44530fd6f4..f0ac123a48 100644
--- a/src/helm/benchmark/presentation/test_contamination.py
+++ b/src/helm/benchmark/presentation/test_contamination.py
@@ -1,9 +1,9 @@
-from helm.benchmark.presentation.schema import read_schema
+from helm.benchmark.presentation.schema import read_schema, SCHEMA_CLASSIC_YAML_FILENAME
 from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
 
 
 def test_contamination_schema():
-    schema = read_schema()
+    schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
     contamination = read_contamination()
     validate_contamination(contamination, schema)
 
diff --git a/src/helm/benchmark/presentation/test_summarize.py b/src/helm/benchmark/presentation/test_summarize.py
index 6323315fd7..35dc7a36d3 100644
--- a/src/helm/benchmark/presentation/test_summarize.py
+++ b/src/helm/benchmark/presentation/test_summarize.py
@@ -2,6 +2,7 @@
 import tempfile
 
 from helm.benchmark.presentation.summarize import Summarizer
+from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
 from helm.common.general import ensure_directory_exists
 
 
@@ -12,9 +13,11 @@ def test_summarize_suite():
             release=None,
             suites=None,
             suite="test_suite",
+            schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
             output_path=output_path,
             verbose=False,
             num_threads=4,
+            allow_unknown_models=True,
         )
         summarizer.run_pipeline(skip_completed=True, num_instances=1000)
         assert os.path.isfile(os.path.join(output_path, "runs", "test_suite", "groups.json"))
@@ -28,9 +31,11 @@ def test_summarize_release():
             release="test_release",
             suites=["test_suite_1", "test_suite_2"],
             suite=None,
+            schema_file=SCHEMA_CLASSIC_YAML_FILENAME,
             output_path=output_path,
             verbose=False,
             num_threads=4,
+            allow_unknown_models=True,
         )
         summarizer.run_pipeline(skip_completed=True, num_instances=1000)
         assert os.path.isfile(os.path.join(output_path, "releases", "test_release", "groups.json"))
diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py
index cdf280d436..9222e8079b 100644
--- a/src/helm/benchmark/run.py
+++ b/src/helm/benchmark/run.py
@@ -7,18 +7,20 @@
 )
 
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
+from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec, get_class_by_name
 from helm.proxy.services.remote_service import create_authentication, add_service_args
 
-from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
-from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
-from helm.benchmark.config_registry import register_helm_configurations
+from helm.benchmark.config_registry import (
+    register_configs_from_directory,
+    register_builtin_configs_from_helm_package,
+)
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark import vlm_run_specs  # noqa
 from .executor import ExecutionSpec
-from .runner import Runner, RunSpec, LATEST_SYMLINK
+from .runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
 from .run_specs import construct_run_specs
 
 
@@ -39,7 +41,7 @@ def run_entries_to_run_specs(
 
         for run_spec in construct_run_specs(parse_object_spec(entry.description)):
             # Filter by models
-            if models_to_run and run_spec.adapter_spec.model_deployment not in models_to_run:
+            if models_to_run and run_spec.adapter_spec.model not in models_to_run:
                 continue
 
             # Filter by groups
@@ -245,30 +247,17 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
-    parser.add_argument(
-        "--model-metadata-paths",
-        nargs="+",
-        help="Experimental: Where to read model metadata from",
-        default=[],
-    )
-    parser.add_argument(
-        "--model-deployment-paths",
-        nargs="+",
-        help="Experimental: Where to read model deployments from",
-        default=[],
-    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
 
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+
     for huggingface_model_name in args.enable_huggingface_models:
         register_huggingface_hub_model_from_flag_value(huggingface_model_name)
     for huggingface_model_path in args.enable_local_huggingface_models:
         register_huggingface_local_model_from_flag_value(huggingface_model_path)
-    for model_metadata_path in args.model_metadata_paths:
-        register_model_metadata_from_path(model_metadata_path)
-    for model_deployment_paths in args.model_deployment_paths:
-        register_model_deployments_from_path(model_deployment_paths)
 
     run_entries: List[RunEntry] = []
     if args.conf_paths:
@@ -278,7 +267,10 @@ def main():
             [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
         )
 
-    register_helm_configurations()
+    # Must set benchmark output path before getting RunSpecs,
+    # because run spec functions can use the benchmark output directory for caching.
+    ensure_directory_exists(args.output_path)
+    set_benchmark_output_path(args.output_path)
 
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index d7ada38f0a..5be237d742 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -14,6 +14,8 @@
     ABLATION_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
 )
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
+from helm.common.general import handle_module_not_found_error
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
 from .runner import RunSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
@@ -255,6 +257,119 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         ]
 
 
+# Instruction-following models like GPT-4, Claude, PaLM 2 don't do in-context
+# learning naturally like base models, and they prefer to respond in a wordy
+# way as an assistant.  Therefore, for these models, we must provide explicit
+# instructions to follow the format of the in-context examples.
+IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX = (
+    "Here are some input-output examples. "
+    + "Read the examples carefully to figure out the mapping. "
+    + "The output of the last example is not given, "
+    + "and your job is to figure out what it is."
+)
+
+IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
+    "Please provide the output to this last example. " + "It is critical to follow the format of the preceding outputs!"
+)
+
+
+class AnthropicRunExpander(RunExpander):
+    """
+    Custom prompt for Anthropic models.
+    These models need more explicit instructions about following the format.
+    """
+
+    name = "anthropic"
+
+    def __init__(self):
+        pass
+
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        try:
+            import anthropic
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["anthropic"])
+
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=anthropic.HUMAN_PROMPT + " " + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + anthropic.AI_PROMPT
+                    + " "
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+
+
+class OpenAIRunExpander(RunExpander):
+    """
+    Custom prompt for OpenAI models.
+    These models need more explicit instructions about following the format.
+    """
+
+    # TODO: Refactor out common logic between this and GoogleRunExpander.
+
+    name = "openai"
+
+    def __init__(self):
+        pass
+
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            return [run_spec]
+
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + "\n"
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+
+
+class GoogleRunExpander(RunExpander):
+    """
+    Custom prompt for Google models.
+    These models need more explicit instructions about following the format.
+    """
+
+    # TODO: Refactor out common logic between this and OpenAIRunExpander.
+
+    name = "google"
+
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+            return [run_spec]
+
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
+                    global_suffix="\n\n"
+                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+                    + "\n"
+                    + run_spec.adapter_spec.output_prefix.strip(),
+                ),
+            ),
+        ]
+
+
 class FormatPromptRunExpander(RunExpander):
     """Adds a prefix and suffix to the prompt."""
 
@@ -271,7 +386,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
                 name=run_spec.name,
                 adapter_spec=replace(
                     run_spec.adapter_spec,
-                    global_prefix=self.prefix,
+                    input_prefix=self.prefix,
                     output_prefix=self.suffix,
                 ),
             ),
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 69bdd636b1..5c31eff639 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1,10 +1,16 @@
 import dataclasses
-import importlib
 import itertools
 from functools import partial
 from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
 
 from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT
+from helm.benchmark.scenarios.commonsense_scenario import (
+    CommonSenseQAScenario,
+    HellaSwagScenario,
+    OpenBookQA,
+    PiqaScenario,
+    SiqaScenario,
+)
 from helm.common.hierarchical_logger import hlog, htrack
 from helm.common.object_spec import ObjectSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import (
@@ -17,31 +23,27 @@
 )
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.common.optional_dependencies import handle_module_not_found_error
 from .metrics.metric import MetricSpec
 from .run_expander import (
     RUN_EXPANDERS,
-    RunExpander,
     GlobalPrefixRunExpander,
+    AnthropicRunExpander,
+    OpenAIRunExpander,
+    GoogleRunExpander,
     StopRunExpander,
     ChatMLRunExpander,
-    AddToStopRunExpander,
-    IncreaseMaxTokensRunExpander,
-    FormatPromptRunExpander,
     IncreaseTemperatureRunExpander,
 )
-from .runner import RunSpec
+from .runner import RunSpec, get_benchmark_output_path
 from .scenarios.lex_glue_scenario import (
     get_lex_glue_max_train_instances,
     get_lex_glue_instructions,
     get_lex_glue_max_tokens,
     get_lex_glue_task_type,
 )
-from .scenarios.scenario import ScenarioSpec
-from .scenarios.big_bench_scenario import BIGBenchScenario
+from .scenarios.scenario import ScenarioSpec, get_scenario_cache_path
 from .scenarios.msmarco_scenario import MSMARCOScenario
 from .scenarios.copyright_scenario import datatag2hash_code
-from .scenarios.raft_scenario import get_raft_instructions
 from .scenarios.lextreme_scenario import (
     get_lextreme_instructions,
     get_lextreme_max_train_instances,
@@ -58,6 +60,7 @@
     get_model_metadata,
     ANTHROPIC_CLAUDE_1_MODEL_TAG,
     ANTHROPIC_CLAUDE_2_MODEL_TAG,
+    GOOGLE_PALM_2_MODEL_TAG,
     NO_NEWLINES_TAG,
     NLG_PREFIX_TAG,
     CHATML_MODEL_TAG,
@@ -66,6 +69,8 @@
 )
 from helm.common.general import singleton
 
+INCLUDE_GENERATIVE_HARMS_METRICS = False
+
 
 ############################################################
 # Prototypical adapter specs
@@ -423,10 +428,10 @@ def get_machine_translation_adapter_spec(
     """
     return AdapterSpec(
         method=ADAPT_GENERATION,
-        instructions=f"Translate {source_language} to {target_language}:",
-        input_prefix="",
-        input_suffix=" = ",
-        output_prefix="",
+        instructions=f"Translate the following sentences from {source_language} to {target_language}.",
+        input_prefix=f"{source_language}: ",
+        input_suffix="\n",
+        output_prefix=f"{target_language}: ",
         output_suffix="\n",
         max_train_instances=max_train_instances,
         num_outputs=1,
@@ -577,6 +582,9 @@ def get_bias_metric_specs() -> List[MetricSpec]:
 
 
 def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
+    # In classic HELM, we included bias/toxicity measures, but now we don't to streamline.
+    if not INCLUDE_GENERATIVE_HARMS_METRICS:
+        return []
     return (
         get_bias_metric_specs()
         + get_toxicity_metric_specs()
@@ -668,12 +676,6 @@ def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
 
 
-def get_machine_translation_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
-    ] + get_basic_metric_specs([])
-
-
 def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]:
     return [
         MetricSpec(
@@ -1030,10 +1032,23 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
 
 @run_spec_function("commonsense")
 def get_commonsense_spec(dataset: str, method: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseScenario",
-        args={"dataset": dataset},
-    )
+    # TODO Split these into their own run_spec_function.
+    if dataset == HellaSwagScenario.name:
+        scenario_spec = ScenarioSpec(
+            class_name="helm.benchmark.scenarios.commonsense_scenario.HellaSwagScenario", args={}
+        )
+    elif dataset == OpenBookQA.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.OpenBookQA", args={})
+    elif dataset == CommonSenseQAScenario.name:
+        scenario_spec = ScenarioSpec(
+            class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseQAScenario", args={}
+        )
+    elif dataset == SiqaScenario.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.SiqaScenario", args={})
+    elif dataset == PiqaScenario.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.PiqaScenario", args={})
+    else:
+        raise ValueError(f"Unknown dataset: {dataset}")
 
     adapter_spec = get_multiple_choice_adapter_spec(
         method=method,
@@ -1188,19 +1203,23 @@ def get_gsm_spec() -> RunSpec:
         name="gsm",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
+        metric_specs=get_basic_metric_specs(["exact_match_indicator", "final_number_exact_match"])
+        + get_generative_harms_metric_specs(),
         groups=["gsm"],
     )
 
 
 @run_spec_function("raft")
 def get_raft_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
+
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
     )
 
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
     adapter_spec = get_generation_adapter_spec(
-        instructions=get_raft_instructions(subset),
+        instructions=get_raft_instructions(subset, scenario_cache_path),
         input_noun=None,
         output_noun="Label",
         max_tokens=30,  # at most ~50 characters per label
@@ -1820,6 +1839,35 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
     )
 
 
+@run_spec_function("legalbench")
+def get_legalbench_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.legalbench_scenario import (
+        LegalBenchScenario,
+        get_legalbench_instructions,
+        get_legalbench_output_nouns,
+    )
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legalbench_scenario.LegalBenchScenario", args={"subset": subset}
+    )
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), LegalBenchScenario.name)
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_legalbench_instructions(subset, scenario_cache_path),
+        input_noun=None,
+        output_noun=get_legalbench_output_nouns(subset, scenario_cache_path),
+        max_tokens=30,  # at most ~50 characters per label,
+        max_train_instances=5,  # Use 5 for all subsets
+    )
+
+    return RunSpec(
+        name=f"legalbench:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["legalbench"],
+    )
+
+
 @run_spec_function("legal_support")
 def get_legal_support_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -1885,6 +1933,8 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
 @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
 @run_spec_function("big_bench")
 def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
+    from helm.benchmark.scenarios.big_bench_scenario import BIGBenchScenario
+
     def get_adaptation_method(big_bench_metrics: List[str]) -> str:
         """
         From BIG-bench, "there are three types of BIG-bench JSON tasks - generative and scoring
@@ -1929,9 +1979,8 @@ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]:
     )
 
     # Get BIG-bench task definition.
-    # TODO: get `output_path` here without hardcoding
-    output_path: str = "benchmark_output/scenarios/big_bench"
-    big_bench_task: Dict = BIGBenchScenario.download_and_get_task(output_path, task, subtask)
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), BIGBenchScenario.name)
+    big_bench_task: Dict = BIGBenchScenario.download_and_get_task(scenario_cache_path, task, subtask)
 
     # The JSON schema for BIG-bench can be found here:
     # https://github.com/google/BIG-bench/blob/main/docs/doc.md#json-schema.
@@ -1964,9 +2013,8 @@ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]:
         name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        # TODO add generative harms when applicable
         metric_specs=get_metric_specs(big_bench_task["metrics"]),
-        groups=["BIG-bench"],
+        groups=[f"big_bench_{task}"],
     )
 
 
@@ -2080,7 +2128,7 @@ def get_med_qa_spec() -> RunSpec:
 
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Give a letter answer among A, B, C or D.",
+        instructions="The following are multiple choice questions (with answers) about medicine.",
         input_noun="Question",
         output_noun="Answer",
     )
@@ -2090,7 +2138,7 @@ def get_med_qa_spec() -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["MedQA"],
+        groups=["med_qa"],
     )
 
 
@@ -2291,7 +2339,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         name=f"wmt_14:language_pair={language_pair}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_machine_translation_metric_specs(),
+        metric_specs=get_open_ended_generation_metric_specs(),
         groups=["wmt_14"],
     )
 
@@ -2468,12 +2516,15 @@ def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
 
 @run_spec_function("cleva")
 def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec:
-    from .scenarios.cleva_scenario import CLEVAScenario  # noqa
+    from helm.benchmark.scenarios.cleva_scenario import CLEVAScenario  # noqa
 
-    CLEVAScenario.download_dataset(task, version)
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name)
+    CLEVAScenario.download_dataset(task, version, scenario_cache_path)
 
-    _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
-    inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
+    _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id, scenario_cache_path)
+    inference_parameters = CLEVAScenario.load_inference_parameters(
+        task, subtask, version, prompt_id, scenario_cache_path
+    )
 
     class_name_prefix = "".join([word.capitalize() for word in task.split("_")])
     scenario_spec = ScenarioSpec(
@@ -2996,47 +3047,21 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
             global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
             run_spec = singleton(global_prefix_expander.expand(run_spec))
 
-        # When running ChatGPT on non-language modelling tasks, increase max_tokens by 1
-        # to add room for the special message role token.
-        if OPENAI_CHATGPT_MODEL_TAG in model.tags and run_spec.adapter_spec.max_tokens:
-            increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=1)
-            run_spec = singleton(increase_max_tokens_expander.expand(run_spec))
-
         if CHATML_MODEL_TAG in model.tags:
             chatml_expander = ChatMLRunExpander()
             run_spec = singleton(chatml_expander.expand(run_spec))
 
-        # Special handling for Anthropic Claude
+        # Anthropic prompts
         if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
-            try:
-                import anthropic
-                from helm.proxy.clients.anthropic_client import AnthropicClient
-            except ModuleNotFoundError as e:
-                handle_module_not_found_error(e, ["anthropic"])
-            claude_run_expanders: List[RunExpander] = []
-            claude_run_expanders.append(AddToStopRunExpander(anthropic.HUMAN_PROMPT))
-            if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags:
-                claude_run_expanders.append(IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS))
-            # Get scenario tags
-            components = run_spec.scenario_spec.class_name.split(".")
-            class_name = components[-1]
-            module_name = ".".join(components[:-1])
-            cls = getattr(importlib.import_module(module_name), class_name)
-            scenario_tags: List[str] = cls.tags
-            # If the scenario is instruction, do not use PROMPT_ANSWER_START
-            if "instructions" in scenario_tags:
-                claude_run_expanders.append(
-                    FormatPromptRunExpander(prefix=anthropic.HUMAN_PROMPT, suffix=f"{anthropic.AI_PROMPT}")
-                )
-            else:
-                claude_run_expanders.append(
-                    FormatPromptRunExpander(
-                        prefix=anthropic.HUMAN_PROMPT,
-                        suffix=f"{anthropic.AI_PROMPT} {AnthropicClient.PROMPT_ANSWER_START}",
-                    )
-                )
-            for claude_run_expander in claude_run_expanders:
-                run_spec = singleton(claude_run_expander.expand(run_spec))
+            run_spec = singleton(AnthropicRunExpander().expand(run_spec))
+
+        # OpenAI prompts
+        if OPENAI_CHATGPT_MODEL_TAG in model.tags:
+            run_spec = singleton(OpenAIRunExpander().expand(run_spec))
+
+        # Google prompts
+        if GOOGLE_PALM_2_MODEL_TAG in model.tags:
+            run_spec = singleton(GoogleRunExpander().expand(run_spec))
 
         # For multiple choice
         if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
diff --git a/src/helm/benchmark/runner.py b/src/helm/benchmark/runner.py
index 21c1a62c6f..8c836fdae9 100644
--- a/src/helm/benchmark/runner.py
+++ b/src/helm/benchmark/runner.py
@@ -23,6 +23,7 @@
     ScenarioSpec,
     create_scenario,
     Instance,
+    get_scenario_cache_path,
     with_instance_ids,
 )
 from .adaptation.adapters.adapter import Adapter
@@ -39,6 +40,22 @@
 
 
 LATEST_SYMLINK: str = "latest"
+_BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
+
+
+def get_benchmark_output_path() -> str:
+    """Get the genchmark output path.
+
+    Many run spec functions need to know the benchmark output path,
+    but there is no way to pass it via  the run spec function,
+    so instead the run spec function should read this global variable."""
+    return _BENCHMARK_OUTPUT_PATH
+
+
+def set_benchmark_output_path(benchmark_output_path: str) -> None:
+    """Set the genchmark output path."""
+    global _BENCHMARK_OUTPUT_PATH
+    _BENCHMARK_OUTPUT_PATH = benchmark_output_path
 
 
 class RunnerError(Exception):
@@ -172,9 +189,8 @@ def __init__(
         self.exit_on_error: bool = exit_on_error
 
         ensure_directory_exists(output_path)
-        # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
-        self.scenarios_path: str = os.path.join(output_path, "scenarios")
-        ensure_directory_exists(self.scenarios_path)
+        self.output_path = output_path
+
         # Decide where to save input instances
         self.instances_path: str = os.path.join(output_path, "scenario_instances")
         ensure_directory_exists(self.instances_path)
@@ -234,10 +250,6 @@ def run_one(self, run_spec: RunSpec):
         # Load the scenario
         scenario: Scenario = create_scenario(run_spec.scenario_spec)
 
-        # This `output_path` will be used when `Adapter` calls `Scenario.get_instances`.
-        scenario_output_path = os.path.join(self.scenarios_path, scenario.name)
-        ensure_directory_exists(scenario_output_path)
-
         # This 'output_path' will be used when the model's input instances are saved.
         args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
         scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
@@ -254,6 +266,7 @@ def run_one(self, run_spec: RunSpec):
                 instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
             else:
                 # Create the instances of the scenario
+                scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
                 with htrack_block("scenario.get_instances"):
                     instances = scenario.get_instances(scenario_output_path)
         if self.cache_instances and not os.path.exists(input_instances_file_path):
diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py
index c63a1e523a..0831d6cd2d 100644
--- a/src/helm/benchmark/scenarios/cleva_scenario.py
+++ b/src/helm/benchmark/scenarios/cleva_scenario.py
@@ -10,6 +10,7 @@
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_GENERATION,
 )
+from helm.benchmark.runner import get_benchmark_output_path
 from helm.common.general import (
     assert_is_str,
     assert_is_str_list,
@@ -17,12 +18,21 @@
     ensure_directory_exists,
 )
 from helm.common.hierarchical_logger import hlog
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+    get_scenario_cache_path,
+)
 from .code_scenario import CodeReference, CodeInstance
 
 
 CLEVA_DATA_URL = "http://39.108.215.175/data"
-CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"
 
 
 @dataclass(frozen=True)
@@ -386,7 +396,10 @@ def __init__(
         self.subtask = subtask
         self.version = version
         self.converter = Converter()
-        self.prompt_template, _ = CLEVAScenario.get_prompt_setting(self.task, subtask, version, prompt_id)
+        scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name)
+        self.prompt_template, _ = CLEVAScenario.get_prompt_setting(
+            self.task, subtask, version, prompt_id, scenario_cache_path
+        )
 
     @property
     @abstractmethod
@@ -394,14 +407,14 @@ def task(self) -> str:
         pass
 
     @classmethod
-    def download_dataset(cls, task: str, version: str):
+    def download_dataset(cls, task: str, version: str, cache_dir: str):
         source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
-        target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version)
+        target_dir: str = os.path.join(cache_dir, "data", version)
         ensure_directory_exists(target_dir)
         ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)
 
-    def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
-        data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task)
+    def load_dataset(self, cache_dir: str) -> Dict[str, List[Dict[str, Any]]]:
+        data_dir: str = os.path.join(cache_dir, "data", self.version, self.task)
         if self.subtask:
             data_dir = os.path.join(data_dir, self.subtask)
 
@@ -418,8 +431,8 @@ def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
         return dataset
 
     @staticmethod
-    def load_prompt_templates(task: str, subtask: Optional[str], version: str) -> List[Dict[str, Any]]:
-        prompt_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version, task)
+    def load_prompt_templates(task: str, subtask: Optional[str], version: str, cache_dir: str) -> List[Dict[str, Any]]:
+        prompt_dir: str = os.path.join(cache_dir, "data", version, task)
         if subtask:
             prompt_dir = os.path.join(prompt_dir, subtask)
         file_path = os.path.join(prompt_dir, "prompts.json")
@@ -432,7 +445,7 @@ def load_prompt_templates(task: str, subtask: Optional[str], version: str) -> Li
 
     def get_instances(self, output_path: str) -> List[Instance]:
         # Download the raw data
-        dataset = self.load_dataset()
+        dataset = self.load_dataset(output_path)
 
         # Read all the instances
         instances: List[Instance] = []
@@ -449,9 +462,9 @@ def process_instance(self, row: Dict[str, Any], split: str) -> Instance:
 
     @classmethod
     def get_prompt_setting(
-        cls, task: str, subtask: Optional[str], version: str, prompt_id: int
+        cls, task: str, subtask: Optional[str], version: str, prompt_id: int, output_path: str
     ) -> Tuple[Dict[str, Any], PromptSetting]:
-        prompt_templates = cls.load_prompt_templates(task, subtask, version)
+        prompt_templates = cls.load_prompt_templates(task, subtask, version, output_path)
         if prompt_id >= len(prompt_templates):
             raise ValueError(
                 f"You want to use prompt template with prompt_id {prompt_id}, but there is only"
@@ -503,10 +516,10 @@ def get_prompt_setting(
 
     @classmethod
     def load_inference_parameters(
-        cls, task: str, subtask: Optional[str], version: str, prompt_id: int
+        cls, task: str, subtask: Optional[str], version: str, prompt_id: int, cache_dir: str
     ) -> Dict[str, Any]:
         # We use a dict instead of dataclass to store hyperparameters such that we can set different default values
-        params_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version, task)
+        params_dir: str = os.path.join(cache_dir, "data", version, task)
         if subtask:
             params_dir = os.path.join(params_dir, subtask)
         file_path = os.path.join(params_dir, "infer_params.json")
@@ -916,7 +929,7 @@ def task(self) -> str:
 
     def get_instances(self, output_path: str) -> List[Instance]:
         # Download the raw data
-        dataset = self.load_dataset()
+        dataset = self.load_dataset(output_path)
 
         # Read all the instances
         instances: List[Instance] = []
diff --git a/src/helm/benchmark/scenarios/commonsense_scenario.py b/src/helm/benchmark/scenarios/commonsense_scenario.py
index 51836bdd3b..2d88e56c46 100644
--- a/src/helm/benchmark/scenarios/commonsense_scenario.py
+++ b/src/helm/benchmark/scenarios/commonsense_scenario.py
@@ -17,37 +17,52 @@
 )
 
 
-class CommonSenseScenario(Scenario):
-    """
-    Unified interface for all CommonSense scenarios.
-
-    - The "HellaSwag" benchmark from this paper:
-      https://arxiv.org/pdf/1905.07830.pdf
-
-    - The "OpenBookQA" benchmark from this paper:
-      https://aclanthology.org/D18-1260.pdf
-
-    - The "CommonSenseQA" benchmark from this paper:
-      https://arxiv.org/pdf/1811.00937.pdf
-
-    - The "PIQA" benchmark from this paper:
-      https://arxiv.org/pdf/1911.11641.pdf
-
-    - The "SIQA" benchmark from this paper:
-      https://arxiv.org/pdf/1904.09728.pdf
-    """
-
-    name = "commonsense"
-    description = "Unified interface for all CommonSense scenarios."
+_SPLIT_TRANSLATION = {
+    "train": TRAIN_SPLIT,
+    "val": VALID_SPLIT,
+    "test": TEST_SPLIT,
+}
+
+
+def _make_instance(question: str, answers: List[str], correct_answer: str, split: str):
+    references = []
+    for answer in answers:
+        references.append(Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []))
+    return Instance(
+        input=Input(text=question),
+        references=references,
+        split=_SPLIT_TRANSLATION[split],
+    )
+
+
+class HellaSwagScenario(Scenario):
+    name = "hellaswag"
+    description = "Benchmark from https://arxiv.org/pdf/1905.07830.pdf."
     tags = ["knowledge", "multiple_choice"]
 
-    def __init__(self, dataset):
-        super().__init__()
-        self.dataset = dataset
-        assert self.dataset in ["hellaswag", "openbookqa", "commonsenseqa", "piqa", "siqa"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+
+        instances = []
+        base_url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_{}.jsonl"
+        # Ignore HellaSwag test set because no label information
+        for split in ["train", "val"]:
+            file_path = os.path.join(data_path, f"hellaswag_{split}.jsonl")
+            ensure_file_downloaded(
+                source_url=base_url.format(split),
+                target_path=file_path,
+            )
+            hlog(f"Reading {file_path}")
+            with open(file_path) as f:
+                for line in f:
+                    item = json.loads(line)
+                    instances.append(self.json_to_instance(item, split))
+        return instances
 
     @staticmethod
-    def process_hellaswag_item(item):
+    def json_to_instance(item, split) -> Instance:
         ctx_b_fixed = item["ctx_b"][0].upper() + item["ctx_b"][1:] if len(item["ctx_b"]) != 0 else ""
 
         question = f"{item['activity_label']}: {item['ctx_a']} {ctx_b_fixed}"
@@ -55,10 +70,38 @@ def process_hellaswag_item(item):
         correct_answer = answers[item["label"]]
 
         assert len(answers) == 4
-        return question, answers, correct_answer
+        return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
+
+
+class OpenBookQA(Scenario):
+    name = "openbookqa"
+    description = "Benchmark from https://aclanthology.org/D18-1260.pdf."
+    tags = ["knowledge", "multiple_choice"]
+
+    def get_instances(self, output_path: str):
+        # Download the raw data
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+
+        ensure_file_downloaded(
+            source_url="https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip",
+            target_path=os.path.join(data_path, "OpenBookQA-V1-Sep2018"),
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+        instances = []
+        for split in ["train", "test"]:
+            file_path = os.path.join(data_path, "OpenBookQA-V1-Sep2018", "Data", "Main", f"{split}.jsonl")
+            hlog(f"Reading {file_path}")
+            with open(file_path) as f:
+                for line in f:
+                    item = json.loads(line)
+                    instances.append(self.json_to_instance(item, split))
+        return instances
 
     @staticmethod
-    def process_openbookqa_item(item):
+    def json_to_instance(item, split) -> Instance:
         letter2idx = {"A": 0, "B": 1, "C": 2, "D": 3}
 
         question = item["question"]["stem"]
@@ -68,10 +111,38 @@ def process_openbookqa_item(item):
 
         assert len(answers) == 4
         assert item["question"]["choices"][correct_choice]["label"] == item["answerKey"]
-        return question, answers, correct_answer
+        return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
+
+
+class CommonSenseQAScenario(Scenario):
+    name = "commonsenseqa"
+    description = "Benchmark from https://arxiv.org/pdf/1811.00937.pdf."
+    tags = ["knowledge", "multiple_choice"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+
+        instances = []
+        base_url = "https://s3.amazonaws.com/commensenseqa/{}_rand_split.jsonl"
+        # Ignore CommonSenseQA test set because no label information
+        split_mapping = {"train": "train", "val": "dev"}
+        for split in ["train", "val"]:
+            file_path = os.path.join(data_path, f"commonsenseqa_{split}.jsonl")
+            ensure_file_downloaded(
+                source_url=base_url.format(split_mapping[split]),
+                target_path=file_path,
+            )
+            hlog(f"Reading {file_path}")
+            with open(file_path) as f:
+                for line in f:
+                    item = json.loads(line)
+                    instances.append(self.json_to_instance(item, split))
+        return instances
 
     @staticmethod
-    def process_commonsenseqa_item(item):
+    def json_to_instance(item, split) -> Instance:
         # Note: question concept field is not used: item["question"]["question_concept"]
         letter2idx = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}
         question = item["question"]["stem"]
@@ -81,180 +152,89 @@ def process_commonsenseqa_item(item):
 
         assert len(answers) == 5
         assert item["question"]["choices"][correct_choice]["label"] == item["answerKey"]
-        return question, answers, correct_answer
-
-    @staticmethod
-    def process_piqa_item(item):
-        question = item["goal"]
-        answers = [item["sol1"], item["sol2"]]
-        correct_choice = item["label"]
-        correct_answer = answers[correct_choice]
-
-        assert len(item) == 4
-        assert correct_choice in [0, 1]
-        return question, answers, correct_answer
+        return _make_instance(question, answers, correct_answer, split)
 
-    @staticmethod
-    def process_siqa_item(item):
-        question = f"{item['context']} {item['question']}"
-        answers = [item["answerA"], item["answerB"], item["answerC"]]
-        correct_choice = item["label"] - 1
-        correct_answer = answers[correct_choice]
 
-        assert len(item) == 6
-        assert correct_choice in [0, 1, 2]
-        return question, answers, correct_answer
+class PiqaScenario(Scenario):
+    name = "piqa"
+    description = "Benchmark from https://arxiv.org/pdf/1911.11641.pdf."
+    tags = ["knowledge", "multiple_choice"]
 
-    def download_dataset(self, output_path: str):
+    def get_instances(self, output_path: str):
         # Download the raw data
-        data_path = os.path.join(output_path, "data", self.dataset)
+        data_path = os.path.join(output_path, "data")
         ensure_directory_exists(data_path)
 
-        if self.dataset == "hellaswag":
-            url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_{}.jsonl"
-            for split in ["train", "val", "test"]:
-                ensure_file_downloaded(
-                    source_url=url.format(split),
-                    target_path=os.path.join(data_path, f"hellaswag_{split}.jsonl"),
-                )
-        elif self.dataset == "openbookqa":
+        url = "https://yonatanbisk.com/piqa/data/{}"
+        # TODO The source actually uses TRAIN_SPLIT and VALID_SPLIT, so consider skipping "val".
+        split_mapping = {"train": "train", "val": "valid"}
+        instances = []
+        # Ignore PIQA test set because no label information
+        for split in ["train", "val"]:
             ensure_file_downloaded(
-                source_url="https://ai2-public-datasets.s3.amazonaws.com/open-book-qa/OpenBookQA-V1-Sep2018.zip",
-                target_path=os.path.join(data_path, "OpenBookQA-V1-Sep2018"),
-                unpack=True,
-                unpack_type="unzip",
+                source_url=url.format(f"{split_mapping[split]}.jsonl"),
+                target_path=os.path.join(data_path, f"piqa_{split}.jsonl"),
             )
-        elif self.dataset == "commonsenseqa":
-            url = "https://s3.amazonaws.com/commensenseqa/{}_rand_split.jsonl"
-            split_mapping = {"train": "train", "val": "dev"}
-            for split in ["train", "val"]:
-                ensure_file_downloaded(
-                    source_url=url.format(split_mapping[split]),
-                    target_path=os.path.join(data_path, f"commonsenseqa_{split}.jsonl"),
-                )
-        elif self.dataset == "piqa":
-            url = "https://yonatanbisk.com/piqa/data/{}"
-            split_mapping = {"train": "train", "val": "valid"}
-            for split in ["train", "val"]:
-                ensure_file_downloaded(
-                    source_url=url.format(f"{split_mapping[split]}.jsonl"),
-                    target_path=os.path.join(data_path, f"piqa_{split}_raw.jsonl"),
-                )
-                ensure_file_downloaded(
-                    source_url=url.format(f"{split_mapping[split]}-labels.lst"),
-                    target_path=os.path.join(data_path, f"piqa_{split}_labels.lst"),
-                )
-                data = [json.loads(line) for line in open(os.path.join(data_path, f"piqa_{split}_raw.jsonl"))]
-                labels = [int(line.strip()) for line in open(os.path.join(data_path, f"piqa_{split}_labels.lst"))]
-                assert len(data) == len(labels)
-                for item, label in zip(data, labels):
-                    item["label"] = label
-                with open(os.path.join(data_path, f"piqa_{split}.jsonl"), "w") as f:
-                    for item in data:
-                        f.write(json.dumps(item) + "\n")
-        elif self.dataset == "siqa":
             ensure_file_downloaded(
-                source_url="https://storage.googleapis.com/ai2-mosaic/public/socialiqa/socialiqa-train-dev.zip",
-                target_path=os.path.join(data_path, "socialiqa-train-dev"),
-                unpack=True,
-                unpack_type="unzip",
+                source_url=url.format(f"{split_mapping[split]}-labels.lst"),
+                target_path=os.path.join(data_path, f"piqa_{split}_labels.lst"),
             )
-            split_mapping = {"train": "train", "val": "dev"}
-            for split in ["train", "val"]:
-                data = [
-                    json.loads(line)
-                    for line in open(
-                        os.path.join(
-                            data_path, "socialiqa-train-dev", "socialiqa-train-dev", f"{split_mapping[split]}.jsonl"
-                        )
-                    )
-                ]
-                labels = [
-                    int(line.strip())
-                    for line in open(
-                        os.path.join(
-                            data_path,
-                            "socialiqa-train-dev",
-                            "socialiqa-train-dev",
-                            f"{split_mapping[split]}-labels.lst",
-                        )
-                    )
-                ]
-                assert len(data) == len(labels)
-                for item, label in zip(data, labels):
-                    item["label"] = label
-                with open(os.path.join(data_path, f"siqa_{split}.jsonl"), "w") as f:
-                    for item in data:
-                        f.write(json.dumps(item) + "\n")
-        else:
-            raise ValueError(f"Unknown dataset: {self.dataset}")
-
-    def load_dataset(self, output_path: str) -> List[List[str]]:
-        data_path = os.path.join(output_path, "data", self.dataset)
-
-        if self.dataset == "hellaswag":
-            split_to_file = {
-                split: os.path.join(data_path, f"hellaswag_{split}.jsonl") for split in ["train", "val"]
-            }  # Ignore HellaSwag test set because no label information
-            item_process_func = self.process_hellaswag_item
-        elif self.dataset == "openbookqa":
-            split_to_file = {
-                split: os.path.join(data_path, "OpenBookQA-V1-Sep2018", "Data", "Main", f"{split}.jsonl")
-                for split in ["train", "test"]
-            }
-            item_process_func = self.process_openbookqa_item
-        elif self.dataset == "commonsenseqa":
-            split_to_file = {
-                split: os.path.join(data_path, f"commonsenseqa_{split}.jsonl") for split in ["train", "val"]
-            }  # Ignore CommonSenseQA test set because no label information
-            item_process_func = self.process_commonsenseqa_item
-        elif self.dataset == "piqa":
-            split_to_file = {
-                split: os.path.join(data_path, f"piqa_{split}.jsonl") for split in ["train", "val"]
-            }  # Ignore PIQA test set because no label information
-            item_process_func = self.process_piqa_item
-        elif self.dataset == "siqa":
-            split_to_file = {
-                split: os.path.join(data_path, f"siqa_{split}.jsonl") for split in ["train", "val"]
-            }  # SIQA has no available test set
-            item_process_func = self.process_siqa_item
-        else:
-            raise ValueError(f"Unknown dataset: {self.dataset}")
-
-        data = []
-        for split in split_to_file:
-            file_path = split_to_file[split]
-            if not os.path.exists(file_path):
-                raise FileNotFoundError(f"File not found: {file_path}")
+            data = [json.loads(line) for line in open(os.path.join(data_path, f"piqa_{split}.jsonl"))]
+            labels = [int(line.strip()) for line in open(os.path.join(data_path, f"piqa_{split}_labels.lst"))]
+            assert len(data) == len(labels)
+            for item, label in zip(data, labels):
+                instances.append(self.json_to_instance(item, label, split))
+        return instances
 
-            hlog(f"Reading {file_path}")
-            with open(file_path) as f:
-                for line in f:
-                    item = json.loads(line)
-                    question, answers, correct_answer = item_process_func(item)
-                    data.append([question, answers, correct_answer, split])
-        return data
+    @staticmethod
+    def json_to_instance(item, label: int, split: str):
+        question = item["goal"]
+        answers = [item["sol1"], item["sol2"]]
+        correct_choice = label
+        correct_answer = answers[correct_choice]
 
-    def get_instances(self, output_path: str) -> List[Instance]:
-        self.download_dataset(output_path)
-        data = self.load_dataset(output_path)
+        assert len(item) == 3
+        assert correct_choice in [0, 1]
+        return _make_instance(question, answers, correct_answer, split)
 
-        splits = {
-            "train": TRAIN_SPLIT,
-            "val": VALID_SPLIT,
-            "test": TEST_SPLIT,
-        }
 
-        instances: List[Instance] = []
+class SiqaScenario(Scenario):
+    name = "siqa"
+    description = "Benchmark from https://arxiv.org/pdf/1904.09728.pdf."
+    tags = ["knowledge", "multiple_choice"]
 
-        def answer_to_reference(answer: str) -> Reference:
-            return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
 
-        for question_id, (question, answers, correct_answer, split) in enumerate(data):
-            instance = Instance(
-                input=Input(text=question),
-                references=list(map(answer_to_reference, answers)),
-                split=splits[split],
-            )
-            instances.append(instance)
+        ensure_file_downloaded(
+            source_url="https://storage.googleapis.com/ai2-mosaic/public/socialiqa/socialiqa-train-dev.zip",
+            target_path=os.path.join(data_path, "socialiqa-train-dev"),
+            unpack=True,
+            unpack_type="unzip",
+        )
+        # TODO The source doesn't follow the standard naming for 'val', so maybe can skip _SPLIT_TRANSLATION.
+        split_mapping = {"train": "train", "val": "dev"}
+        instances = []
+        # SIQA has no available test set
+        for split in ["train", "val"]:
+            base_path = os.path.join(data_path, "socialiqa-train-dev", "socialiqa-train-dev", f"{split_mapping[split]}")
+            data = [json.loads(line) for line in open(base_path + ".jsonl")]
+            labels = [int(line.strip()) for line in open(base_path + "-labels.lst")]
+            assert len(data) == len(labels)
+
+            for item, label in zip(data, labels):
+                instances.append(self.json_to_instance(item, label, split))
         return instances
+
+    @staticmethod
+    def json_to_instance(item, label, split) -> Instance:
+        question = f"{item['context']} {item['question']}"
+        answers = [item["answerA"], item["answerB"], item["answerC"]]
+        correct_choice = label - 1
+        correct_answer = answers[correct_choice]
+
+        assert len(item) == 5
+        assert correct_choice in [0, 1, 2]
+        return _make_instance(question, answers, correct_answer, split)
diff --git a/src/helm/benchmark/scenarios/legalbench_scenario.py b/src/helm/benchmark/scenarios/legalbench_scenario.py
new file mode 100644
index 0000000000..70dd088bfe
--- /dev/null
+++ b/src/helm/benchmark/scenarios/legalbench_scenario.py
@@ -0,0 +1,123 @@
+import random
+import os
+import json
+import datasets
+from pathlib import Path
+from typing import List, Dict
+
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
+
+PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
+
+SUBSETS = [
+    "abercrombie",
+    "corporate_lobbying",
+    "international_citizenship_questions",
+    "function_of_decision_section",
+    "proa",
+]
+
+
+def get_legalbench_prompt_settings(subset: str, cache_dir: str):
+    """
+    Loads prompt construction settings for all subsets.
+    """
+    assert subset in SUBSETS, "Unknown subset: {}".format(subset)
+
+    prompt_construction_settings_path = os.path.join(cache_dir, "prompt_construction_settings.json")
+    ensure_directory_exists(cache_dir)
+    ensure_file_downloaded(
+        source_url=PROMPT_SETTINGS_URL,
+        target_path=prompt_construction_settings_path,
+    )
+    with open(prompt_construction_settings_path, "r") as f:
+        field_ordering, instructions, label_keys, output_nouns, _ = map(json.loads, f.read().strip().split("\n"))
+    return (
+        field_ordering[subset],
+        instructions[subset],
+        label_keys[subset],
+        output_nouns[subset],
+    )
+
+
+def get_legalbench_instructions(subset: str, cache_dir: str):
+    return get_legalbench_prompt_settings(subset, cache_dir)[1]
+
+
+def get_legalbench_output_nouns(subset: str, cache_dir: str):
+    return get_legalbench_prompt_settings(subset, cache_dir)[3]
+
+
+class LegalBenchScenario(Scenario):
+    """
+    LegalBench is benchmark containing different legal reasoning tasks. We use a subset of the tasks, selected
+    to represent different legal reasoning patterns.
+
+    LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models
+    https://arxiv.org/abs/2308.11462
+
+    Official website for LegalBench:
+    http://hazyresearch.stanford.edu/legalbench/
+
+    Dataset summary:
+    https://huggingface.co/datasets/nguha/legalbench
+
+    Prompts are adapted from:
+    https://github.com/HazyResearch/legalbench/
+
+    Subsets:
+
+    - abercrombie
+    - corporate_lobbying
+    - international_citizenship_questions
+    - function_of_decision_section
+    - proa
+    """
+
+    name = "legalbench"
+    description = "LegalBench"
+    tags = ["text_classification", "robustness"]
+
+    def __init__(self, subset: str, random_seed=42):
+        super().__init__()
+        assert subset in SUBSETS, "Unknown subset: {}".format(subset)
+        self.subset = subset
+        self.random_seed = random_seed
+
+    def load_prompt_construction_settings(self, output_path: str):
+        # Load from prompt construction settings
+        cache_dir = str(Path(output_path) / "data")
+        return get_legalbench_prompt_settings(self.subset, cache_dir)
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        fields, _, label_key, _ = self.load_prompt_construction_settings(output_path)
+        cache_dir = str(Path(output_path) / "data")
+
+        # Download data from Huggingface. LegalBench provides splits for samples to
+        # be used for prompt construction and for testing.
+        train_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="train")
+        test_dataset = datasets.load_dataset("nguha/legalbench", self.subset, cache_dir=cache_dir, split="test")
+        assert isinstance(train_dataset, datasets.Dataset)
+        assert isinstance(test_dataset, datasets.Dataset)
+
+        dataset_splits: Dict[str, datasets.Dataset] = {
+            TRAIN_SPLIT: train_dataset,
+            TEST_SPLIT: test_dataset,
+        }
+
+        # Read all instances
+        random.seed(self.random_seed)
+        instances: List[Instance] = []
+        for split, subset in dataset_splits.items():
+            for x in subset:
+                assert fields is not None, "Field ordering not loaded"
+                prompt: str = "\n".join([f"{field[0]}: {x[field[1]]}" for field in fields])
+                instance = Instance(
+                    input=Input(text=prompt),
+                    references=[Reference(Output(text=x[label_key]), tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/lsat_qa_scenario.py b/src/helm/benchmark/scenarios/lsat_qa_scenario.py
index 99da80aa21..744599aad3 100644
--- a/src/helm/benchmark/scenarios/lsat_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/lsat_qa_scenario.py
@@ -101,9 +101,11 @@ def __init__(self, task):
 
     def get_question_types(self, tags: List[str]) -> List[str]:
         question_type: str = tags[2].replace("grouping (distribution)", "distribution grouping") or "miscellaneous"
+        types = [question_type.replace(" ", "_").replace("/", "_")]
         main_type = self.subtype2type.get(question_type)
-        assert main_type
-        return [question_type.replace(" ", "_").replace("/", "_"), main_type]
+        if main_type is not None:
+            types.append(main_type)
+        return types
 
     def get_instances(self, output_path: str) -> List[Instance]:
         data_path = os.path.join(output_path, "data")
diff --git a/src/helm/benchmark/scenarios/math_scenario.py b/src/helm/benchmark/scenarios/math_scenario.py
index 30cdef391c..edd543f48e 100644
--- a/src/helm/benchmark/scenarios/math_scenario.py
+++ b/src/helm/benchmark/scenarios/math_scenario.py
@@ -354,7 +354,7 @@ def __init__(
 
     def get_instances(self, output_path: str) -> List[Instance]:
         dataset = {}
-        data = typing.cast(DatasetDict, load_dataset("competition_math", ignore_verifications=True))
+        data = typing.cast(DatasetDict, load_dataset("competition_math")).sort("problem").shuffle(seed=42)
 
         def group_by_key(dataset_list, key):
             dataset_per_key = collections.defaultdict(list)
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 1a22d21b14..5814513799 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -107,18 +107,18 @@ def __init__(self, survey_type: str, context: str):
         self.survey_type: str = survey_type
         self.context: str = context
 
-    def download_data(self):
+    def download_data(self, output_path: str):
 
-        output_path: str = os.path.join(output_path, "data")
-        if not os.path.exists(output_path):
-            os.makedirs(output_path)
+        data_dir: str = os.path.join(output_path, "data")
+        if not os.path.exists(data_dir):
+            os.makedirs(data_dir)
 
         DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
         DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
         DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
 
         for filename in DOWNLOAD_FILENAMES:
-            data_path: str = os.path.join(output_path, filename)
+            data_path: str = os.path.join(data_dir, filename)
 
             source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
             ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
@@ -129,7 +129,7 @@ def read_survey_questions(self, csv_path):
         return df
 
     def get_instances(self, output_path: str) -> List[Instance]:
-        self.download_data()
+        self.download_data(output_path)
 
         # Read all the instances
         instances: List[Instance] = []
diff --git a/src/helm/benchmark/scenarios/raft_scenario.py b/src/helm/benchmark/scenarios/raft_scenario.py
index a08402a636..ca1a79829b 100644
--- a/src/helm/benchmark/scenarios/raft_scenario.py
+++ b/src/helm/benchmark/scenarios/raft_scenario.py
@@ -1,7 +1,6 @@
 import random
 import os
 import json
-import tempfile
 import datasets
 from pathlib import Path
 from typing import List, Dict
@@ -26,12 +25,9 @@
 ]
 
 
-def get_raft_prompt_settings(subset: str, cache_dir=None):
+def get_raft_prompt_settings(subset: str, cache_dir: str):
     assert subset in SUBSETS, "Unknown subset: {}".format(subset)
 
-    if cache_dir is None:
-        cache_dir = tempfile.gettempdir()
-
     prompt_construction_settings_path = os.path.join(cache_dir, "prompt_construction_settings.json")
     ensure_directory_exists(cache_dir)
     ensure_file_downloaded(
@@ -44,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir=None):
     return field_ordering[subset], instructions[subset]
 
 
-def get_raft_instructions(subset: str, cache_dir=None):
+def get_raft_instructions(subset: str, cache_dir: str):
     return get_raft_prompt_settings(subset, cache_dir)[1]
 
 
diff --git a/src/helm/benchmark/scenarios/scenario.py b/src/helm/benchmark/scenarios/scenario.py
index 4d36bb1ab4..e84e08a908 100644
--- a/src/helm/benchmark/scenarios/scenario.py
+++ b/src/helm/benchmark/scenarios/scenario.py
@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, replace
 from typing import List, Optional, Tuple
+import os
 from pathlib import PurePath
 import inspect
 
 from helm.common.media_object import MultimediaObject
 from helm.common.object_spec import ObjectSpec, create_object
-from helm.common.general import format_text, format_split, format_tags, indent_lines
+from helm.common.general import ensure_directory_exists, format_text, format_split, format_tags, indent_lines
 from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
 
 """ Data splits """
@@ -249,3 +250,10 @@ class ScenarioSpec(ObjectSpec):
 def create_scenario(scenario_spec: ScenarioSpec) -> Scenario:
     """Construct the scenario and set some fields."""
     return create_object(scenario_spec)
+
+
+def get_scenario_cache_path(benchmark_output_path: str, scenario_name: str):
+    """Return a directory under benchmark_output_path in which Scenario can cache temporary data."""
+    scenarios_path: str = os.path.join(benchmark_output_path, "scenarios", scenario_name)
+    ensure_directory_exists(scenarios_path)
+    return scenarios_path
diff --git a/src/helm/benchmark/scenarios/test_math_scenario.py b/src/helm/benchmark/scenarios/test_math_scenario.py
new file mode 100644
index 0000000000..06fd7be87e
--- /dev/null
+++ b/src/helm/benchmark/scenarios/test_math_scenario.py
@@ -0,0 +1,16 @@
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.math_scenario import MATHScenario
+from helm.benchmark.scenarios.scenario import Input, Output, Reference
+
+
+def test_math_scenario_get_instances():
+    math_scenario = MATHScenario(subject="number_theory", level="1")
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = math_scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 77
+    assert actual_instances[0].input == Input(text="What is the remainder when (99)(101) is divided by 9?")
+    assert actual_instances[0].references == [
+        Reference(output=Output(text="0", multimedia_content=None), tags=["correct"])
+    ]
+    assert actual_instances[0].split == "train"
diff --git a/src/helm/benchmark/scenarios/the_pile_scenario.py b/src/helm/benchmark/scenarios/the_pile_scenario.py
index 6a50c15202..2a2c1ec9b1 100644
--- a/src/helm/benchmark/scenarios/the_pile_scenario.py
+++ b/src/helm/benchmark/scenarios/the_pile_scenario.py
@@ -50,14 +50,13 @@ def __init__(self, subset: str):
         self.subset = subset
 
     @htrack(None)
-    def load_and_cache_all_subsets(self, output_path):
-        data_path = os.path.join(output_path, "data")
+    def load_and_cache_all_subsets(self, data_jsonl, output_path):
         subsets: Dict[str, List] = {subset: [] for subset in self.pile_subsets}
 
         # Load all data into memory
         with htrack_block("Loading"):
-            hlog(f"Loading all data from {data_path}")
-            with open(data_path) as f:
+            hlog(f"Loading all data from {data_jsonl}")
+            with open(data_jsonl) as f:
                 data = [json.loads(line) for line in f]
 
         # Classify the documents by subset
@@ -76,10 +75,10 @@ def load_and_cache_all_subsets(self, output_path):
 
     def get_instances(self, output_path: str) -> List[Instance]:
         # Download the raw data
-        data_path = os.path.join(output_path, "data")
+        data_jsonl = os.path.join(output_path, "data")
         ensure_file_downloaded(
             source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
-            target_path=data_path,
+            target_path=data_jsonl,
             unpack=True,
         )
 
@@ -87,7 +86,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
 
         # If the target subset does not exist, load and cache all subsets to the directory
         if not os.path.exists(subset_path):
-            self.load_and_cache_all_subsets(output_path)
+            self.load_and_cache_all_subsets(data_jsonl, output_path)
 
         # Read all the instances
         instances = []
diff --git a/src/helm/benchmark/scenarios/wmt_14_scenario.py b/src/helm/benchmark/scenarios/wmt_14_scenario.py
index 58e174bd36..9a00547a98 100644
--- a/src/helm/benchmark/scenarios/wmt_14_scenario.py
+++ b/src/helm/benchmark/scenarios/wmt_14_scenario.py
@@ -1,6 +1,6 @@
 from typing import List, Any
 from datasets import load_dataset
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import htrack_block
 from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
 
 
@@ -59,26 +59,26 @@ def _deduplicate(self, dataset: List):
         return deduplicated_dataset
 
     def get_instances(self, output_path: str) -> List[Instance]:
-        hlog("Loading the HuggingFace dataset. The first time could take several minutes.")
-        subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
-        hf_dataset: Any = load_dataset("wmt14", subset_name)
-        splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
+            subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
+            hf_dataset: Any = load_dataset("wmt14", subset_name)
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
 
         instances: List[Instance] = []
-        hlog("Generating instances")
-        # Some training sets are too large, so we will only take a random subset of it.
-        hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
-        hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
-        for example in hf_dataset["train"]["translation"]:
-            source_sentence: str = example[self.source_language]
-            target_sentence: str = example[self.target_language]
-            instances.append(
-                Instance(
-                    input=Input(text=source_sentence),
-                    references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
-                    split="train",
+        with htrack_block("Generating instances"):
+            # Some training sets are too large, so we will only take a random subset of it.
+            hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
+            hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
+            for example in hf_dataset["train"]["translation"]:
+                source_sentence: str = example[self.source_language]
+                target_sentence: str = example[self.target_language]
+                instances.append(
+                    Instance(
+                        input=Input(text=source_sentence),
+                        references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
+                        split="train",
+                    )
                 )
-            )
 
         # No special handling needed for validation or test.
         for split_name in ["validation", "test"]:
diff --git a/src/helm/benchmark/server.py b/src/helm/benchmark/server.py
index f0bef444b9..9d8bf90879 100644
--- a/src/helm/benchmark/server.py
+++ b/src/helm/benchmark/server.py
@@ -5,10 +5,15 @@
 
 import argparse
 import importlib_resources as resources
+import json
 from os import path
 import urllib
 
-from bottle import Bottle, static_file
+from bottle import Bottle, static_file, HTTPResponse
+import yaml
+
+from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
+from helm.common.general import serialize_dates
 
 
 app = Bottle()
@@ -28,6 +33,35 @@ def serve_config():
         )
 
 
+# Shim for running helm-server for old suites from old version of helm-summarize
+# that do not contain schema.json.
+#
+# The HELM web frontend expects to find a schema.json at /benchmark_output/runs/<version>/schema.json
+# which is produced by the new version of helm-summarize but not the old version.
+# When serving a suite produced by the old version of helm-summarize, the schena.json will be missing.
+# This shim supports those suites by serving a schena.json that is dynamically computed from schema_classic.yaml
+#
+# We will remove this in a few months after most users have moved to the new version of helm-summarize.
+#
+# TODO(2024-03-01): Remove this.
+@app.get("/benchmark_output/<runs_or_releases:re:runs|releases>/<version>/schema.json")
+def server_schema(runs_or_releases, version):
+    relative_schema_path = path.join(runs_or_releases, version, "schema.json")
+    absolute_schema_path = path.join(app.config["helm.outputpath"], relative_schema_path)
+    if path.isfile(absolute_schema_path):
+        response = static_file(relative_schema_path, root=app.config["helm.outputpath"])
+    else:
+        # Suite does not contain schema.json
+        # Fall back to schema_classic.yaml from the static directory
+        classic_schema_path = path.join(app.config["helm.staticpath"], SCHEMA_CLASSIC_YAML_FILENAME)
+        with open(classic_schema_path, "r") as f:
+            response = HTTPResponse(json.dumps(yaml.safe_load(f), indent=2, default=serialize_dates))
+    response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
+    response.set_header("Expires", "0")
+    response.content_type = "application/json"
+    return response
+
+
 @app.get("/benchmark_output/<filename:path>")
 def serve_benchmark_output(filename):
     response = static_file(filename, root=app.config["helm.outputpath"])
diff --git a/src/helm/benchmark/slurm_jobs.py b/src/helm/benchmark/slurm_jobs.py
index 8820ca9058..4bc30b9cbd 100644
--- a/src/helm/benchmark/slurm_jobs.py
+++ b/src/helm/benchmark/slurm_jobs.py
@@ -2,6 +2,8 @@
 import subprocess
 from typing import Mapping, Set, Union
 
+from retrying import retry
+
 from helm.common.optional_dependencies import handle_module_not_found_error
 
 try:
@@ -67,6 +69,11 @@ def submit_slurm_job(command: str, slurm_args: Mapping[str, Union[str, int]]) ->
     return slurm.sbatch(command)
 
 
+@retry(
+    wait_incrementing_start=5 * 1000,  # 5 seconds
+    wait_incrementing_increment=5 * 1000,  # 5 seconds
+    stop_max_attempt_number=5,
+)
 def get_slurm_job_state(job_id: int) -> str:
     """Get the state of a Slurm job."""
     try:
@@ -80,6 +87,11 @@ def get_slurm_job_state(job_id: int) -> str:
     return search_result.group(1)
 
 
+@retry(
+    wait_incrementing_start=5 * 1000,  # 5 seconds
+    wait_incrementing_increment=5 * 1000,  # 5 seconds
+    stop_max_attempt_number=5,
+)
 def cancel_slurm_job(job_id: int) -> None:
     """Cancel a Slurm job."""
     try:
diff --git a/src/helm/benchmark/slurm_runner.py b/src/helm/benchmark/slurm_runner.py
index 09cde008b5..9a1293c25a 100644
--- a/src/helm/benchmark/slurm_runner.py
+++ b/src/helm/benchmark/slurm_runner.py
@@ -150,6 +150,9 @@ def run_all(self, run_specs: List[RunSpec]):
         # Info for all worker Slurm jobs
         run_name_to_slurm_job_info: Dict[str, _SlurmJobInfo] = {}
 
+        # Location to persist the info for all worker Slurm jobs
+        worker_slurm_jobs_path = os.path.join(self.slurm_base_dir, "worker_slurm_jobs.json")
+
         # Callback for cleaning up worker Slurm jobs
         def cancel_all_jobs():
             """Cancels all submitted worker Slurm jobs that are in a non-terminal state."""
@@ -159,6 +162,11 @@ def cancel_all_jobs():
                     if slurm_job_info.state not in TERMINAL_SLURM_JOB_STATES:
                         hlog(f"Cancelling worker Slurm job run {run_name} with Slurm job ID {slurm_job_info.id}")
                         cancel_slurm_job(slurm_job_info.id)
+                        slurm_job_info.state = SlurmJobState.CANCELLED
+            run_name_to_slurm_job_info_json = to_json(run_name_to_slurm_job_info)
+            hlog(f"Worker Slurm jobs: {run_name_to_slurm_job_info_json}")
+            hlog(f"Writing worker Slurm job states to {worker_slurm_jobs_path}")
+            write(file_path=worker_slurm_jobs_path, content=run_name_to_slurm_job_info_json)
 
         try:
             # Monitor submitted Slurm jobs for RunSpecs until an exit condition is triggered.
@@ -190,7 +198,6 @@ def cancel_all_jobs():
                     for slurm_job_info in run_name_to_slurm_job_info.values():
                         if slurm_job_info.state not in TERMINAL_SLURM_JOB_STATES:
                             slurm_job_info.state = get_slurm_job_state(slurm_job_info.id)
-                    worker_slurm_jobs_path = os.path.join(self.slurm_base_dir, "worker_slurm_jobs.json")
                     run_name_to_slurm_job_info_json = to_json(run_name_to_slurm_job_info)
                     hlog(f"Worker Slurm jobs: {run_name_to_slurm_job_info_json}")
                     hlog(f"Writing worker Slurm job states to {worker_slurm_jobs_path}")
diff --git a/src/helm/benchmark/static/benchmarking.js b/src/helm/benchmark/static/benchmarking.js
index 4f5382a94c..e353681e69 100644
--- a/src/helm/benchmark/static/benchmarking.js
+++ b/src/helm/benchmark/static/benchmarking.js
@@ -92,7 +92,7 @@ $(function () {
     $table.append($header);
 
     schema.run_groups.forEach((group) => {
-      if (group.category) {
+      if (group.category && group.category !== "Scenarios") {
         return;
       }
       const href = groupUrl(group.name);
@@ -1595,8 +1595,7 @@ $(function () {
     window.SUITE = urlParams.suite;
   }
 
-  const schemaPromise = $.get("schema.yaml", {}, (response) => {
-    const raw = jsyaml.load(response);
+  const schemaPromise = $.getJSON(schemaJsonUrl(), {}, (raw) => {
     console.log("schema", raw);
     schema = new Schema(raw);
   });
diff --git a/src/helm/benchmark/static/contamination.yaml b/src/helm/benchmark/static/contamination.yaml
index c51bb9d4e2..4cd6d83e22 100644
--- a/src/helm/benchmark/static/contamination.yaml
+++ b/src/helm/benchmark/static/contamination.yaml
@@ -44,6 +44,7 @@ points:
     - anthropic/stanford-online-all-v4-s3
     - anthropic/claude-v1.3
     - anthropic/claude-instant-v1
+    - anthropic/claude-instant-1.2
     groups:
     - the_pile
     level: strong
@@ -83,7 +84,6 @@ points:
     - openai/code-davinci-002
     - openai/code-davinci-001
     - openai/code-cushman-001
-    - openai/chat-gpt
     groups:
     - natural_qa_closedbook
     - natural_qa_openbook_longans
diff --git a/src/helm/benchmark/static/images/organizations/together.png b/src/helm/benchmark/static/images/organizations/together.png
index 76ffc350f9..28644a53b3 100644
Binary files a/src/helm/benchmark/static/images/organizations/together.png and b/src/helm/benchmark/static/images/organizations/together.png differ
diff --git a/src/helm/benchmark/static/json-urls.js b/src/helm/benchmark/static/json-urls.js
index db9632671c..92ddf4c355 100644
--- a/src/helm/benchmark/static/json-urls.js
+++ b/src/helm/benchmark/static/json-urls.js
@@ -8,6 +8,10 @@ function versionBaseUrl() {
   }
 }
 
+function schemaJsonUrl() {
+  return `${versionBaseUrl()}/schema.json`;
+}
+
 function summaryJsonUrl() {
   return `${versionBaseUrl()}/summary.json`;
 }
diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema_classic.yaml
similarity index 93%
rename from src/helm/benchmark/static/schema.yaml
rename to src/helm/benchmark/static/schema_classic.yaml
index 0edf2eff69..bacf13789d 100644
--- a/src/helm/benchmark/static/schema.yaml
+++ b/src/helm/benchmark/static/schema_classic.yaml
@@ -101,6 +101,12 @@ models:
     creator_organization: Anthropic
     access: limited
     release_date: 2023-07-11
+  - name: anthropic/claude-2.1
+    display_name: Anthropic Claude 2.1
+    description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-11-21
   - name: anthropic/claude-v1.3
     display_name: Anthropic Claude v1.3
     description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
@@ -113,6 +119,12 @@ models:
     creator_organization: Anthropic
     access: limited
     release_date: 2023-03-17
+  - name: anthropic/claude-instant-1.2
+    display_name: Anthropic Claude Instant 1.2
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-08-09
 
   # Berkeley
   - name: together/koala-13b
@@ -186,6 +198,36 @@ models:
     access: open
     num_parameters: 1500000000
 
+  # HuggignfaceM4
+  - name: HuggingFaceM4/idefics-9b
+    display_name: IDEFICS (9B)
+    description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-9b-instruct
+    display_name: IDEFICS instruct (9B)
+    description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-80b
+    display_name: IDEFICS (80B)
+    description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-80b-instruct
+    display_name: IDEFICS instruct (80B)
+    description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+
   # Cerebras Systems
   - name: together/cerebras-gpt-6.7b
     display_name: Cerebras GPT (6.7B)
@@ -261,6 +303,18 @@ models:
     access: limited
     num_parameters: 52400000000
     release_date: 2022-11-08
+  - name: cohere/command
+    display_name: Cohere Command
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
+  - name: cohere/command-light
+    display_name: Cohere Command Light
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
 
   # Databricks
   - name: databricks/dolly-v2-3b
@@ -356,7 +410,6 @@ models:
     access: open
     num_parameters: 11000000000
     release_date: 2019-10-23
-
   - name: together/ul2
     display_name: UL2 (20B)
     description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
@@ -364,19 +417,48 @@ models:
     access: open
     num_parameters: 20000000000
     release_date: 2022-05-10
-
   - name: together/flan-t5-xxl
     display_name: Flan-T5 (11B)
     description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
     creator_organization: Google
     access: open
-
   - name: google/palm
     display_name: PaLM (540B)
     description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
     creator_organization: Google
     access: closed
     todo: true
+  ## PaLM 2
+  - name: google/text-bison@001
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/text-bison-32k
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/text-unicorn@001
+    display_name: PaLM-2 (Unicorn)
+    description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/code-bison@001
+    display_name: Codey PaLM-2 (Bison)
+    description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
+  - name: google/code-bison-32k
+    display_name: Codey PaLM-2 (Bison)
+    description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
 
   # HazyResearch
   - name: together/h3-2.7b
@@ -560,6 +642,22 @@ models:
     num_parameters: 13000000000
     release_date: 2023-06-22
 
+  # 01.AI
+  - name: 01-ai/yi-6b
+    display_name: Yi (6B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2023-11-02
+  - name: 01-ai/yi-34b
+    display_name: Yi (34B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 34000000000
+    release_date: 2023-11-02
+
   # Mistral AI
   - name: mistralai/mistral-7b-v0.1
     display_name: Mistral v0.1 (7B)
@@ -715,6 +813,12 @@ models:
     creator_organization: OpenAI
     access: limited
     release_date: 2023-06-13
+  - name: openai/gpt-4-1106-preview
+    display_name: gpt-4-1106-preview
+    description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-11-06
 
   # Together
   - name: together/Together-gpt-JT-6B-v1
@@ -885,7 +989,6 @@ models:
     access: limited
     num_parameters: 5000000000
     release_date: 2022-10-13
-    todo: true
   - name: writer/palmyra-large
     display_name: Palmyra Large (20B)
     description: Palmyra Large (20B)
@@ -893,7 +996,6 @@ models:
     access: limited
     num_parameters: 20000000000
     release_date: 2022-12-23
-    todo: true
   - name: writer/palmyra-instruct-30
     display_name: InstructPalmyra (30B)
     description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
@@ -901,7 +1003,6 @@ models:
     access: limited
     num_parameters: 30000000000
     release_date: 2023-02-16
-    todo: true
   - name: writer/palmyra-e
     display_name: Palmyra E (30B)
     description: Palmyra E (30B)
@@ -909,7 +1010,6 @@ models:
     access: limited
     num_parameters: 30000000000
     release_date: 2023-03-03
-    todo: true
   - name: writer/silk-road
     display_name: Silk Road (35B)
     description: Silk Road (35B)
@@ -917,7 +1017,6 @@ models:
     access: limited
     num_parameters: 35000000000
     release_date: 2023-04-13
-    todo: true
   - name: writer/palmyra-x
     display_name: Palmyra X (43B)
     description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
@@ -925,7 +1024,27 @@ models:
     access: limited
     num_parameters: 43000000000
     release_date: 2023-06-11
-    todo: true
+  - name: writer/palmyra-x-v2
+    display_name: Palmyra X V2 (33B)
+    description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+  - name: writer/palmyra-x-v3
+    display_name: Palmyra X V3 (72B)
+    description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 72000000000
+    release_date: 2023-12-01
+  - name: writer/palmyra-x-32k
+    display_name: Palmyra X-32K (33B)
+    description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
 
   # Yandex
   - name: together/yalm
@@ -1207,10 +1326,15 @@ metrics:
     lower_is_better: false
   - name: math_equiv_chain_of_thought
     display_name: Equivalent (chain of thought)
-    description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thoughts prompting.
+    description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
     lower_is_better: false
   - name: exact_match_indicator
-    display_name: Exact match (up to specified indicator)
+    display_name: Exact match (final)
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
+    lower_is_better: false
+  - name: final_number_exact_match
+    display_name: Exact match (final number)
     short_display_name: EM
     description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
     lower_is_better: false
@@ -1602,7 +1726,7 @@ metric_groups:
         split: ${main_split}
 
   - name: calibration_detailed
-    display_name: Calibration
+    display_name: Calibration (Detailed)
     description: Measures how calibrated the model is (how meaningful its uncertainty estimates are).
     metrics:
       - name: max_prob
@@ -1633,7 +1757,7 @@ metric_groups:
 
   # TODO: Add other robustness perturbations
   - name: robustness_detailed
-    display_name: Robustness
+    display_name: Robustness (Detailed)
     description: Measures how robust the model is to invariances.
     metrics:
       - name: ${main_name}
@@ -1652,7 +1776,7 @@ metric_groups:
 
   # TODO: Add other fairness perturbations
   - name: fairness_detailed
-    display_name: Fairness
+    display_name: Fairness (Detailed)
     description: Measures how fair the model is.
     metrics:
       - name: ${main_name}
@@ -1690,7 +1814,7 @@ metric_groups:
       split: ${main_split}
 
   - name: efficiency_detailed
-    display_name: Efficiency
+    display_name: Efficiency (Detailed)
     description: The efficiency of the model across both training and inference.
     metrics:
       - name: inference_runtime
@@ -2767,8 +2891,8 @@ run_groups:
       language: synthetic
 
   - name: math_chain_of_thought
-    display_name: MATH (chain-of-thoughts)
-    description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
+    display_name: MATH (chain-of-thought)
+    description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).
     metric_groups:
       - accuracy
       - efficiency
@@ -2818,6 +2942,23 @@ run_groups:
       when: n/a
       language: synthetic
 
+  - name: legalbench
+    display_name: LegalBench
+    description: LegalBench is a large collaboratively constructed benchmark of legal reasoning. Five representative tasks are included here. See [(Guha et al, 2023)[https://arxiv.org/abs/2308.11462] for more details.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: "text classification"
+      what: "fact patterns, questions, and legal documents"
+      who: "lawyers"
+      when: n/a
+      language: English
+
   - name: legal_support
     display_name: LegalSupport
     description: Scenario introduced in this work to measure fine-grained legal reasoning through reverse entailment.
@@ -2852,6 +2993,40 @@ run_groups:
       when: n/a
       language: synthetic
 
+  - name: med_qa
+    display_name: MedQA
+    description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+
+  - name: wmt_14
+    display_name: WMT 2014
+    description: WMT 2014 is a collection of machine translation datasets.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: bleu_4
+      main_split: test
+    taxonomy:
+      task: machine translation
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+
   - name: lextreme
     display_name: LEXTREME
     description: A Multilingual Legal Benchmark for Natural Language Understanding
@@ -3534,7 +3709,7 @@ run_groups:
 
   - name: cleva_mathematical_reasoning
     display_name: CLEVA (Chinese) mathematical reasoning
-    description: "Scenario that tests models' mathematical reasoning ability with chain-of-thoughts style reasoning. It contains a math word problem solving subtask."
+    description: "Scenario that tests models' mathematical reasoning ability with chain-of-thought style reasoning. It contains a math word problem solving subtask."
     metric_groups:
       - cleva_mathematical_reasoning_metrics
       - general_information
diff --git a/src/helm/benchmark/static/schema_lite.yaml b/src/helm/benchmark/static/schema_lite.yaml
new file mode 100644
index 0000000000..ec6abd6dab
--- /dev/null
+++ b/src/helm/benchmark/static/schema_lite.yaml
@@ -0,0 +1,1049 @@
+---
+############################################################
+models:
+  # Anthropic
+  - name: anthropic/claude-2.0
+    display_name: Anthropic Claude 2.0
+    description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-07-11
+  - name: anthropic/claude-2.1
+    display_name: Anthropic Claude 2.1
+    description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-11-21
+  - name: anthropic/claude-v1.3
+    display_name: Anthropic Claude v1.3
+    description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-03-17
+  - name: anthropic/claude-instant-1.2
+    display_name: Anthropic Claude Instant 1.2
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-08-09
+
+  # Cohere
+  - name: cohere/command
+    display_name: Cohere Command
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
+  - name: cohere/command-light
+    display_name: Cohere Command Light
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
+
+  # Meta
+  - name: meta/llama-65b
+    display_name: LLaMA (65B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization: Meta
+    access: open
+    num_parameters: 65000000000
+    release_date: 2023-02-24
+  - name: meta/llama-2-7b
+    display_name: Llama 2 (7B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-07-18
+  - name: meta/llama-2-13b
+    display_name: Llama 2 (13B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-07-18
+  - name: meta/llama-2-70b
+    display_name: Llama 2 (70B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 70000000000
+    release_date: 2023-07-18
+
+  # 01.AI
+  - name: 01-ai/yi-6b
+    display_name: Yi (6B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2023-11-02
+  - name: 01-ai/yi-34b
+    display_name: Yi (34B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 34000000000
+    release_date: 2023-11-02
+
+  # Mistral AI
+  - name: mistralai/mistral-7b-v0.1
+    display_name: Mistral v0.1 (7B)
+    description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
+    creator_organization: Mistral AI
+    access: open
+    num_parameters: 7300000000
+    release_date: 2023-09-27
+
+  - name: mistralai/mixtral-8x7b-32kseqlen
+    display_name: Mixtral (8x7B 32K seqlen)
+    description: Mistral AI's mixture-of-experts model ([tweet](https://twitter.com/MistralAI/status/1733150512395038967)).
+    creator_organization: Mistral AI
+    access: open
+    num_parameters: 56000000000
+    release_date: 2023-12-08
+
+  # OpenAI
+  - name: openai/text-davinci-003
+    display_name: GPT-3.5 (text-davinci-003)
+    description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-11-28
+  - name: openai/text-davinci-002
+    display_name: GPT-3.5 (text-davinci-002)
+    description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-27
+  - name: openai/gpt-4-0613
+    display_name: GPT-4 (0613)
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+  - name: openai/gpt-4-1106-preview
+    display_name: GPT-4 Turbo (1106 preview)
+    description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-11-06
+  - name: openai/gpt-3.5-turbo-0613
+    display_name: GPT-3.5 Turbo (0613)
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+
+  # Writer
+  - name: writer/palmyra-x-v2
+    display_name: Palmyra X V2 (33B)
+    description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+  - name: writer/palmyra-x-v3
+    display_name: Palmyra X V3 (72B)
+    description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 72000000000
+    release_date: 2023-12-01
+
+  # Google
+  - name: google/text-bison@001
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/text-unicorn@001
+    display_name: PaLM-2 (Unicorn)
+    description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+
+  # TII UAE
+  - name: tiiuae/falcon-7b
+    display_name: Falcon (7B)
+    description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-15
+  - name: tiiuae/falcon-40b
+    display_name: Falcon (40B)
+    description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 40000000000
+    release_date: 2023-05-25
+
+  # AI21 Labs
+  - name: ai21/j2-jumbo
+    display_name: Jurassic-2 Jumbo (178B)
+    description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2023-03-09
+  - name: ai21/j2-grande
+    display_name: Jurassic-2 Grande (17B)
+    description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2023-03-09
+
+  #  Aleph Alpha
+  - name: AlephAlpha/luminous-base
+    display_name: Luminous Base (13B)
+    description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 13000000000
+    # TODO: get exact release date
+    release_date: 2022-01-01
+  - name: AlephAlpha/luminous-extended
+    display_name: Luminous Extended (30B)
+    description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2022-01-01
+  - name: AlephAlpha/luminous-supreme
+    display_name: Luminous Supreme (70B)
+    description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 70000000000
+    release_date: 2022-01-01
+
+############################################################
+adapter:
+  - name: method
+    description: The high-level strategy for converting instances into a prompt for the language model.
+    values:
+      - name: generation
+        description: Given the input, the model generates the output free-form.
+      - name: multiple_choice_joint
+        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
+      - name: multiple_choice_separate_original
+        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
+      - name: multiple_choice_separate_calibrated
+        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
+      - name: language_modeling
+        description: Given the input, the model assigns the sequence a probability.
+  - name: instructions
+    description: The description of the task that is included at the very beginning of the prompt.
+  - name: global_prefix
+    description: The string that is prepended to the prompt.
+  - name: global_suffix
+    description: The string that is appended to the prompt.
+  - name: instance_prefix
+    description: The string that is included before each instance (e.g., '\n\n').
+  - name: input_prefix
+    description: The string that is included before each input (e.g., 'Question:').
+  - name: input_suffix
+    description: The string that is included after each input (e.g., '\n').
+  - name: reference_prefix
+    description: The string that is included before each reference (for multiple-choice questions).
+  - name: reference_suffix
+    description: The string that is included after each reference (for multiple-choice questions).
+  - name: output_prefix
+    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
+  - name: output_suffix
+    description: The string that is included after the correct answer/predicted output (e.g., '\n').
+  - name: substitutions
+    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
+  - name: max_train_instances
+    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
+  - name: max_eval_instances
+    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
+  - name: num_outputs
+    description: Maximum number of possible outputs to generate by sampling multiple outputs.
+  - name: num_train_trials
+    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
+  - name: sample_train
+    description: If true, randomly sample N training examples; if false, select N consecutive training examples
+  - name: model
+    description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
+  - name: model_deployment
+    description: Name of the language model (<host_organization>/<model name>) to send requests to.
+  - name: temperature
+    description: Temperature parameter used in generation.
+  - name: max_tokens
+    description: Maximum number of tokens to generate.
+  - name: stop_sequences
+    description: List of sequences, where we stop generation if we encounter any of them.
+  - name: random
+    description: Random seed (string), which guarantees reproducibility.
+  - name: multi_label
+    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
+
+############################################################
+metrics:
+  # Infrastructure metrics:
+  - name: num_perplexity_tokens
+    display_name: '# tokens'
+    description: Average number of tokens in the predicted output (for language modeling, the input too).
+  - name: num_bytes
+    display_name: '# bytes'
+    description: Average number of bytes in the predicted output (for language modeling, the input too).
+
+  - name: num_references
+    display_name: '# ref'
+    description: Number of references.
+  - name: num_train_trials
+    display_name: '# trials'
+    description: Number of trials, where in each trial we choose an independent, random set of training instances.
+  - name: estimated_num_tokens_cost
+    display_name: 'cost'
+    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
+  - name: num_prompt_tokens
+    display_name: '# prompt tokens'
+    description: Number of tokens in the prompt.
+  - name: num_prompt_characters
+    display_name: '# prompt chars'
+    description: Number of characters in the prompt.
+  - name: num_completion_tokens
+    display_name: '# completion tokens'
+    description: Actual number of completion tokens (over all completions).
+  - name: num_output_tokens
+    display_name: '# output tokens'
+    description: Actual number of output tokens.
+  - name: max_num_output_tokens
+    display_name: 'Max output tokens'
+    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
+  - name: num_requests
+    display_name: '# requests'
+    description: Number of distinct API requests.
+  - name: num_instances
+    display_name: '# eval'
+    description: Number of evaluation instances.
+  - name: num_train_instances
+    display_name: '# train'
+    description: Number of training instances (e.g., in-context examples).
+  - name: prompt_truncated
+    display_name: truncated
+    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
+  - name: finish_reason_length
+    display_name: finish b/c length
+    description: Fraction of instances where the the output was terminated because of the max tokens limit.
+  - name: finish_reason_stop
+    display_name: finish b/c stop
+    description: Fraction of instances where the the output was terminated because of the stop sequences.
+  - name: finish_reason_endoftext
+    display_name: finish b/c endoftext
+    description: Fraction of instances where the the output was terminated because the end of text token was generated.
+  - name: finish_reason_unknown
+    display_name: finish b/c unknown
+    description: Fraction of instances where the the output was terminated for unknown reasons.
+  - name: num_completions
+    display_name: '# completions'
+    description: Number of completions.
+  - name: predicted_index
+    display_name: Predicted index
+    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+
+  # Accuracy metrics:
+  - name: exact_match
+    display_name: Exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match
+    display_name: Quasi-exact match
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match
+    display_name: Prefix exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match
+    # TODO: should call this prefix_quasi_exact_match
+    display_name: Prefix quasi-exact match
+    short_display_name: PEM
+    description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
+  - name: exact_match@5
+    display_name: Exact match @5
+    short_display_name: EM@5
+    description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_exact_match@5
+    display_name: Quasi-exact match @5
+    short_display_name: EM@5
+    description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
+    lower_is_better: false
+  - name: prefix_exact_match@5
+    display_name: Prefix exact match @5
+    short_display_name: PEM@5
+    description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match@5
+    display_name: Prefix quasi-exact match @5
+    short_display_name: PEM@5
+    description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
+  - name: logprob
+    display_name: Log probability
+    short_display_name: Logprob
+    description: Predicted output's average log probability (input's log prob for language modeling).
+    lower_is_better: false
+  - name: logprob_per_byte
+    display_name: Log probability / byte
+    short_display_name: Logprob/byte
+    description: Predicted output's average log probability normalized by the number of bytes.
+    lower_is_better: false
+  - name: bits_per_byte
+    display_name: Bits/byte
+    short_display_name: BPB
+    lower_is_better: true
+    description: Average number of bits per byte according to model probabilities.
+  - name: perplexity
+    display_name: Perplexity
+    short_display_name: PPL
+    lower_is_better: true
+    description: Perplexity of the output completion (effective branching factor per output token).
+  - name: rouge_1
+    display_name: ROUGE-1
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
+    lower_is_better: false
+  - name: rouge_2
+    display_name: ROUGE-2
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
+    lower_is_better: false
+  - name: rouge_l
+    display_name: ROUGE-L
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+    lower_is_better: false
+  - name: bleu_1
+    display_name: BLEU-1
+    description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
+    lower_is_better: false
+  - name: bleu_4
+    display_name: BLEU-4
+    description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
+    lower_is_better: false
+  - name: f1_set_match
+    display_name: F1 (set match)
+    short_display_name: F1
+    description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
+    lower_is_better: false
+  - name: f1_score
+    display_name: F1
+    description: Average F1 score in terms of word overlap between the model output and correct reference.
+    lower_is_better: false
+  - name: classification_macro_f1
+    display_name: Macro-F1
+    description: Population-level macro-averaged F1 score.
+    lower_is_better: false
+  - name: classification_micro_f1
+    display_name: Micro-F1
+    description: Population-level micro-averaged F1 score.
+    lower_is_better: false
+  - name: absolute_value_difference
+    display_name: Absolute difference
+    short_display_name: Diff.
+    lower_is_better: true
+    description: Average absolute difference between the model output (converted to a number) and the correct reference.
+  - name: distance
+    display_name: Geometric distance
+    short_display_name: Dist.
+    lower_is_better: true
+    description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
+  - name: percent_valid
+    display_name: Valid fraction
+    short_display_name: Valid
+    description: Fraction of valid model outputs (as a number).
+    lower_is_better: false
+  - name: NDCG@10
+    display_name: NDCG@10
+    description: Normalized discounted cumulative gain at 10 in information retrieval.
+    lower_is_better: false
+  - name: RR@10
+    display_name: RR@10
+    description: Mean reciprocal rank at 10 in information retrieval.
+    lower_is_better: false
+  - name: NDCG@20
+    display_name: NDCG@20
+    description: Normalized discounted cumulative gain at 20 in information retrieval.
+    lower_is_better: false
+  - name: RR@20
+    display_name: RR@20
+    description: Mean reciprocal rank at 20 in information retrieval.
+    lower_is_better: false
+  - name: math_equiv
+    display_name: Equivalent
+    description: Fraction of model outputs that are mathematically equivalent to the correct reference.
+    lower_is_better: false
+  - name: math_equiv_chain_of_thought
+    display_name: Equivalent (CoT)
+    description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
+    lower_is_better: false
+  - name: exact_match_indicator
+    display_name: Exact match (final)
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
+    lower_is_better: false
+  - name: final_number_exact_match
+    display_name: Exact match (final number)
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
+    lower_is_better: false
+  - name: exact_set_match
+    display_name: Exact match (at sets)
+    short_display_name: EM
+    description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
+    lower_is_better: false
+  - name: iou_set_match
+    display_name: Intersection over union (as sets)
+    short_display_name: IoU
+    description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
+    lower_is_better: false
+
+  # Summarization metrics
+  - name: summac
+    display_name: SummaC
+    description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
+    lower_is_better: false
+  - name: QAFactEval
+    display_name: QAFactEval
+    description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
+    lower_is_better: false
+  - name: summarization_coverage
+    display_name: Coverage
+    description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
+  - name: summarization_density
+    display_name: Density
+    description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
+  - name: summarization_compression
+    display_name: Compression
+    description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
+  - name: BERTScore-P
+    display_name: BERTScore (P)
+    description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
+    lower_is_better: false
+  - name: BERTScore-R
+    display_name: BERTScore (R)
+    description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
+    lower_is_better: false
+  - name: BERTScore-F
+    display_name: BERTScore (F1)
+    description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
+    lower_is_better: false
+  - name: HumanEval-faithfulness
+    display_name: HumanEval-faithfulness
+    description: Human evaluation score for faithfulness.
+    lower_is_better: false
+  - name: HumanEval-relevance
+    display_name: HumanEval-relevance
+    description: Human evaluation score for relevance.
+    lower_is_better: false
+  - name: HumanEval-coherence
+    display_name: HumanEval-coherence
+    description: Human evaluation score for coherence.
+    lower_is_better: false
+
+  #  Code metrics
+  - name: code_eval_acc
+    display_name: Correctness
+    short_display_name: Correctness
+    description: Fraction of instances that the model output evaluates to the correct answer.
+    lower_is_better: false
+  - name: pass
+    display_name: pass@1
+    description: Fraction of model outputs that pass the associated test cases.
+    lower_is_better: false
+  - name: test_avg
+    display_name: 'Avg. # tests passed'
+    description: Average number of tests passed by model outputs.
+    lower_is_better: false
+  - name: strict_acc
+    display_name: Strict correctness
+    short_display_name: Strict correctness
+    description: Fraction of models outputs that pass all associated test cases.
+    lower_is_better: false
+
+  # Disinformation metrics (measure diversity):
+  - name: self_bleu
+    display_name: Self-BLEU
+    description: Self-BLEU metric of [Zhu et al. (2018)](https://arxiv.org/pdf/1802.01886.pdf) as a measure of diversity in model generations.
+  - name: monte_carlo_entropy
+    display_name: Entropy (Monte Carlo)
+    short_display_name: Entropy
+    description: Monte Carlo estimate of the entropy as a measure of diversity in model generations.
+  # Disinformation wedging metrics (measure quality of disinformation):
+  - name: wedging_eval_address_intended_audience
+    display_name: Address intended audience (human)
+    short_display_name: Intended audience (human)
+    description: Human judgment of fraction of model generations that target the intended audience.
+  - name: wedging_eval_include_intended_goal
+    display_name: Include intended goal (human)
+    short_display_name: Intended goal (human)
+    description: Human judgment of fraction of model generations that support the intended goal.
+  - name: wedging_eval_style
+    display_name: Correct style (human)
+    short_display_name: Style (human)
+    description: Average human judgment of whether model generations are of the specified style.
+  - name: wedging_eval_divisiveness
+    display_name: Divisiveness (human)
+    description: Average human judgment of the extent to which model generations are divisive.
+  - name: wedging_eval_toxicity
+    display_name: Toxicity (human)
+    description: Average human judgment of the extent to which model generations are toxicity.
+  # Disinformation reiteration metrics (measure quality of disinformation):
+  - name: reiteration_eval_support_thesis
+    display_name: Support thesis (human)
+    description: Average human judgment of the extent to which model generations support the stated thesis.
+  - name: reiteration_eval_style
+    display_name: Style (human)
+    description: Average human judgment of whether model generations are of the specified style.
+
+  # Copyright metrics (measure copying/overlap):
+  - name: longest_common_prefix_length
+    display_name: Longest common prefix length
+    short_display_name: LCS
+    lower_is_better: true
+    description: Average length of longest common prefix between model generation and reference.
+  - name: edit_distance
+    display_name: Edit distance (Levenshtein)
+    short_display_name: Edit dist.
+    description: Average Levenshtein edit distance between model generation and reference.
+    lower_is_better: false
+  - name: edit_similarity
+    display_name: Edit similarity (Levenshtein)
+    short_display_name: Edit sim.
+    lower_is_better: true
+    description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
+
+  # Bias metrics:
+  - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
+    display_name: Stereotypical associations (race, profession)
+    short_display_name: Stereotypes (race)
+    lower_is_better: true
+    description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
+  - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
+    display_name: Stereotypical associations (race, adjectives)
+    short_display_name: Stereotypes (race)
+    lower_is_better: true
+    description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
+
+  - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
+    display_name: Stereotypical associations (gender, profession)
+    short_display_name: Stereotypes (gender)
+    lower_is_better: true
+    description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
+  - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
+    display_name: Stereotypical associations (gender, adjectives)
+    short_display_name: Stereotypes (gender)
+    lower_is_better: true
+    description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
+
+  - name: bias_metric:mode=representation,demographic_category=race
+    display_name: Demographic representation (race)
+    short_display_name: Representation (race)
+    lower_is_better: true
+    description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
+  - name: bias_metric:mode=representation,demographic_category=gender
+    display_name: Demographic representation (gender)
+    short_display_name: Representation (gender)
+    lower_is_better: true
+    description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
+
+  - name: bbq_metric_ambiguous_bias
+    display_name: BBQ (ambiguous)
+    lower_is_better: true
+    description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
+  - name: bbq_metric_unambiguous_bias
+    display_name: BBQ (unambiguous)
+    lower_is_better: true
+    description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
+
+  # Toxicity metrics
+  - name: expected_max_toxicity
+    display_name: Average max toxicity
+    short_display_name: Avg. max toxicity
+    lower_is_better: true
+    description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+  - name: max_toxicity_probability
+    display_name: max toxicity
+    lower_is_better: true
+    description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
+  - name: toxic_frac
+    display_name: Toxic fraction
+    lower_is_better: true
+    description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
+
+  # Efficiency metrics:
+  - name: training_co2_cost
+    display_name: Estimated training emissions (kg CO2)
+    short_display_name: Training emissions (kg CO2)
+    lower_is_better: true
+    description: Estimate of the CO2 emissions from training the model.
+  - name: training_energy_cost
+    display_name: Estimated training energy cost (MWh)
+    short_display_name: Training energy (MWh)
+    lower_is_better: true
+    description: Estimate of the amount of energy used to train the model.
+  - name: inference_runtime
+    display_name: Observed inference runtime (s)
+    short_display_name: Observed inference time (s)
+    lower_is_better: true
+    description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
+  - name: inference_idealized_runtime
+    display_name: Idealized inference runtime (s)
+    short_display_name: Idealized inference time (s)
+    lower_is_better: true
+    description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
+  - name: inference_denoised_runtime
+    display_name: Denoised inference runtime (s)
+    short_display_name: Denoised inference time (s)
+    lower_is_better: true
+    description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
+  - name: batch_size
+    display_name: Batch size
+    description: For batch jobs, how many requests are in a batch.
+
+  # Calibration metrics:
+  - name: ece_1_bin
+    display_name: 1-bin expected calibration error
+    short_display_name: ECE (1-bin)
+    lower_is_better: true
+    description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
+  - name: max_prob
+    display_name: Max prob
+    description: Model's average confidence in its prediction (only computed for classification tasks)
+    lower_is_better: false
+  - name: ece_10_bin
+    display_name: 10-bin expected calibration error
+    short_display_name: ECE (10-bin)
+    lower_is_better: true
+    description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
+  - name: platt_ece_1_bin
+    display_name: 1-bin expected calibration error (after Platt scaling)
+    short_display_name: Platt-scaled ECE (1-bin)
+    lower_is_better: true
+    description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
+  - name: platt_ece_10_bin
+    display_name: 10-bin Expected Calibration Error (after Platt scaling)
+    short_display_name: Platt-scaled ECE (10-bin)
+    lower_is_better: true
+    description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
+  - name: platt_coef
+    display_name: Platt Scaling Coefficient
+    short_display_name: Platt Coef
+    description: Coefficient of the Platt scaling classifier (can compare this across tasks).
+    lower_is_better: false
+  - name: platt_intercept
+    display_name: Platt Scaling Intercept
+    short_display_name: Platt Intercept
+    description: Intercept of the Platt scaling classifier (can compare this across tasks).
+    lower_is_better: false
+  - name: selective_cov_acc_area
+    display_name: Selective coverage-accuracy area
+    short_display_name: Selective Acc
+    description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
+    lower_is_better: false
+  - name: selective_acc@10
+    display_name: Accuracy at 10% coverage
+    short_display_name: Acc@10%
+    description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
+    lower_is_better: false
+
+  # CLEVA (Chinese) metrics:
+  # Accuracy metrics (Chinese)
+  - name: chinese_ibleu
+    display_name: Chinese iBLEU
+    short_display_name: iBLEU (Chinese)
+    description: A special BLEU score [(Sun and Zhou, 2008)](https://aclanthology.org/P12-2008.pdf) that balances the lexical similarity between references and hypotheses as well as the lexical diversity between raw inputs and hypotheses.
+    lower_is_better: false
+  - name: cleva_top1_accuracy
+    display_name: Chinese Top-1 Accuracy
+    short_display_name: Acc@Top-1 (Chinese)
+    description: A special accuracy [(Patel and Pavlick, 2022)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
+    lower_is_better: false
+  - name: cleva_machine_translation_bleu
+    display_name: BLEU
+    short_display_name: BLEU
+    description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
+    lower_is_better: false
+  - name: chinese_rouge_2
+    display_name: Chinese ROUGE-2 score
+    short_display_name: ROUGE-2 (Chinese)
+    description: ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese tokenizer that segments Chinese strings by character.
+    lower_is_better: false
+  - name: chinese_bleu_1
+    display_name: Chinese BLEU-1 score
+    short_display_name: BLEU-1 (Chinese)
+    description: BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a Chinese tokenizer that segments Chinese strings by character.
+    lower_is_better: false
+  - name: cleva_math_result_match
+    display_name: CLEVA Math Exact Match
+    short_display_name: EM (Math)
+    description: Exact match that cares only the last math expression (numbers and fractions) in the model's prediction.
+    lower_is_better: false
+  # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
+
+############################################################
+perturbations:
+  - name: robustness
+    display_name: Robustness
+    description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
+  - name: fairness
+    display_name: Fairness
+    description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
+  - name: typos
+    display_name: Typos
+    description: >
+      Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
+      performance between perturbed and unperturbed versions.
+  - name: synonym
+    display_name: Synonyms
+    description: >
+      Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
+      worst-case performance between perturbed and unperturbed versions.
+  - name: dialect
+    display_name: SAE -> AAE
+    short_display_name: Dialect
+    description: >
+      Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
+  - name: race
+    display_name: First names by race (White -> Black)
+    short_display_name: Race
+    description: >
+      Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
+  - name: gender
+    display_name: Pronouns by gender (Male -> Female)
+    short_display_name: Gender
+    description: >
+      Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
+      performance between perturbed and unperturbed versions.
+
+############################################################
+metric_groups:
+  - name: accuracy
+    display_name: Accuracy
+    metrics:
+      - name: ${main_name}
+        split: ${main_split}
+
+  - name: efficiency
+    display_name: Efficiency
+    metrics:
+    - name: inference_runtime
+      split: ${main_split}
+
+  - name: general_information
+    display_name: General information
+    metrics:
+    - name: num_instances
+      split: ${main_split}
+    - name: num_train_instances
+      split: ${main_split}
+    - name: prompt_truncated
+      split: ${main_split}
+    - name: num_prompt_tokens
+      split: ${main_split}
+    - name: num_output_tokens
+      split: ${main_split}
+
+############################################################
+run_groups:
+  - name: core_scenarios
+    display_name: Core scenarios
+    description: The scenarios where we evaluate all the models.
+    category: All scenarios
+    subgroups:
+      - narrative_qa
+      - natural_qa_openbook_longans
+      - natural_qa_closedbook
+      - openbookqa
+      - mmlu
+      - math_chain_of_thought
+      - gsm
+      - legalbench
+      - med_qa
+      - wmt_14
+
+  - name: narrative_qa
+    display_name: NarrativeQA
+    description: The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: f1_score
+      main_split: test
+    taxonomy:
+      task: short-answer question answering
+      what: passages are books and movie scripts, questions are unknown
+      who: annotators from summaries
+      when: "2018"
+      language: English
+
+  - name: natural_qa_closedbook
+    display_name: NaturalQuestions (closed-book)
+    description: The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: f1_score
+      main_split: valid
+    taxonomy:
+      task: short-answer question answering
+      what: passages from Wikipedia, questions from search queries
+      who: web users
+      when: 2010s
+      language: English
+
+  - name: natural_qa_openbook_longans
+    display_name: NaturalQuestions (open-book)
+    description: The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: f1_score
+      main_split: valid
+    taxonomy:
+      task: short-answer question answering
+      what: passages from Wikipedia, questions from search queries
+      who: web users
+      when: 2010s
+      language: English
+
+  - name: openbookqa
+    display_name: OpenbookQA
+    description: The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: elementary science
+      who: Amazon Mechnical Turk workers
+      when: "2018"
+      language: English
+
+  - name: mmlu
+    display_name: MMLU (Massive Multitask Language Understanding)
+    short_display_name: MMLU
+    description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: math, science, history, etc.
+      who: various online sources
+      when: before 2021
+      language: English
+
+  - name: gsm
+    display_name: GSM8K (Grade School Math)
+    short_display_name: GSM8K
+    description: The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: final_number_exact_match
+      main_split: test
+    taxonomy:
+      task: numeric answer question answering
+      what: grade school math word problems
+      who: contractors on Upwork and Surge AI
+      when: "2021"
+      language: English
+
+  - name: math_chain_of_thought
+    display_name: MATH
+    description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: math_equiv_chain_of_thought
+      main_split: test
+    taxonomy:
+      task: numeric answer question answering
+      what: math competitions (AMC, AIME, etc.)
+      who: problem setters
+      when: before 2021
+      language: synthetic
+
+  - name: legalbench
+    display_name: LegalBench
+    description: LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: public legal and admininstrative documents, manually constructed questions
+      who: lawyers
+      when: before 2023
+      language: English
+
+  - name: med_qa
+    display_name: MedQA
+    description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: US medical licensing exams
+      who: problem setters
+      when: before 2020
+      language: English
+
+  - name: wmt_14
+    display_name: WMT 2014
+    description: WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: bleu_4
+      main_split: test
+    taxonomy:
+      task: machine translation
+      what: multilingual sentences
+      who: Europarl, news, Common Crawl, etc.
+      when: before 2014
+      language: English, French, Czech, etc.
diff --git a/src/helm/benchmark/test_model_deployment_definition.py b/src/helm/benchmark/test_model_deployment_definition.py
new file mode 100644
index 0000000000..2ba904e083
--- /dev/null
+++ b/src/helm/benchmark/test_model_deployment_definition.py
@@ -0,0 +1,92 @@
+"""Temporary test for preserving invariants during the model / tokenizer / window service refactor.
+
+Delete this after the refactor is done."""
+
+from typing import Optional
+
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
+from helm.benchmark.model_deployment_registry import (
+    get_model_deployment,
+    ModelDeployment,
+    ALL_MODEL_DEPLOYMENTS,
+)
+from helm.benchmark.model_metadata_registry import get_model_metadata, ModelMetadata
+from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
+from helm.benchmark.window_services.test_utils import get_tokenizer_service
+from helm.proxy.clients.client import Client
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+from helm.benchmark.window_services.window_service import WindowService
+
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.proxy.clients.auto_client import AutoClient
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
+
+
+# HACK: This looks like it should be done in a setup_class()
+# for the test below but apparently pytest first check the parametrize
+# before running the setup_class().
+# Therefore ALL_MODEL_DEPLOYMENTS is empty and no test would be run,
+# so we need to do this here.
+register_builtin_configs_from_helm_package()
+
+INT_MAX: int = 2**31 - 1
+
+
+class TestModelProperties:
+    @pytest.mark.parametrize("deployment_name", [deployment.name for deployment in ALL_MODEL_DEPLOYMENTS])
+    def test_models_has_window_service(self, deployment_name: str):
+        with TemporaryDirectory() as tmpdir:
+            auto_client = AutoClient({}, tmpdir, "")
+            auto_tokenizer = AutoTokenizer({}, tmpdir, "")
+            tokenizer_service = get_tokenizer_service(tmpdir)
+
+            # Loading the TokenizerConfig and ModelMetadat ensures that they are valid.
+            deployment: ModelDeployment = get_model_deployment(deployment_name)
+            tokenizer_name: str = deployment.tokenizer_name if deployment.tokenizer_name else deployment_name
+            tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(tokenizer_name)
+            assert tokenizer_config is not None
+            model: ModelMetadata = get_model_metadata(
+                deployment.model_name if deployment.model_name else deployment_name
+            )
+
+            # Can't test lit-gpt client because it requires manual dependencies
+            if "lit-gpt" in model.name:
+                return
+
+            # Can't test Llama 2 because it requires Hugging Face credentials
+            if "llama-2-" in model.name:
+                return
+
+            # Can't test Vertex AI because it requires Google credentials
+            if "text-bison" in model.name or "text-unicorn" in model.name:
+                return
+
+            # Loads the model, window service and tokenizer
+            # which checks that the model, window service and tokenizer are all valid,
+            # and that no Client, WindowService or Tokenizer are crashing.
+            client: Client = auto_client._get_client(deployment_name)  # noqa: F841
+            window_service: WindowService = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
+            tokenizer: Tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)  # noqa: F841
+
+            # Verify that the parameters that are redundant between the ModelDeployment, Tokenizer and the
+            # WindowService are the same.
+            assert window_service.tokenizer_name == deployment.tokenizer_name
+            assert window_service.max_sequence_length == deployment.max_sequence_length
+            assert (
+                window_service.max_request_length == deployment.max_request_length
+                if deployment.max_request_length
+                else deployment.max_sequence_length
+            )
+            assert (
+                window_service.max_sequence_and_generated_tokens_length
+                == deployment.max_sequence_and_generated_tokens_length
+                if deployment.max_sequence_and_generated_tokens_length
+                else INT_MAX
+            )
+            assert tokenizer_config.end_of_text_token == window_service.end_of_text_token
+            assert tokenizer_config.prefix_token == window_service.prefix_token
+
+            # TODO: Add a dummy tokenize, decode and make_request request to each client/tokenizer
+            # Do this once we have a proper Cache for tests.
diff --git a/src/helm/benchmark/test_model_properties.py b/src/helm/benchmark/test_model_properties.py
index 64cf3e0db9..e610332da3 100644
--- a/src/helm/benchmark/test_model_properties.py
+++ b/src/helm/benchmark/test_model_properties.py
@@ -5,14 +5,14 @@
 import pytest
 from tempfile import TemporaryDirectory
 from typing import Any
-from helm.benchmark.config_registry import register_helm_configurations
+from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
     ModelDeployment,
     WindowServiceSpec,
-    ALL_MODEL_DEPLOYMENTS,
+    get_model_deployment,
 )
-from helm.benchmark.model_metadata_registry import ModelMetadata
+from helm.benchmark.model_metadata_registry import get_model_metadata, ModelMetadata
 from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec
 from helm.benchmark.window_services.test_utils import get_tokenizer_service
 
@@ -23,6 +23,12 @@
 
 
 _BUILT_IN_TOKENIZER_CONFIGS = [
+    TokenizerConfig(
+        name="simple/model1",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.simple_tokenizer.SimpleTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
     TokenizerConfig(
         name="neurips/local",
         tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"),
@@ -59,6 +65,18 @@
         end_of_text_token="<|endoftext|>",
         prefix_token="<|endoftext|>",
     ),
+    TokenizerConfig(
+        name="microsoft/gpt2",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<<",
+    ),
+    TokenizerConfig(
+        name="writer/gpt2",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="",
+        prefix_token="",
+    ),
     TokenizerConfig(
         name="anthropic/claude",
         tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"),
@@ -149,6 +167,12 @@
         end_of_text_token="</s>",
         prefix_token="",
     ),
+    TokenizerConfig(
+        name="google/mt5-base",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
     TokenizerConfig(
         name="facebook/opt-66b",
         tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
@@ -212,7 +236,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.http_model_client.HTTPModelClient"),
         tokenizer_name="neurips/local",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -285,7 +309,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
         tokenizer_name="AlephAlpha/luminous-base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -294,7 +318,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
         tokenizer_name="AlephAlpha/luminous-extended",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -303,7 +327,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
         tokenizer_name="AlephAlpha/luminous-supreme",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -312,7 +336,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicLegacyClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
     ),
@@ -321,7 +345,17 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
         tokenizer_name="anthropic/claude",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=8000,
+        max_sequence_and_generated_tokens_length=9016,
+    ),
+    ModelDeployment(
+        name="anthropic/claude-2.1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+        tokenizer_name="anthropic/claude",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8000,
         max_sequence_and_generated_tokens_length=9016,
@@ -331,7 +365,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
         tokenizer_name="anthropic/claude",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8000,
         max_sequence_and_generated_tokens_length=9016,
@@ -341,7 +375,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
         tokenizer_name="anthropic/claude",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8000,
         max_sequence_and_generated_tokens_length=9016,
@@ -351,7 +385,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="bigscience/bloom",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.bloom_window_service.BloomWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -470,7 +504,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-j-6B",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -480,7 +514,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -490,7 +524,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -500,7 +534,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -510,7 +544,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -520,7 +554,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -530,70 +564,70 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/llama-13b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/llama-30b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/llama-65b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/llama-2-7b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="meta-llama/Llama-2-7b-hf",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=4096,
+        max_sequence_length=4094,
     ),
     ModelDeployment(
         name="together/llama-2-13b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="meta-llama/Llama-2-7b-hf",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=4096,
+        max_sequence_length=4094,
     ),
     ModelDeployment(
         name="together/llama-2-70b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="meta-llama/Llama-2-7b-hf",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=4096,
+        max_sequence_length=4094,
     ),
     ModelDeployment(
         name="together/alpaca-7b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -602,7 +636,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -611,7 +645,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="hf-internal-testing/llama-tokenizer",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -620,7 +654,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="mistralai/Mistral-7B-v0.1",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4095,
     ),
@@ -629,7 +663,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -639,7 +673,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -649,7 +683,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -659,7 +693,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -669,43 +703,43 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="tiiuae/falcon-7b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/falcon-7b-instruct",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="tiiuae/falcon-7b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/falcon-40b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="tiiuae/falcon-7b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="together/falcon-40b-instruct",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="tiiuae/falcon-7b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
-        max_sequence_length=2048,
+        max_sequence_length=2047,
     ),
     ModelDeployment(
         name="gooseai/gpt-neo-20b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -715,7 +749,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
         tokenizer_name="EleutherAI/gpt-j-6B",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -725,7 +759,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=1024,
         max_request_length=1025,
@@ -735,7 +769,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
         tokenizer_name="EleutherAI/gpt-j-6B",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -745,7 +779,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
         tokenizer_name="bigcode/santacoder",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -754,7 +788,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
         tokenizer_name="bigcode/starcoder",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
     ),
@@ -785,12 +819,62 @@
         ),
         max_sequence_length=511,
     ),
+    ModelDeployment(
+        name="google/text-bison@001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vertexai_client.VertexAIClient"),
+        tokenizer_name="google/mt5-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=6000,
+        max_sequence_and_generated_tokens_length=7000,
+    ),
+    ModelDeployment(
+        name="google/text-bison-32k",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vertexai_client.VertexAIClient"),
+        tokenizer_name="google/mt5-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=32000,
+        max_sequence_and_generated_tokens_length=32000,
+    ),
+    ModelDeployment(
+        name="google/text-unicorn@001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vertexai_client.VertexAIClient"),
+        tokenizer_name="google/mt5-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=6000,
+        max_sequence_and_generated_tokens_length=7000,
+    ),
+    ModelDeployment(
+        name="google/code-bison@001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vertexai_client.VertexAIClient"),
+        tokenizer_name="google/mt5-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=6000,
+        max_sequence_and_generated_tokens_length=7000,
+    ),
+    ModelDeployment(
+        name="google/code-bison-32k",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vertexai_client.VertexAIClient"),
+        tokenizer_name="google/mt5-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=32000,
+        max_sequence_and_generated_tokens_length=32000,
+    ),
     ModelDeployment(
         name="together/h3-2.7b",
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=1024,
         max_request_length=1025,
@@ -800,7 +884,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="facebook/opt-66b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -810,7 +894,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="facebook/opt-66b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -820,7 +904,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="facebook/opt-66b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -830,7 +914,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="facebook/opt-66b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -838,9 +922,9 @@
     ModelDeployment(
         name="microsoft/TNLGv2_530B",
         client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="microsoft/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2047,
         max_request_length=2048,
@@ -848,9 +932,9 @@
     ModelDeployment(
         name="microsoft/TNLGv2_7B",
         client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="microsoft/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2047,
         max_request_length=2048,
@@ -860,7 +944,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -870,7 +954,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -880,7 +964,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -890,7 +974,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -900,7 +984,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4000,
         max_request_length=4001,
@@ -910,7 +994,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4000,
         max_request_length=4001,
@@ -920,7 +1004,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -930,7 +1014,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -940,7 +1024,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -950,7 +1034,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -960,7 +1044,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4000,
         max_request_length=4001,
@@ -970,7 +1054,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -980,17 +1064,27 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
     ),
+    ModelDeployment(
+        name="openai/gpt-4-1106-preview",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
+        ),
+        max_sequence_length=128000,
+        max_request_length=128001,
+    ),
     ModelDeployment(
         name="openai/gpt-4-0314",
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
         max_request_length=8193,
@@ -1000,7 +1094,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=32768,
         max_request_length=32769,
@@ -1010,7 +1104,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
         max_request_length=8193,
@@ -1020,7 +1114,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=32768,
         max_request_length=32769,
@@ -1030,7 +1124,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4000,
         max_request_length=4001,
@@ -1040,7 +1134,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4000,
         max_request_length=4001,
@@ -1050,7 +1144,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="openai/cl100k_base",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=16000,
         max_request_length=16001,
@@ -1060,7 +1154,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1070,7 +1164,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1080,7 +1174,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1090,7 +1184,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1100,7 +1194,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1110,7 +1204,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-j-6B",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1120,7 +1214,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1130,7 +1224,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1140,7 +1234,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1150,7 +1244,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1160,7 +1254,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1178,9 +1272,9 @@
     ModelDeployment(
         name="writer/palmyra-base",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_sequence_and_generated_tokens_length=2048,
@@ -1188,9 +1282,9 @@
     ModelDeployment(
         name="writer/palmyra-large",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_sequence_and_generated_tokens_length=2048,
@@ -1198,9 +1292,9 @@
     ModelDeployment(
         name="writer/palmyra-instruct-30",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_sequence_and_generated_tokens_length=2048,
@@ -1208,9 +1302,9 @@
     ModelDeployment(
         name="writer/palmyra-e",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_sequence_and_generated_tokens_length=2048,
@@ -1218,9 +1312,9 @@
     ModelDeployment(
         name="writer/silk-road",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
         max_sequence_and_generated_tokens_length=8192,
@@ -1228,9 +1322,9 @@
     ModelDeployment(
         name="writer/palmyra-x",
         client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="writer/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=8192,
         max_sequence_and_generated_tokens_length=8192,
@@ -1245,22 +1339,12 @@
         max_sequence_length=2048,
         max_request_length=2049,
     ),
-    ModelDeployment(
-        name="google/palm",
-        client_spec=ClientSpec(class_name="helm.proxy.clients.google_client.GoogleClient"),
-        tokenizer_name="huggingface/gpt2",
-        window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-        ),
-        max_sequence_length=2048,
-        max_request_length=2049,
-    ),
     ModelDeployment(
         name="nvidia/megatron-gpt2",
         client_spec=ClientSpec(class_name="helm.proxy.clients.megatron_client.MegatronClient"),
         tokenizer_name="huggingface/gpt2",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=1024,
     ),
@@ -1269,7 +1353,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1279,7 +1363,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1289,7 +1373,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
         max_request_length=2049,
@@ -1299,7 +1383,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4096,
         max_request_length=4097,
@@ -1309,7 +1393,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
         tokenizer_name="EleutherAI/gpt-neox-20b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=4096,
         max_request_length=4097,
@@ -1320,7 +1404,7 @@
         model_name=None,
         tokenizer_name="lightningai/lit-gpt",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService", args={}
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService", args={}
         ),
         max_sequence_length=2048,
         max_request_length=None,
@@ -1331,7 +1415,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
         tokenizer_name="HuggingFaceM4/idefics-9b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -1340,7 +1424,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
         tokenizer_name="HuggingFaceM4/idefics-9b-instruct",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -1349,7 +1433,7 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
         tokenizer_name="HuggingFaceM4/idefics-80b",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
@@ -1358,19 +1442,18 @@
         client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
         tokenizer_name="HuggingFaceM4/idefics-80b-instruct",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
     ),
     ModelDeployment(
         name="simple/model1",
         client_spec=ClientSpec(class_name="helm.proxy.clients.simple_client.SimpleClient"),
-        tokenizer_name="huggingface/gpt2",
+        tokenizer_name="simple/model1",
         window_service_spec=WindowServiceSpec(
-            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+            class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService"
         ),
         max_sequence_length=2048,
-        max_request_length=2049,
     ),
 ]
 
@@ -1387,12 +1470,12 @@ def _full_class_name(obj: Any) -> str:
 # before running the setup_class().
 # Therefore ALL_MODEL_DEPLOYMENTS is empty and no test would be run,
 # so we need to do this here.
-register_helm_configurations()
+register_builtin_configs_from_helm_package()
 
 
 class TestModelProperties:
-    @pytest.mark.parametrize("model", ALL_MODEL_DEPLOYMENTS)
-    def test_models_has_window_service(self, model: ModelMetadata):
+    @pytest.mark.parametrize("deployment_name", [deployment.name for deployment in _BUILT_IN_MODEL_DEPLOYMENTS])
+    def test_models_has_window_service(self, deployment_name: str):
         auto_client = AutoClient(defaultdict(str), "", "")
         auto_tokenizer = AutoTokenizer(defaultdict(str), "", "")
         model_deployments = {
@@ -1403,6 +1486,8 @@ def test_models_has_window_service(self, model: ModelMetadata):
         }
         with TemporaryDirectory() as tmpdir:
             tokenizer_service = get_tokenizer_service(tmpdir)
+            deployment: ModelDeployment = get_model_deployment(deployment_name)
+            model: ModelMetadata = get_model_metadata(deployment.model_name or deployment_name)
             # Can't test lit-gpt client because it requires manual dependencies
             if "lit-gpt" in model.name:
                 return
@@ -1411,7 +1496,10 @@ def test_models_has_window_service(self, model: ModelMetadata):
             if "llama-2-" in model.name:
                 return
 
-            deployment_name: str = model.name
+            # Can't test Vertex AI because it requires Google credentials
+            if "text-bison" in model.name or "text-unicorn" in model.name:
+                return
+
             client = auto_client._get_client(deployment_name)
             window_service = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
             tokenizer_name = window_service.tokenizer_name
@@ -1437,7 +1525,7 @@ def test_models_has_window_service(self, model: ModelMetadata):
             )
 
             model_deployment = ModelDeployment(
-                name=model.name,
+                name=deployment_name,
                 client_spec=ClientSpec(class_name=client_class_name),
                 tokenizer_name=tokenizer_name,
                 window_service_spec=WindowServiceSpec(class_name=window_service_class_name),
@@ -1454,12 +1542,29 @@ def test_models_has_window_service(self, model: ModelMetadata):
             # NOTE: To generate the _BUILT_IN_MODEL_DEPLOYMENT and _BUILT_IN_TOKENIZER_CONFIGS lists above,
             # print tokenizer_config and model_deployment here.
 
-            assert model_deployments[model.name] == model_deployment
-            # PalmyraWindowService overrides the huggingface/gpt2 tokenizer with different special tokens,
+            # Cannot directly compare model_deployment and deployment because they will have different values
+            # for deprecated for example as it is not specified in this file.
+            # In stead compare all the fields defined here.
+            test_model_deployment: ModelDeployment = model_deployments[deployment_name]
+            assert model_deployment.name == test_model_deployment.name
+            assert model_deployment.client_spec.class_name == test_model_deployment.client_spec.class_name
+            assert model_deployment.tokenizer_name == test_model_deployment.tokenizer_name
+            if model_deployment.window_service_spec:
+                assert test_model_deployment.window_service_spec
+                assert (
+                    model_deployment.window_service_spec.class_name
+                    == test_model_deployment.window_service_spec.class_name
+                )
+            else:
+                assert not test_model_deployment.window_service_spec
+            assert model_deployment.max_sequence_length == test_model_deployment.max_sequence_length
+            assert model_deployment.max_request_length == test_model_deployment.max_request_length
+            assert (
+                model_deployment.max_sequence_and_generated_tokens_length
+                == test_model_deployment.max_sequence_and_generated_tokens_length
+            )
+            # DefaultWindowService overrides the huggingface/gpt2 tokenizer with different special tokens,
             # so there are currently two tokenizers named huggingface/gpt2
-            # TODO: Give PalmyraWindowService's tokenizer a different name e.g. writer/palmyra
+            # TODO: Give DefaultWindowService's tokenizer a different name e.g. writer/palmyra
             if tokenizer_name != "huggingface/gpt2":
                 assert tokenizer_configs[tokenizer_name] == tokenizer_config
-
-    def test_num_models_available(self):
-        assert len(ALL_MODEL_DEPLOYMENTS) == 119
diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py
index 732cd38bd1..359fd1584c 100644
--- a/src/helm/benchmark/tokenizer_config_registry.py
+++ b/src/helm/benchmark/tokenizer_config_registry.py
@@ -1,18 +1,11 @@
-import os
 from typing import Dict, Optional, List
 from dataclasses import dataclass
-import importlib_resources as resources
 
 import cattrs
 import yaml
 
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import ObjectSpec
-from helm.benchmark.model_metadata_registry import CONFIG_PACKAGE
-
-
-TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
-TOKENIZERS_REGISTERED: bool = False
 
 
 class TokenizerSpec(ObjectSpec):
@@ -59,20 +52,5 @@ def register_tokenizer_configs_from_path(path: str) -> None:
         register_tokenizer_config(tokenizer_config)
 
 
-def maybe_register_tokenizer_configs_from_base_path(path: str) -> None:
-    """Register tokenizer configs from yaml file if the path exists."""
-    if os.path.exists(path):
-        register_tokenizer_configs_from_path(path)
-
-
 def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]:
-    register_tokenizers_if_not_already_registered()
     return TOKENIZER_NAME_TO_CONFIG.get(name)
-
-
-def register_tokenizers_if_not_already_registered() -> None:
-    global TOKENIZERS_REGISTERED
-    if not TOKENIZERS_REGISTERED:
-        path: str = resources.files(CONFIG_PACKAGE).joinpath(TOKENIZER_CONFIGS_FILE)
-        maybe_register_tokenizer_configs_from_base_path(path)
-        TOKENIZERS_REGISTERED = True
diff --git a/src/helm/benchmark/window_services/anthropic_window_service.py b/src/helm/benchmark/window_services/anthropic_window_service.py
deleted file mode 100644
index 4fe1417dcd..0000000000
--- a/src/helm/benchmark/window_services/anthropic_window_service.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from .gpt2_window_service import GPT2WindowService
-from .tokenizer_service import TokenizerService
-
-
-class LegacyAnthropicWindowService(GPT2WindowService):
-    """
-    This is the window service for the legacy Anthropic client.
-    Please consider using the new Anthropic client with the AnthropicWindowService.
-    Anthropic has their own tokenizer based on the GPT-2 tokenizer.
-    This tokenizer is not publicly available, so approximate with the GPT-2 tokenizer.
-    """
-
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length of the Anthropic model."""
-        return 8192
-
-    @property
-    def max_request_length(self) -> int:
-        """
-        Return the max request length of the Anthropic model.
-        Anthropic does not include the start of sequence token.
-        """
-        return self.max_sequence_length
-
-
-class AnthropicWindowService(GPT2WindowService):
-    """
-    Anthropic has their own tokenizer.
-    """
-
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """
-        Return the max sequence length of the Anthropic model.
-        While the limits seems to be 8192, we limit to 8000
-        according to Anthropic's recommendations.
-        See: https://console.anthropic.com/docs/prompt-design
-        """
-        return 8000
-
-    @property
-    def max_sequence_and_generated_tokens_length(self) -> int:
-        """
-        Return the max prompt length + max token length.
-        Anthropic is one of the rare models that has a limit on this.
-        The official limit seems to be 9192,but using scripts/compute_request_limits.py
-        we found that the limit is actually 9016.
-        """
-        return 9016
-
-    @property
-    def max_request_length(self) -> int:
-        """
-        Return the max request length of the Anthropic model.
-        Anthropic does not include the start of sequence token.
-        """
-        return self.max_sequence_length
-
-    @property
-    def tokenizer_name(self) -> str:
-        return "anthropic/claude"
diff --git a/src/helm/benchmark/window_services/bloom_window_service.py b/src/helm/benchmark/window_services/bloom_window_service.py
deleted file mode 100644
index 32cf989644..0000000000
--- a/src/helm/benchmark/window_services/bloom_window_service.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class BloomWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """
-        The model was trained with a sequence length of 2,048.
-        Source: https://huggingface.co/bigscience/bloom
-        """
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        """Return the max request length."""
-        return self.max_sequence_length + 1
-
-    @property
-    def end_of_text_token(self) -> str:
-        """The end of text token."""
-        return "</s>"
-
-    @property
-    def prefix_token(self) -> str:
-        """The prefix token is the same as the end of text token."""
-        return self.end_of_text_token
-
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        return "bigscience/bloom"
diff --git a/src/helm/benchmark/window_services/default_window_service.py b/src/helm/benchmark/window_services/default_window_service.py
index eddc056183..20e464a382 100644
--- a/src/helm/benchmark/window_services/default_window_service.py
+++ b/src/helm/benchmark/window_services/default_window_service.py
@@ -1,4 +1,5 @@
 from typing import Optional
+from .window_service import INT_MAX
 from .local_window_service import LocalWindowService
 from .tokenizer_service import TokenizerService
 
@@ -10,6 +11,7 @@ def __init__(
         tokenizer_name: str,
         max_sequence_length: int,
         max_request_length: Optional[int] = None,
+        max_sequence_and_generated_tokens_length: Optional[int] = None,
         end_of_text_token: Optional[str] = None,
         prefix_token: Optional[str] = None,
     ):
@@ -17,6 +19,7 @@ def __init__(
         self._tokenizer_name = tokenizer_name
         self._max_sequence_length = max_sequence_length
         self._max_request_length = max_request_length or max_sequence_length
+        self._max_sequence_and_generated_tokens_length = max_sequence_and_generated_tokens_length or INT_MAX
         self._end_of_text_token = end_of_text_token or ""
         self._prefix_token = prefix_token or ""
 
@@ -32,6 +35,10 @@ def max_sequence_length(self) -> int:
     def max_request_length(self) -> int:
         return self._max_request_length
 
+    @property
+    def max_sequence_and_generated_tokens_length(self) -> int:
+        return self._max_sequence_and_generated_tokens_length
+
     @property
     def end_of_text_token(self) -> str:
         return self._end_of_text_token
diff --git a/src/helm/benchmark/window_services/gptj_window_service.py b/src/helm/benchmark/window_services/gptj_window_service.py
deleted file mode 100644
index 4df768e5b7..0000000000
--- a/src/helm/benchmark/window_services/gptj_window_service.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class GPTJWindowService(LocalWindowService):
-    """
-    The same tokenizer as GPT-2, but with an additional 143 tokens
-    (source: https://huggingface.co/docs/transformers/model_doc/gptj).
-    """
-
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length."""
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        """Return the max request length."""
-        return self.max_sequence_length + 1
-
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        # Not a typo: Named "gpt-j-6B" instead of "gpt-j-6b" in Hugging Face
-        return "EleutherAI/gpt-j-6B"
-
-    @property
-    def end_of_text_token(self) -> str:
-        """The end of text token."""
-        return "<|endoftext|>"
-
-    @property
-    def prefix_token(self) -> str:
-        """The prefix token for models is the same as the end of text token."""
-        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/gptneox_window_service.py b/src/helm/benchmark/window_services/gptneox_window_service.py
deleted file mode 100644
index 3f654dbb9e..0000000000
--- a/src/helm/benchmark/window_services/gptneox_window_service.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class GPTNeoXWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length."""
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        """Return the max request length."""
-        return self.max_sequence_length + 1
-
-    @property
-    def end_of_text_token(self) -> str:
-        """The end of text token."""
-        return "<|endoftext|>"
-
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        return "EleutherAI/gpt-neox-20b"
-
-    @property
-    def prefix_token(self) -> str:
-        """The prefix token is the same as the end of text token."""
-        return self.end_of_text_token
-
-
-class StableLMAlphaWindowService(GPTNeoXWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length."""
-        # The context length for these models is 4096 tokens.
-        # See: https://github.com/Stability-AI/StableLM#stablelm-alpha
-        return 4096
diff --git a/src/helm/benchmark/window_services/http_model_window_service.py b/src/helm/benchmark/window_services/http_model_window_service.py
deleted file mode 100644
index d84308b370..0000000000
--- a/src/helm/benchmark/window_services/http_model_window_service.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-# TODO: Remove Once we have configurable model names since this hardcodes the tokenizer name
-class HTTPModelWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        return "<|endoftext|>"
-
-    @property
-    def tokenizer_name(self) -> str:
-        return "neurips/local"
-
-    @property
-    def prefix_token(self) -> str:
-        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/huggingface_window_service.py b/src/helm/benchmark/window_services/huggingface_window_service.py
index 3bb545e677..dccb0268ed 100644
--- a/src/helm/benchmark/window_services/huggingface_window_service.py
+++ b/src/helm/benchmark/window_services/huggingface_window_service.py
@@ -10,29 +10,29 @@ def __init__(
         service: TokenizerService,
         tokenizer_name: str,
         pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
         max_sequence_length: Optional[int] = None,
         max_request_length: Optional[int] = None,
         end_of_text_token: Optional[str] = None,
         prefix_token: Optional[str] = None,
+        **kwargs
     ):
         super().__init__(service)
         self._tokenizer_name = tokenizer_name
-        tokenizer = HuggingFaceTokenizer.get_tokenizer(
-            helm_tokenizer_name=tokenizer_name,
-            pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
-            revision=revision,
-        )
         # Override max_sequence_length, max_request_length, end_of_text_token
         # and prefix_token if provided as an argument.
         # Otherwise, auto-infer them from the Hugging Face tokenizer.
         #
         # Note that many Hugging Face tokenizers have incorrect sequence lengths,
         # so it is recommended to set this manually.
-        self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
-        self._max_request_length = max_request_length or self._max_sequence_length
-        self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
-        self._prefix_token = prefix_token or tokenizer.bos_token or ""
+        with HuggingFaceTokenizer.get_tokenizer(
+            helm_tokenizer_name=tokenizer_name,
+            pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
+            **kwargs,
+        ) as tokenizer:
+            self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
+            self._max_request_length = max_request_length or self._max_sequence_length
+            self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
+            self._prefix_token = prefix_token or tokenizer.bos_token or ""
 
     @property
     def tokenizer_name(self) -> str:
diff --git a/src/helm/benchmark/window_services/lit_gpt_window_service.py b/src/helm/benchmark/window_services/lit_gpt_window_service.py
deleted file mode 100644
index 4d670a38e6..0000000000
--- a/src/helm/benchmark/window_services/lit_gpt_window_service.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class LitGPTWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        return "<|endoftext|>"
-
-    @property
-    def tokenizer_name(self) -> str:
-        return "lightningai/lit-gpt"
-
-    @property
-    def prefix_token(self) -> str:
-        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/llama_window_service.py b/src/helm/benchmark/window_services/llama_window_service.py
deleted file mode 100644
index 586ce0d970..0000000000
--- a/src/helm/benchmark/window_services/llama_window_service.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
-from helm.benchmark.window_services.tokenizer_service import TokenizerService
-
-
-class LlamaWindowService(HuggingFaceWindowService):
-    def __init__(self, service: TokenizerService):
-        # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
-        # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example
-        super().__init__(service, tokenizer_name="hf-internal-testing/llama-tokenizer")
-
-
-class Llama2WindowService(HuggingFaceWindowService):
-    # To use the Llama-2 tokenizer:
-    #
-    # 1. Accept the license agreement: https://ai.meta.com/resources/models-and-libraries/llama-downloads/
-    # 2. Request to access the Hugging Face repository: https://huggingface.co/meta-llama/Llama-2-7b
-    # 3. Run `huggingface-cli login`
-    #
-    # If you encounter the following error, complete the above steps and try again:
-    #
-    #     meta-llama/Llama-2-70b-hf is not a local folder and is not a valid model identifier listed on
-    #     'https://huggingface.co/models'
-    def __init__(self, service: TokenizerService):
-        super().__init__(service, "meta-llama/Llama-2-7b-hf", max_sequence_length=4096)
diff --git a/src/helm/benchmark/window_services/luminous_window_service.py b/src/helm/benchmark/window_services/luminous_window_service.py
deleted file mode 100644
index f450ecf5b1..0000000000
--- a/src/helm/benchmark/window_services/luminous_window_service.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from abc import abstractmethod
-
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class LuminousWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    @abstractmethod
-    def tokenizer_name(self) -> str:
-        """Each Luminous model has its own tokenizer."""
-        pass
-
-    @property
-    def max_sequence_length(self) -> int:
-        """
-        From https://docs.aleph-alpha.com/api/complete, "the summed number of tokens of prompt
-        and maximum_tokens..may not exceed 2048 tokens." Confirmed it's 2048 for the Luminous
-        models currently available.
-        """
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        """
-        The end of text token.
-        TODO: Setting to empty string for now as echo is not supported.
-        """
-        return ""
-
-    @property
-    def prefix_token(self) -> str:
-        """
-        The prefix token.
-        """
-        return self.end_of_text_token
-
-
-class LuminousBaseWindowService(LuminousWindowService):
-    @property
-    def tokenizer_name(self) -> str:
-        return "AlephAlpha/luminous-base"
-
-
-class LuminousExtendedWindowService(LuminousWindowService):
-    @property
-    def tokenizer_name(self) -> str:
-        return "AlephAlpha/luminous-extended"
-
-
-class LuminousSupremeWindowService(LuminousWindowService):
-    @property
-    def tokenizer_name(self) -> str:
-        return "AlephAlpha/luminous-supreme"
-
-
-class LuminousWorldWindowService(LuminousWindowService):
-    @property
-    def tokenizer_name(self) -> str:
-        return "AlephAlpha/luminous-world"
diff --git a/src/helm/benchmark/window_services/megatron_window_service.py b/src/helm/benchmark/window_services/megatron_window_service.py
deleted file mode 100644
index 0a37943e87..0000000000
--- a/src/helm/benchmark/window_services/megatron_window_service.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .gpt2_window_service import GPT2WindowService
-
-
-# NOTE: The only difference between this and GPT2WindowService is that
-# the request length is constrained to the sequence length.
-class MegatronWindowService(GPT2WindowService):
-    @property
-    def max_request_length(self) -> int:
-        """Return the max request length of GPT-2."""
-        return self.max_sequence_length
diff --git a/src/helm/benchmark/window_services/mt_nlg_window_service.py b/src/helm/benchmark/window_services/mt_nlg_window_service.py
deleted file mode 100644
index ab72c88734..0000000000
--- a/src/helm/benchmark/window_services/mt_nlg_window_service.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .gpt2_window_service import GPT2WindowService
-from .tokenizer_service import TokenizerService
-
-
-class MTNLGWindowService(GPT2WindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """
-        The max length of the model input. MT-NLG does not predict the logprob of the first
-        input token so `max_sequence_length` is one token shorter than `max_request_length`.
-        """
-        return self.max_request_length - 1
-
-    @property
-    def max_request_length(self) -> int:
-        """
-        The max request length for the MT-NLG models is 2048.
-        Source: https://github.com/microsoft/turing-academic-TNLG
-        """
-        return 2048
-
-    @property
-    def prefix_token(self) -> str:
-        return "<<"
diff --git a/src/helm/benchmark/window_services/no_decoding_window_service.py b/src/helm/benchmark/window_services/no_decoding_window_service.py
new file mode 100644
index 0000000000..9c7a91b4f7
--- /dev/null
+++ b/src/helm/benchmark/window_services/no_decoding_window_service.py
@@ -0,0 +1,32 @@
+from typing import List, Optional
+
+from helm.benchmark.window_services.window_service import EncodeResult
+from helm.benchmark.window_services.default_window_service import DefaultWindowService
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    TokenizationToken,
+)
+
+
+class NoDecodingWindowService(DefaultWindowService):
+    """A window service for tokenizers that have a unimplemented decode() method.
+
+    This assumes that concatenating the tokens from the tokenize endpoint will result in the original string,
+    which is not always true for all tokenizers.
+
+    TODO(#2141): Come up with a more correct way of doing this."""
+
+    def encode(self, text: str, truncation: bool = False, max_length: Optional[int] = None) -> EncodeResult:
+        response: TokenizationRequestResult = self.service.tokenize(
+            TokenizationRequest(text, tokenizer=self.tokenizer_name, encode=False, truncation=truncation)
+        )
+        return EncodeResult(text=text, tokens=response.tokens[:max_length])
+
+    def decode(self, tokens: List[TokenizationToken], normalized_text: Optional[str] = None) -> str:
+        del normalized_text
+        token_strings = []
+        for token in tokens:
+            assert isinstance(token.value, str)
+            token_strings.append(token.value)
+        return "".join(token_strings)
diff --git a/src/helm/benchmark/window_services/openai_window_service.py b/src/helm/benchmark/window_services/openai_window_service.py
deleted file mode 100644
index dc354e542d..0000000000
--- a/src/helm/benchmark/window_services/openai_window_service.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from .gpt2_window_service import GPT2WindowService
-from .tokenizer_service import TokenizerService
-
-
-class OpenAIWindowService(GPT2WindowService):
-    def __init__(self, service: TokenizerService):
-        # OpenAI uses the same tokenizer for GPT-2 and GPT-3.
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length of the OpenAI models (max length of model input)."""
-        return 2048
diff --git a/src/helm/benchmark/window_services/opt_window_service.py b/src/helm/benchmark/window_services/opt_window_service.py
deleted file mode 100644
index 49c964931a..0000000000
--- a/src/helm/benchmark/window_services/opt_window_service.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class OPTWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        """
-        The max length of the model input. The max sequence length for the OPT models is 2048.
-        Source: https://arxiv.org/pdf/2205.01068.pdf
-        """
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        """Return the max request length."""
-        return self.max_sequence_length + 1
-
-    @property
-    def end_of_text_token(self) -> str:
-        """The end of text token."""
-        return "</s>"
-
-    @property
-    def prefix_token(self) -> str:
-        """The prefix token is the same as the end of text token."""
-        return self.end_of_text_token
-
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        return "facebook/opt-66b"
diff --git a/src/helm/benchmark/window_services/palmyra_window_service.py b/src/helm/benchmark/window_services/palmyra_window_service.py
deleted file mode 100644
index c4ce91c8c5..0000000000
--- a/src/helm/benchmark/window_services/palmyra_window_service.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class PalmyraWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def tokenizer_name(self) -> str:
-        """All Palmyra models use the same tokenizer."""
-        return "huggingface/gpt2"
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def max_sequence_and_generated_tokens_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        """
-        The end of text token.
-        TODO: Setting to empty string for now as echo is not supported.
-        """
-        return ""
-
-    @property
-    def prefix_token(self) -> str:
-        """
-        The prefix token.
-        """
-        return self.end_of_text_token
-
-
-class LongerPalmyraWindowService(PalmyraWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        return 8192
diff --git a/src/helm/benchmark/window_services/santacoder_window_service.py b/src/helm/benchmark/window_services/santacoder_window_service.py
deleted file mode 100644
index 45aa023346..0000000000
--- a/src/helm/benchmark/window_services/santacoder_window_service.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class SantaCoderWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 2048
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        return "<|endoftext|>"
-
-    @property
-    def tokenizer_name(self) -> str:
-        return "bigcode/santacoder"
-
-    @property
-    def prefix_token(self) -> str:
-        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/starcoder_window_service.py b/src/helm/benchmark/window_services/starcoder_window_service.py
deleted file mode 100644
index 1fe72569c6..0000000000
--- a/src/helm/benchmark/window_services/starcoder_window_service.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class StarCoderWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService):
-        super().__init__(service)
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 8192
-
-    @property
-    def max_request_length(self) -> int:
-        return self.max_sequence_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        return "<|endoftext|>"
-
-    @property
-    def tokenizer_name(self) -> str:
-        return "bigcode/starcoder"
-
-    @property
-    def prefix_token(self) -> str:
-        return self.end_of_text_token
diff --git a/src/helm/benchmark/window_services/wider_openai_window_service.py b/src/helm/benchmark/window_services/wider_openai_window_service.py
deleted file mode 100644
index a21c593fe3..0000000000
--- a/src/helm/benchmark/window_services/wider_openai_window_service.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from .gpt2_window_service import GPT2WindowService
-
-
-class WiderOpenAIWindowService(GPT2WindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length of the larger second-generation OpenAI models.
-
-        Source: https://platform.openai.com/docs/models"""
-        return 4000
-
-
-class OpenAIChatWindowService(WiderOpenAIWindowService):
-    @property
-    def tokenizer_name(self) -> str:
-        return "openai/cl100k_base"
-
-
-class GPTTurboWindowService(OpenAIChatWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length for GPT-3.5 Turbo.
-
-        Source: https://platform.openai.com/docs/models"""
-        return 4000
-
-
-class GPTTurbo16KWindowService(OpenAIChatWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length for GPT-3.5 Turbo.
-
-        Source: https://platform.openai.com/docs/models"""
-        return 16000
-
-
-class GPT4WindowService(OpenAIChatWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length for GPT-4.
-
-        Source: https://platform.openai.com/docs/models"""
-        return 8192
-
-
-class GPT432KWindowService(OpenAIChatWindowService):
-    @property
-    def max_sequence_length(self) -> int:
-        """Return the max sequence length for GPT-4.
-
-        Source: https://platform.openai.com/docs/models"""
-        return 32768
diff --git a/src/helm/benchmark/window_services/window_service_factory.py b/src/helm/benchmark/window_services/window_service_factory.py
index e15bf72016..6389a95ae5 100644
--- a/src/helm/benchmark/window_services/window_service_factory.py
+++ b/src/helm/benchmark/window_services/window_service_factory.py
@@ -48,6 +48,7 @@ def get_window_service(model_deployment_name: str, service: TokenizerService) ->
                     "tokenizer_name": model_deployment.tokenizer_name,
                     "max_sequence_length": model_deployment.max_sequence_length,
                     "max_request_length": model_deployment.max_request_length,
+                    "max_sequence_and_generated_tokens_length": model_deployment.max_sequence_and_generated_tokens_length,  # noqa
                     "end_of_text_token": end_of_text_token,
                     "prefix_token": prefix_token,
                 },
diff --git a/src/helm/common/cache.py b/src/helm/common/cache.py
index 935ea8ddf3..8f882efe1e 100644
--- a/src/helm/common/cache.py
+++ b/src/helm/common/cache.py
@@ -1,19 +1,14 @@
-# mypy: check_untyped_defs = False
-from abc import abstractmethod
-import contextlib
+from collections import defaultdict
 from dataclasses import dataclass
+from typing import Dict, Callable, Generator, Optional, Tuple
 import json
-from typing import Dict, Callable, Generator, Iterable, Optional, Tuple
-from collections import defaultdict
-import sqlite3
 import threading
 
-from sqlitedict import SqliteDict
+import sqlite3
+
 from helm.common.general import hlog, htrack
+from helm.common.key_value_store import KeyValueStore, SqliteKeyValueStore
 from helm.proxy.retry import get_retry_decorator
-from bson.son import SON
-from bson.errors import InvalidDocument
-from pymongo import MongoClient, ReplaceOne
 
 try:
     from cPickle import loads
@@ -21,31 +16,19 @@
     from pickle import loads
 
 
-def request_to_key(request: Dict) -> str:
-    """Normalize a `request` into a `key` so that we can hash using it."""
-    return json.dumps(request, sort_keys=True)
-
-
-def key_to_request(key: str) -> Dict:
-    """Convert the normalized version to the request."""
-    return json.loads(key)
-
-
 def retry_if_write_failed(success: bool) -> bool:
     """Retries when the write fails."""
     return not success
 
 
 retry: Callable = get_retry_decorator(
-    "Write", max_attempts=10, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_write_failed
+    "Write", max_attempts=5, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_write_failed
 )
 
 
 class CacheConfig:
     """Configuration for a cache."""
 
-    pass
-
     @property
     def cache_stats_key(self) -> str:
         """The string key used by CacheStats to identify this cache."""
@@ -55,8 +38,6 @@ def cache_stats_key(self) -> str:
 class KeyValueStoreCacheConfig(CacheConfig):
     """Configuration for a cache backed by a key-value store."""
 
-    pass
-
 
 @dataclass(frozen=True)
 class SqliteCacheConfig(KeyValueStoreCacheConfig):
@@ -105,156 +86,6 @@ def cache_stats_key(self) -> str:
         return self.main.cache_stats_key
 
 
-class KeyValueStore(contextlib.AbstractContextManager):
-    """Key value store that persists writes."""
-
-    @property
-    def path(self):
-        return self._path
-
-    @abstractmethod
-    def contains(self, key: Dict) -> bool:
-        pass
-
-    @abstractmethod
-    def get(self, key: Dict) -> Optional[Dict]:
-        pass
-
-    @abstractmethod
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        pass
-
-    @abstractmethod
-    def put(self, key: Dict, value: Dict) -> None:
-        pass
-
-    @abstractmethod
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        pass
-
-    @abstractmethod
-    def remove(self, key: Dict) -> None:
-        pass
-
-
-class _SqliteKeyValueStore(KeyValueStore):
-    """Key value store backed by a SQLite file."""
-
-    def __init__(self, path: str):
-        self._sqlite_dict = SqliteDict(path)
-        super().__init__()
-
-    def __enter__(self) -> "_SqliteKeyValueStore":
-        self._sqlite_dict.__enter__()
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        self._sqlite_dict.__exit__(exc_type, exc_value, traceback)
-
-    def contains(self, key: Dict) -> bool:
-        return request_to_key(key) in self._sqlite_dict
-
-    def get(self, key: Dict) -> Optional[Dict]:
-        key_string = request_to_key(key)
-        result = self._sqlite_dict.get(key_string)
-        if result is not None:
-            assert isinstance(result, dict)
-            return result
-        return None
-
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        for key, value in self._sqlite_dict.items():
-            yield (key, value)
-
-    def put(self, key: Dict, value: Dict) -> None:
-        key_string = request_to_key(key)
-        self._sqlite_dict[key_string] = value
-        self._sqlite_dict.commit()
-
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        for key, value in pairs:
-            self.put(key, value)
-
-    def remove(self, key: Dict) -> None:
-        del self._sqlite_dict[key]
-        self._sqlite_dict.commit()
-
-
-class _MongoKeyValueStore(KeyValueStore):
-    """Key value store backed by a MongoDB database."""
-
-    # The number of documents to return per batch.
-    _BATCH_SIZE: int = 8
-
-    _REQUEST_KEY = "request"
-    _RESPONSE_KEY = "response"
-
-    def __init__(self, uri: str, collection_name: str):
-        # TODO: Create client in __enter__ and clean up client in __exit__
-        self._mongodb_client: MongoClient = MongoClient(uri)
-        self._database = self._mongodb_client.get_default_database()
-        self._collection = self._database.get_collection(collection_name)
-        self._collection.create_index(self._REQUEST_KEY, unique=True)
-        super().__init__()
-
-    def __enter__(self) -> "_MongoKeyValueStore":
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        return
-
-    def _canonicalize_key(self, key: Dict) -> SON:
-        serialized = json.dumps(key, sort_keys=True)
-        return json.loads(serialized, object_pairs_hook=SON)
-
-    def contains(self, key: Dict) -> bool:
-        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
-        return self._collection.find_one(query) is not None
-
-    def get(self, key: Dict) -> Optional[Dict]:
-        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
-        document = self._collection.find_one(query)
-        if document is not None:
-            response = document[self._RESPONSE_KEY]
-            if isinstance(response, str):
-                return json.loads(response)
-            else:
-                return response
-        return None
-
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        for document in self._collection.find({}).batch_size(self._BATCH_SIZE):
-            request = document[self._REQUEST_KEY]
-            response = document[self._RESPONSE_KEY]
-            if isinstance(response, str):
-                yield (request, json.loads(response))
-            else:
-                yield (request, response)
-
-    def put(self, key: Dict, value: Dict) -> None:
-        request = self._canonicalize_key(key)
-        document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
-        # The MongoDB collection should have a unique indexed on "request"
-        try:
-            self._collection.replace_one(filter={"request": request}, replacement=document, upsert=True)
-        except InvalidDocument:
-            # If the document is malformed e.g. because of null bytes in keys, instead store the response as a string.
-            alternate_document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, json.dumps(value))])
-            self._collection.replace_one(filter={"request": request}, replacement=alternate_document, upsert=True)
-
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        operations = []
-        for key, value in pairs:
-            request = self._canonicalize_key(key)
-            document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
-            operations.append(ReplaceOne({self._REQUEST_KEY: request}, document, upsert=True))
-        # Note: unlike put, multi_put does not support documents with null bytes in keys.
-        self._collection.bulk_write(operations)
-
-    def remove(self, key: Dict) -> None:
-        self._collection.delete_one(key)
-
-
 def get_all_from_sqlite(path: str) -> Generator[Tuple[Dict, Dict], None, None]:
     """Yields all decoded key, value pairs from the SQLite cache.
 
@@ -277,9 +108,11 @@ def create_key_value_store(config: KeyValueStoreCacheConfig) -> KeyValueStore:
     """Create a key value store from the given configuration."""
     # TODO: Support creating _MongoKeyValueStore
     if isinstance(config, MongoCacheConfig):
-        return _MongoKeyValueStore(config.uri, config.collection_name)
+        from helm.common.mongo_key_value_store import MongoKeyValueStore
+
+        return MongoKeyValueStore(config.uri, config.collection_name)
     elif isinstance(config, SqliteCacheConfig):
-        return _SqliteKeyValueStore(config.path)
+        return SqliteKeyValueStore(config.path)
     else:
         raise ValueError(f"KeyValueStoreCacheConfig with unknown type: {config}")
 
diff --git a/src/helm/common/concurrency.py b/src/helm/common/concurrency.py
new file mode 100644
index 0000000000..66baef92d8
--- /dev/null
+++ b/src/helm/common/concurrency.py
@@ -0,0 +1,32 @@
+from contextlib import AbstractContextManager
+from threading import Lock
+from typing import TypeVar, Generic
+
+
+T = TypeVar("T")
+
+
+class ThreadSafeWrapper(AbstractContextManager, Generic[T]):
+    """A wrapper that makes thread-hostile objects thread-safe.
+
+    This provides a context manager that holds a lock for accessing the inner object.
+
+    Example usage:
+
+        wrapped_obj = wrapper(thread_hostile_obj)
+        with wrapped_obj as obj:
+            # Lock is automatically held in here
+            obj.do_stuff()
+    """
+
+    def __init__(self, wrapped: T):
+        self._wrapped = wrapped
+        self._lock = Lock()
+
+    def __enter__(self) -> T:
+        self._lock.__enter__()
+        return self._wrapped
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self._lock.__exit__(exc_type, exc_value, traceback)
+        pass
diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index 5940b43463..c504421dc1 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -7,6 +7,7 @@
 import uuid
 import zstandard
 from typing import Any, Callable, Dict, List, Optional, TypeVar
+from datetime import datetime, date
 from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
 
@@ -160,6 +161,13 @@ def asdict_without_nones(obj: Any) -> Dict[str, Any]:
     return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
 
 
+def serialize_dates(obj):
+    """Serialize dates (pass deault=serialize_dates into json.dumps)."""
+    if isinstance(obj, (datetime, date)):
+        return obj.isoformat()
+    raise TypeError(f"Type {type(obj)} is not serializable")
+
+
 def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
     """Binarize the dict by setting the values that are 1 to 0.
 
diff --git a/src/helm/common/key_value_store.py b/src/helm/common/key_value_store.py
new file mode 100644
index 0000000000..bf1213eebe
--- /dev/null
+++ b/src/helm/common/key_value_store.py
@@ -0,0 +1,82 @@
+from abc import abstractmethod
+import contextlib
+import json
+from typing import Dict, Generator, Iterable, Optional, Tuple
+
+from sqlitedict import SqliteDict
+
+
+def request_to_key(request: Dict) -> str:
+    """Normalize a `request` into a `key` so that we can hash using it."""
+    return json.dumps(request, sort_keys=True)
+
+
+class KeyValueStore(contextlib.AbstractContextManager):
+    """Key value store that persists writes."""
+
+    @abstractmethod
+    def contains(self, key: Dict) -> bool:
+        pass
+
+    @abstractmethod
+    def get(self, key: Dict) -> Optional[Dict]:
+        pass
+
+    @abstractmethod
+    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
+        pass
+
+    @abstractmethod
+    def put(self, key: Dict, value: Dict) -> None:
+        pass
+
+    @abstractmethod
+    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
+        pass
+
+    @abstractmethod
+    def remove(self, key: Dict) -> None:
+        pass
+
+
+class SqliteKeyValueStore(KeyValueStore):
+    """Key value store backed by a SQLite file."""
+
+    def __init__(self, path: str):
+        self._sqlite_dict = SqliteDict(path)
+        super().__init__()
+
+    def __enter__(self) -> "SqliteKeyValueStore":
+        self._sqlite_dict.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self._sqlite_dict.__exit__(exc_type, exc_value, traceback)
+
+    def contains(self, key: Dict) -> bool:
+        return request_to_key(key) in self._sqlite_dict
+
+    def get(self, key: Dict) -> Optional[Dict]:
+        key_string = request_to_key(key)
+        result = self._sqlite_dict.get(key_string)
+        if result is not None:
+            assert isinstance(result, dict)
+            return result
+        return None
+
+    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
+        for key, value in self._sqlite_dict.items():
+            yield (key, value)
+
+    def put(self, key: Dict, value: Dict) -> None:
+        key_string = request_to_key(key)
+        self._sqlite_dict[key_string] = value
+        self._sqlite_dict.commit()
+
+    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
+        for key, value in pairs:
+            self.put(key, value)
+
+    def remove(self, key: Dict) -> None:
+        del self._sqlite_dict[key]
+        self._sqlite_dict.commit()
diff --git a/src/helm/common/mongo_key_value_store.py b/src/helm/common/mongo_key_value_store.py
new file mode 100644
index 0000000000..1481820914
--- /dev/null
+++ b/src/helm/common/mongo_key_value_store.py
@@ -0,0 +1,88 @@
+import json
+from typing import Dict, Generator, Iterable, Optional, Tuple
+
+from helm.common.key_value_store import KeyValueStore
+from helm.common.optional_dependencies import handle_module_not_found_error
+
+try:
+    from bson.errors import InvalidDocument
+    from bson.son import SON
+    from pymongo import MongoClient, ReplaceOne
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e)
+
+
+class MongoKeyValueStore(KeyValueStore):
+    """Key value store backed by a MongoDB database."""
+
+    # The number of documents to return per batch.
+    _BATCH_SIZE: int = 8
+
+    _REQUEST_KEY = "request"
+    _RESPONSE_KEY = "response"
+
+    def __init__(self, uri: str, collection_name: str):
+        # TODO: Create client in __enter__ and clean up client in __exit__
+        self._mongodb_client: MongoClient = MongoClient(uri)
+        self._database = self._mongodb_client.get_default_database()
+        self._collection = self._database.get_collection(collection_name)
+        self._collection.create_index(self._REQUEST_KEY, unique=True)
+        super().__init__()
+
+    def __enter__(self) -> "MongoKeyValueStore":
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        return
+
+    def _canonicalize_key(self, key: Dict) -> SON:
+        serialized = json.dumps(key, sort_keys=True)
+        return json.loads(serialized, object_pairs_hook=SON)
+
+    def contains(self, key: Dict) -> bool:
+        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
+        return self._collection.find_one(query) is not None
+
+    def get(self, key: Dict) -> Optional[Dict]:
+        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
+        document = self._collection.find_one(query)
+        if document is not None:
+            response = document[self._RESPONSE_KEY]
+            if isinstance(response, str):
+                return json.loads(response)
+            else:
+                return response
+        return None
+
+    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
+        for document in self._collection.find({}).batch_size(self._BATCH_SIZE):
+            request = document[self._REQUEST_KEY]
+            response = document[self._RESPONSE_KEY]
+            if isinstance(response, str):
+                yield (request, json.loads(response))
+            else:
+                yield (request, response)
+
+    def put(self, key: Dict, value: Dict) -> None:
+        request = self._canonicalize_key(key)
+        document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
+        # The MongoDB collection should have a unique indexed on "request"
+        try:
+            self._collection.replace_one(filter={"request": request}, replacement=document, upsert=True)
+        except (InvalidDocument, OverflowError):
+            # If the document is malformed (e.g. because of null bytes in keys) or some numbers cause overflows
+            # (e.g. integers exceed 8 bits) instead store the response as a string.
+            alternate_document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, json.dumps(value))])
+            self._collection.replace_one(filter={"request": request}, replacement=alternate_document, upsert=True)
+
+    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
+        operations = []
+        for key, value in pairs:
+            request = self._canonicalize_key(key)
+            document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
+            operations.append(ReplaceOne({self._REQUEST_KEY: request}, document, upsert=True))
+        # Note: unlike put, multi_put does not support documents with null bytes in keys.
+        self._collection.bulk_write(operations)
+
+    def remove(self, key: Dict) -> None:
+        self._collection.delete_one(key)
diff --git a/src/helm/common/tokenization_request.py b/src/helm/common/tokenization_request.py
index 311cf5dcbe..568537a0e1 100644
--- a/src/helm/common/tokenization_request.py
+++ b/src/helm/common/tokenization_request.py
@@ -19,9 +19,12 @@ class TokenizationRequest:
     text: str
 
     # Which tokenizer we should use
-    tokenizer: str = "huggingface/gpt2"
+    tokenizer: str
 
-    # Whether to encode
+    # Whether to encode tokens
+    #
+    # If true, the response's TokenizationToken should contain integers.
+    # Otherwise, the response's TokenizationToken should contain strings.
     encode: bool = False
 
     # Whether to truncate
diff --git a/src/helm/config/__init__.py b/src/helm/config/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
index f7699818a8..9b2e87e7e9 100644
--- a/src/helm/config/model_deployments.yaml
+++ b/src/helm/config/model_deployments.yaml
@@ -2,7 +2,12 @@
 # Some models have several deployments, each with different parameters.
 
 # If you want to add a new deployment, you can technically do it here but we recommend
-# you to do it in private/model_deployments.yaml instead.
+# you to do it in prod_env/model_deployments.yaml instead.
+
+# Follow the template of this file to add a new deployment. You can copy paste this to get started:
+#    # This file defines all the model deployments that you do not want to be public.
+#    model_deployments: [] # Leave empty to disable private model deployments
+
 
 model_deployments:
 
@@ -13,9 +18,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.simple_client.SimpleClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
   # AI21 Labs
 
@@ -51,7 +53,7 @@ model_deployments:
           class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
           args: {}
 
-  - name: ai21/j1-grande 
+  - name: ai21/j1-grande
     deprecated: true
     model_name: ai21/j1-grande
     tokenizer_name: ai21/j1
@@ -66,7 +68,7 @@ model_deployments:
           class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
           args: {}
 
-  - name: ai21/j1-grande-v2-beta 
+  - name: ai21/j1-grande-v2-beta
     deprecated: true
     model_name: ai21/j1-grande-v2-beta
     tokenizer_name: ai21/j1
@@ -133,9 +135,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
-      args: {}
 
   - name: AlephAlpha/luminous-extended
     model_name: AlephAlpha/luminous-extended
@@ -144,9 +143,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
-      args: {}
 
   - name: AlephAlpha/luminous-supreme
     model_name: AlephAlpha/luminous-supreme
@@ -155,14 +151,11 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
-      args: {}
 
   # TODO: Add luminous-world once it is released.
 
 
-  
+ 
   # Anthropic
   - name: anthropic/claude-v1.3
     model_name: anthropic/claude-v1.3
@@ -172,9 +165,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
-      args: {}
 
   - name: anthropic/claude-instant-v1
     model_name: anthropic/claude-instant-v1
@@ -184,8 +174,14 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+
+  - name: anthropic/claude-instant-1.2
+    model_name: anthropic/claude-instant-1.2
+    tokenizer_name: anthropic/claude
+    max_sequence_length: 8000
+    max_sequence_and_generated_tokens_length: 9016
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
       args: {}
 
   - name: anthropic/claude-2.0
@@ -196,8 +192,14 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+
+  - name: anthropic/claude-2.1
+    model_name: anthropic/claude-2.1
+    tokenizer_name: anthropic/claude
+    max_sequence_length: 8000
+    max_sequence_and_generated_tokens_length: 9016
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
       args: {}
 
   - name: anthropic/stanford-online-all-v4-s3
@@ -208,11 +210,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.anthropic_client.AnthropicLegacyClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
-      args: {}
-
-
 
   # Cohere
   - name: cohere/xlarge-20220609
@@ -348,9 +345,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: gooseai/gpt-j-6b
     model_name: eleutherai/gpt-j-6b
@@ -360,8 +354,60 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
       args: {}
+
+
+
+  # Google
+
+  ## PaLM 2
+  - name: google/text-bison@001
+    model_name: google/text-bison@001
+    tokenizer_name: google/text-bison@001
+    max_sequence_length: 6000 # Officially 8192
+    max_sequence_and_generated_tokens_length: 7000 # Officially 9216
+    client_spec:
+      class_name: "helm.proxy.clients.vertexai_client.VertexAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
+
+
+  - name: google/text-bison-32k
+    model_name: google/text-bison-32k
+    tokenizer_name: google/mt5-base
+    max_sequence_length: 32000
+    max_sequence_and_generated_tokens_length: 32000
+    client_spec:
+      class_name: "helm.proxy.clients.vertexai_client.VertexAIClient"
+      args: {}
+
+  - name: google/text-unicorn@001
+    model_name: google/text-unicorn@001
+    tokenizer_name: google/text-unicorn@001
+    max_sequence_length: 6000 # Officially 8192
+    max_sequence_and_generated_tokens_length: 7000 # Officially 9216
+    client_spec:
+      class_name: "helm.proxy.clients.vertexai_client.VertexAIClient"
+      args: {}
     window_service_spec:
-      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+      class_name: "helm.benchmark.window_services.no_decoding_window_service.NoDecodingWindowService"
+
+  - name: google/code-bison@001
+    model_name: google/code-bison@001
+    tokenizer_name: google/mt5-base
+    max_sequence_length: 6000 # Officially 6144
+    max_sequence_and_generated_tokens_length: 7000 # Officially 7168
+    client_spec:
+      class_name: "helm.proxy.clients.vertexai_client.VertexAIClient"
+      args: {}
+
+  - name: google/code-bison-32k
+    model_name: google/code-bison-32k
+    tokenizer_name: google/mt5-base
+    max_sequence_length: 32000
+    max_sequence_and_generated_tokens_length: 32000
+    client_spec:
+      class_name: "helm.proxy.clients.vertexai_client.VertexAIClient"
       args: {}
 
 
@@ -372,22 +418,18 @@ model_deployments:
   - name: huggingface/santacoder
     model_name: bigcode/santacoder
     tokenizer_name: bigcode/santacoder
+    max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
-      args: {}
 
   - name: huggingface/starcoder
     model_name: bigcode/starcoder
     tokenizer_name: bigcode/starcoder
+    max_sequence_length: 8192
     client_spec:
       class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
-      args: {}
 
   ## EleutherAI
   - name: huggingface/gpt-j-6b
@@ -398,11 +440,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
-      args: {}
 
-  ## OpenAI 
+  ## OpenAI
   - name: huggingface/gpt2
     model_name: openai/gpt2
     tokenizer_name: huggingface/gpt2
@@ -411,56 +450,39 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
-      args: {}
-
-
 
   # HuggingFaceM4
   - name: HuggingFaceM4/idefics-9b
-    model_name: huggingface/idefics-9b
+    model_name: HuggingFaceM4/idefics-9b
     tokenizer_name: HuggingFaceM4/idefics-9b
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: HuggingFaceM4/idefics-9b-instruct
-    model_name: huggingface/idefics-9b-instruct
+    model_name: HuggingFaceM4/idefics-9b-instruct
     tokenizer_name: HuggingFaceM4/idefics-9b-instruct
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: HuggingFaceM4/idefics-80b
-    model_name: huggingface/idefics-80b
+    model_name: HuggingFaceM4/idefics-80b
     tokenizer_name: HuggingFaceM4/idefics-80b
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: HuggingFaceM4/idefics-80b-instruct
-    model_name: huggingface/idefics-80b-instruct
+    model_name: HuggingFaceM4/idefics-80b-instruct
     tokenizer_name: HuggingFaceM4/idefics-80b-instruct
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
 
 
@@ -471,39 +493,30 @@ model_deployments:
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.lit_gpt_client.LitGPTClient"
-      args: 
+      args:
         checkpoint_dir: "" # Path to the checkpoint directory
         precision: bf16-true
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService"
-      args: {}
 
 
 
   # Microsoft
   - name: microsoft/TNLGv2_530B
     model_name: microsoft/TNLGv2_530B
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: microsoft/gpt2
     max_sequence_length: 2047
     max_request_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
-      args: {}
 
   - name: microsoft/TNLGv2_7B
     model_name: microsoft/TNLGv2_7B
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: microsoft/gpt2
     max_sequence_length: 2047
     max_request_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
-      args: {}
 
 
 
@@ -515,9 +528,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.http_model_client.HTTPModelClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
-      args: {}
 
 
 
@@ -529,9 +539,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.megatron_client.MegatronClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
-      args: {}
 
 
 
@@ -540,8 +547,8 @@ model_deployments:
   ## GPT 3 Models
   # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3
   # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024.
-  
-  - name: openai/davinci 
+ 
+  - name: openai/davinci
     deprecated: true
     model_name: openai/davinci
     tokenizer_name: huggingface/gpt2
@@ -550,11 +557,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/curie 
+  - name: openai/curie
     deprecated: true
     model_name: openai/curie
     tokenizer_name: huggingface/gpt2
@@ -563,11 +567,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/babbage 
+  - name: openai/babbage
     deprecated: true
     model_name: openai/babbage
     tokenizer_name: huggingface/gpt2
@@ -576,11 +577,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/ada 
+  - name: openai/ada
     deprecated: true
     model_name: openai/ada
     tokenizer_name: huggingface/gpt2
@@ -589,11 +587,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-davinci-003 
+  - name: openai/text-davinci-003
     deprecated: true
     model_name: openai/text-davinci-003
     tokenizer_name: huggingface/gpt2
@@ -602,11 +597,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
-      args: {}
 
-  - name: openai/text-davinci-002 
+  - name: openai/text-davinci-002
     deprecated: true
     model_name: openai/text-davinci-002
     tokenizer_name: huggingface/gpt2
@@ -615,11 +607,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
-      args: {}
 
-  - name: openai/text-davinci-001 
+  - name: openai/text-davinci-001
     deprecated: true
     model_name: openai/text-davinci-001
     tokenizer_name: huggingface/gpt2
@@ -628,11 +617,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-curie-001 
+  - name: openai/text-curie-001
     deprecated: true
     model_name: openai/text-curie-001
     tokenizer_name: huggingface/gpt2
@@ -641,11 +627,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-babbage-001 
+  - name: openai/text-babbage-001
     deprecated: true
     model_name: openai/text-babbage-001
     tokenizer_name: huggingface/gpt2
@@ -654,11 +637,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-ada-001 
+  - name: openai/text-ada-001
     deprecated: true
     model_name: openai/text-ada-001
     tokenizer_name: huggingface/gpt2
@@ -667,9 +647,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
 
   ## GPT 3.5 Turbo Models
@@ -687,9 +664,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
-      args: {}
 
   # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
   # sequence length is smaller at 4087 with one user input message and one assistant
@@ -703,9 +677,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
-      args: {}
 
   # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
   # in the openai/gpt-3.5-turbo-0613 comment
@@ -717,13 +688,22 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
-      args: {}
 
 
   ## GPT 4 Models
 
+  - name: openai/gpt-4-1106-preview
+    model_name: openai/gpt-4-1106-preview
+    tokenizer_name: openai/cl100k_base
+    # According to https://help.openai.com/en/articles/8555510-gpt-4-turbo,
+    # the maximum number of output tokens for this model is 4096
+    # TODO: add max_generated_tokens_length of 4096 https://github.com/stanford-crfm/helm/issues/2098
+    max_sequence_length: 128000
+    max_request_length: 128001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+
   - name: openai/gpt-4-0314
     model_name: openai/gpt-4-0314
     tokenizer_name: openai/cl100k_base
@@ -732,9 +712,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
-      args: {}
 
   - name: openai/gpt-4-32k-0314
     model_name: openai/gpt-4-32k-0314
@@ -744,9 +721,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
-      args: {}
 
   - name: openai/gpt-4-0613
     model_name: openai/gpt-4-0613
@@ -756,9 +730,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
-      args: {}
 
   - name: openai/gpt-4-32k-0613
     model_name: openai/gpt-4-32k-0613
@@ -768,15 +739,12 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
-      args: {}
 
 
   ## Codex Models
   # DEPRECATED: Codex models have been shut down on March 23 2023.
 
-  - name: openai/code-davinci-002 
+  - name: openai/code-davinci-002
     deprecated: true
     model_name: openai/code-davinci-002
     tokenizer_name: huggingface/gpt2
@@ -785,11 +753,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
-      args: {}
 
-  - name: openai/code-davinci-001 
+  - name: openai/code-davinci-001
     deprecated: true
     model_name: openai/code-davinci-001
     tokenizer_name: huggingface/gpt2
@@ -798,11 +763,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/code-cushman-001 
+  - name: openai/code-cushman-001
     deprecated: true
     model_name: openai/code-cushman-001
     tokenizer_name: huggingface/gpt2
@@ -811,11 +773,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  
+ 
   ## Text Similarity Models
   # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
   # The number of parameters is guessed based on the number of parameters of the
@@ -823,7 +782,7 @@ model_deployments:
   # DEPRECATED: Announced on July 06 2023 that first generation embeddings models
   #  will be shut down on January 04 2024.
 
-  - name: openai/text-similarity-davinci-001 
+  - name: openai/text-similarity-davinci-001
     deprecated: true
     model_name: openai/text-similarity-davinci-001
     tokenizer_name: huggingface/gpt2
@@ -832,11 +791,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-similarity-curie-001 
+  - name: openai/text-similarity-curie-001
     deprecated: true
     model_name: openai/text-similarity-curie-001
     tokenizer_name: huggingface/gpt2
@@ -845,11 +801,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-similarity-babbage-001 
+  - name: openai/text-similarity-babbage-001
     deprecated: true
     model_name: openai/text-similarity-babbage-001
     tokenizer_name: huggingface/gpt2
@@ -858,11 +811,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
-  - name: openai/text-similarity-ada-001 
+  - name: openai/text-similarity-ada-001
     deprecated: true
     model_name: openai/text-similarity-ada-001
     tokenizer_name: huggingface/gpt2
@@ -871,14 +821,11 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
   # As of 2023-11-07, text-embedding-ada-002 is not deprecated:
   # "We recommend using text-embedding-ada-002 for nearly all use cases."
   # Source: https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
-  - name: openai/text-embedding-ada-002 
+  - name: openai/text-embedding-ada-002
     model_name: openai/text-embedding-ada-002
     tokenizer_name: huggingface/gpt2
     max_sequence_length: 2048
@@ -886,9 +833,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.openai_client.OpenAIClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
-      args: {}
 
 
 
@@ -899,7 +843,7 @@ model_deployments:
   # https://api.together.xyz/playground
 
   ## BigScience
-  - name: together/bloom 
+  - name: together/bloom
     deprecated: true # Removed from together
     model_name: bigscience/bloom
     tokenizer_name: bigscience/bloom
@@ -908,11 +852,8 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.bloom_window_service.BloomWindowService"
-      args: {}
 
-  - name: together/t0pp 
+  - name: together/t0pp
     deprecated: true # Removed from together
     model_name: bigscience/t0pp
     tokenizer_name: bigscience/T0pp
@@ -933,9 +874,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/dolly-v2-7b
     model_name: databricks/dolly-v2-7b
@@ -945,9 +883,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/dolly-v2-12b
     model_name: databricks/dolly-v2-12b
@@ -957,9 +892,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   ## EleutherAI
   - name: together/gpt-j-6b
@@ -971,9 +903,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
-      args: {}
 
   - name: together/gpt-neox-20b
     deprecated: true # Removed from together
@@ -984,9 +913,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/pythia-1b-v0
     model_name: eleutherai/pythia-1b-v0
@@ -996,9 +922,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/pythia-2.8b-v0
     model_name: eleutherai/pythia-2.8b-v0
@@ -1008,9 +931,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/pythia-6.9b
     model_name: eleutherai/pythia-6.9b
@@ -1020,9 +940,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/pythia-12b-v0
     model_name: eleutherai/pythia-12b-v0
@@ -1032,9 +949,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   ## Google
   - name: together/t5-11b
@@ -1075,7 +989,7 @@ model_deployments:
 
   ## HazyResearch
   - name: together/h3-2.7b
-    deprecated: true# Not available on Together yet
+    deprecated: true # Not available on Together yet
     model_name: hazyresearch/h3-2.7b
     tokenizer_name: huggingface/gpt2
     max_sequence_length: 1024
@@ -1083,9 +997,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
-      args: {}
 
   ## LMSYS
   # TODO: might be deprecated. Needs to be checked.
@@ -1097,9 +1008,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   - name: together/vicuna-13b-v1.3
     model_name: lmsys/vicuna-13b-v1.3
@@ -1108,87 +1016,63 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   ## Meta
   - name: together/llama-7b
     model_name: meta/llama-7b
     tokenizer_name: hf-internal-testing/llama-tokenizer
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   - name: together/llama-13b
     model_name: meta/llama-13b
     tokenizer_name: hf-internal-testing/llama-tokenizer
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   - name: together/llama-30b
     model_name: meta/llama-30b
     tokenizer_name: hf-internal-testing/llama-tokenizer
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   - name: together/llama-65b
     model_name: meta/llama-65b
     tokenizer_name: hf-internal-testing/llama-tokenizer
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 tokens to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   - name: together/llama-2-7b
     model_name: meta/llama-2-7b
     tokenizer_name: meta-llama/Llama-2-7b-hf
-    max_sequence_length: 4096
+    max_sequence_length: 4094  # Subtract 2 tokens to work around a off-by-two bug in Together's token counting (#2080 and #2094)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
-      args: {}
 
   - name: together/llama-2-13b
     model_name: meta/llama-2-13b
     tokenizer_name: meta-llama/Llama-2-7b-hf
-    max_sequence_length: 4096
+    max_sequence_length: 4094  # Subtract 2 tokens to work around a off-by-two bug in Together's token counting (#2080 and #2094)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
-      args: {}
 
   - name: together/llama-2-70b
     model_name: meta/llama-2-70b
     tokenizer_name: meta-llama/Llama-2-7b-hf
-    max_sequence_length: 4096
+    max_sequence_length: 4094  # Subtract 2 tokens to work around a off-by-two bug in Together's token counting (#2080 and #2094)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
-      args: {}
 
   - name: together/opt-175b
     deprecated: true # Not available on Together yet
@@ -1199,9 +1083,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
-      args: {}
 
   - name: together/opt-66b
     deprecated: true # Not available on Together yet
@@ -1212,9 +1093,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
-      args: {}
 
   - name: together/opt-6.7b
     deprecated: true # Not available on Together yet
@@ -1225,9 +1103,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
-      args: {}
 
   - name: together/opt-1.3b
     deprecated: true # Not available on Together yet
@@ -1238,20 +1113,39 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+
+  # 01.AI
+  - name: together/yi-6b
+    model_name: 01-ai/yi-6b
+    tokenizer_name: 01-ai/Yi-6B
+    max_sequence_length: 4095
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+
+  - name: together/yi-34b
+    model_name: 01-ai/yi-34b
+    tokenizer_name: 01-ai/Yi-6B
+    max_sequence_length: 4095
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
 
   ## MistralAI
   - name: together/mistral-7b-v0.1
     model_name: mistralai/mistral-7b-v0.1
     tokenizer_name: mistralai/Mistral-7B-v0.1
-    max_sequence_length: 4095
+    max_sequence_length: 4095  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+
+  - name: together/mixtral-8x7b-32kseqlen
+    model_name: mistralai/mixtral-8x7b-32kseqlen
+    tokenizer_name: mistralai/Mistral-7B-v0.1
+    max_sequence_length: 4095  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
 
   ## MosaicML
@@ -1264,9 +1158,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/mpt-instruct-7b
     deprecated: true # Not available on Together yet
@@ -1277,9 +1168,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/mpt-30b
     model_name: mosaicml/mpt-30b
@@ -1289,9 +1177,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/mpt-instruct-30b
     model_name: mosaicml/mpt-instruct-30b
@@ -1301,9 +1186,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   ## StabilityAI
   - name: together/stablelm-base-alpha-3b
@@ -1315,9 +1197,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
-      args: {}
 
   - name: together/stablelm-base-alpha-7b
     deprecated: true # Removed from together
@@ -1328,9 +1207,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
-      args: {}
 
   ## Stanford
   - name: together/alpaca-7b
@@ -1340,54 +1216,39 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
-      args: {}
 
   ## Tiiuae
   - name: together/falcon-7b
     model_name: tiiuae/falcon-7b
     tokenizer_name: tiiuae/falcon-7b
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: together/falcon-7b-instruct
     model_name: tiiuae/falcon-7b-instruct
     tokenizer_name: tiiuae/falcon-7b
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: together/falcon-40b
     model_name: tiiuae/falcon-40b
     tokenizer_name: tiiuae/falcon-7b
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   - name: together/falcon-40b-instruct
     model_name: tiiuae/falcon-40b-instruct
     tokenizer_name: tiiuae/falcon-7b
-    max_sequence_length: 2048
+    max_sequence_length: 2047  # Subtract 1 token to work around a off-by-one bug in Together's input validation token counting (#2080)
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
-      args: {}
 
   ## Together
   # These are models fine-tuned by Together (and not simply hosted by Together).
@@ -1399,9 +1260,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
-      args: {}
 
   - name: together/gpt-neoxt-chat-base-20b
     model_name: together/gpt-neoxt-chat-base-20b
@@ -1411,9 +1269,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/redpajama-incite-base-3b-v1
     model_name: together/redpajama-incite-base-3b-v1
@@ -1423,9 +1278,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/redpajama-incite-instruct-3b-v1
     model_name: together/redpajama-incite-instruct-3b-v1
@@ -1435,9 +1287,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/redpajama-incite-base-7b
     model_name: together/redpajama-incite-base-7b
@@ -1447,9 +1296,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   - name: together/redpajama-incite-instruct-7b
     model_name: together/redpajama-incite-instruct-7b
@@ -1459,9 +1305,6 @@ model_deployments:
     client_spec:
       class_name: "helm.proxy.clients.together_client.TogetherClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
-      args: {}
 
   ## Tsinghua
   - name: together/glm
@@ -1496,72 +1339,81 @@ model_deployments:
   # Writer
   - name: writer/palmyra-base
     model_name: writer/palmyra-base
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 2048
     max_sequence_and_generated_tokens_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
-      args: {}
 
   - name: writer/palmyra-large
     model_name: writer/palmyra-large
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 2048
     max_sequence_and_generated_tokens_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
-      args: {}
 
   - name: writer/palmyra-instruct-30
     model_name: writer/palmyra-instruct-30
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 2048
     max_sequence_and_generated_tokens_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
-      args: {}
 
   - name: writer/palmyra-e
     model_name: writer/palmyra-e
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 2048
     max_sequence_and_generated_tokens_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
-      args: {}
 
   - name: writer/silk-road
     model_name: writer/silk-road
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 8192
     max_sequence_and_generated_tokens_length: 8192
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
-      args: {}
 
   - name: writer/palmyra-x
     model_name: writer/palmyra-x
-    tokenizer_name: huggingface/gpt2
+    tokenizer_name: writer/gpt2
     max_sequence_length: 8192
     max_sequence_and_generated_tokens_length: 8192
     client_spec:
       class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
       args: {}
-    window_service_spec:
-      class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
-      args: {}
\ No newline at end of file
+
+  - name: writer/palmyra-x-v2
+    model_name: writer/palmyra-x-v2
+    tokenizer_name: writer/gpt2
+    max_sequence_length: 6000
+    max_sequence_and_generated_tokens_length: 7024
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+
+  - name: writer/palmyra-x-v3
+    model_name: writer/palmyra-x-v3
+    tokenizer_name: writer/gpt2
+    max_sequence_length: 6000
+    max_sequence_and_generated_tokens_length: 7024
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+
+  - name: writer/palmyra-x-32k
+    model_name: writer/palmyra-x-32k
+    tokenizer_name: writer/gpt2
+    max_sequence_length: 28000
+    max_sequence_and_generated_tokens_length: 30048
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
index e9c097ea16..72088ac210 100644
--- a/src/helm/config/model_metadata.yaml
+++ b/src/helm/config/model_metadata.yaml
@@ -2,7 +2,12 @@
 # The model names here should match the model names in model_deployments.yaml.
 
 # If you want to add a new model, you can technically do it here but we recommend
-# you to do it in private/model_metadata.yaml instead.
+# you to do it in prod_env/model_metadata.yaml instead.
+
+# Follow the template of this file to add a new model. You can copy paste this to get started:
+#    # This file contains the metadata for private models
+#    models: [] # Leave empty to disable private models
+
 
 models:
 
@@ -151,6 +156,14 @@ models:
     release_date: 2023-03-17
     tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: anthropic/claude-instant-1.2
+    display_name: Anthropic Claude Instant 1.2
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization_name: Anthropic
+    access: limited
+    release_date: 2023-08-09
+    tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
   - name: anthropic/claude-2.0
     display_name: Anthropic Claude 2.0
     description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
@@ -159,6 +172,14 @@ models:
     release_date: 2023-07-11
     tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: anthropic/claude-2.1
+    display_name: Anthropic Claude 2.1
+    description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization_name: Anthropic
+    access: limited
+    release_date: 2023-11-21
+    tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
   # DEPRECATED: Please do not use.
   - name: anthropic/stanford-online-all-v4-s3
     display_name: Anthropic-LM v4-s3 (52B)
@@ -341,21 +362,20 @@ models:
     release_date: 2022-11-08
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
-  # TODO: Fill in the details.
   - name: cohere/command
-    display_name: Cohere Command TODO
-    description: Cohere Command TODO
+    display_name: Cohere Command
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
     creator_organization_name: Cohere
     access: limited
-    release_date: 2022-11-08 # TODO
+    release_date: 2023-09-29
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
   - name: cohere/command-light
-    display_name: Cohere Command TODO
-    description: Cohere Command TODO
+    display_name: Cohere Command Light
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
     creator_organization_name: Cohere
     access: limited
-    release_date: 2022-11-08 # TODO
+    release_date: 2023-09-29
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
 
@@ -506,6 +526,46 @@ models:
     release_date: 2023-03-01 # was first announced on 2022-04 but remained private.
     tags: [] # TODO: add tags
 
+  - name: google/text-bison@001
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+    tags: [TEXT_MODEL_TAG, GOOGLE_PALM_2_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: google/text-bison-32k
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+    tags: [TEXT_MODEL_TAG, GOOGLE_PALM_2_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: google/text-unicorn@001
+    display_name: PaLM-2 (Unicorn)
+    description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+    tags: [TEXT_MODEL_TAG, GOOGLE_PALM_2_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: google/code-bison@001
+    display_name: Codey PaLM-2 (Bison)
+    description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
+    tags: [CODE_MODEL_TAG]
+
+  - name: google/code-bison-32k
+    display_name: Codey PaLM-2 (Bison)
+    description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization_name: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
+    tags: [CODE_MODEL_TAG]
+
 
 
   # HazyResearch
@@ -521,7 +581,7 @@ models:
 
 
   # HuggingFace
-  - name: huggingface/idefics-9b
+  - name: HuggingFaceM4/idefics-9b
     display_name: IDEFICS (9B)
     description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
     creator_organization_name: HuggingFace
@@ -530,7 +590,7 @@ models:
     release_date: 2023-08-22
     tags: [VISION_LANGUAGE_MODEL_TAG]
 
-  - name: huggingface/idefics-9b-instruct
+  - name: HuggingFaceM4/idefics-9b-instruct
     display_name: IDEFICS instruct (9B)
     description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
     creator_organization_name: HuggingFace
@@ -539,7 +599,7 @@ models:
     release_date: 2023-08-22
     tags: [VISION_LANGUAGE_MODEL_TAG]
 
-  - name: huggingface/idefics-80b
+  - name: HuggingFaceM4/idefics-80b
     display_name: IDEFICS (80B)
     description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
     creator_organization_name: HuggingFace
@@ -548,7 +608,7 @@ models:
     release_date: 2023-08-22
     tags: [VISION_LANGUAGE_MODEL_TAG]
 
-  - name: huggingface/idefics-80b-instruct
+  - name: HuggingFaceM4/idefics-80b-instruct
     display_name: IDEFICS instruct (80B)
     description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
     creator_organization_name: HuggingFace
@@ -754,16 +814,44 @@ models:
 
 
 
+  # 01.AI
+  - name: 01-ai/yi-6b
+    display_name: Yi (6B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization_name: 01.AI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2023-11-02
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+  - name: 01-ai/yi-34b
+    display_name: Yi (34B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization_name: 01.AI
+    access: open
+    num_parameters: 34000000000
+    release_date: 2023-11-02
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
   # Mistral AI
   - name: mistralai/mistral-7b-v0.1
     display_name: Mistral v0.1 (7B)
-    description: Mistral 7B is a  7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
+    description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
     creator_organization_name: Mistral AI
     access: open
     num_parameters: 7300000000
     release_date: 2023-09-27
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: mistralai/mixtral-8x7b-32kseqlen
+    display_name: Mixtral (8x7B 32K seqlen)
+    description: Mistral AI's mixture-of-experts model ([tweet](https://twitter.com/MistralAI/status/1733150512395038967)).
+    creator_organization_name: Mistral AI
+    access: open
+    num_parameters: 56000000000
+    release_date: 2023-12-08
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
 
 
   # MosaicML
@@ -901,7 +989,7 @@ models:
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
 
   - name: openai/text-davinci-003 # DEPRECATED
-    display_name: text-davinci-003
+    display_name: GPT-3.5 (text-davinci-003)
     description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
     creator_organization_name: OpenAI
     access: limited
@@ -912,7 +1000,7 @@ models:
   # TODO: text-davinci-002 supports insertion. Support insertion in our framework.
   #       https://github.com/stanford-crfm/benchmarking/issues/359
   - name: openai/text-davinci-002 # DEPRECATED
-    display_name: text-davinci-002
+    display_name: GPT-3.5 (text-davinci-002)
     description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
     creator_organization_name: OpenAI
     access: limited
@@ -921,7 +1009,7 @@ models:
     tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
 
   - name: openai/text-davinci-001 # DEPRECATED
-    display_name: text-davinci-001
+    display_name: GPT-3.5 (text-davinci-001)
     description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
     creator_organization_name: OpenAI
     access: limited
@@ -961,7 +1049,7 @@ models:
   # ChatGPT: https://openai.com/blog/chatgpt
   
   - name: openai/gpt-3.5-turbo-0301
-    display_name: gpt-3.5-turbo-0301
+    display_name: GPT-3.5 Turbo (0301)
     description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
     creator_organization_name: OpenAI
     access: limited
@@ -969,7 +1057,7 @@ models:
     tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
   - name: openai/gpt-3.5-turbo-0613
-    display_name: gpt-3.5-turbo-0613
+    display_name: GPT-3.5 Turbo (0613)
     description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
     creator_organization_name: OpenAI
     access: limited
@@ -988,9 +1076,17 @@ models:
 
 
   ## GPT 4 Models
-    
+
+  - name: openai/gpt-4-1106-preview
+    display_name: GPT-4 Turbo (1106 preview)
+    description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-11-06
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
   - name: openai/gpt-4-0314
-    display_name: gpt-4-0314
+    display_name: GPT-4 (0314)
     description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
     creator_organization_name: OpenAI
     access: limited
@@ -1006,7 +1102,7 @@ models:
     tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
   - name: openai/gpt-4-0613
-    display_name: gpt-4-0613
+    display_name: GPT-4 (0613)
     description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
     creator_organization_name: OpenAI
     access: limited
@@ -1299,6 +1395,7 @@ models:
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
   - name: writer/palmyra-instruct-30
+    deprecated: true # Internal error
     display_name: InstructPalmyra (30B)
     description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
     creator_organization_name: Writer
@@ -1309,6 +1406,7 @@ models:
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
   - name: writer/palmyra-e
+    deprecated: true # Internal error
     display_name: Palmyra E (30B)
     description: Palmyra E (30B)
     creator_organization_name: Writer
@@ -1338,6 +1436,36 @@ models:
     # Does not support echo
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
 
+  - name: writer/palmyra-x-v2
+    display_name: Palmyra X V2 (33B)
+    description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-x-v3
+    display_name: Palmyra X V3 (72B)
+    description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 72000000000
+    release_date: 2023-12-01
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-x-32k
+    display_name: Palmyra X-32K (33B)
+    description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
 
 
   # Yandex
@@ -1348,4 +1476,4 @@ models:
     access: open
     num_parameters: 100000000000
     release_date: 2022-06-23
-    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
\ No newline at end of file
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
index c7c0d1446e..ef745ec0a1 100644
--- a/src/helm/config/tokenizer_configs.yaml
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -1,3 +1,13 @@
+# This file defines all the tokenizers that are supported by the Helm API.
+
+# If you want to add a new tokenizer, you can technically do it here but we recommend
+# you to do it in prod_env/tokenizer_configs.yaml instead.
+
+# Follow the template of this file to add a new tokenizer. You can copy paste this to get started:
+#    # This file contains the tokenizer configs for the private tokenizers
+#    tokenizer_configs: [] # Leave empty to disable private tokenizers
+
+
 tokenizer_configs:
 
   - name: simple/model1
@@ -108,8 +118,26 @@ tokenizer_configs:
       class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "</s>"
     prefix_token: ""
+  - name: google/mt5-base
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+  - name: google/text-bison@001
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.vertexai_tokenizer.VertexAITokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+  - name: google/text-unicorn@001
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.vertexai_tokenizer.VertexAITokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
 
   # Hf-internal-testing
+
+  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
+  # https://huggingface.co/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizerFast.example
   - name: hf-internal-testing/llama-tokenizer
     tokenizer_spec:
       class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -153,12 +181,38 @@ tokenizer_configs:
     prefix_token: "<|endoftext|>"
 
   # Meta-llama
+
+  # To use the Llama-2 tokenizer:
+    #
+    # 1. Accept the license agreement: https://ai.meta.com/resources/models-and-libraries/llama-downloads/
+    # 2. Request to access the Hugging Face repository: https://huggingface.co/meta-llama/Llama-2-7b
+    # 3. Run `huggingface-cli login`
+    #
+    # If you encounter the following error, complete the above steps and try again:
+    #
+    #     meta-llama/Llama-2-70b-hf is not a local folder and is not a valid model identifier listed on
+    #     'https://huggingface.co/models'
   - name: meta-llama/Llama-2-7b-hf
     tokenizer_spec:
       class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "</s>"
     prefix_token: "<s>"
 
+
+  # 01-ai
+  - name: 01-ai/Yi-6B
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # Microsoft
+  - name: microsoft/gpt2
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<<"
+
   # Mistralai
   - name: mistralai/Mistral-7B-v0.1
     tokenizer_spec:
@@ -194,9 +248,16 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: ""
 
+  # Writer
+  - name: writer/gpt2
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+
   # Yandex
   - name: Yandex/yalm
     tokenizer_spec:
       class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"
     end_of_text_token: "</s>"
-    prefix_token: "</s>"
\ No newline at end of file
+    prefix_token: "</s>"
diff --git a/src/helm/proxy/clients/aleph_alpha_client.py b/src/helm/proxy/clients/aleph_alpha_client.py
index 4890267165..c6927eab91 100644
--- a/src/helm/proxy/clients/aleph_alpha_client.py
+++ b/src/helm/proxy/clients/aleph_alpha_client.py
@@ -67,7 +67,7 @@ def do_it():
             tokens: List[Token] = []
 
             # `completion_tokens` is the list of selected tokens.
-            for i, token in enumerate(completion["completion_tokens"]):
+            for i, token in enumerate(completion.get("completion_tokens", [])):
                 # Get the top K logprobs for the ith token
                 top_logprobs: Dict[str, float] = completion["log_probs"][i]
                 # Use the selected token value to get the logprob
diff --git a/src/helm/proxy/clients/anthropic_client.py b/src/helm/proxy/clients/anthropic_client.py
index faf92d8bdd..bde73984e2 100644
--- a/src/helm/proxy/clients/anthropic_client.py
+++ b/src/helm/proxy/clients/anthropic_client.py
@@ -56,9 +56,12 @@ class AnthropicClient(CachingClient):
     ADDITIONAL_TOKENS: int = 5
     PROMPT_ANSWER_START: str = "The answer is "
 
-    def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig, api_key: Optional[str] = None):
+    def __init__(
+        self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: Optional[str] = None
+    ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
         self.api_key: Optional[str] = api_key
         self._client = anthropic.Client(api_key) if api_key else None
 
@@ -165,7 +168,7 @@ def do_it():
             # The Anthropic API doesn't return us tokens or logprobs, so we tokenize ourselves.
             tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
                 # Anthropic uses their own tokenizer
-                TokenizationRequest(text, tokenizer=request.model_engine)
+                TokenizationRequest(text, tokenizer=self.tokenizer_name)
             )
 
             # Log probs are not currently not supported by the Anthropic, so set to 0 for now.
@@ -240,7 +243,7 @@ def is_valid_logprobs_response(raw_response: str) -> bool:
             hlog(f"Invalid logprobs response: {raw_response}")
             return False
 
-    def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
+    def __init__(self, api_key: str, cache_config: CacheConfig):
         hlog("This client is deprecated. Please use AnthropicClient instead.")
         super().__init__(cache_config=cache_config)
         self.api_key = api_key
diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py
index 8d5ddcce6e..db72553b6a 100644
--- a/src/helm/proxy/clients/auto_client.py
+++ b/src/helm/proxy/clients/auto_client.py
@@ -68,7 +68,7 @@ def _get_client(self, model_deployment_name: str) -> Client:
 
             client_spec = inject_object_spec_args(
                 model_deployment.client_spec,
-                constant_bindings={"cache_config": cache_config},
+                constant_bindings={"cache_config": cache_config, "tokenizer_name": model_deployment.tokenizer_name},
                 provider_bindings={
                     "api_key": lambda: provide_api_key(self.credentials, host_organization, model_deployment_name),
                     "tokenizer": lambda: self._auto_tokenizer._get_tokenizer(
@@ -78,6 +78,8 @@ def _get_client(self, model_deployment_name: str) -> Client:
                         host_organization + "OrgId", None
                     ),  # OpenAI, GooseAI, Microsoft
                     "lock_file_path": lambda: os.path.join(self.cache_path, f"{host_organization}.lock"),  # Microsoft
+                    "project_id": lambda: self.credentials.get(host_organization + "ProjectId", None),  # VertexAI
+                    "location": lambda: self.credentials.get(host_organization + "Location", None),  # VertexAI
                 },
             )
             client = create_object(client_spec)
diff --git a/src/helm/proxy/clients/client.py b/src/helm/proxy/clients/client.py
index ee632365ff..fd641ec780 100644
--- a/src/helm/proxy/clients/client.py
+++ b/src/helm/proxy/clients/client.py
@@ -125,7 +125,7 @@ def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
         "together",
     ]:
         return token.replace("▁", " ")
-    elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
+    elif tokenizer_name is not None and (tokenizer_name.startswith("huggingface") or tokenizer_name.endswith("gpt2")):
         return token.replace("Ġ", " ")
     return token
 
diff --git a/src/helm/proxy/clients/cohere_client.py b/src/helm/proxy/clients/cohere_client.py
index 1e1f121917..1f8e656e7d 100644
--- a/src/helm/proxy/clients/cohere_client.py
+++ b/src/helm/proxy/clients/cohere_client.py
@@ -11,7 +11,6 @@
     Sequence,
     Token,
 )
-from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization
 from .client import CachingClient, truncate_sequence
 from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
 
@@ -43,8 +42,6 @@ def make_request(self, request: Request) -> RequestResult:
             # so `max_tokens` has to be greater than 0 when `return_likelihoods` is set to "GENERATION".
             assert request.max_tokens > 0, "max_tokens can only be 0 if echo_prompt=True"
 
-        # model: "Currently available models are small, medium, large, xlarge"
-        assert request.model_deployment in get_model_deployments_by_host_organization("cohere")
         # temperature: "min value of 0.0, max value of 5.0"
         assert 0.0 <= request.temperature <= 5.0, f"Invalid temperature: {request.temperature}. Valid range: [0,5]"
         # num_generations: "min value of 1, max value of 5"
diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py
index 7370aeaaab..afb914d6bb 100644
--- a/src/helm/proxy/clients/huggingface_client.py
+++ b/src/helm/proxy/clients/huggingface_client.py
@@ -1,6 +1,6 @@
 from copy import deepcopy
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM
 from transformers.generation.stopping_criteria import (
     StoppingCriteria,
     StoppingCriteriaList,
@@ -18,7 +18,7 @@
     Token,
 )
 from .client import CachingClient, truncate_sequence
-from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, resolve_alias
+from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer, resolve_alias
 from threading import Lock
 
 
@@ -39,29 +39,27 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
 class HuggingFaceServer:
     """A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call."""
 
-    def __init__(self, pretrained_model_name_or_path: str, revision: Optional[str] = None):
+    def __init__(self, pretrained_model_name_or_path: str, **kwargs):
         if torch.cuda.is_available():
             hlog("CUDA is available, initializing with a GPU...")
             self.device: str = "cuda:0"
         else:
             self.device = "cpu"
-        model_kwargs = {}
-        if revision:
-            model_kwargs["revision"] = revision
         with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
             # WARNING this may fail if your GPU does not have enough memory
             self.model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_name_or_path, trust_remote_code=True, **model_kwargs
+                pretrained_model_name_or_path, trust_remote_code=True, **kwargs
             ).to(self.device)
         with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"):
-            self.tokenizer: AutoTokenizer = HuggingFaceTokenizer.create_tokenizer(
-                pretrained_model_name_or_path, revision
+            self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer(
+                pretrained_model_name_or_path, **kwargs
             )
 
     def serve_request(self, raw_request: Dict[str, Any]):
-        encoded_input = self.tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
-            self.device
-        )
+        with self.wrapped_tokenizer as tokenizer:
+            encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
+                self.device
+            )
         raw_request = deepcopy(raw_request)
         raw_request["do_sample"] = True
         raw_request["return_dict_in_generate"] = True
@@ -70,9 +68,10 @@ def serve_request(self, raw_request: Dict[str, Any]):
         del raw_request["top_k_per_token"]
         stopping_criteria: Optional[StoppingCriteriaList] = None
         if len(raw_request["stop_sequences"]) > 0:
-            stop_sequence_ids = self.tokenizer(
-                raw_request["stop_sequences"], return_token_type_ids=False, add_special_tokens=False
-            )
+            with self.wrapped_tokenizer as tokenizer:
+                stop_sequence_ids = tokenizer(
+                    raw_request["stop_sequences"], return_token_type_ids=False, add_special_tokens=False
+                )
             if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
                 raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
             else:
@@ -81,63 +80,99 @@ def serve_request(self, raw_request: Dict[str, Any]):
                     stopping_criteria.append(StopAtSpecificTokenCriteria(stop_sequence=stop_sequence_input_ids))
             del raw_request["stop_sequences"]
 
-        # Strip out irrelevant parameters
-        relevant_raw_request = {
-            key: raw_request[key]
-            for key in raw_request
-            if key not in ["engine", "prompt", "echo_prompt", "stop_sequences"]
-        }
-
-        # Use HuggingFace's `generate` method.
-        output = self.model.generate(
-            **encoded_input,
-            **relevant_raw_request,
-            stopping_criteria=stopping_criteria,
+        # Check if we need to compute the perplexity of the prompt (#1497)
+        compute_logprobs_only = (
+            raw_request["max_new_tokens"] == 0
+            and raw_request["num_return_sequences"] == 1
+            and raw_request["echo_prompt"]
         )
-        sequences = output.sequences
-        scores = output.scores
 
-        # Compute logprobs for each completed sequence.
-        all_logprobs_of_chosen_tokens = []
-        all_top_logprobs_dicts = []
+        # Use HuggingFace's `generate` method.
+        if compute_logprobs_only:
+            with torch.no_grad():
+                output = self.model(encoded_input["input_ids"])
+            sequences = encoded_input["input_ids"]
+            scores = output.logits
+        else:
+            # Strip out irrelevant parameters
+            relevant_raw_request = {
+                key: raw_request[key]
+                for key in raw_request
+                if key not in ["engine", "prompt", "echo_prompt", "stop_sequences"]
+            }
+
+            output = self.model.generate(
+                **encoded_input,
+                **relevant_raw_request,
+                stopping_criteria=stopping_criteria,
+            )
+            sequences = output.sequences
+            scores = output.scores
+
+        prompt_tokens_logprobs = []
+        prompt_tokens_top_logprobs_dicts: List[Dict] = []
+        if compute_logprobs_only:
+            # Append the logprob of the first token of the prompt.
+            prompt_tokens_logprobs.append(0.0)
+            prompt_tokens_top_logprobs_dicts.append({})
+
+            # Compute logprobs of prompt tokens.
+            for completion_id in range(raw_request["num_return_sequences"]):
+                for i in range(len(sequences[completion_id]) - 1):
+                    logprobs = torch.nn.functional.log_softmax(scores[completion_id][i], dim=0)
+                    topk_logprobs = torch.topk(logprobs, k=top_k_per_token)
+                    with self.wrapped_tokenizer as tokenizer:
+                        prompt_tokens_top_logprobs_dicts.append(
+                            {
+                                tokenizer.convert_ids_to_tokens(k.item()): v.item()
+                                for (k, v) in zip(topk_logprobs.indices, topk_logprobs.values)
+                            }
+                        )
+                    prompt_tokens_logprobs.append(logprobs[sequences[completion_id][i + 1]].item())
+
+        # Compute logprobs of generated tokens for each completed sequence.
+        all_generated_tokens_logprobs = []
+        all_generated_tokens_top_logprobs_dicts = []
         for completion_id in range(raw_request["num_return_sequences"]):
-            logprobs_of_chosen_tokens = []
-            top_logprobs_dicts = []
+            generated_tokens_logprobs = []
+            generated_tokens_top_logprobs_dicts = []
             for i in range(len(sequences[completion_id]) - len(encoded_input.input_ids[0])):
                 logprobs = torch.nn.functional.log_softmax(scores[i][completion_id], dim=0)
-
                 # Get top tokens in terms of log probability.
                 topk_logprobs = torch.topk(logprobs, k=top_k_per_token)
-                top_logprobs_dicts.append(
-                    {
-                        self.tokenizer.convert_ids_to_tokens(k.item()): v.item()
-                        for (k, v) in zip(topk_logprobs.indices, topk_logprobs.values)
-                    }
-                )
-
+                with self.wrapped_tokenizer as tokenizer:
+                    generated_tokens_top_logprobs_dicts.append(
+                        {
+                            tokenizer.convert_ids_to_tokens(k.item()): v.item()
+                            for (k, v) in zip(topk_logprobs.indices, topk_logprobs.values)
+                        }
+                    )
                 # Get log probability of chosen token.
                 j = i + len(encoded_input.input_ids[0])
-                logprobs_of_chosen_tokens.append(logprobs[sequences[completion_id][j]].item())
-            all_logprobs_of_chosen_tokens.append(logprobs_of_chosen_tokens)
-            all_top_logprobs_dicts.append(top_logprobs_dicts)
+                generated_tokens_logprobs.append(logprobs[sequences[completion_id][j]].item())
+            all_generated_tokens_logprobs.append(generated_tokens_logprobs)
+            all_generated_tokens_top_logprobs_dicts.append(generated_tokens_top_logprobs_dicts)
 
         # Remove prompt from the start of each sequence if echo_prompt is False.
         if not raw_request["echo_prompt"]:
             sequences = [sequence[len(encoded_input.input_ids[0]) :] for sequence in sequences]
 
-        all_tokens = [[self.tokenizer.decode(token) for token in sequence_tokens] for sequence_tokens in sequences]
-        all_decoded_text = self.tokenizer.batch_decode(sequences)
+        with self.wrapped_tokenizer as tokenizer:
+            all_tokens = [[tokenizer.decode(token) for token in sequence_tokens] for sequence_tokens in sequences]
+            all_decoded_text = tokenizer.batch_decode(sequences)
 
         completions = []
-        for decoded_text, tokens, logprobs_of_chosen_tokens, top_logprobs_dicts in zip(
-            all_decoded_text, all_tokens, all_logprobs_of_chosen_tokens, all_top_logprobs_dicts
+        for decoded_text, tokens, generated_tokens_logprobs, generated_tokens_top_logprobs_dicts in zip(
+            all_decoded_text, all_tokens, all_generated_tokens_logprobs, all_generated_tokens_top_logprobs_dicts
         ):
             completions.append(
                 {
                     "text": decoded_text,
                     "tokens": tokens,
-                    "logprobs": logprobs_of_chosen_tokens,
-                    "top_logprobs_dicts": top_logprobs_dicts,
+                    "logprobs": generated_tokens_logprobs,
+                    "top_logprobs_dicts": generated_tokens_top_logprobs_dicts,
+                    "prompt_logprobs": prompt_tokens_logprobs,
+                    "prompt_top_logprobs_dicts": prompt_tokens_top_logprobs_dicts,
                 }
             )
 
@@ -151,7 +186,7 @@ class HuggingFaceServerFactory:
     _servers_lock: Lock = Lock()
 
     @staticmethod
-    def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None) -> Any:
+    def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwargs) -> Any:
         """
         Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
         Returns the HuggingFaceModel.
@@ -159,26 +194,46 @@ def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revisio
         with HuggingFaceServerFactory._servers_lock:
             if helm_model_name not in HuggingFaceServerFactory._servers:
                 with htrack_block(
-                    f"Loading {pretrained_model_name_or_path} (revision={revision}) "
+                    f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
                     f"for HELM model {helm_model_name} with Hugging Face Transformers"
                 ):
                     HuggingFaceServerFactory._servers[helm_model_name] = HuggingFaceServer(
-                        pretrained_model_name_or_path, revision
+                        pretrained_model_name_or_path, **kwargs
                     )
 
         return HuggingFaceServerFactory._servers[helm_model_name]
 
 
+TORCH_DTYPE_KEY = "torch_dtype"
+TORCH_DTYPE_VALUE_PREFIX = "torch."
+
+
+def _process_huggingface_client_kwargs(raw_kwargs: Dict[str, Any]):
+    """Process the kwargs for HuggingFaceClient.
+
+    The kwargs passed to HuggingFaceClient will eventually be passed to AutoModel.from_pretrained().
+    Since the kwargs from HuggingFaceClient may be derived from configuration YAML,
+    they may contain primitive types instead of the unserializable types that
+    AutoModel.from_pretrained() expects (e.g. torch_dtype). This function converts values of
+    primitive types to values of the unserializable types."""
+    processed_kwargs = deepcopy(raw_kwargs)
+
+    # Convert torch_dtype string value to actual dtypes
+    # e.g. the string "torch.bfloat16" is converted to torch.bfloat16
+    torch_dtype = processed_kwargs.get(TORCH_DTYPE_KEY)
+    if torch_dtype and isinstance(torch_dtype, str):
+        if not torch_dtype.startswith(TORCH_DTYPE_VALUE_PREFIX):
+            raise ValueError(f'Unknown dtype "{torch_dtype}"; expected a string such as "torch.bfloat16"')
+        processed_kwargs[TORCH_DTYPE_KEY] = getattr(torch, torch_dtype[len(TORCH_DTYPE_VALUE_PREFIX) :])
+
+    return processed_kwargs
+
+
 class HuggingFaceClient(CachingClient):
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
-    ):
+    def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
         super().__init__(cache_config=cache_config)
         self._pretrained_model_name_or_path = pretrained_model_name_or_path
-        self._revision = revision
+        self._kwargs = _process_huggingface_client_kwargs(kwargs)
 
     def make_request(self, request: Request) -> RequestResult:
         # Embedding not supported for this model
@@ -205,7 +260,7 @@ def make_request(self, request: Request) -> RequestResult:
         huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server(
             helm_model_name=request.model_deployment,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            revision=self._revision,
+            **self._kwargs,
         )
 
         try:
@@ -227,8 +282,18 @@ def do_it():
             if request.echo_prompt:
                 # Add prompt to list of generated tokens.
                 generated_tokens = raw_completion["tokens"][response["input_length"] :]
-                for token_text in raw_completion["tokens"][: response["input_length"]]:
-                    tokens.append(Token(text=token_text, logprob=0.0, top_logprobs={}))
+                if raw_completion.get("prompt_logprobs") and raw_completion.get("prompt_top_logprobs_dicts"):
+                    for token_text, logprob, top_logprobs_dict in zip(
+                        raw_completion["tokens"][: response["input_length"]],
+                        raw_completion["prompt_logprobs"][: response["input_length"]],
+                        raw_completion["prompt_top_logprobs_dicts"][: response["input_length"]],
+                    ):
+                        tokens.append(Token(text=token_text, logprob=logprob, top_logprobs=top_logprobs_dict))
+                        sequence_logprob += logprob
+                else:
+                    for token_text in raw_completion["tokens"][: response["input_length"]]:
+                        tokens.append(Token(text=token_text, logprob=0.0, top_logprobs={}))
+
             else:
                 generated_tokens = raw_completion["tokens"]
 
diff --git a/src/helm/proxy/clients/lit_gpt_client.py b/src/helm/proxy/clients/lit_gpt_client.py
index 86c180e5ea..f864b2f289 100644
--- a/src/helm/proxy/clients/lit_gpt_client.py
+++ b/src/helm/proxy/clients/lit_gpt_client.py
@@ -89,6 +89,7 @@ class LitGPTClient(CachingClient):
     def __init__(
         self,
         tokenizer: Tokenizer,
+        tokenizer_name: str,
         cache_config: CacheConfig,
         checkpoint_dir: Path = Path(""),
         precision: str = "bf16-true",
diff --git a/src/helm/proxy/clients/megatron_client.py b/src/helm/proxy/clients/megatron_client.py
index 712ea67610..ee699f9c1e 100644
--- a/src/helm/proxy/clients/megatron_client.py
+++ b/src/helm/proxy/clients/megatron_client.py
@@ -26,9 +26,10 @@ class MegatronClient(CachingClient):
     https://github.com/NVIDIA/Megatron-LM#gpt-text-generation
     """
 
-    def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig):
+    def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
 
     def _send_request(self, raw_request: Dict[str, Any]) -> Dict[str, Any]:
         response = requests.request(
@@ -48,7 +49,7 @@ def _send_request(self, raw_request: Dict[str, Any]) -> Dict[str, Any]:
         return out
 
     def _tokenize_response(self, text: str) -> List[Token]:
-        tokenized_text = self.tokenizer.tokenize(TokenizationRequest(text, tokenizer="huggingface/gpt2"))
+        tokenized_text = self.tokenizer.tokenize(TokenizationRequest(text, tokenizer=self.tokenizer_name))
 
         # TODO(tgale): Support logprobs.
         tokens = [Token(text=str(token), logprob=0, top_logprobs={}) for token in tokenized_text.raw_tokens]
diff --git a/src/helm/proxy/clients/openai_client.py b/src/helm/proxy/clients/openai_client.py
index 0cb355fd76..4fea10f678 100644
--- a/src/helm/proxy/clients/openai_client.py
+++ b/src/helm/proxy/clients/openai_client.py
@@ -15,7 +15,6 @@
 
 try:
     import openai
-    import tiktoken
 except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["openai"])
 
@@ -29,12 +28,14 @@ class OpenAIClient(CachingClient):
     def __init__(
         self,
         tokenizer: Tokenizer,
+        tokenizer_name: str,
         cache_config: CacheConfig,
         api_key: Optional[str] = None,
         org_id: Optional[str] = None,
     ):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
         self.org_id: Optional[str] = org_id
         self.api_key: Optional[str] = api_key
         self.api_base: str = "https://api.openai.com/v1"
@@ -156,9 +157,7 @@ def do_it():
                 text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
                 # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
                 tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
-                    TokenizationRequest(
-                        text, tokenizer="openai/" + tiktoken.encoding_for_model(request.model_engine).name
-                    )
+                    TokenizationRequest(text, tokenizer=self.tokenizer_name)
                 )
                 # Log probs are not currently not supported by the OpenAI chat completion API, so set to 0 for now.
                 tokens = [
diff --git a/src/helm/proxy/clients/palmyra_client.py b/src/helm/proxy/clients/palmyra_client.py
index aed6a69a9b..9fa87c70d2 100644
--- a/src/helm/proxy/clients/palmyra_client.py
+++ b/src/helm/proxy/clients/palmyra_client.py
@@ -28,10 +28,11 @@ def _is_content_moderation_failure(response: Dict) -> bool:
 
 
 class PalmyraClient(CachingClient):
-    def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig, api_key: str):
+    def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, api_key: str):
         super().__init__(cache_config=cache_config)
         self.api_key: str = api_key
         self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
 
     def _send_request(self, model_name: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:
         response = requests.request(
@@ -123,7 +124,7 @@ def do_it():
             # The Writer API doesn't return us tokens or logprobs, so we tokenize ourselves.
             tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
                 # Writer uses the GPT-2 tokenizer
-                TokenizationRequest(text, tokenizer="huggingface/gpt2")
+                TokenizationRequest(text, tokenizer=self.tokenizer_name)
             )
 
             # Log probs are not currently not supported by the Writer, so set to 0 for now.
diff --git a/src/helm/proxy/clients/test_huggingface_client.py b/src/helm/proxy/clients/test_huggingface_client.py
index 460756123b..059c9c1777 100644
--- a/src/helm/proxy/clients/test_huggingface_client.py
+++ b/src/helm/proxy/clients/test_huggingface_client.py
@@ -47,3 +47,25 @@ def test_gptj_6b(self):
             )
         )
         assert len(result.completions) == 3
+
+    def test_logprob(self):
+        prompt: str = "I am a computer scientist."
+        result: RequestResult = self.client.make_request(
+            Request(
+                model="openai/gpt2",
+                model_deployment="huggingface/gpt2",
+                prompt=prompt,
+                num_completions=1,
+                max_tokens=0,
+                echo_prompt=True,
+            )
+        )
+        assert result.completions[0].text.startswith(
+            prompt
+        ), "echo_prompt was set to true. Expected the prompt at the beginning of each completion"
+        total_logprob: float = 0
+        assert len(result.completions[0].tokens) == 6, "Expected 6 tokens in the completion"
+        for token in result.completions[0].tokens[1:]:
+            assert token.logprob != 0
+            total_logprob += token.logprob
+        assert result.completions[0].logprob == pytest.approx(total_logprob)
diff --git a/src/helm/proxy/clients/together_client.py b/src/helm/proxy/clients/together_client.py
index d07669ee5a..b91ebf99a3 100644
--- a/src/helm/proxy/clients/together_client.py
+++ b/src/helm/proxy/clients/together_client.py
@@ -42,6 +42,7 @@
     "llama-2-13b": "togethercomputer/llama-2-13b",
     "llama-2-70b": "togethercomputer/llama-2-70b",
     "mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1",
+    "mixtral-8x7b-32kseqlen": "mistralai/mixtral-8x7b-32kseqlen",
     "mpt-30b": "togethercomputer/mpt-30b",
     "mpt-instruct-30b": "togethercomputer/mpt-30b-instruct",
     "pythia-1b-v0": "EleutherAI/pythia-1b-v0",
@@ -50,6 +51,8 @@
     "pythia-12b-v0": "EleutherAI/pythia-12b-v0",
     "vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3",
     "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
+    "yi-6b": "zero-one-ai/Yi-6B",
+    "yi-34b": "zero-one-ai/Yi-34B",
 }
 """Together model name aliases.
 
@@ -106,6 +109,10 @@ class _RewriteRequestTags:
 (e.g. "stanford/alpaca-7b") or the Together model name (e.g. "togethercomputer/alpaca-7b")."""
 
 
+TOGETHER_SUPPORTS_ASYNC_REQUESTS = False
+"""Whether Together AI currently supports asynchronous requests."""
+
+
 def _rewrite_raw_request_for_model_tags(raw_request: Dict[str, Any], model_engine: str) -> Dict[str, Any]:
     """Rewrite the raw request given the model."""
     # Make a deepcopy to avoid mutating the input in unexpected ways
@@ -182,8 +189,7 @@ def make_request(self, request: Request) -> RequestResult:
             raise TogetherClientError("togetherApiKey not set in credentials.conf")
         headers: Dict[str, str] = {"Authorization": f"Bearer {self.api_key}"}
 
-        # TODO: Remove synchronous branch.
-        if request.model_engine in MODEL_ALIASES:
+        if TOGETHER_SUPPORTS_ASYNC_REQUESTS:
 
             def submit_job() -> str:
                 submit_request = {**raw_request, "async": True}
diff --git a/src/helm/proxy/clients/vertexai_client.py b/src/helm/proxy/clients/vertexai_client.py
new file mode 100644
index 0000000000..6098154cd4
--- /dev/null
+++ b/src/helm/proxy/clients/vertexai_client.py
@@ -0,0 +1,115 @@
+import requests
+from typing import List
+
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+)
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+from .client import CachingClient, truncate_sequence
+
+try:
+    import vertexai
+    from vertexai.language_models import TextGenerationModel, TextGenerationResponse
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["google"])
+
+
+class VertexAIClient(CachingClient):
+    def __init__(
+        self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig, project_id: str, location: str
+    ) -> None:
+        super().__init__(cache_config=cache_config)
+        self.project_id = project_id
+        self.location = location
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+
+        vertexai.init(project=self.project_id, location=self.location)
+
+    def make_request(self, request: Request) -> RequestResult:
+        """Make a request"""
+        parameters = {
+            "temperature": request.temperature,
+            "max_output_tokens": request.max_tokens,
+            "top_k": request.top_k_per_token,
+            "top_p": request.top_p,
+            "stop_sequences": request.stop_sequences,
+            "candidate_count": request.num_completions,
+            # TODO #2084: Add support for these parameters.
+            # The parameters "echo", "frequency_penalty", and "presence_penalty" are supposed to be supported
+            # in an HTTP request (See https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text),
+            # but they are not supported in the Python SDK:
+            # https://github.com/googleapis/python-aiplatform/blob/beae48f63e40ea171c3f1625164569e7311b8e5a/vertexai/language_models/_language_models.py#L968C1-L980C1
+            # "frequency_penalty": request.frequency_penalty,
+            # "presence_penalty": request.presence_penalty,
+            # "echo": request.echo_prompt,
+        }
+
+        completions: List[Sequence] = []
+        model_name: str = request.model_engine
+
+        try:
+
+            def do_it():
+                model = TextGenerationModel.from_pretrained(model_name)
+                response = model.predict(request.prompt, **parameters)
+                candidates: List[TextGenerationResponse] = response.candidates
+                response_dict = {
+                    "predictions": [{"text": completion.text for completion in candidates}],
+                }  # TODO: Extract more information from the response
+                return response_dict
+
+            # We need to include the engine's name to differentiate among requests made for different model
+            # engines since the engine name is not included in the request itself.
+            # Same for the prompt.
+            cache_key = CachingClient.make_cache_key(
+                {
+                    "engine": request.model_engine,
+                    "prompt": request.prompt,
+                    **parameters,
+                },
+                request,
+            )
+
+            response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except (requests.exceptions.RequestException, AssertionError) as e:
+            error: str = f"VertexAIClient error: {e}"
+            return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
+
+        for prediction in response["predictions"]:
+            response_text = prediction["text"]
+
+            # The Python SDK does not support echo
+            # TODO #2084: Add support for echo.
+            text: str = request.prompt + response_text if request.echo_prompt else response_text
+
+            tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
+                TokenizationRequest(text, tokenizer=self.tokenizer_name)
+            )
+
+            # TODO #2085: Add support for log probs.
+            # Once again, log probs seem to be supported by the API but not by the Python SDK.
+            # HTTP Response body reference:
+            # https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#response_body
+            # Python SDK reference:
+            # https://github.com/googleapis/python-aiplatform/blob/beae48f63e40ea171c3f1625164569e7311b8e5a/vertexai/language_models/_language_models.py#L868
+            tokens: List[Token] = [
+                Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
+            ]
+
+            completion = Sequence(text=response_text, logprob=0, tokens=tokens)
+            sequence = truncate_sequence(completion, request, print_warning=True)
+            completions.append(sequence)
+
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response["request_datetime"],
+            completions=completions,
+            embedding=[],
+        )
diff --git a/src/helm/proxy/clients/vision_language/idefics_client.py b/src/helm/proxy/clients/vision_language/idefics_client.py
index 88c1988228..df692b9e56 100644
--- a/src/helm/proxy/clients/vision_language/idefics_client.py
+++ b/src/helm/proxy/clients/vision_language/idefics_client.py
@@ -54,9 +54,10 @@ class IDEFICSClient(CachingClient):
     END_OF_UTTERANCE_TOKEN: str = "<end_of_utterance>"
     BAD_WORD_TOKENS: List[str] = ["<image>", "<fake_token_around_image>"]
 
-    def __init__(self, tokenizer: Tokenizer, cache_config: CacheConfig):
+    def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, cache_config: CacheConfig):
         super().__init__(cache_config=cache_config)
         self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
         self._device: str = get_torch_device_name()
 
     def _get_model(self, checkpoint: str) -> LoadedIDEFICSModelProcessor:
@@ -142,7 +143,7 @@ def do_it():
         # TODO: Does it make sense to support echo? Include these params in the cache key.
         # TODO: Together might support this model so use the TogetherClient
         tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
-            TokenizationRequest(result["output"], tokenizer=request.model)
+            TokenizationRequest(result["output"], tokenizer=self.tokenizer_name)
         )
         tokens: List[Token] = [
             Token(text=str(text), logprob=0, top_logprobs={}) for text in tokenization_result.raw_tokens
diff --git a/src/helm/proxy/retry.py b/src/helm/proxy/retry.py
index 752f9a8bc4..8a851f4f99 100644
--- a/src/helm/proxy/retry.py
+++ b/src/helm/proxy/retry.py
@@ -41,11 +41,13 @@ def wait(attempts: int, delay: float) -> float:
         Wait function to pass into `Retrying` that logs and returns the amount of time to sleep
         depending on the number of attempts and delay (in milliseconds).
         """
+        del delay  # unused
+        next_delay = 2**attempts * wait_exponential_multiplier_seconds * 1000
         hlog(
-            f"{operation} failed. Retrying (attempt #{attempts + 1}) in {delay // 1000} seconds... "
+            f"{operation} failed. Retrying (attempt #{attempts + 1}) in {next_delay // 1000} seconds... "
             "(See above for error details)"
         )
-        return _retrying.exponential_sleep(attempts, delay)
+        return next_delay
 
     def print_exception_and_traceback(exception: Exception) -> bool:
         """
@@ -85,3 +87,7 @@ def retry_if_request_failed(result: Union[RequestResult, TokenizationRequestResu
 retry_request: Callable = get_retry_decorator(
     "Request", max_attempts=5, wait_exponential_multiplier_seconds=5, retry_on_result=retry_if_request_failed
 )
+
+retry_tokenizer_request: Callable = get_retry_decorator(
+    "Request", max_attempts=5, wait_exponential_multiplier_seconds=1, retry_on_result=retry_if_request_failed
+)
diff --git a/src/helm/proxy/server.py b/src/helm/proxy/server.py
index 8500b82371..25c9585281 100644
--- a/src/helm/proxy/server.py
+++ b/src/helm/proxy/server.py
@@ -16,6 +16,10 @@
 from dacite import from_dict
 import bottle
 
+from helm.benchmark.config_registry import (
+    register_configs_from_directory,
+    register_builtin_configs_from_helm_package,
+)
 from helm.common.authentication import Authentication
 from helm.common.hierarchical_logger import hlog
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -225,6 +229,9 @@ def main():
     )
     args = parser.parse_args()
 
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.base_path)
+
     service = ServerService(base_path=args.base_path, mongo_uri=args.mongo_uri)
 
     gunicorn_args = {
diff --git a/src/helm/proxy/services/server_service.py b/src/helm/proxy/services/server_service.py
index ed1ae76243..098ceeed3a 100644
--- a/src/helm/proxy/services/server_service.py
+++ b/src/helm/proxy/services/server_service.py
@@ -1,3 +1,4 @@
+import dataclasses
 import os
 import signal
 from typing import List, Optional
@@ -58,7 +59,10 @@ def __init__(self, base_path: str = "prod_env", root_mode=False, mongo_uri: str
         self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
 
     def get_general_info(self) -> GeneralInfo:
-        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS_METADATA)
+        # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
+        # TODO(#2158): Either fix this or delete get_general_info.
+        all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
+        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
 
     def get_window_service_info(self, model_name) -> WindowServiceInfo:
         # The import statement is placed here to avoid two problems, please refer to the link for details
diff --git a/src/helm/proxy/token_counters/openai_token_counter.py b/src/helm/proxy/token_counters/openai_token_counter.py
index ec672af504..42681e8ca7 100644
--- a/src/helm/proxy/token_counters/openai_token_counter.py
+++ b/src/helm/proxy/token_counters/openai_token_counter.py
@@ -16,7 +16,7 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
         https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5
         """
         tokenized_prompt: TokenizationRequestResult = self.huggingface_tokenizer.tokenize(
-            TokenizationRequest(request.prompt)
+            TokenizationRequest(request.prompt, tokenizer="huggingface/gpt2")
         )
         # Number of tokens in the prompt + number of tokens in all the completions
         return len(tokenized_prompt.tokens) + sum([len(sequence.tokens) for sequence in completions])
diff --git a/src/helm/proxy/tokenizers/anthropic_tokenizer.py b/src/helm/proxy/tokenizers/anthropic_tokenizer.py
index 629ed55908..3eb6ca581d 100644
--- a/src/helm/proxy/tokenizers/anthropic_tokenizer.py
+++ b/src/helm/proxy/tokenizers/anthropic_tokenizer.py
@@ -15,6 +15,9 @@
 
 class AnthropicTokenizer(CachingTokenizer):
     LOCK: threading.Lock = threading.Lock()
+    """Global lock for the Anthropic tokenizer.
+
+    The Anthropic tokenizer is a wrapper around a single global Hugging Face tokenizer, which is thread-hostile."""
 
     def __init__(self, cache_config: CacheConfig) -> None:
         super().__init__(cache_config)
@@ -26,21 +29,24 @@ def __init__(self, cache_config: CacheConfig) -> None:
     def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
         if request["encode"]:
             if request["truncation"]:
-                tokens = self._tokenizer.encode(
-                    request["text"],
-                    truncation=request["truncation"],
-                    max_length=request["max_length"],
-                    add_special_tokens=False,
-                )
+                with AnthropicTokenizer.LOCK:
+                    tokens = self._tokenizer.encode(
+                        request["text"],
+                        truncation=request["truncation"],
+                        max_length=request["max_length"],
+                        add_special_tokens=False,
+                    )
             else:
-                tokens = self._tokenizer.encode(request["text"], add_special_tokens=False)
+                with AnthropicTokenizer.LOCK:
+                    tokens = self._tokenizer.encode(request["text"], add_special_tokens=False)
         else:
             # No encoding, just return the token strings
             tokens = [self._tokenizer.convert_tokens_to_string([i]) for i in self._tokenizer.tokenize(request["text"])]
         return {"tokens": tokens}
 
     def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        text = self._tokenizer.decode(
-            request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
-        )
+        with AnthropicTokenizer.LOCK:
+            text = self._tokenizer.decode(
+                request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
+            )
         return {"text": text}
diff --git a/src/helm/proxy/tokenizers/auto_tokenizer.py b/src/helm/proxy/tokenizers/auto_tokenizer.py
index 5722a7e97b..64aa415c3c 100644
--- a/src/helm/proxy/tokenizers/auto_tokenizer.py
+++ b/src/helm/proxy/tokenizers/auto_tokenizer.py
@@ -9,6 +9,7 @@
 from helm.common.cache import CacheConfig
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import create_object, inject_object_spec_args
+from helm.proxy.retry import retry_tokenizer_request
 from helm.common.tokenization_request import (
     DecodeRequest,
     DecodeRequestResult,
@@ -46,9 +47,13 @@ def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
                 constant_bindings={"cache_config": cache_config},
                 provider_bindings={
                     "api_key": lambda: provide_api_key(self.credentials, organization),
+                    "project_id": lambda: self.credentials.get(organization + "ProjectId", None),  # VertexAI
+                    "location": lambda: self.credentials.get(organization + "Location", None),  # VertexAI
                 },
             )
             tokenizer = create_object(tokenizer_spec)
+        else:
+            hlog(f"No tokenizer config for {tokenizer_name}")
 
         # Cache the tokenizer
         assert isinstance(tokenizer, Tokenizer)  # To make mypy happy
@@ -59,6 +64,7 @@ def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
     def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
         """Tokenizes based on the name of the tokenizer (e.g., huggingface/gpt2)."""
 
+        @retry_tokenizer_request
         def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> TokenizationRequestResult:
             return tokenizer.tokenize(request)
 
@@ -75,6 +81,7 @@ def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> T
     def decode(self, request: DecodeRequest) -> DecodeRequestResult:
         """Decodes based on the the name of the tokenizer (e.g., huggingface/gpt2)."""
 
+        @retry_tokenizer_request
         def decode_with_retry(tokenizer: Tokenizer, request: DecodeRequest) -> DecodeRequestResult:
             return tokenizer.decode(request)
 
diff --git a/src/helm/proxy/tokenizers/caching_tokenizer.py b/src/helm/proxy/tokenizers/caching_tokenizer.py
index bf553df1f9..a19ba84da1 100644
--- a/src/helm/proxy/tokenizers/caching_tokenizer.py
+++ b/src/helm/proxy/tokenizers/caching_tokenizer.py
@@ -100,9 +100,15 @@ def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
             # Internal check of the type of the first token
             # This is to make sure that the tokenization is correct
             if request.encode and len(tokens) > 0:
-                assert type(tokens[0].value) == int
+                assert type(tokens[0].value) == int, (
+                    f"tokenize() returned strings instead of integers when encode is True: "
+                    f"request={request} repsonse={response}"
+                )
             elif not request.encode and len(tokens) > 0:
-                assert type(tokens[0].value) == str
+                assert type(tokens[0].value) == str, (
+                    f"tokenize() returned integers instead of strings when encode is False: "
+                    f"request={request} repsonse={response}"
+                )
 
             result = TokenizationRequestResult(
                 success=True,
diff --git a/src/helm/proxy/tokenizers/huggingface_tokenizer.py b/src/helm/proxy/tokenizers/huggingface_tokenizer.py
index d22212e999..b92d6c068b 100644
--- a/src/helm/proxy/tokenizers/huggingface_tokenizer.py
+++ b/src/helm/proxy/tokenizers/huggingface_tokenizer.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Optional
 from threading import Lock
 from helm.common.cache import CacheConfig
+from helm.common.concurrency import ThreadSafeWrapper
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -16,6 +17,10 @@
     "huggingface/gpt2": "gpt2",
     "huggingface/santacoder": "bigcode/santacoder",
     "huggingface/starcoder": "bigcode/starcoder",
+    "writer/gpt2": "gpt2",  # Palmyra models do not support echo
+    # So they have a different TokenizerConfig called "writer/gpt2"
+    # when in reality they use the same tokenizer as "huggingface/gpt2"
+    "microsoft/gpt2": "gpt2",  # Same as above
 }
 """Mapping of some HELM model names to Hugging Face pretrained model name."""
 
@@ -26,30 +31,36 @@ def resolve_alias(model_name: str) -> str:
     return _MODEL_NAME_ALIASES.get(model_name, model_name)
 
 
+WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
+"""Thread safe wrapper around Hugging Face PreTrainedTokenizerBase.
+
+Hugging Face PreTrainedTokenizerBase is thread-hostile and using it from multiple threads
+simultaneously can result in an "Already borrowed" error (#1421). This wrapper ensures
+that a lock is held when using the PreTrainedTokenizerBase.
+
+Example usage:
+
+    with wrapped_tokenizer as tokenizer:
+        tokenizer.encode("...")
+"""
+
+
 class HuggingFaceTokenizer(CachingTokenizer):
-    _tokenizers: Dict[str, PreTrainedTokenizerBase] = {}
+    _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
     _tokenizers_lock: Lock = Lock()
 
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
-    ):
+    def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
         super().__init__(cache_config=cache_config)
         self._pretrained_model_name_or_path = pretrained_model_name_or_path
-        self._revision = revision
+        self._kwargs = kwargs
 
     @staticmethod
-    def create_tokenizer(pretrained_model_name_or_path: str, revision: Optional[str] = None) -> PreTrainedTokenizerBase:
+    def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer:
         """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
         # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
         # TODO: Figure out if we actually need this.
         os.environ["TOKENIZERS_PARALLELISM"] = "False"
 
-        tokenizer_kwargs = {}
-        if revision is not None:
-            tokenizer_kwargs["revision"] = revision
         try:
             # From the Hugging Face documentation, "local_files_only(defaults to False) —
             # Whether or not to only look at local files".
@@ -60,19 +71,23 @@ def create_tokenizer(pretrained_model_name_or_path: str, revision: Optional[str]
             # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
             # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
             # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=True, use_fast=True, **tokenizer_kwargs
+            return WrappedPreTrainedTokenizer(
+                AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
+                )
             )
         except OSError:
             hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=False, use_fast=True, **tokenizer_kwargs
+            return WrappedPreTrainedTokenizer(
+                AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
+                )
             )
 
     @staticmethod
     def get_tokenizer(
-        helm_tokenizer_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
-    ) -> PreTrainedTokenizerBase:
+        helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs
+    ) -> WrappedPreTrainedTokenizer:
         """
         Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
         Returns the tokenizer.
@@ -80,42 +95,41 @@ def get_tokenizer(
         with HuggingFaceTokenizer._tokenizers_lock:
             if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers:
                 with htrack_block(
-                    f"Loading {pretrained_model_name_or_path} (revision={revision}) "
+                    f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
                     f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers"
                 ):
                     # Keep the tokenizer in memory, so we don't recreate it for future requests
                     HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer(
-                        pretrained_model_name_or_path, revision
+                        pretrained_model_name_or_path, **kwargs
                     )
         return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
 
-    def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> PreTrainedTokenizerBase:
+    def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer:
         """Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer."""
         pretrained_model_name_or_path: str
         if self._pretrained_model_name_or_path:
             pretrained_model_name_or_path = self._pretrained_model_name_or_path
         else:
             pretrained_model_name_or_path = resolve_alias(request["tokenizer"])
-        _tokenizer = HuggingFaceTokenizer.get_tokenizer(
+        return HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=request["tokenizer"],
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            revision=self._revision,
+            **self._kwargs,
         )
-        return _tokenizer
 
     def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        _tokenizer = self._get_tokenizer_for_request(request)
-
         if request["encode"]:
             if request["truncation"]:
-                tokens = _tokenizer.encode(
-                    request["text"],
-                    truncation=request["truncation"],
-                    max_length=request["max_length"],
-                    add_special_tokens=False,
-                )
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.encode(
+                        request["text"],
+                        truncation=request["truncation"],
+                        max_length=request["max_length"],
+                        add_special_tokens=False,
+                    )
             else:
-                tokens = _tokenizer.encode(request["text"], add_special_tokens=False)
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.encode(request["text"], add_special_tokens=False)
         else:
             if "gpt" in request["tokenizer"] or request["tokenizer"] in [
                 "bigscience/bloom",
@@ -126,9 +140,10 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
                 # convert_tokens_to_string method. We prefer to use this method instead
                 # of the hacky cleanup_tokens method below as it might handle cases
                 # we haven't thought of in cleanup_tokens.
-                tokens = [
-                    _tokenizer.convert_tokens_to_string([token]) for token in _tokenizer.tokenize(request["text"])
-                ]
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = [
+                        tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
+                    ]
             else:
                 # Tokenizes the text and returns the tokens as a list of strings,
                 # not a list of token objects (otherwise "Hello world" would be"
@@ -138,14 +153,14 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
                 # But this replaces all the "▁" characters by "", which is not what we want.
                 # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
                 # Just like tokenize("Hello", encode=False) would return ["Hello"].
-                tokens = _tokenizer.tokenize(request["text"])
+                with self._get_tokenizer_for_request(request) as tokenizer:
+                    tokens = tokenizer.tokenize(request["text"])
                 tokens = cleanup_tokens(tokens, request["tokenizer"])
         return {"tokens": tokens}
 
     def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
-        _tokenizer = self._get_tokenizer_for_request(request)
-
-        text = _tokenizer.decode(
-            request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
-        )
+        with self._get_tokenizer_for_request(request) as tokenizer:
+            text = tokenizer.decode(
+                request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
+            )
         return {"text": text}
diff --git a/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
index 6fdc178829..46157af858 100644
--- a/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
+++ b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
@@ -3,6 +3,7 @@
 from typing import List
 
 from helm.common.cache import SqliteCacheConfig
+from helm.common.general import parallel_map
 from helm.common.tokenization_request import (
     DecodeRequest,
     DecodeRequestResult,
@@ -26,7 +27,7 @@ def teardown_method(self, method):
         os.remove(self.cache_path)
 
     def test_tokenize(self):
-        request = TokenizationRequest(text=self.TEST_PROMPT)
+        request = TokenizationRequest(text=self.TEST_PROMPT, tokenizer="anthropic/claude")
         result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
         assert result.raw_tokens == self.TEST_TOKENS
@@ -35,7 +36,9 @@ def test_tokenize(self):
         assert result.raw_tokens == self.TEST_TOKENS
 
     def test_encode(self):
-        request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1)
+        request = TokenizationRequest(
+            text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1
+        )
         result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
         assert result.raw_tokens == [self.TEST_ENCODED[0]]
@@ -43,16 +46,37 @@ def test_encode(self):
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == [self.TEST_ENCODED[0]]
 
-        request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1024)
+        request = TokenizationRequest(
+            text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1024
+        )
         result = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making this particular request. Result should not be cached"
         assert result.raw_tokens == self.TEST_ENCODED
 
     def test_decode(self):
-        request = DecodeRequest(tokens=self.TEST_ENCODED)
+        request = DecodeRequest(tokens=self.TEST_ENCODED, tokenizer="anthropic/claude")
         result: DecodeRequestResult = self.tokenizer.decode(request)
         assert not result.cached, "First time making the decode request. Result should not be cached"
         assert result.text == self.TEST_PROMPT
         result = self.tokenizer.decode(request)
         assert result.cached, "Result should be cached"
         assert result.text == self.TEST_PROMPT
+
+    def test_already_borrowed(self):
+        """Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Anthropic tokenizer,
+        which is a thin wrapper around a Hugging Face FastTokenizer"""
+
+        def make_tokenize_request(seed: int) -> None:
+            request_length = 10
+            truncation = bool(seed % 2)
+            self.tokenizer.tokenize(
+                # The truncation parameter requires setting a flag on the Rust FastTokenizer.
+                # Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
+                TokenizationRequest(
+                    text=str(seed) * request_length, tokenizer="anthropic/claude", encode=True, truncation=truncation
+                )
+            )
+
+        num_requests = 100
+        # Should not raise "Already borrowed" error
+        parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)
diff --git a/src/helm/proxy/tokenizers/test_huggingface_tokenizer.py b/src/helm/proxy/tokenizers/test_huggingface_tokenizer.py
index d232a6f212..8cc994e05d 100644
--- a/src/helm/proxy/tokenizers/test_huggingface_tokenizer.py
+++ b/src/helm/proxy/tokenizers/test_huggingface_tokenizer.py
@@ -3,7 +3,7 @@
 from typing import Optional
 
 from helm.common.cache import SqliteCacheConfig
-from helm.common.general import singleton
+from helm.common.general import parallel_map, singleton
 from helm.common.tokenization_request import (
     DecodeRequest,
     DecodeRequestResult,
@@ -59,6 +59,24 @@ def test_decode(self):
         assert result.cached, "Result should be cached"
         assert result.text == "I am a computer scientist."
 
+    def test_already_borrowed(self):
+        """Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Hugging Face tokenizer"""
+
+        def make_tokenize_request(seed: int) -> None:
+            request_length = 10
+            truncation = bool(seed % 2)
+            self.tokenizer.tokenize(
+                # The truncation parameter requires setting a flag on the Rust FastTokenizer.
+                # Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
+                TokenizationRequest(
+                    text=str(seed) * request_length, tokenizer="huggingface/gpt2", encode=True, truncation=truncation
+                )
+            )
+
+        num_requests = 100
+        # Should not raise "Already borrowed" error
+        parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)
+
 
 class TestHuggingFaceTokenizer:
     # The following prompt has 51 tokens according to the GPT-2 tokenizer
@@ -74,12 +92,13 @@ class TestHuggingFaceTokenizer:
     def verify_get_tokenizer(
         tokenizer_name: str, expected_num_tokens: int, pretrained_model_name_or_path: Optional[str] = None
     ):
-        tokenizer = HuggingFaceTokenizer.get_tokenizer(
+        wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=tokenizer_name,
             pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
         )
         assert tokenizer_name in HuggingFaceTokenizer._tokenizers, "Tokenizer should be cached"
-        assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
+        with wrapped_tokenizer as tokenizer:
+            assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
 
     def test_get_tokenizer_gpt2(self):
         TestHuggingFaceTokenizer.verify_get_tokenizer("huggingface/gpt2", 51, pretrained_model_name_or_path="gpt2")
@@ -107,7 +126,8 @@ def test_get_santacoder(self):
 
     def test_gpt2_tokenize_eos(self):
         eos_token: str = "<|endoftext|>"
-        tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
-        token_ids = tokenizer.encode(eos_token)
-        assert singleton(token_ids) == 50256
-        assert tokenizer.decode(token_ids) == eos_token
+        wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
+        with wrapped_tokenizer as tokenizer:
+            token_ids = tokenizer.encode(eos_token)
+            assert singleton(token_ids) == 50256
+            assert tokenizer.decode(token_ids) == eos_token
diff --git a/src/helm/proxy/tokenizers/vertexai_tokenizer.py b/src/helm/proxy/tokenizers/vertexai_tokenizer.py
new file mode 100644
index 0000000000..4eac306a90
--- /dev/null
+++ b/src/helm/proxy/tokenizers/vertexai_tokenizer.py
@@ -0,0 +1,97 @@
+import base64
+import dataclasses
+import requests
+from typing import Any, Dict, List, Union, Optional
+
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.proxy.tokenizers.caching_tokenizer import CachingTokenizer
+from helm.proxy.retry import NonRetriableException
+
+try:
+    import google.auth
+    import google.auth.transport.requests
+    from google.auth.exceptions import DefaultCredentialsError
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["google"])
+
+
+class VertexAIAuthenticationException(NonRetriableException):
+    pass
+
+
+class VertexAITokenizer(CachingTokenizer):
+    """Google Vertex AI API for tokenization.
+
+    Doc: https://cloud.google.com/vertex-ai/docs/generative-ai/compute-token"""
+
+    def __init__(self, project_id: Optional[str], location: Optional[str], cache_config: CacheConfig) -> None:
+        super().__init__(cache_config)
+        if not project_id:
+            raise VertexAIAuthenticationException("credentials.conf is missing googleProjectId")
+        if not location:
+            raise VertexAIAuthenticationException("credentials.conf is missing googleLocation")
+        self.project_id = project_id
+        self.location = location
+        try:
+            creds, _ = google.auth.default(quota_project_id=self.project_id)
+            auth_req = google.auth.transport.requests.Request()
+            creds.refresh(auth_req)
+        except DefaultCredentialsError as e:
+            raise VertexAIAuthenticationException(
+                "Log in using `gcloud auth application-default login` to use the Google Vertex tokenizer API"
+            ) from e
+        self.access_token = creds.token
+
+    def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
+        cache_key = dataclasses.asdict(request)
+        # Delete encode because the Google Vertex AI API simulateously gives string and integer tokens.
+        del cache_key["encode"]
+        return cache_key
+
+    def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        text: str = request["text"]
+        tokenizer_name = request["tokenizer"].split("/", maxsplit=1)[1]
+        url = (
+            f"https://{self.location}-aiplatform.googleapis.com/v1/projects/{self.project_id}/"
+            f"locations/{self.location}/publishers/google/models/{tokenizer_name}:computeTokens"
+        )
+
+        headers = {"Authorization": f"Bearer {self.access_token}"}
+        body = {
+            "instances": [{"prompt": text}],
+        }
+        response = requests.post(url, headers=headers, json=body)
+        response.raise_for_status()
+        return response.json()
+
+    def _tokenization_raw_response_to_tokens(
+        self, response: Dict[str, Any], request: TokenizationRequest
+    ) -> List[TokenizationToken]:
+        tokens: List[Union[int, str]]
+        response_instance = response["tokensInfo"][0]
+        if not response_instance:
+            # Response was empty
+            tokens = []
+        else:
+            if request.encode:
+                tokens = [int(token) for token in response_instance["tokenIds"]]
+            else:
+                # errors="ignore" is needed because the tokenizer is not guaranteed to tokenize on
+                # the boundary of UTF-8 characters. The tokenization boundary can be within the bytes of
+                # a UTF-8 character.
+                #
+                # TODO(#2141): Come up with a more correct way of doing this.
+                tokens = [
+                    base64.decodebytes(token.encode()).decode("utf-8", errors="ignore")
+                    for token in response_instance["tokens"]
+                ]
+        return [TokenizationToken(token) for token in tokens]
+
+    def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        # Defined for mypy but decode() already raises NotImplementedError
+        raise NotImplementedError("The Google Vertex AI API does not support decoding.")