diff --git a/helm-frontend/src/assets/vhelm/vhelm-aspects.png b/helm-frontend/src/assets/vhelm/vhelm-aspects.png new file mode 100644 index 0000000000..dde76709e9 Binary files /dev/null and b/helm-frontend/src/assets/vhelm/vhelm-aspects.png differ diff --git a/helm-frontend/src/assets/vhelm/vhelm-framework.png b/helm-frontend/src/assets/vhelm/vhelm-framework.png old mode 100755 new mode 100644 index 8845b1e13b..d94c1dc877 Binary files a/helm-frontend/src/assets/vhelm/vhelm-framework.png and b/helm-frontend/src/assets/vhelm/vhelm-framework.png differ diff --git a/helm-frontend/src/assets/vhelm/vhelm-model.png b/helm-frontend/src/assets/vhelm/vhelm-model.png old mode 100755 new mode 100644 index 85a141ee91..23b1e6b134 Binary files a/helm-frontend/src/assets/vhelm/vhelm-model.png and b/helm-frontend/src/assets/vhelm/vhelm-model.png differ diff --git a/helm-frontend/src/components/VHELMLanding.tsx b/helm-frontend/src/components/VHELMLanding.tsx index 2a58808667..6369df7854 100644 --- a/helm-frontend/src/components/VHELMLanding.tsx +++ b/helm-frontend/src/components/VHELMLanding.tsx @@ -1,57 +1,69 @@ -import { useEffect, useState } from "react"; import { Link } from "react-router-dom"; -import getSchema from "@/services/getSchema"; -import type Schema from "@/types/Schema"; - -import ModelsList from "@/components/ModelsList"; import MiniLeaderboard from "@/components/MiniLeaderboard"; -import ScenariosList from "@/components/ScenariosList"; import vhelmFrameworkImage from "@/assets/vhelm/vhelm-framework.png"; import vhelmModelImage from "@/assets/vhelm/vhelm-model.png"; +import vhelmAspectsImage from "@/assets/vhelm/vhelm-aspects.png"; export default function VHELMLanding() { - const [schema, setSchema] = useState(undefined); - - useEffect(() => { - const controller = new AbortController(); - async function fetchData() { - const schema = await getSchema(controller.signal); - setSchema(schema); - } - - void fetchData(); - return () => controller.abort(); - }, []); - return (

- The First Steps to Holistic Evaluation of Vision-Language Models + Holistic Evaluation of Vision-Language Models

+ +
+ + Paper + + + Github + +

- To better understand VLMs, we introduce the first version of{" "} - Holistic Evaluation of Vision-Language Models (VHELM) by - extending the HELM{" "} - framework with the necessary adaptation methods to assess the - performance of 6 prominent VLMs on 3 standard VLM benchmarks. + Current benchmarks for assessing vision-language models (VLMs) often + focus on their perception or problem-solving capabilities and neglect + other critical aspects such as fairness, multilinguality, or toxicity. + Furthermore, they differ in their evaluation procedures and the scope of + the evaluation, making it difficult to compare models. To address these + issues, we extend the HELM framework to VLMs to present the Holistic + Evaluation of Vision Language Models (VHELM). To address these issues, + we introduce VHELM, built on HELM for language models. VHELM aggregates + various datasets to cover one or more of the 9 aspects:{" "} + visual perception, bias, fairness, knowledge + , multilinguality, reasoning, robustness,{" "} + safety, and toxicity. In doing so, we produce a + comprehensive, multi-dimensional view of the capabilities of the VLMs + across these important factors. In addition, we standardize the standard + inference parameters, methods of prompting, and evaluation metrics to + enable fair comparisons across models. Our framework is designed to be + lightweight and automatic so that evaluation runs are cheap and fast. + For transparency, we release the raw model generations and complete + results on this website.

- This is ongoing work to achieve holistic evaluation for vision-language - models, so please stay tuned! + VHELM is intended to be a living benchmark. We hope to continue adding + new datasets, models and metrics over time, so please stay tuned!

An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...' An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed
@@ -65,12 +77,13 @@ export default function VHELMLanding() {
- {schema === undefined ? null : ( -
- - -
- )} +
+ An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. +
); }