Merge branch 'open-compass:main' into main

Skyfall-xzz · Dec 28, 2023 · 325b191 · 325b191
2 parents 8634042 + 8728287
commit 325b191
Show file tree

Hide file tree

Showing 387 changed files with 20,503 additions and 2,479 deletions.
diff --git a/.github/workflows/pr-stage-check.yml b/.github/workflows/pr-stage-check.yml
@@ -0,0 +1,121 @@
+name: pr_stage_test
+
+on:
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'configs/**'
+      - 'tools/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        include:
+          - torch: 2.0.0
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install system dependencies
+        run: |
+          sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
+          sudo apt-get update && sudo apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install opencompass dependencies
+        run: |
+          python -m pip install -r requirements.txt
+      - name: Build and install
+        run: python -m pip install -e .
+      - name: Prepare dataset
+        run: |
+          wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
+
+  build_cu117:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install Python-dev
+        run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
+        if: ${{matrix.python-version != 3.10}}
+      - name: Install system dependencies
+        run: |
+          apt-get update
+          apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc6 libc6-dev
+          sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
+          apt-get update && apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install opencompass dependencies
+        run: |
+          python -m pip install -r requirements.txt
+      - name: Build and install
+        run: python -m pip install -e .
+      - name: Prepare dataset
+        run: |
+          wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
+
+  build_windows:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        platform: [cpu]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install PyTorch
+        run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+      - name: Install opencompass dependencies
+        run: |
+          pip install -r requirements.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Prepare dataset
+        run: |
+          Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip -OutFile OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
diff --git a/.gitignore b/.gitignore
@@ -4,13 +4,15 @@ outputs/
 icl_inference_output/
 .vscode/
 tmp/
+configs/eval_subjective_alignbench_test.py
 configs/openai_key.py
 configs/secrets.py
 configs/datasets/log.json
 configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
+models/*
 configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -90,3 +92,9 @@ docs/zh_cn/_build/
 # sft config ignore list
 configs/sft_cfg/*B_*
 configs/cky/
+
+# path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
+turbomind/
+
+# ignore the config file for criticbench evaluation
+configs/sft_cfg/criticbench_eval/*
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
@@ -5,7 +5,8 @@ exclude: |
       opencompass/utils/internal/|
       opencompass/openicl/icl_evaluator/hf_metrics/|
       opencompass/datasets/lawbench/utils|
-      opencompass/datasets/lawbench/evaluation_functions/
+      opencompass/datasets/lawbench/evaluation_functions/|
+      opencompass/datasets/medbench
     )
 repos:
   - repo: https://gitee.com/openmmlab/mirrors-flake8

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,8 @@ exclude: |
       opencompass/utils/internal/|
       opencompass/openicl/icl_evaluator/hf_metrics/|
       opencompass/datasets/lawbench/utils|
-      opencompass/datasets/lawbench/evaluation_functions/
+      opencompass/datasets/lawbench/evaluation_functions/|
+      opencompass/datasets/medbench/
     )
 repos:
   - repo: https://github.com/PyCQA/flake8

diff --git a/README.md b/README.md
@@ -21,6 +21,18 @@ English | [简体中文](README_zh-CN.md)
     👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
 </p>
 
+## 📣 OpenCompass 2023 LLM Annual Leaderboard
+
+We are honored to have witnessed the tremendous progress of artificial general intelligence together with the community in the past year, and we are also very pleased that **OpenCompass** can help numerous developers and users.
+
+We announce the launch of the **OpenCompass 2023 LLM Annual Leaderboard** plan. We expect to release the annual leaderboard of the LLMs in January 2024, systematically evaluating the performance of LLMs in various capabilities such as language, knowledge, reasoning, creation, long-text, and agents.
+
+At that time, we will release rankings for both open-source models and commercial API models, aiming to provide a comprehensive, objective, and neutral reference for the industry and research community.
+
+We sincerely invite various large models to join the OpenCompass to showcase their performance advantages in different fields. At the same time, we also welcome researchers and developers to provide valuable suggestions and contributions to jointly promote the development of the LLMs. If you have any questions or needs, please feel free to [contact us](mailto:[email protected]). In addition, relevant evaluation contents, performance statistics, and evaluation methods will be open-source along with the leaderboard release.
+
+Let's look forward to the release of the OpenCompass 2023 LLM Annual Leaderboard!
+
 ## 🧭	Welcome
 
 to **OpenCompass**!
@@ -38,15 +50,18 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details! 🔥🔥🔥.
+- **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series). 🔥🔥🔥.
+- **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation. 🔥🔥🔥.
+- **\[2023.11.22\]** We have supported many API-based models, include **Baidu, ByteDance, Huawei, 360**. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details. 🔥🔥🔥.
+- **\[2023.11.20\]** Thanks [helloyongyang](https://github.com/helloyongyang) for supporting the evaluation with [LightLLM](https://github.com/ModelTC/lightllm) as backent. Welcome to [Evaluation With LightLLM](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lightllm.html) for more details. 🔥🔥🔥.
 - **\[2023.11.13\]** We are delighted to announce the release of OpenCompass v0.1.8. This version enables local loading of evaluation benchmarks, thereby eliminating the need for an internet connection. Please note that with this update, **you must re-download all evaluation datasets** to ensure accurate and up-to-date results.🔥🔥🔥.
-- **\[2023.11.06\]** We have supported several API-based models, include  ChatGLM Pro@Zhipu, ABAB-Chat@MiniMax and Xunfei. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details. 🔥🔥🔥.
-- **\[2023.10.24\]** We release a new benchmark for evaluating LLMs’ capabilities of having multi-turn dialogues. Welcome to [BotChat](https://github.com/open-compass/BotChat) for more details. 🔥🔥🔥.
-- **\[2023.09.26\]** We update the leaderboard with [Qwen](https://github.com/QwenLM/Qwen), one of the best-performing open-source models currently available, welcome to our [homepage](https://opencompass.org.cn) for more details. 🔥🔥🔥.
+- **\[2023.11.06\]** We have supported several API-based models, include  **ChatGLM Pro@Zhipu, ABAB-Chat@MiniMax and Xunfei**. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details. 🔥🔥🔥.
+- **\[2023.10.24\]** We release a new benchmark for evaluating LLMs’ capabilities of having multi-turn dialogues. Welcome to [BotChat](https://github.com/open-compass/BotChat) for more details.
+- **\[2023.09.26\]** We update the leaderboard with [Qwen](https://github.com/QwenLM/Qwen), one of the best-performing open-source models currently available, welcome to our [homepage](https://opencompass.org.cn) for more details.
 - **\[2023.09.20\]** We update the leaderboard with [InternLM-20B](https://github.com/InternLM/InternLM), welcome to our [homepage](https://opencompass.org.cn) for more details.
 - **\[2023.09.19\]** We update the leaderboard with WeMix-LLaMA2-70B/Phi-1.5-1.3B, welcome to our [homepage](https://opencompass.org.cn) for more details.
 - **\[2023.09.18\]** We have released [long context evaluation guidance](docs/en/advanced_guides/longeval.md).
-- **\[2023.09.08\]** We update the leaderboard with Baichuan-2/Tigerbot-2/Vicuna-v1.5, welcome to our [homepage](https://opencompass.org.cn) for more details.
-- **\[2023.09.06\]**  [**Baichuan2**](https://github.com/baichuan-inc/Baichuan2) team adpots OpenCompass to evaluate their models systematically. We deeply appreciate the community's dedication to transparency and reproducibility in LLM evaluation.
 
 > [More](docs/en/notes/news.md)
 
@@ -68,20 +83,40 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
 
 ## 📊 Leaderboard
 
-We provide [OpenCompass Leaderbaord](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `[email protected]`.
+We provide [OpenCompass Leaderboard](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `[email protected]`.
 
 <p align="right"><a href="#top">🔝Back to top</a></p>
 
 ## 🛠️ Installation
 
 Below are the steps for quick installation and datasets preparation.
 
-```Python
+### 💻 Environment Setup
+
+#### Open-source Models with GPU
+
+```bash
 conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
 conda activate opencompass
 git clone https://github.com/open-compass/opencompass opencompass
 cd opencompass
 pip install -e .
+```
+
+#### API Models with CPU-only
+
+```bash
+conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
+conda activate opencompass
+git clone https://github.com/open-compass/opencompass opencompass
+cd opencompass
+pip install -e .
+# also please install requiresments packages via `pip install -r requirements/api.txt` for API models if needed.
+```
+
+### 📂 Data Preparation
+
+```bash
 # Download dataset to data/ folder
 wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
 unzip OpenCompassData-core-20231110.zip
@@ -411,24 +446,33 @@ Through the command line or configuration files, OpenCompass also supports evalu
     <tr valign="top">
       <td>
 
-- InternLM
-- LLaMA
-- Vicuna
-- Alpaca
-- Baichuan
-- WizardLM
-- ChatGLM2
-- Falcon
-- TigerBot
-- Qwen
+- [InternLM](https://github.com/InternLM/InternLM)
+- [LLaMA](https://github.com/facebookresearch/llama)
+- [Vicuna](https://github.com/lm-sys/FastChat)
+- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
+- [Baichuan](https://github.com/baichuan-inc)
+- [WizardLM](https://github.com/nlpxucan/WizardLM)
+- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
+- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
+- [TigerBot](https://github.com/TigerResearch/TigerBot)
+- [Qwen](https://github.com/QwenLM/Qwen)
+- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
 - ...
 
 </td>
 <td>
 
 - OpenAI
 - Claude
-- PaLM (coming soon)
+- ZhipuAI(ChatGLM)
+- Baichuan
+- ByteDance(YunQue)
+- Huawei(PanGu)
+- 360
+- Baidu(ERNIEBot)
+- MiniMax(ABAB-Chat)
+- SenseTime(nova)
+- Xunfei(Spark)
 - ……
 
 </td>
@@ -444,17 +488,17 @@ Through the command line or configuration files, OpenCompass also supports evalu
 - [ ] Subjective Evaluation
   - [ ] Release CompassAreana
   - [ ] Subjective evaluation dataset.
-- [ ] Long-context
+- [x] Long-context
   - [ ] Long-context evaluation with extensive datasets.
   - [ ] Long-context leaderboard.
 - [ ] Coding
   - [ ] Coding evaluation leaderboard.
-  - [ ] Non-python language evaluation service.
+  - [x] Non-python language evaluation service.
 - [ ] Agent
   - [ ] Support various agenet framework.
   - [ ] Evaluation of tool use of the LLMs.
-- [ ] Robustness
-  - [ ] Support various attack method
+- [x] Robustness
+  - [x] Support various attack method
 
 ## 👷‍♂️ Contributing