Merge branch 'main' into fix_bugs

liushz · May 14, 2024 · a182ca2 · a182ca2
2 parents ec6b394 + aa2dd2b
commit a182ca2
Show file tree

Hide file tree

Showing 805 changed files with 6,828 additions and 8,006 deletions.
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
@@ -45,19 +45,18 @@ repos:
             (?x)^(
               dicts/|
               projects/.*?/dicts/|
-              configs/
+              configs/.*?/.*\.txt
             )
       - id: check-yaml
       - id: end-of-file-fixer
         exclude: |
             (?x)^(
               dicts/|
               projects/.*?/dicts/|
-              configs/
+              configs/.*?/.*\.txt
             )
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
-        exclude: configs/
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: ["--remove"]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,19 +45,18 @@ repos:
             (?x)^(
               dicts/|
               projects/.*?/dicts/|
-              configs/
+              configs/.*?/.*\.txt
             )
       - id: check-yaml
       - id: end-of-file-fixer
         exclude: |
             (?x)^(
               dicts/|
               projects/.*?/dicts/|
-              configs/
+              configs/.*?/.*\.txt
             )
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
-        exclude: configs/
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: ["--remove"]

diff --git a/README.md b/README.md
@@ -162,20 +162,11 @@ python tools/list_configs.py llama mmlu
 You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
 
 ```bash
-python run.py --datasets ceval_ppl mmlu_ppl \
---hf-path huggyllama/llama-7b \  # HuggingFace model path
---model-kwargs device_map='auto' \  # Arguments for model construction
---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \  # Arguments for tokenizer construction
---max-out-len 100 \  # Maximum number of tokens generated
---max-seq-len 2048 \  # Maximum sequence length the model can accept
---batch-size 8 \  # Batch size
---no-batch-padding \  # Don't enable batch padding, infer through for loop to avoid performance loss
---num-gpus 1  # Number of minimum required GPUs
+python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
 ```
 
 > \[!TIP\]
 >
-> To run the command above, you will need to remove the comments starting from `# ` first.
 > configuration with `_ppl` is designed for base model typically.
 > configuration with `_gen` can be used for both base model and chat model.
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -163,20 +163,9 @@ python tools/list_configs.py llama mmlu
 你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例：
 
 ```bash
-python run.py --datasets ceval_ppl mmlu_ppl \
---hf-path huggyllama/llama-7b \  # HuggingFace 模型地址
---model-kwargs device_map='auto' \  # 构造 model 的参数
---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \  # 构造 tokenizer 的参数
---max-out-len 100 \  # 最长生成 token 数
---max-seq-len 2048 \  # 模型能接受的最大序列长度
---batch-size 8 \  # 批次大小
---no-batch-padding \  # 不打开 batch padding，通过 for loop 推理，避免精度损失
---num-gpus 1  # 运行该模型所需的最少 gpu 数
+python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
 ```
 
-> **注意**<br />
-> 若需要运行上述命令，你需要删除所有从 `# ` 开始的注释。
-
 通过命令行或配置文件，OpenCompass 还支持评测 API 或自定义模型，以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
 
 更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。

diff --git a/configs/api_examples/eval_api_360.py b/configs/api_examples/eval_api_360.py
@@ -17,7 +17,7 @@
         abbr='360GPT_S2_V9',
         type=AI360GPT,
         path='360GPT_S2_V9',
-        key="xxxxxxxxxxxx",
+        key='xxxxxxxxxxxx',
         generation_kwargs={
             'temperature': 0.9,
             'max_tokens': 2048,
@@ -40,4 +40,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir ="./output/api_360GPT_S2_V9"
+work_dir ='./output/api_360GPT_S2_V9'
diff --git a/configs/api_examples/eval_api_baichuan.py b/configs/api_examples/eval_api_baichuan.py
@@ -18,8 +18,8 @@
         type=BaiChuan,
         path='Baichuan2-53B',
         api_key='xxxxxx',
-        secret_key="xxxxx",
-        url="xxxxx",
+        secret_key='xxxxx',
+        url='xxxxx',
         generation_kwargs={
             'temperature': 0.3,
             'top_p': 0.85,
@@ -41,4 +41,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_baichuan53b/"
+work_dir = 'outputs/api_baichuan53b/'
diff --git a/configs/api_examples/eval_api_baidu.py b/configs/api_examples/eval_api_baidu.py
@@ -39,4 +39,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_erniebot/"
+work_dir = 'outputs/api_erniebot/'
diff --git a/configs/api_examples/eval_api_bytedance.py b/configs/api_examples/eval_api_bytedance.py
@@ -18,8 +18,8 @@
         abbr='skylark-pro-public',
         type=ByteDance,
         path='skylark-pro-public',
-        accesskey="xxxxxxx",
-        secretkey="xxxxxxx",
+        accesskey='xxxxxxx',
+        secretkey='xxxxxxx',
         url='xxxxxx',
         generation_kwargs={
             'temperature': 0.7,
@@ -41,4 +41,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_bytedance/"
+work_dir = 'outputs/api_bytedance/'
diff --git a/configs/api_examples/eval_api_minimax.py b/configs/api_examples/eval_api_minimax.py
@@ -34,4 +34,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_minimax/"
+work_dir = 'outputs/api_minimax/'
diff --git a/configs/api_examples/eval_api_moonshot.py b/configs/api_examples/eval_api_moonshot.py
@@ -37,4 +37,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_moonshot/"
+work_dir = 'outputs/api_moonshot/'
diff --git a/configs/api_examples/eval_api_nanbeige.py b/configs/api_examples/eval_api_nanbeige.py
@@ -18,7 +18,7 @@
         abbr='nanbeige-plus',
         type=Nanbeige,
         path='nanbeige-plus',
-        key="xxxxxx",
+        key='xxxxxx',
         query_per_second=1,
         max_out_len=2048,
         batch_size=8),
@@ -33,4 +33,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir ="./output/nanbeige-plus"
+work_dir ='./output/nanbeige-plus'
diff --git a/configs/api_examples/eval_api_pangu.py b/configs/api_examples/eval_api_pangu.py
@@ -17,13 +17,13 @@
         abbr='pangu',
         type=PanGu,
         path='pangu',
-        access_key="xxxxxx",
-        secret_key="xxxxxx",
-        url = "xxxxxx",
+        access_key='xxxxxx',
+        secret_key='xxxxxx',
+        url = 'xxxxxx',
         # url of token sever, used for generate token, like "https://xxxxxx.myhuaweicloud.com/v3/auth/tokens",
-        token_url = "xxxxxx",
+        token_url = 'xxxxxx',
         # scope-project-name, used for generate token
-        project_name = "xxxxxx",
+        project_name = 'xxxxxx',
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,
@@ -39,4 +39,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_pangu/"
+work_dir = 'outputs/api_pangu/'
diff --git a/configs/api_examples/eval_api_qwen.py b/configs/api_examples/eval_api_qwen.py
@@ -37,4 +37,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_qwen/"
+work_dir = 'outputs/api_qwen/'
diff --git a/configs/api_examples/eval_api_sensetime.py b/configs/api_examples/eval_api_sensetime.py
@@ -24,17 +24,17 @@
         max_seq_len=2048,
         batch_size=8,
         parameters={
-            "temperature": 0.8,
-            "top_p": 0.7,
-            "max_new_tokens": 1024,
-            "repetition_penalty": 1.05,
-            "know_ids": [],
-            "stream": True,
-            "user": "#*#***TestUser***#*#",
-            "knowledge_config": {
-                "control_level": "normal",
-                "knowledge_base_result": False,
-                "online_search_result": False
+            'temperature': 0.8,
+            'top_p': 0.7,
+            'max_new_tokens': 1024,
+            'repetition_penalty': 1.05,
+            'know_ids': [],
+            'stream': True,
+            'user': '#*#***TestUser***#*#',
+            'knowledge_config': {
+                'control_level': 'normal',
+                'knowledge_base_result': False,
+                'online_search_result': False
             }
         }
     )
@@ -49,4 +49,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_sensetime/"
+work_dir = 'outputs/api_sensetime/'
diff --git a/configs/api_examples/eval_api_xunfei.py b/configs/api_examples/eval_api_xunfei.py
@@ -17,22 +17,22 @@
     dict(
         abbr='Spark-v1-1',
         type=XunFei,
-        appid="xxxx",
+        appid='xxxx',
         path='ws://spark-api.xf-yun.com/v1.1/chat',
-        api_secret = "xxxxxxx",
-        api_key = "xxxxxxx",
+        api_secret = 'xxxxxxx',
+        api_key = 'xxxxxxx',
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,
         batch_size=8),
     dict(
         abbr='Spark-v3-1',
         type=XunFei,
-        appid="xxxx",
+        appid='xxxx',
         domain='generalv3',
         path='ws://spark-api.xf-yun.com/v3.1/chat',
-        api_secret = "xxxxxxxx",
-        api_key = "xxxxxxxxx",
+        api_secret = 'xxxxxxxx',
+        api_key = 'xxxxxxxxx',
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,
@@ -48,4 +48,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_xunfei/"
+work_dir = 'outputs/api_xunfei/'
diff --git a/configs/api_examples/eval_api_zhipu.py b/configs/api_examples/eval_api_zhipu.py
@@ -29,7 +29,7 @@
         abbr='chatglm_pro',
         type=ZhiPuAI,
         path='chatglm_pro',
-        key='xxxxxxxxxxxx', 
+        key='xxxxxxxxxxxx',
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,
@@ -45,4 +45,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_zhipu/"
+work_dir = 'outputs/api_zhipu/'
diff --git a/configs/api_examples/eval_api_zhipu_v2.py b/configs/api_examples/eval_api_zhipu_v2.py
@@ -64,4 +64,4 @@
         task=dict(type=OpenICLInferTask)),
 )
 
-work_dir = "outputs/api_zhipu_v2/"
+work_dir = 'outputs/api_zhipu_v2/'
diff --git a/configs/dataset_collections/chat_OC15.py b/configs/dataset_collections/chat_OC15.py
@@ -0,0 +1,22 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ..datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
+    from ..datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets
+    from ..datasets.race.race_gen_69ee4f import race_datasets
+    from ..datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
+    from ..datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from ..datasets.bbh.bbh_gen_2879b0 import bbh_datasets
+    from ..datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..datasets.math.math_0shot_gen_393424 import math_datasets
+    from ..datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
+    from ..datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
+    from ..datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
diff --git a/configs/datasets/ARC_c/ARC_c_clean_ppl.py b/configs/datasets/ARC_c/ARC_c_clean_ppl.py
@@ -12,29 +12,29 @@
     prompt_template=dict(
         type=PromptTemplate,
         template={
-            "A":
+            'A':
             dict(
                 round=[
-                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
-                    dict(role="BOT", prompt="{textA}")
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
                 ], ),
-            "B":
+            'B':
             dict(
                 round=[
-                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
-                    dict(role="BOT", prompt="{textB}")
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
                 ], ),
-            "C":
+            'C':
             dict(
                 round=[
-                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
-                    dict(role="BOT", prompt="{textC}")
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
                 ], ),
-            "D":
+            'D':
             dict(
                 round=[
-                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
-                    dict(role="BOT", prompt="{textD}")
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
                 ], ),
         }),
     retriever=dict(type=ZeroRetriever),