[Feature] Support 360API and FixKRetriever for CSQA dataset

open-compass · Nov 16, 2023 · 3286b9a · 3286b9a
1 parent 9c883ec
commit 3286b9a
Show file tree

Hide file tree

Showing 6 changed files with 358 additions and 0 deletions.
diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py
@@ -0,0 +1,55 @@
+# Use FixKRetriever to avoid hang caused by the Huggingface
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=["question", "A", "B", "C", "D", "E"],
+    output_column="answerKey",
+    test_split="validation")
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template=dict(
+        begin="</E>",
+        round=[
+            dict(
+                role="HUMAN",
+                prompt=
+                "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nAnswer:",
+            ),
+            dict(
+                role="BOT",
+                prompt="{answerKey}",
+            ),
+        ],
+    ),
+    ice_token="</E>",
+)
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4, 5, 6, 7]),
+    inferencer=dict(type=GenInferencer),
+)
+
+commonsenseqa_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='./data/commonsenseqa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg,
+    )
+]
+
+del _ice_template
diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py
@@ -0,0 +1,42 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation')
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
+                dict(role="BOT", prompt=ans_token),
+            ])
+        for ans, ans_token in [["A", "{A}"], ["B", "{B}"],
+                               ["C", "{C}"], ["D", "{D}"],
+                               ["E", "{E}"]]
+    },
+    ice_token='</E>')
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4, 5, 6, 7]),
+    inferencer=dict(type=PPLInferencer))
+
+commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='./data/commonsenseqa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg)
+]
diff --git a/configs/eval_360.py b/configs/eval_360.py
@@ -0,0 +1,48 @@
+from mmengine.config import read_base
+from opencompass.models import AI360GPT
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners.local_api import LocalAPIRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    # from .datasets.collections.chat_medium import datasets
+    from .summarizers.medium import summarizer
+    from .datasets.ceval.ceval_gen import ceval_datasets
+    # from .datasets.ARC_c.ARC_c_gen import ARC_c_datasets
+    # from .datasets.race.race_gen import race_datasets
+    # from .datasets.commonsenseqa.commonsenseqa_gen_260dab import commonsenseqa_datasets
+    # from .datasets.winogrande.winogrande_gen import winogrande_datasets
+    # from .datasets.gsm8k.gsm8k_gen import gsm8k_datasets
+
+datasets = [
+    *ceval_datasets,
+    # *ARC_c_datasets,
+    # *race_datasets,
+    # *commonsenseqa_datasets,
+    # *winogrande_datasets,
+    # *gsm8k_datasets,
+]
+
+models = [
+    dict(
+        abbr='360GPT_S2_V9',
+        type=AI360GPT,
+        path='360GPT_S2_V9',
+        key="xxxxxxxxxxxx",
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=2048,
+        batch_size=8),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalAPIRunner,
+        max_num_workers=2,
+        concurrent_users=2,
+        task=dict(type=OpenICLInferTask)),
+)
+
+work_dir ="./output/360GPT_S2_V9"
diff --git a/configs/summarizers/groups/mathbench.py b/configs/summarizers/groups/mathbench.py
@@ -0,0 +1,19 @@
+mathbench_summary_groups = []
+
+
+mathbench_college = ['single_choice_cn', 'cloze_en']
+mathbench_college = ['mathbench-college' + s for s in mathbench_college]
+
+mathbench_high = ['single_choice_cn', 'single_choice_en']
+mathbench_high = ['mathbench-high' + s for s in mathbench_high]
+
+mathbench_middle = ['single_choice_cn']
+mathbench_middle = ['mathbench-middle' + s for s in mathbench_middle]
+
+mathbench_primary = ['cloze_cn']
+mathbench_primary = ['mathbench-primary' + s for s in mathbench_primary]
+
+mathbench_summary_groups.append(
+    {'name': 'mathbench',
+     'subsets': mathbench_college+mathbench_high+mathbench_middle+mathbench_primary}
+)
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
@@ -1,3 +1,4 @@
+from .ai360_api import AI360GPT  # noqa: F401
 from .base import BaseModel, LMTemplateParser  # noqa
 from .base_api import APITemplateParser, BaseAPIModel  # noqa
 from .claude_api import Claude  # noqa: F401

diff --git a/opencompass/models/ai360_api.py b/opencompass/models/ai360_api.py
@@ -0,0 +1,193 @@
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+class AI360GPT(BaseAPIModel):
+    """Model wrapper around 360 GPT.
+
+    Documentations: https://ai.360.com/platform/docs/overview
+
+    Args:
+    """
+
+    def __init__(
+        self,
+        path: str,  # model name, e.g.: 360GPT_S2_V9
+        key: str,
+        url: str = 'https://api.360.cn/v1/chat/completions',
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.headers = {
+            'Authorization': f'Bearer {key}',
+            'Content-Type': 'application/json',
+        }
+        self.model = path
+        self.url = url
+
+    def generate(
+        self,
+        inputs: List[str or PromptList],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def flush(self):
+        """Flush stdout and stderr when concurrent resources exists.
+
+        When use multiproessing with standard io rediected to files, need to
+        flush internal information for examination or log loss when system
+        breaks.
+        """
+        if hasattr(self, 'tokens'):
+            sys.stdout.flush()
+            sys.stderr.flush()
+
+    def acquire(self):
+        """Acquire concurrent resources if exists.
+
+        This behavior will fall back to wait with query_per_second if there are
+        no concurrent resources.
+        """
+        if hasattr(self, 'tokens'):
+            self.tokens.acquire()
+        else:
+            self.wait()
+
+    def release(self):
+        """Release concurrent resources if acquired.
+
+        This behavior will fall back to do nothing if there are no concurrent
+        resources.
+        """
+        if hasattr(self, 'tokens'):
+            self.tokens.release()
+
+    def _generate(
+        self,
+        input: str or PromptList,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                msg = {'content': item['prompt']}
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    msg['role'] = 'system'
+                messages.append(msg)
+
+        data = {
+            'model': self.model,
+            'messages': messages,
+            'stream': False,
+            'temperature': 0.9,
+            'max_tokens': 2048,
+            'top_p': 0.5,
+            'tok_k': 0,
+            'repetition_penalty': 1.05,
+            # "num_beams": 1,
+            # "user": "OpenCompass"
+        }
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            # payload = json.dumps(data)
+            raw_response = requests.request('POST',
+                                            url=self.url,
+                                            headers=self.headers,
+                                            json=data)
+            response = raw_response.json()
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+            if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
+                # msg = response['text']
+                try:
+                    msg = response['choices'][0]['message']['content'].strip()
+                    return msg
+
+                except KeyError:
+                    if 'error' in response:
+                        # tpm(token per minitue) limit
+                        if response['erro']['code'] == '1005':
+                            time.sleep(1)
+                            continue
+
+                        self.logger.error('Find error message in response: ',
+                                          str(response['error']))
+
+            # sensitive content, prompt overlength, network error
+            # or illegal prompt
+            if (raw_response.status_code == 400
+                    or raw_response.status_code == 401
+                    or raw_response.status_code == 402
+                    or raw_response.status_code == 429
+                    or raw_response.status_code == 500):
+                print(raw_response.text)
+                # return ''
+                continue
+            print(raw_response)
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response.text)