diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f14e053c7..7d1d2aacd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,11 +10,11 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python 3.10 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: '3.10' - name: Install pre-commit hook run: | pip install pre-commit diff --git a/docs/en/MMBench.md b/docs/en/MMBench.md index fd2a2afc2..8f5a036b2 100644 --- a/docs/en/MMBench.md +++ b/docs/en/MMBench.md @@ -1,5 +1,4 @@ -# Evalation pipeline on MMBench - +# Evaluation pipeline on MMBench ## Intro to each data sample in MMBench @@ -17,8 +16,8 @@ context (optional): the context to a question, which is optional. answer: the target answer to current question. (only exists in the dev split, and is keep confidential for the test split on our evaluation server) ``` - ## Load MMBench + We provide a code snippet as an example of loading MMBench ```python @@ -77,7 +76,7 @@ class MMBenchDataset(Dataset): 'context': hint, } return data - + def load_from_df(self, idx, key): if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]): return self.df.iloc[idx][key] @@ -85,10 +84,8 @@ class MMBenchDataset(Dataset): return None ``` - - - ## How to construct the inference prompt + ```python if data_sample['context'] is None: prompt = data_sample['context'] + ' ' + data_sample['question'] + ' ' + data_sample['options'] @@ -98,7 +95,7 @@ else: For example: Question: Which category does this image belong to? -A. Oil Paiting +A. Oil Painting B. Sketch C. Digital art D. Photo @@ -107,16 +104,14 @@ D. Photo - - - ``` -prompt = ###Human: Question: Which category does this image belong to? There are several options: A. Oil Paiting, B. Sketch, C. Digital art, D. Photo ###Assistant: +prompt = ###Human: Question: Which category does this image belong to? There are several options: A. Oil Painting, B. Sketch, C. Digital art, D. Photo ###Assistant: ``` -You can make custom modifications to the prompt +You can make custom modifications to the prompt ## How to save results: + You should dump your model's predictions into an excel(.xlsx) file, and this file should contain the following fields: ``` @@ -125,13 +120,10 @@ A: The first choice B: The second choice C: The third choice D: The fourth choice -prediction: The prediction of your model to currrent question +prediction: The prediction of your model to current question category: the leaf category l2_category: the l2-level category index: the l2-level category ``` -If there are any questions with fewer than four options, simply leave those fields blank. - - - +If there are any questions with fewer than four options, simply leave those fields blank. diff --git a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py index 16db96a84..239ce0794 100644 --- a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py @@ -208,11 +208,10 @@ def get_metrics(self, scores: dict) -> dict: length = np.count_nonzero(~np.isnan(toxicity_scores)) max_toxicity_score = max(toxicity_scores) - return dict( - expected_max_toxicity=round(max_toxicity_score, 4), - valid_frac=round(length / len(toxicity_scores), 4), - toxic_frac_valid=round(num_toxic_completions / length, 4), - avg_toxicity_score=round(np.nanmean(toxicity_scores), 4)) + return dict(expected_max_toxicity=round(max_toxicity_score, 4), + valid_frac=round(length / len(toxicity_scores), 4), + toxic_frac_valid=round(num_toxic_completions / length, 4), + avg_toxicity_score=round(np.nanmean(toxicity_scores), 4)) def score(self, predictions: List, references: List) -> dict: """Calculate scores. Reference is not needed. diff --git a/opencompass/openicl/icl_retriever/icl_base_retriever.py b/opencompass/openicl/icl_retriever/icl_base_retriever.py index 94736c7eb..ed3c1d046 100644 --- a/opencompass/openicl/icl_retriever/icl_base_retriever.py +++ b/opencompass/openicl/icl_retriever/icl_base_retriever.py @@ -4,7 +4,7 @@ from mmengine.dist import is_main_process -from opencompass.openicl import PromptTemplate +from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.utils.prompt import PromptList diff --git a/opencompass/openicl/icl_retriever/icl_mdl_retriever.py b/opencompass/openicl/icl_retriever/icl_mdl_retriever.py index 1bd324633..43fe12d1e 100644 --- a/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +++ b/opencompass/openicl/icl_retriever/icl_mdl_retriever.py @@ -7,7 +7,7 @@ import tqdm from transformers import AutoModelForCausalLM -from opencompass.openicl import PromptTemplate +from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever from opencompass.openicl.utils.logging import get_logger from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS