From acae560911d52620680da95d62eeae8f9a0aa9eb Mon Sep 17 00:00:00 2001 From: Mo Li <82895469+DseidLi@users.noreply.github.com> Date: Wed, 17 Jan 2024 13:47:34 +0800 Subject: [PATCH] Added support for multi-needle testing in needle-in-a-haystack test (#802) * Add NeedleInAHaystack Test * Apply pre-commit formatting * Update configs/eval_hf_internlm_chat_20b_cdme.py Co-authored-by: Songyang Zhang * add needle in haystack test * update needle in haystack test * update plot function in tools_needleinahaystack.py * optimizing needleinahaystack dataset generation strategy * modify minor formatting issues * add English version support * change NeedleInAHaystackDataset to dynamic loading * change NeedleInAHaystackDataset to dynamic loading * fix needleinahaystack test eval bug * fix needleinahaystack config bug * Added support for multi-needle testing in needle-in-a-haystack test * Optimize the code for plotting in the needle-in-a-haystack test. * Correct the typo in the dataset parameters. * update needleinahaystack test docs --------- Co-authored-by: Songyang Zhang --- .../cdme/multi_needle/cdme8k_cot2_italy.py | 92 +++++++ .../cdme/multi_needle/cdme8k_cot3_italy.py | 93 +++++++ ..._20b_cdme.py => eval_needleinahaystack.py} | 2 +- configs/eval_needleinahaystack_turbomind.py | 28 ++ .../advanced_guides/needleinahaystack_eval.md | 239 ++++++++++++++---- docs/en/index.rst | 1 + .../advanced_guides/needleinahaystack_eval.md | 198 +++++++++++++-- docs/zh_cn/index.rst | 1 + opencompass/datasets/cdme/cdme_multi.py | 224 ++++++++++++++++ tools/tools_needleinahaystack.py | 37 ++- 10 files changed, 822 insertions(+), 93 deletions(-) create mode 100644 configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py create mode 100644 configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py rename configs/{eval_hf_internlm_chat_20b_cdme.py => eval_needleinahaystack.py} (97%) create mode 100644 configs/eval_needleinahaystack_turbomind.py create mode 100644 opencompass/datasets/cdme/cdme_multi.py diff --git a/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py b/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py new file mode 100644 index 000000000..3f5fab2b0 --- /dev/null +++ b/configs/datasets/cdme/multi_needle/cdme8k_cot2_italy.py @@ -0,0 +1,92 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.cdme.cdme_multi import CDMEDataset +from opencompass.datasets.cdme.cdme_multi import CDMEEvaluator +from opencompass.datasets.cdme.cdme_multi import cdme_postprocess +from opencompass.datasets.cdme.cdme_multi import cdme_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +cdme_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''{prompt}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +cdme_eval_cfg = dict( + evaluator=dict(type=CDMEEvaluator), + pred_postprocessor=dict(type=cdme_postprocess), + dataset_postprocessor=dict(type=cdme_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(1000, 9000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/CDME' +file_list = ['zh_finance.jsonl'] +cdme_datasets = [] + +needles = ['\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', + '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。'] +retrieval_question = ("佛罗伦萨中排行第一的餐馆的特色菜肴是?" + "请按照'佛罗伦萨中排行第一的餐馆的特色菜肴是______。'的格式回答。") +answer = "佛罗伦萨中排行第一的餐馆的特色菜肴是松露奶酪通心粉。" + +keyword = "松露奶酪通心粉" +diff = 25 + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'CDME_Length{original_context_length}' + f'Depth{int(depth_percent)}', + 'type': CDMEDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needles': needles, + 'diff': diff, + 'retrieval_question': retrieval_question, + 'answer': answer, + 'keyword': keyword, + 'reader_cfg': cdme_reader_cfg, + 'infer_cfg': cdme_infer_cfg, + 'eval_cfg': cdme_eval_cfg + } + cdme_datasets.append(dataset_dict) diff --git a/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py b/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py new file mode 100644 index 000000000..acc82e853 --- /dev/null +++ b/configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py @@ -0,0 +1,93 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.cdme.cdme_multi import CDMEDataset +from opencompass.datasets.cdme.cdme_multi import CDMEEvaluator +from opencompass.datasets.cdme.cdme_multi import cdme_postprocess +from opencompass.datasets.cdme.cdme_multi import cdme_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +cdme_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +cdme_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''{prompt}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +cdme_eval_cfg = dict( + evaluator=dict(type=CDMEEvaluator), + pred_postprocessor=dict(type=cdme_postprocess), + dataset_postprocessor=dict(type=cdme_dataset_postprocess), + pred_role='BOT') + +context_lengths = list(range(1000, 9000, 1000)) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/CDME' +file_list = ['zh_finance.jsonl'] +cdme_datasets = [] + +needles = ['\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', + '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。', + '松露奶酪通心粉是该家餐馆的有着意大利皇室烹饪血统的大厨Jack制作',] +retrieval_question = ("制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫什么?" + "请按照'制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫______。'的格式回答。") +answer = "制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫Jack" + +keyword = "Jack" +diff = 25 + +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'CDME_Length{original_context_length}' + f'Depth{int(depth_percent)}', + 'type': CDMEDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needles': needles, + 'diff': diff, + 'retrieval_question': retrieval_question, + 'answer': answer, + 'keyword': keyword, + 'reader_cfg': cdme_reader_cfg, + 'infer_cfg': cdme_infer_cfg, + 'eval_cfg': cdme_eval_cfg + } + cdme_datasets.append(dataset_dict) diff --git a/configs/eval_hf_internlm_chat_20b_cdme.py b/configs/eval_needleinahaystack.py similarity index 97% rename from configs/eval_hf_internlm_chat_20b_cdme.py rename to configs/eval_needleinahaystack.py index cb7706d4a..533cc4dbf 100644 --- a/configs/eval_hf_internlm_chat_20b_cdme.py +++ b/configs/eval_needleinahaystack.py @@ -31,7 +31,7 @@ trust_remote_code=True, ), max_out_len=100, - max_seq_len=2048, + max_seq_len=8192, batch_size=8, meta_template=_meta_template, run_cfg=dict(num_gpus=2, num_procs=1), diff --git a/configs/eval_needleinahaystack_turbomind.py b/configs/eval_needleinahaystack_turbomind.py new file mode 100644 index 000000000..5f9a2f112 --- /dev/null +++ b/configs/eval_needleinahaystack_turbomind.py @@ -0,0 +1,28 @@ +from opencompass.models.turbomind import TurboMindModel + +from mmengine.config import read_base +with read_base(): + from .datasets.cdme.cdme200k import cdme_datasets + +datasets = [*cdme_datasets] + +internlm_meta_template = dict(round=[ + dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), +], + eos_token_id=103028) + +models = [ + # config for internlm-chat-20b + dict( + type=TurboMindModel, + abbr='internlm-chat-20b-turbomind', + path='./turbomind', + max_out_len=100, + max_seq_len=201000, + batch_size=8, + concurrency=8, + meta_template=internlm_meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/docs/en/advanced_guides/needleinahaystack_eval.md b/docs/en/advanced_guides/needleinahaystack_eval.md index 329f42c97..f04ba8d3f 100644 --- a/docs/en/advanced_guides/needleinahaystack_eval.md +++ b/docs/en/advanced_guides/needleinahaystack_eval.md @@ -2,24 +2,24 @@ ## Introduction to the Needle In A Haystack Test -The Needle In A Haystack test (inspired by [NeedleInAHaystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/LLMNeedleHaystackTester.py)) involves embedding key information randomly within a long text to form prompts for large language models (LLMs). This test evaluates the LLM's ability to extract key information from extensive text, reflecting the fundamental capabilities of LLMs in understanding long texts. +The Needle In A Haystack test, inspired by [NeedleInAHaystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/LLMNeedleHaystackTester.py), is a method to evaluate the long-text information extraction ability of Large Language Models (LLMs). It involves randomly inserting key information at various points in a long text to form a prompt for LLMs. This test assesses the fundamental ability of LLMs to understand long texts by extracting critical information from them. -## Dataset Overview +## Dataset Introduction -The `Skywork/ChineseDomainModelingEval` dataset includes high-quality Chinese articles published from September to October 2023, covering multiple domains. These articles ensure a fair and challenging benchmark test. +The `Skywork/ChineseDomainModelingEval` dataset includes high-quality Chinese articles published between September and October 2023, covering multiple domains. These articles ensure a fair and challenging benchmark for testing. ## File Description -The dataset includes files specific to certain domains: +The dataset includes files specific to various domains: - `zh_finance.jsonl` - Finance - `zh_game.jsonl` - Gaming -- `zh_government.jsonl` - Government Affairs +- `zh_government.jsonl` - Government - `zh_movie.jsonl` - Movies - `zh_tech.jsonl` - Technology - `zh_general.jsonl` - General -These files are used to evaluate the LLM's understanding capabilities in different specific areas. +These files are used to assess the LLM's understanding of different specific domains. ### Evaluation Steps @@ -58,51 +58,182 @@ cd opencompass pip install -e . ``` -### Generating the Dataset +### Configuring the Dataset -Run the following command to generate the dataset: +In the latest version, datasets are no longer generated by running scripts but dynamically defined and loaded through configuration files. Users need to specify dataset parameters in the configuration file according to their needs, offering greater flexibility and customization options. -```bash -python tools/tools_needleinahaystack.py \ - --processed_datasets_path './data/CDME/processed' \ - --data_path './data/CDME' \ - --tokenizer_model 'gpt-4' \ - --num_records_per_file 10 \ - --length_buffer 200 \ - --guided True \ - --file_list 'zh_finance.jsonl' \ - --context_lengths 1000 2000 3000 4000 5000 6000 7000 8000 \ - --needle '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n' \ - --retrieval_question '小明最喜欢的实习地点是哪里?你的回答格式应该为“小明最喜欢的实习地点就是________。”' \ - --document_depth_percent_intervals 35 \ +#### Dataset Configuration Example + +Here is an example of dataset configuration, showing how to define a dataset in the `configs/datasets/cdme/cdme8k.py` configuration file. This example demonstrates a Chinese dataset configuration with a length of 8000 tokens: + +```python +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'CDME_Length{original_context_length}Depth{int(depth_percent)}', + 'type': CDMEDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', + 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照“小明最喜欢的实习地点就是________。”的格式回答。', + 'reader_cfg': cdme_reader_cfg, + 'infer_cfg': cdme_infer_cfg, + 'eval_cfg': cdme_eval_cfg + } + cdme_datasets.append(dataset_dict) ``` -You can set specific parameters when launching `tools/tools_needleinahaystack.py` to select the datasets required for your task. Key parameters include: +In this configuration, the main parameters include: + +- `abbr`: Abbreviation of the dataset. +- `type`: Dataset type. +- `path`: Path to the dataset files. +- `length`: Context length in tokens. +- `depth`: Depth percentage of the document. +- `tokenizer_model`: Tokenizer model used. +- `file_list`: List of data source files. +- `num_repeats_per_file`: Number of repeats per file. +- `length_buffer`: Length buffer. +- `guide`: Whether it's a guided dataset. +- `language`: Language of the dataset. +- `needle`: Specific text to find in the dataset (the 'needle'). +- `retrieval_question`: Question used to prompt the model for retrieval. +- `reader_cfg`, `infer_cfg`, `eval_cfg`: Configurations for reading, inference, and evaluation, respectively. -- `needle`: The specific text (needle) to be located within the dataset. -- `retrieval_question`: The question used to prompt the model for retrieval. -- `context_lengths`: Specifies the context lengths (in tokens) for different test scenarios. -- `document_depth_percent_intervals`: The number of interval divisions for document depth to determine where to insert the "needle". +By defining these parameters in the configuration file, you can flexibly create datasets that suit your needs. Configuration files offer a highly customizable and scalable way to manage the generation and use of datasets. + +### Multi-Needle Needle In A Haystack Test + +The latest version introduces the multi-needle Needle In A Haystack test, allowing multiple different needles (text snippets) to be inserted into the same dataset. These needles are inserted in sequence according to a given depth parameter. Compared to the single-needle test, the multi-needle test provides a more complex data processing scenario. + +#### Multi-Needle Dataset Configuration Example + +Here is an example of configuring a multi-needle dataset, showing how to define a multi-needle dataset in the `configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py` configuration file. This example demonstrates a dataset configuration with three needles: + +```python +# Basic dataset configuration +base_path = './data/CDME' +file_list = ['zh_finance.jsonl'] + +# Definition of Needles +needles = [ + '\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', + '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。', + '松露奶酪通心粉是该家餐馆的有着意大利皇室烹饪血统的大厨Jack制作' +] + + +# Configuration parameters +retrieval_question = ("制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫什么?" + "请按照'制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫______。'的格式回答。") +answer = "制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫Jack" +keyword = "Jack" +diff = 25 + +# Dataset generation loop +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + # Other configuration items... + 'needles': needles, + 'diff': diff, + 'keyword': keyword, + # Other configuration items... + } + cdme_datasets.append(dataset_dict) +``` + +In this configuration, in addition to the standard parameters, the main new parameters include: + +- `needles`: A list containing multiple strings, each representing a needle to be inserted. +- `diff`: Defines the depth increment for subsequent needles relative to the first needle. +- `keyword`: A keyword used for score correction during the evaluation process. + +#### Change in Scoring Mechanism + +In the source code of `opencompass/datasets/cdme/cdme_multi.py`, the scoring mechanism for multi-needle datasets differs. The following code segment has been added to adjust the scores based on the `keyword` in the predictions: + +```python +if keyword in prediction: + print(f'{keyword} is in {prediction}') + score = 100 +else: + print(f'{keyword} is not in {prediction}') + score = 0.2 * score +``` + +This code means that if the keyword is present in the prediction, it will be awarded a high score (e.g., 100). If not, the score will be significantly reduced (20% of the original score). This scoring mechanism places more emphasis on the accuracy of keywords, supplementing the traditional scoring methods. ### Evaluation -For example, to evaluate using the `internlm` model, you can use the following command: +#### Evaluating with the `internlm` Model + +For example, to evaluate using the `internlm` model, the following command can be used: ```bash -python run.py configs/eval_hf_internlm_chat_20b_cdme.py --slurm -p partition_name-q auto --max-num-workers 32 +python run.py configs/eval_needleinahaystack.py --slurm -p partition_name -q auto --max-num-workers 32 ``` -This command initiates the evaluation process, where the model will attempt to find the specified "needle" in the generated dataset. The parameters `-p partition_name-q auto` and `--max-num-workers 32` specify the Slurm queue and the maximum number of worker processes. +This command initiates the evaluation process, where the model attempts to find the specified "needle" in the generated dataset. The parameters `-p partition_name -q auto` and `--max-num-workers 32` specify the Slurm queue and the maximum number of worker processes, respectively. -### Score Calculation Method +#### Large-Scale Text Evaluation with `LMDeploy` + +When evaluating especially long texts (e.g., 200k tokens), conventional methods might lead to memory overload. In such cases, quantized models can be used for evaluation. This can be achieved using the `LMDeploy` tool ([LMDeploy](https://github.com/InternLM/lmdeploy)). + +Detailed information about installing and configuring `LMDeploy` can be found on its GitHub page. Once installed, the `TurboMindModel` defined in the `configs/eval_needleinahaystack_turbomind.py` configuration file can be used for evaluation. + +Below is an example configuration in the `configs/eval_needleinahaystack_turbomind.py` file: + +```python +from opencompass.models.turbomind import TurboMindModel +from mmengine.config import read_base + +with read_base(): + from .datasets.cdme.cdme200k import cdme_datasets + +datasets = [*cdme_datasets] + +internlm_meta_template = dict(round=[ + dict(role='HUMAN', begin=':', end='\n'), + dict(role='BOT', begin=':', end='\n', generate=True), +], + eos_token_id=103028) + +models = [ + dict( + type=TurboMindModel, + abbr='internlm-chat-20b-turbomind', + path='./turbomind', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + concurrency=8, + meta_template=internlm_meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] +``` + +In this configuration, the `TurboMindModel` combines the functionality of `LMDeploy`, suitable for handling large-scale text datasets and effectively reducing memory usage. -In the `CDMEEvaluator` class, we use two main methods to calculate scores: `levenshtein_distance` and `score`. Here is a detailed introduction and implementation +### Score Calculation Method -of these methods. +In the `CDMEEvaluator` class, we use two main methods to calculate scores: `levenshtein_distance` and `score`. Here are detailed explanations and implementations of these methods. #### Levenshtein Distance -Levenshtein distance is a method for measuring the difference between two strings. It represents the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other. +Levenshtein distance is a measure of the difference between two strings. It represents the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other. ```python def levenshtein_distance(self, s1, s2): @@ -127,7 +258,7 @@ def levenshtein_distance(self, s1, s2): #### Score Calculation -The `score` calculation method accepts lists of predictions and references and calculates the edit distance and score for each pair of prediction and reference. +The `score` calculation method accepts two lists of predictions and references and calculates the edit distance and score for each pair of prediction and reference. ```python def score(self, predictions, references): @@ -140,12 +271,14 @@ def score(self, predictions, references): prediction = re.sub(r'\s+', '', prediction) reference = re.sub(r'\s+', '', reference) edit_distance = self.levenshtein_distance(prediction, reference) + + max_len = max(len(prediction), len(reference)) - score = 100 * (1 - edit_distance / max_len) if max_len != 0 else 100 + score = 100 * (1 - edit_distance /max_len) if max_len != 0 else 100 detail = { "pred": prediction, - "answer": reference, + "ref": reference, "edit_distance": edit_distance, "score": score } @@ -153,28 +286,41 @@ def score(self, predictions, references): details.append(detail) average_score = total_score / len(predictions) if predictions else 0 - result = {"score": average_score, "details": details} + result = {"average_score": average_score, "details": details} return result ``` -The method first removes all whitespace characters from the predictions and references, then calculates the Levenshtein distance between them. The score is calculated as 100 minus the percentage loss based on the edit distance. Finally, it returns detailed scores for each prediction and the average score. +This scoring method first removes all whitespace characters from both predictions and references and then calculates the Levenshtein distance between them. The score is calculated as 100 minus the percentage loss based on edit distance. Finally, it returns detailed scores for each prediction and the average score overall. ### Visualization -You can visualize the CSV files in the `outputs` folder using the `tools_needleinahaystack.py` script. For example: +The `tools_needleinahaystack.py` script can be used to visualize CSV files. This script supports specifying one or more CSV file paths through the `--path` parameter and can use the `--dataset_length` parameter to specify the length of the dataset. + +#### Usage Examples + +To visualize a single CSV file: ```bash -python tools/tools_needleinahaystack.py \ - --plot \ - --csv_file_paths 'outputs/default/20231216_161457/summary/summary_20231216_161457.csv' 'outputs/default/20231217_022310/summary/summary_20231217_022310.csv' +python tools/tools_needleinahaystack.py --path 'outputs/default/20231216_161457/summary/summary_20231216_161457.csv' ``` -Currently, this approach only supports the CDME dataset, and we welcome community contributions to more datasets. +To visualize multiple CSV files: -If you use this method, please add a citation: +```bash +python tools/tools_needleinahaystack.py --path 'path_to_first_csv.csv' 'path_to_second_csv.csv' +``` -```bibtex +To specify the dataset length for visualization, which is used for generating titles in the visualization charts: +```bash +python tools/tools_needleinahaystack.py --path 'path_to_csv.csv' --dataset_length 200K +``` + +Currently, this approach only supports the CDME dataset, and we welcome community contributions for more datasets. + +If you use this method, please cite as follows: + +```bibtex @misc{2023opencompass, title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, author={OpenCompass Contributors}, @@ -191,11 +337,10 @@ If you use this method, please add a citation: @misc{wei2023skywork, title={Skywork: A More Open Bilingual Foundation Model}, - author={Tianwen Wei and others}, + author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou}, year={2023}, eprint={2310.19341}, archivePrefix={arXiv}, primaryClass={cs.CL} } - ``` diff --git a/docs/en/index.rst b/docs/en/index.rst index fe3792a46..f5e77fa37 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -70,6 +70,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass. advanced_guides/subjective_evaluation.md advanced_guides/circular_eval.md advanced_guides/contamination_eval.md + advanced_guides/needleinahaystack_eval.md .. _Tools: .. toctree:: diff --git a/docs/zh_cn/advanced_guides/needleinahaystack_eval.md b/docs/zh_cn/advanced_guides/needleinahaystack_eval.md index c0376d511..236c8944a 100644 --- a/docs/zh_cn/advanced_guides/needleinahaystack_eval.md +++ b/docs/zh_cn/advanced_guides/needleinahaystack_eval.md @@ -58,41 +58,173 @@ cd opencompass pip install -e . ``` -### 生成数据集 +### 配置数据集 -运行以下命令以生成数据集: +在最新版本中,数据集不再通过运行脚本手动生成,而是通过在配置文件中动态定义和加载。用户需要根据自己的需求,在配置文件中指定数据集的参数。这种方法提供了更大的灵活性和定制化选项。 -```bash -python tools/tools_needleinahaystack.py \ - --processed_datasets_path './data/CDME/processed' \ - --data_path './data/CDME' \ - --tokenizer_model 'gpt-4' \ - --num_records_per_file 10 \ - --length_buffer 200 \ - --guided True \ - --file_list 'zh_finance.jsonl' \ - --context_lengths 1000 2000 3000 4000 5000 6000 7000 8000 \ - --needle '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n' \ - --retrieval_question '小明最喜欢的实习地点是哪里?你的回答格式应该为“小明最喜欢的实习地点就是________。”' \ - --document_depth_percent_intervals 35 \ -``` +#### 数据集配置示例 + +以下是一个数据集配置的示例,展示了如何在配置文件 `configs/datasets/cdme/cdme8k.py` 中定义一个数据集。这个示例展示了一个 8000 tokens 长度的中文数据集配置: -您可以在启动 `tools/tools_needleinahaystack.py` 时设置特定参数,以选择任务所需的数据集。主要参数包括: +```python +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + 'abbr': f'CDME_Length{original_context_length}Depth{int(depth_percent)}', + 'type': CDMEDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle': '\n小明最喜欢的实习的地点就是上海人工智能实验室。\n', + 'retrieval_question': '小明最喜欢的实习地点是哪里?请按照“小明最喜欢的实习地点就是________。”的格式回答。', + 'reader_cfg': cdme_reader_cfg, + 'infer_cfg': cdme_infer_cfg, + 'eval_cfg': cdme_eval_cfg + } + cdme_datasets.append(dataset_dict) +``` -- `needle`: 要在数据集中查找的指定文本(针)。 +在这个配置中,主要参数包括: + +- `abbr`: 数据集的简称。 +- `type`: 数据集类型。 +- `path`: 数据集文件的路径。 +- `length`: 上下文长度(以token为单位)。 +- `depth`: 文档深度百分比。 +- `tokenizer_model`: 使用的tokenizer 模型。 +- `file_list`: 数据源文件列表。 +- `num_repeats_per_file`: 每个文件重复的次数。 +- `length_buffer`: 长度缓冲区。 +- `guide`: 是否为引导式数据集。 +- `language`: 数据集的语言。 +- `needle`: 在数据集中要查找的特定文本(针)。 - `retrieval_question`: 用于提示模型检索的问题。 -- `context_lengths`: 指定不同测试场景的上下文长度(以token为单位)。 -- `document_depth_percent_intervals`: 文档深度的划分间隔数量,用于确定在何处插入“针”。 +- `reader_cfg`, `infer_cfg`, `eval_cfg`: 分别对应读取、推理和评估的配置。 + +通过在配置文件中定义这些参数,您可以灵活地创建适合您需求的数据集。配置文件提供了一种高度可定制和扩展的方式来管理数据集的生成和使用。 + +### 多根针大海捞针测试 + +最新版本中引入了多根针大海捞针测试,允许在同一个数据集中插入多个不同的针(文本片段)。这些针将根据给定的深度参数依次插入数据集中。相对于单针测试,多针测试提供了更复杂的数据处理场景。 + +#### 多针数据集配置示例 + +以下是一个配置多针数据集的示例,展示了如何在配置文件 `configs/datasets/cdme/multi_needle/cdme8k_cot3_italy.py` 中定义多针数据集。这个示例展示了一个包含三根针的数据集配置: + +```python +# 数据集基础配置 +base_path = './data/CDME' +file_list = ['zh_finance.jsonl'] + +# 针(Needles)定义 +needles = [ + '\n意大利的佛罗伦萨有一家名为"La Giostra"的餐馆,是整个佛罗伦萨中排行第一的餐馆。\n', + '"La Giostra"餐馆的特色菜肴是松露奶酪通心粉。', + '松露奶酪通心粉是该家餐馆的有着意大利皇室烹饪血统的大厨Jack制作' +] + +# 配置参数 +retrieval_question = ("制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫什么?" + "请按照'制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫______。'的格式回答。") +answer = "制作佛罗伦萨中排行第一的餐馆的特色菜肴的人叫Jack" +keyword = "Jack" +diff = 25 + +# 数据集生成循环 +for original_context_length in context_lengths: + for depth_percent in generate_depth_percents( + document_depth_percent_intervals, + document_depth_percent_interval_type): + dataset_dict = { + # 其他配置项... + 'needles': needles, + 'diff': diff, + 'keyword': keyword, + # 其他配置项... + } + cdme_datasets.append(dataset_dict) +``` + +在这个配置中,除了标准的参数之外,主要新增了以下几个关键参数: + +- `needles`: 一个包含多个字符串的列表,每个字符串代表一个要插入的针。 +- `diff`: 定义后续针相对于第一根针的插入深度增量。 +- `keyword`: 用于在评分过程中对答案进行校正的关键词。 + +#### 评分机制的改变 + +在 `opencompass/datasets/cdme/cdme_multi.py` 的源代码中,对于多根针的数据集,评分机制有所不同。新增了以下代码段,用于基于 `keyword` 对预测的答案进行评分校正: + +```python +if keyword in prediction: + print(f'{keyword} is in {prediction}') + score = 100 +else: + print(f'{keyword} is not in {prediction}') + score = 0.2 * score +``` + +这段代码意味着如果预测的答案中包含了 `keyword`,则会给予高分(如100分)。如果不包含,则分数会被大幅度降低(原分数的20%)。这种评分机制更加注重关键词的准确性,是对传统评分方法的一个重要补充。 ### 评估 +#### 使用 `internlm` 模型进行评估 + 例如,使用 `internlm` 模型进行评估,可以使用以下命令: ```bash -python run.py configs/eval_hf_internlm_chat_20b_cdme.py --slurm -p partition_name-q auto --max-num-workers 32 +python run.py configs/eval_needleinahaystack.py --slurm -p partition_name -q auto --max-num-workers 32 +``` + +这个命令将启动评估流程,其中模型将试图在生成的数据集中找到指定的“针”。参数 `-p partition_name -q auto` 和 `--max-num-workers 32` 用于指定 Slurm 队列和最大工作进程数。 + +#### 使用 `LMDeploy` 进行大规模文本评估 + +当评估特别长的文本(例如 200k tokens)时,常规方法可能会导致显存不足。在这种情况下,可以使用量化模型进行评估。这可以通过使用 `LMDeploy` 工具([LMDeploy](https://github.com/InternLM/lmdeploy))完成。 + +安装和配置 `LMDeploy` 的详细信息可以在其 GitHub 页面上找到。安装完成后,可以使用 `configs/eval_needleinahaystack_turbomind.py` 配置文件中定义的 `TurboMindModel` 模型进行评估。 + +以下是 `configs/eval_needleinahaystack_turbomind.py` 文件的示例配置: + +```python +from opencompass.models.turbomind import TurboMindModel +from mmengine.config import read_base + +with read_base(): + from .datasets.cdme.cdme200k import cdme_datasets + +datasets = [*cdme_datasets] + +internlm_meta_template = dict(round=[ + dict(role='HUMAN', begin=':', end='\n'), + dict(role='BOT', begin=':', end='\n', generate=True), +], + eos_token_id=103028) + +models = [ + dict( + type=TurboMindModel, + abbr='internlm-chat-20b-turbomind', + path='./turbomind', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + concurrency=8, + meta_template=internlm_meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] ``` -这个命令将启动评估流程,其中模型将试图在生成的数据集中找到指定的“针”。参数 `-p partition_name-q auto` 和 `--max-num-workers 32` 用于指定Slurm队列和最大工作进程数。 +在这个配置中,`TurboMindModel` 模型结合了 `LMDeploy` 的功能,适用于处理大规模文本数据集,有效减少显存的占用。 ### Score计算方法 @@ -159,12 +291,26 @@ def score(self, predictions, references): ### 可视化 -可以使用 `tools_needleinahaystack.py` 脚本,将 `outputs` 文件夹中的 CSV 文件进行可视化绘图。例如 +可以使用 `tools_needleinahaystack.py` 脚本来对 CSV 文件进行可视化绘图。这个脚本支持通过 `--path` 参数指定一个或多个 CSV 文件的路径,并且可以使用 `--dataset_length` 参数来指定数据集的长度。 + +#### 使用示例 + +绘制单个 CSV 文件的可视化: + +```bash +python tools/tools_needleinahaystack.py --path 'outputs/default/20231216_161457/summary/summary_20231216_161457.csv' +``` + +绘制多个 CSV 文件的可视化: + +```bash +python tools/tools_needleinahaystack.py --path 'path_to_first_csv.csv' 'path_to_second_csv.csv' +``` + +指定数据集长度进行可视化,此参数用于生成可视化图中的图表标题: ```bash -python tools/tools_needleinahaystack.py \ - --plot \ - --csv_file_paths 'outputs/default/20231216_161457/summary/summary_20231216_161457.csv' 'outputs/default/20231217_022310/summary/summary_20231217_022310.csv' +python tools/tools_needleinahaystack.py --path 'path_to_csv.csv' --dataset_length 200K ``` 目前该方案仅支持 CDME 数据集,我们欢迎社区贡献更多的数据集。 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 16852f56e..0031fe641 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -71,6 +71,7 @@ OpenCompass 上手路线 advanced_guides/circular_eval.md advanced_guides/contamination_eval.md advanced_guides/compassbench_intro.md + advanced_guides/needleinahaystack_eval.md .. _工具: .. toctree:: diff --git a/opencompass/datasets/cdme/cdme_multi.py b/opencompass/datasets/cdme/cdme_multi.py new file mode 100644 index 000000000..32b297522 --- /dev/null +++ b/opencompass/datasets/cdme/cdme_multi.py @@ -0,0 +1,224 @@ +import json +import random +import re +from pathlib import Path + +import tiktoken +from datasets import Dataset + +from opencompass.datasets.base import BaseDataset +from opencompass.openicl import BaseEvaluator +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + + +@LOAD_DATASET.register_module() +class CDMEDataset(BaseDataset): + + @staticmethod + def load( + path: str, + length: int, + depth: int, + tokenizer_model: str, + file_list: 'list[str]', + num_repeats_per_file: int, + length_buffer: int, + guide: bool, + language: str, + needles: 'list[str]', + diff: int, + retrieval_question: str, + answer: str, + keyword: str, + ): + data = {'prompt': [], 'answer': []} + tokenizer = tiktoken.encoding_for_model(tokenizer_model) + + def _generate_context(tokens_context, depth_percent, needles): + tokens_needle = [ + _get_tokens_from_context(needle) for needle in needles + ] + insertion_points = [] + total_length = len(tokens_context) + + for i, needle_tokens in enumerate(tokens_needle): + if i == 0: + insertion_point = int(total_length * (depth_percent / 100)) + else: + insertion_point = int(insertion_points[i - 1] + + len(tokens_needle[i - 1]) + + total_length * (diff / 100)) + insertion_point = min( + insertion_point, + total_length + sum(len(tn) for tn in tokens_needle[:i])) + insertion_points.append(insertion_point) + + for i, needle_tokens in enumerate(tokens_needle): + tokens_context = tokens_context[:insertion_points[i]] \ + + needle_tokens + tokens_context[insertion_points[i]:] + for j in range(i + 1, len(insertion_points)): + insertion_points[j] += len(needle_tokens) + + new_context = _decode_tokens(tokens_context) + return new_context + + def _get_tokens_from_context(context): + if isinstance(context, list): + return [tokenizer.encode(item) for item in context] + else: + return tokenizer.encode(context) + + def _decode_tokens(tokens): + return tokenizer.decode(tokens) + + def _modify_retrieval_question(retrieval_question): + if language == 'Chinese': + parts = retrieval_question.split('请按照') + guide_retrieval_question = (parts[0] + '在回答之前,请思考文档中与此问题' + '最相关的内容是什么。请按照' + parts[1]) + return guide_retrieval_question + elif language == 'English': + parts = retrieval_question.split('Please answer in the format') + guide_retrieval_question = ( + parts[0] + 'Before answering, please consider' + ' what in the document is most relevant to this question.' + ' Please answer in the format' + parts[1]) + return guide_retrieval_question + else: + raise ValueError(f"Language '{language}' is not supported.") + + def _generate_prompt(context, retrieval_question): + if guide: + retrieval_question = _modify_retrieval_question( + retrieval_question) + + if language == 'Chinese': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n' + f'用户现在给你的文档是{context}\n\n' + f'现在请问:{retrieval_question}') + elif language == 'English': + prompt = ('You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do not' + ' talk about irrelevant topics or repeat your ' + 'answers.\n' + f'The document given to you by the user is {context}' + f'\n\nNow, the question is: {retrieval_question}') + else: + raise ValueError(f"Language '{language}' is not supported.") + + return prompt + + files = Path(path).glob('*.jsonl') + for file in files: + if file.name not in file_list: + continue + + with open(file, 'r', encoding='utf-8') as f: + lines_bak = [json.loads(line.strip()) for line in f] + lines = lines_bak.copy() + for counter in range(num_repeats_per_file): + random.seed(counter) + random.shuffle(lines) + + context_length = length - length_buffer + target_length_per_record = context_length - \ + sum(len(tokens) for tokens + in _get_tokens_from_context(needles)) + + accumulated_tokens = [] + for line in lines: + tokens_current_line = _get_tokens_from_context( + line['text']) + accumulated_tokens.extend(tokens_current_line) + + if len(accumulated_tokens) >= target_length_per_record: + break + + processed_text = _generate_context( + accumulated_tokens[:target_length_per_record], depth, + needles) + + processed_prompt = _generate_prompt(processed_text, + retrieval_question) + + data['prompt'].append(processed_prompt) + data['answer'].append(answer + '*' + keyword) + + dataset = Dataset.from_dict({ + 'prompt': data['prompt'], + 'answer': data['answer'], + }) + return dataset + + +class CDMEEvaluator(BaseEvaluator): + + def levenshtein_distance(self, s1, s2): + if len(s1) < len(s2): + return self.levenshtein_distance(s2, s1) + + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + + def score(self, predictions, references): + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different lengths' + } + + total_score = 0 + details = [] + for prediction, reference in zip(predictions, references): + keyword = reference.split('*')[1] + reference = reference.split('*')[0] + prediction = re.sub(r'\s+', '', prediction) + reference = re.sub(r'\s+', '', reference) + edit_distance = self.levenshtein_distance(prediction, reference) + max_len = max(len(prediction), len(reference)) + score = 100 * (1 - + edit_distance / max_len) if max_len != 0 else 100 + + if keyword in prediction: + print(f'{keyword} is in {prediction}') + score = 100 + else: + print(f'{keyword} is not in {prediction}') + score = 0.2 * score + + detail = { + 'pred': prediction, + 'answer': reference, + 'edit_distance': edit_distance, + 'score': score + } + total_score += score + details.append(detail) + + average_score = total_score / len(predictions) if predictions else 0 + result = {'score': average_score, 'details': details} + return result + + +@TEXT_POSTPROCESSORS.register_module('cdme') +def cdme_postprocess(text: str) -> str: + return text + + +@TEXT_POSTPROCESSORS.register_module('cdme_dataset') +def cdme_dataset_postprocess(text: str) -> str: + return text diff --git a/tools/tools_needleinahaystack.py b/tools/tools_needleinahaystack.py index e0d5600fb..64a2ba862 100644 --- a/tools/tools_needleinahaystack.py +++ b/tools/tools_needleinahaystack.py @@ -9,12 +9,10 @@ class CDMEDataset(): @staticmethod - def visualize(csv_file_paths): - for file_path in csv_file_paths: + def visualize(path: str, dataset_length: str): + for file_path in path: df = pd.read_csv(file_path) - # Split 'dataset' column to - # get 'Context Length' and 'Document Depth' df['Context Length'] = df['dataset'].apply( lambda x: int(x.split('Length')[1].split('Depth')[0])) df['Document Depth'] = df['dataset'].apply( @@ -45,7 +43,7 @@ def visualize(csv_file_paths): overall_score = mean_scores.mean() # Create heatmap and line plot - plt.figure(figsize=(17.5, 8)) + plt.figure(figsize=(15.5, 8)) ax = plt.gca() cmap = LinearSegmentedColormap.from_list( 'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F']) @@ -84,9 +82,9 @@ def visualize(csv_file_paths): ax2.legend(loc='upper left') # Set chart title and labels - ax.set_title(f'{model_name} 8K Context Performance\n' + - 'Fact Retrieval Across Context Lengths ' + - '("Needle In A Haystack")') + ax.set_title(f'{model_name} {dataset_length} Context ' + 'Performance\nFact Retrieval Across ' + 'Context Lengths ("Needle In A Haystack")') ax.set_xlabel('Token Limit') ax.set_ylabel('Depth Percent') ax.set_xticklabels(pivot_table.columns.values, rotation=45) @@ -102,7 +100,9 @@ def visualize(csv_file_paths): # Save heatmap as PNG png_file_path = file_path.replace('.csv', f'_{model_name}.png') - # plt.tight_layout() + plt.tight_layout() + plt.subplots_adjust(right=1) + plt.draw() plt.savefig(png_file_path) plt.show() @@ -116,21 +116,20 @@ def main(): parser = argparse.ArgumentParser(description='Generate NeedleInAHaystack' 'Test Plots') - parser.add_argument('--plot', - action='store_true', - help='Visualize the dataset results') - parser.add_argument('--csv_file_paths', + parser.add_argument('--path', nargs='*', default=['path/to/your/result.csv'], help='Paths to CSV files for visualization') - + parser.add_argument('--dataset_length', + default='8K', + type=str, + help='Dataset_length for visualization') args = parser.parse_args() - if args.plot: - if not args.csv_file_paths: - print("Error: '--csv_file_paths' is required for visualization.") - exit(1) - CDMEDataset.visualize(args.csv_file_paths) + if not args.path: + print("Error: '--path' is required for visualization.") + exit(1) + CDMEDataset.visualize(args.path, args.dataset_length) if __name__ == '__main__':