From 16f29b25f142802c7cd7448d0c077237e4db2313 Mon Sep 17 00:00:00 2001 From: Mo Li <82895469+DseidLi@users.noreply.github.com> Date: Sun, 7 Apr 2024 17:51:13 +0800 Subject: [PATCH] [Fix] Simplify needlebench summarizer (#1024) * Conflicts: configs/summarizers/needlebench.py * fix lint problems --- .../datasets/needlebench/atc/atc_choice_50.py | 43 + .../datasets/needlebench/atc/atc_choice_80.py | 43 + configs/summarizers/needlebench.py | 880 ++++-------------- opencompass/summarizers/needlebench.py | 404 ++++---- 4 files changed, 505 insertions(+), 865 deletions(-) create mode 100644 configs/datasets/needlebench/atc/atc_choice_50.py create mode 100644 configs/datasets/needlebench/atc/atc_choice_80.py diff --git a/configs/datasets/needlebench/atc/atc_choice_50.py b/configs/datasets/needlebench/atc/atc_choice_50.py new file mode 100644 index 000000000..c0ffc0eee --- /dev/null +++ b/configs/datasets/needlebench/atc/atc_choice_50.py @@ -0,0 +1,43 @@ +from mmengine.config import read_base +with read_base(): + from .atc_choice_20 import * + +needle_num_list = list(range(2, 50, 1)) +needlebench_datasets = [] + +for _name in list(single_choice_prompts.keys()): + + needlebench_atc_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=(single_choice_prompts[_name])), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer,), + ) + + needlebench_atc_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + for num_needles in needle_num_list: + abbr = (f'NeedleBenchATCDataset-' + f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}') + language = "English" if "en" in _name else "Chinese" + if 'reasoning' in _name: + abbr += '-Reasoning' + dataset_dict = { + 'abbr': abbr, + 'type': NeedleBenchATCDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': language, + 'repeats': repeats, + 'with_circular': with_circular_eval, + 'reader_cfg': needlebench_atc_reader_cfg, + 'infer_cfg': needlebench_atc_infer_cfg, + 'eval_cfg': needlebench_atc_eval_cfg + } + needlebench_datasets.append(dataset_dict) + diff --git a/configs/datasets/needlebench/atc/atc_choice_80.py b/configs/datasets/needlebench/atc/atc_choice_80.py new file mode 100644 index 000000000..c631e0fca --- /dev/null +++ b/configs/datasets/needlebench/atc/atc_choice_80.py @@ -0,0 +1,43 @@ +from mmengine.config import read_base +with read_base(): + from .atc_choice_20 import * + +needle_num_list = list(range(2, 80, 1)) +needlebench_datasets = [] + +for _name in list(single_choice_prompts.keys()): + + needlebench_atc_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=(single_choice_prompts[_name])), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer,), + ) + + needlebench_atc_eval_cfg = dict( + evaluator=dict(type=CircularEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + for num_needles in needle_num_list: + abbr = (f'NeedleBenchATCDataset-' + f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}') + language = "English" if "en" in _name else "Chinese" + if 'reasoning' in _name: + abbr += '-Reasoning' + dataset_dict = { + 'abbr': abbr, + 'type': NeedleBenchATCDataset, + 'path': names_path, + 'num_needles': num_needles, + 'language': language, + 'repeats': repeats, + 'with_circular': with_circular_eval, + 'reader_cfg': needlebench_atc_reader_cfg, + 'infer_cfg': needlebench_atc_infer_cfg, + 'eval_cfg': needlebench_atc_eval_cfg + } + needlebench_datasets.append(dataset_dict) + diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index 5fef6f0bc..3ecb9f32a 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -1,653 +1,142 @@ from opencompass.summarizers.needlebench import NeedleBenchSummarizer -from opencompass.summarizers.needlebench import NeedleBenchATCSummarizer -# ----------NeedleBench-4k-summarizer---------- -context_lengths_4k = list(range(1000, 5000, 1000)) -depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100] -depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] - -# Initialize the lists -_needlebench_4k_2needle_en = [] -_needlebench_4k_3needle_en = [] -_needlebench_4k_4needle_en = [] -_needlebench_4k_5needle_en = [] -_needlebench_4k_2needle_zh = [] -_needlebench_4k_3needle_zh = [] -_needlebench_4k_4needle_zh = [] -_needlebench_4k_5needle_zh = [] -_needlebench_4k_origin_en = [] -_needlebench_4k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_4k: - for depth_percent in depths: - _needlebench_4k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_4k') - _needlebench_4k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_4k') - _needlebench_4k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_4k') - _needlebench_4k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_4k') - _needlebench_4k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_4k') - _needlebench_4k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_4k') - _needlebench_4k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_4k') - _needlebench_4k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_4k') - - _needlebench_4k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_4k') - _needlebench_4k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_4k') - -# Concatenate the multi-needle and origin lists -_needlebench_4k_multi_needle_en = _needlebench_4k_2needle_en + _needlebench_4k_3needle_en + _needlebench_4k_4needle_en + _needlebench_4k_5needle_en -_needlebench_4k_multi_needle_zh = _needlebench_4k_2needle_zh + _needlebench_4k_3needle_zh + _needlebench_4k_4needle_zh + _needlebench_4k_5needle_zh -_needlebench_4k_origin = _needlebench_4k_origin_en + _needlebench_4k_origin_zh -_needlebench_4k_multi_needle = _needlebench_4k_multi_needle_en + _needlebench_4k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_4k_parallel_en = [] -_needlebench_4k_parallel_zh = [] -for original_context_length in context_lengths_4k: - _needlebench_4k_parallel_en.append(f'Length{original_context_length}_parallel_en_4k') -for original_context_length in context_lengths_4k: - _needlebench_4k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_4k') -_needlebench_4k_parallel = _needlebench_4k_parallel_en + _needlebench_4k_parallel_zh - -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_4k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_4k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_4k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_4k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_4k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_4k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_4k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_4k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_4k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_4k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_4k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_4k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_4k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_4k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_4k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_4k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_4k_parallel_en}, +def create_m_rs_names_list(context_lengths, depths, needle_counts, + languages, dataset_size): + names_dict = {} + multi_needle_list = [] + multi_needle_en_list = [] + multi_needle_zh_list = [] + + for needle_count in needle_counts: + for language in languages: + key = f"{needle_count}-Needle-{language.upper()}-{dataset_size.upper()}" + names_list = [ + f"Length{length}Depth{int(depth)}_{needle_count}needle_{language}_{dataset_size}" + for length in context_lengths + for depth in depths + ] + names_dict[key] = names_list + + multi_needle_list.extend(names_list) + if language == 'en': + multi_needle_en_list.extend(names_list) + elif language == 'zh': + multi_needle_zh_list.extend(names_list) + names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list + names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list + names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list + + return names_dict + +def create_summarizer(context_lengths, depths, dataset_size, + sparse_depths=None): + needle_counts = ["2", "3", "4", "5"] + languages = ["en", "zh"] + if sparse_depths: + depths = sparse_depths + names_dict = {} + multi_reasoning_names = create_m_rs_names_list( + context_lengths, depths, needle_counts, languages, dataset_size) + + names_dict.update(multi_reasoning_names) + + single_needle_list = [] + single_needle_en_list = [] + single_needle_zh_list = [] + + for language in languages: + names_list = [ + f"Length{length}Depth{int(depth)}_origin_{language}_{dataset_size}" + for length in context_lengths + for depth in depths + ] + single_needle_list.extend(names_list) + if language == 'en': + single_needle_en_list.extend(names_list) + elif language == 'zh': + single_needle_zh_list.extend(names_list) + names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list + names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list + names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list + + parallel_list = [] + parallel_en_list = [] + parallel_zh_list = [] + + for language in languages: + names_list = [ + f"Length{length}_parallel_{language}_{dataset_size}" + for length in context_lengths + ] + parallel_list.extend(names_list) + if language == 'en': + parallel_en_list.extend(names_list) + elif language == 'zh': + parallel_zh_list.extend(names_list) + names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list + names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list + names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list + + summary_groups = [ + {'name': key, 'subsets': value} for key, value in names_dict.items() + ] + + summary_groups.append({ + 'name': 'NeedleBench-Overall-Score', + 'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'], + ['Multi-Needle-Reasoning(M-RS)', 'naive_average'], + ['Multi-Needle-Retrieval(M-RT)', 'average_score']], + 'weights': {'Single-Needle-Retrieval(S-RT)': 0.4, + 'Multi-Needle-Reasoning(M-RS)': 0.3, + 'Multi-Needle-Retrieval(M-RT)': 0.3}}) + summarizer_config = { + 'type': NeedleBenchSummarizer, + 'summary_groups': summary_groups, + 'dataset_abbrs': [ + 'NeedleBench-Overall-Score', + f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------', + 'Single-Needle-Retrieval(S-RT)', + 'Single-Needle-Retrieval-EN', + 'Single-Needle-Retrieval-ZH', + f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------', + 'Multi-Needle-Retrieval(M-RT)', + 'Multi-Needle-Retrieval-EN', + 'Multi-Needle-Retrieval-ZH', + f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------', + 'Multi-Needle-Reasoning(M-RS)', + 'Multi-Needle-Reasoning-EN', + 'Multi-Needle-Reasoning-ZH', + '2-Needle-EN-4K', + '2-Needle-ZH-4K', + '3-Needle-EN-4K', + '3-Needle-ZH-4K', + '4-Needle-EN-4K', + '4-Needle-ZH-4K', + '5-Needle-EN-4K', + '5-Needle-ZH-4K', + ] + } + return summarizer_config - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_4k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-4k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-4k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-4k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel, - ], - summary_groups=needlebench_summary_groups, -) -# ----------NeedleBench-8k-summarizer---------- +depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100] +depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] +context_lengths_4k = list(range(1000, 5000, 1000)) +needlebench_4k_summarizer = create_summarizer(context_lengths_4k, depths, "4k") context_lengths_8k = list(range(5000, 9000, 1000)) - -# Initialize the lists -_needlebench_8k_2needle_en = [] -_needlebench_8k_3needle_en = [] -_needlebench_8k_4needle_en = [] -_needlebench_8k_5needle_en = [] -_needlebench_8k_2needle_zh = [] -_needlebench_8k_3needle_zh = [] -_needlebench_8k_4needle_zh = [] -_needlebench_8k_5needle_zh = [] -_needlebench_8k_origin_en = [] -_needlebench_8k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_8k: - for depth_percent in depths: - _needlebench_8k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_8k') - _needlebench_8k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_8k') - _needlebench_8k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_8k') - _needlebench_8k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_8k') - _needlebench_8k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_8k') - _needlebench_8k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_8k') - _needlebench_8k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_8k') - _needlebench_8k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_8k') - - _needlebench_8k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_8k') - _needlebench_8k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_8k') - -# Concatenate the multi-needle and origin lists -_needlebench_8k_multi_needle_en = _needlebench_8k_2needle_en + _needlebench_8k_3needle_en + _needlebench_8k_4needle_en + _needlebench_8k_5needle_en -_needlebench_8k_multi_needle_zh = _needlebench_8k_2needle_zh + _needlebench_8k_3needle_zh + _needlebench_8k_4needle_zh + _needlebench_8k_5needle_zh -_needlebench_8k_origin = _needlebench_8k_origin_en + _needlebench_8k_origin_zh -_needlebench_8k_multi_needle = _needlebench_8k_multi_needle_en + _needlebench_8k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_8k_parallel_en = [] -_needlebench_8k_parallel_zh = [] -for original_context_length in context_lengths_8k: - _needlebench_8k_parallel_en.append(f'Length{original_context_length}_parallel_en_8k') -for original_context_length in context_lengths_8k: - _needlebench_8k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_8k') -_needlebench_8k_parallel = _needlebench_8k_parallel_en + _needlebench_8k_parallel_zh - -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_8k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_8k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_8k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_8k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_8k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_8k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_8k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_8k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_8k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_8k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_8k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_8k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_8k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_8k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_8k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_8k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_8k_parallel_en}, - - - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_8k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-8k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-8k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-8k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, - ], - summary_groups=needlebench_summary_groups, -) - -# ----------NeedleBench-32k-summarizer---------- - +needlebench_8k_summarizer = create_summarizer(context_lengths_8k, depths, "8k") context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000] - -# Initialize the lists -_needlebench_32k_2needle_en = [] -_needlebench_32k_3needle_en = [] -_needlebench_32k_4needle_en = [] -_needlebench_32k_5needle_en = [] -_needlebench_32k_2needle_zh = [] -_needlebench_32k_3needle_zh = [] -_needlebench_32k_4needle_zh = [] -_needlebench_32k_5needle_zh = [] -_needlebench_32k_origin_en = [] -_needlebench_32k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_32k: - for depth_percent in depths_list_sparse: - _needlebench_32k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_32k') - _needlebench_32k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_32k') - _needlebench_32k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_32k') - _needlebench_32k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_32k') - _needlebench_32k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_32k') - _needlebench_32k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_32k') - _needlebench_32k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_32k') - _needlebench_32k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_32k') - - _needlebench_32k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_32k') - _needlebench_32k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_32k') - -# Concatenate the multi-needle and origin lists -_needlebench_32k_multi_needle_en = _needlebench_32k_2needle_en + _needlebench_32k_3needle_en + _needlebench_32k_4needle_en + _needlebench_32k_5needle_en -_needlebench_32k_multi_needle_zh = _needlebench_32k_2needle_zh + _needlebench_32k_3needle_zh + _needlebench_32k_4needle_zh + _needlebench_32k_5needle_zh -_needlebench_32k_origin = _needlebench_32k_origin_en + _needlebench_32k_origin_zh -_needlebench_32k_multi_needle = _needlebench_32k_multi_needle_en + _needlebench_32k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_32k_parallel_en = [] -_needlebench_32k_parallel_zh = [] -for original_context_length in context_lengths_32k: - _needlebench_32k_parallel_en.append(f'Length{original_context_length}_parallel_en_32k') -for original_context_length in context_lengths_32k: - _needlebench_32k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_32k') -_needlebench_32k_parallel = _needlebench_32k_parallel_en + _needlebench_32k_parallel_zh - -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_32k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_32k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_32k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_32k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_32k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_32k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_32k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_32k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_32k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_32k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_32k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_32k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_32k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_32k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_32k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_32k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_32k_parallel_en}, - - - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_32k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-32k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-32k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-32k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel, - ], - summary_groups=needlebench_summary_groups, -) - -# ----------NeedleBench-128k-summarizer---------- - +needlebench_32k_summarizer = create_summarizer(context_lengths_32k, depths_list_sparse, "32k") context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) - -# Initialize the lists -_needlebench_128k_2needle_en = [] -_needlebench_128k_3needle_en = [] -_needlebench_128k_4needle_en = [] -_needlebench_128k_5needle_en = [] -_needlebench_128k_2needle_zh = [] -_needlebench_128k_3needle_zh = [] -_needlebench_128k_4needle_zh = [] -_needlebench_128k_5needle_zh = [] -_needlebench_128k_origin_en = [] -_needlebench_128k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_128k: - for depth_percent in depths_list_sparse: - _needlebench_128k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_128k') - _needlebench_128k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_128k') - _needlebench_128k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_128k') - _needlebench_128k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_128k') - _needlebench_128k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_128k') - _needlebench_128k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_128k') - _needlebench_128k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_128k') - _needlebench_128k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_128k') - - _needlebench_128k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_128k') - _needlebench_128k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_128k') - -# Concatenate the multi-needle and origin lists -_needlebench_128k_multi_needle_en = _needlebench_128k_2needle_en + _needlebench_128k_3needle_en + _needlebench_128k_4needle_en + _needlebench_128k_5needle_en -_needlebench_128k_multi_needle_zh = _needlebench_128k_2needle_zh + _needlebench_128k_3needle_zh + _needlebench_128k_4needle_zh + _needlebench_128k_5needle_zh -_needlebench_128k_origin = _needlebench_128k_origin_en + _needlebench_128k_origin_zh -_needlebench_128k_multi_needle = _needlebench_128k_multi_needle_en + _needlebench_128k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_128k_parallel_en = [] -_needlebench_128k_parallel_zh = [] -for original_context_length in context_lengths_128k: - _needlebench_128k_parallel_en.append(f'Length{original_context_length}_parallel_en_128k') -for original_context_length in context_lengths_128k: - _needlebench_128k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_128k') -_needlebench_128k_parallel = _needlebench_128k_parallel_en + _needlebench_128k_parallel_zh - -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_128k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_128k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_128k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_128k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_128k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_128k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_128k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_128k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_128k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_128k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_128k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_128k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_128k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_128k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_128k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_128k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_128k_parallel_en}, - - - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_128k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-128k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-128k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-128k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel, - ], - summary_groups=needlebench_summary_groups, -) - -# ----------NeedleBench-200k-summarizer---------- - +needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k") context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) -# Initialize the lists -_needlebench_200k_2needle_en = [] -_needlebench_200k_3needle_en = [] -_needlebench_200k_4needle_en = [] -_needlebench_200k_5needle_en = [] -_needlebench_200k_2needle_zh = [] -_needlebench_200k_3needle_zh = [] -_needlebench_200k_4needle_zh = [] -_needlebench_200k_5needle_zh = [] -_needlebench_200k_origin_en = [] -_needlebench_200k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_200k: - for depth_percent in depths_list_sparse: - _needlebench_200k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_200k') - _needlebench_200k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_200k') - _needlebench_200k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_200k') - _needlebench_200k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_200k') - _needlebench_200k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_200k') - _needlebench_200k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_200k') - _needlebench_200k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_200k') - _needlebench_200k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_200k') - - _needlebench_200k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_200k') - _needlebench_200k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_200k') - -# Concatenate the multi-needle and origin lists -_needlebench_200k_multi_needle_en = _needlebench_200k_2needle_en + _needlebench_200k_3needle_en + _needlebench_200k_4needle_en + _needlebench_200k_5needle_en -_needlebench_200k_multi_needle_zh = _needlebench_200k_2needle_zh + _needlebench_200k_3needle_zh + _needlebench_200k_4needle_zh + _needlebench_200k_5needle_zh -_needlebench_200k_origin = _needlebench_200k_origin_en + _needlebench_200k_origin_zh -_needlebench_200k_multi_needle = _needlebench_200k_multi_needle_en + _needlebench_200k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_200k_parallel_en = [] -_needlebench_200k_parallel_zh = [] -for original_context_length in context_lengths_200k: - _needlebench_200k_parallel_en.append(f'Length{original_context_length}_parallel_en_200k') -for original_context_length in context_lengths_200k: - _needlebench_200k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_200k') -_needlebench_200k_parallel = _needlebench_200k_parallel_en + _needlebench_200k_parallel_zh - -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_200k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_200k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_200k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_200k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_200k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_200k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_200k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_200k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_200k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_200k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_200k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_200k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_200k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_200k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_200k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_200k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_200k_parallel_en}, - - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_200k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-200k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-200k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-200k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel, - ], - summary_groups=needlebench_summary_groups, -) - -# ----------NeedleBench-1000k-summarizer---------- - +needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k") context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) -# Initialize the lists -_needlebench_1000k_2needle_en = [] -_needlebench_1000k_3needle_en = [] -_needlebench_1000k_4needle_en = [] -_needlebench_1000k_5needle_en = [] -_needlebench_1000k_2needle_zh = [] -_needlebench_1000k_3needle_zh = [] -_needlebench_1000k_4needle_zh = [] -_needlebench_1000k_5needle_zh = [] -_needlebench_1000k_origin_en = [] -_needlebench_1000k_origin_zh = [] - -# Fill the lists using nested loops -for original_context_length in context_lengths_1000k: - for depth_percent in depths_list_sparse: - _needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k') - _needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k') - _needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k') - _needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k') - _needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k') - _needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k') - _needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k') - _needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k') - - _needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k') - _needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k') - -# Concatenate the multi-needle and origin lists -_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en -_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh -_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh -_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh - -# Repeating the same process for parallel (assuming it's similar to origin_en) -_needlebench_1000k_parallel_en = [] -_needlebench_1000k_parallel_zh = [] -for original_context_length in context_lengths_1000k: - _needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k') -for original_context_length in context_lengths_1000k: - _needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k') -_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh +needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k") -needlebench_summary_groups = [ - {'name': 'original_version', 'subsets': _needlebench_1000k_origin}, - {'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh}, - {'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en}, - - {'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en}, - {'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en}, - {'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en}, - {'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en}, - {'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en}, - - {'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh}, - {'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh}, - {'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh}, - {'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh}, - {'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh}, - - {'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle}, - - {'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel}, - {'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh}, - {'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en}, - {'name': 'overall', - 'subsets': [['original_version', 'naive_average'], - ['multi_needle', 'naive_average'], - ['parallel_version', 'average_score']], - 'weights': {'original_version': 0.4, - 'multi_needle': 0.3, - 'parallel_version': 0.3}}, -] -needlebench_1000k_summarizer = dict( - type=NeedleBenchSummarizer, - dataset_abbrs=[ - 'overall', - '--------- NeedleBench-1000k Single-Needle ---------', # category - 'original_version', - 'original_version_zh', - 'original_version_en', - '--------- NeedleBench-1000k Parallel-Needles ---------', # category - 'parallel_version', - 'parallel_version_zh', - 'parallel_version_en', - '--------- NeedleBench-1000k Multi-Needles ---------', # category - 'multi_needle', - 'multi_needle_en', - 'multi_needle_zh', - 'multi_needle2_en', - 'multi_needle3_en', - 'multi_needle4_en', - 'multi_needle5_en', - 'multi_needle2_zh', - 'multi_needle3_zh', - 'multi_needle4_zh', - 'multi_needle5_zh', - - # *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel, - ], - summary_groups=needlebench_summary_groups, -) - -context_lengths_8k = list(range(5000, 9000, 1000)) -# Repeating the same process for parallel (assuming it's similar to origin_en) _needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch5 = [] _needlebench_8k_parallel_en_batch10 = [] @@ -713,7 +202,6 @@ 'parallel_version_en_batch15', 'parallel_version_zh_batch20', 'parallel_version_en_batch20', - # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, ], summary_groups=needlebench_summary_groups, ) @@ -754,64 +242,72 @@ 'parallel_version_en_batch15', 'parallel_version_zh_batch20', 'parallel_version_en_batch20', - # *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel, ], summary_groups=needlebench_summary_groups, ) -needle_num_list = list(range(2, 20, 1)) - -categories = ['ZH', 'EN', 'ZH-Reasoning', 'EN-Reasoning', 'ZH-CircularEval', 'EN-CircularEval', 'ZH-Reasoning-Circular', 'EN-Reasoning-Circular'] -needlebench_atc_summary_groups = [] - -for category in categories: - metric = 'perf_4' if 'CircularEval' in category else 'acc_1' - cleaned_category = category.replace('-CircularEval', '').replace('-Circular', '') - subsets = [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}' - for num_needles in needle_num_list] +def gen_atc_summarizer(needle_num_list): + categories = [ + 'ZH-Direct-CE', 'EN-Direct-CE', + 'ZH-Reasoning-CE', 'EN-Reasoning-CE' + ] + needlebench_atc_summary_groups = [] + + # 根据分类生成summary groups + for category in categories: + # 对于CircularEval相关的评分,使用perf_4指标,否则使用acc_1指标 + metric = 'perf_4' if 'CE' in category else 'acc_1' + # 生成subsets时,不需要在数据集名称中包含CircularEval信息 + cleaned_category = category.replace('-CE', '').replace('-Direct', '') + needlebench_atc_summary_groups.append({ + 'name': category, + 'subsets': [ + [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', metric] + for num_needles in needle_num_list + ], + 'weights': {f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}': num_needles for num_needles in needle_num_list}, + }) needlebench_atc_summary_groups.append({ - 'name': category, + 'name': 'ATC-CE-Overall', 'subsets': [ - [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', - metric] - for num_needles in needle_num_list - ] - }) - -atc_dataset_abbrs = [] - -for category in categories: - title = f'######## Needlebench-ATC-{category}-Score ########' - atc_dataset_abbrs.append(title) - - weighted_average_score_entry = [f'{category}', 'weighted_average'] - atc_dataset_abbrs.append(weighted_average_score_entry) - -if atc_dataset_abbrs[-1] == '------------------------------------------': - atc_dataset_abbrs.pop() - -needlebench_atc_summarizer = dict( - dataset_abbrs=[ - *atc_dataset_abbrs, - '######## Needlebench-ATC Accuracy ########', # category - *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list], - '------------------------------------------', - '######## Needlebench-ATC CircularEval ########', # category - *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list], - '------------------------------------------', - *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list], - '------------------------------------------', - ], - summary_groups=needlebench_atc_summary_groups -) + [f'{category}', 'weighted_average'] for category in categories + ], + }) + atc_dataset_abbrs = [] + atc_dataset_abbrs.append(['ATC-CE-Overall', 'naive_average']) + + for category in categories: + weighted_average_score_entry = [f'{category}', 'weighted_average'] + atc_dataset_abbrs.append(weighted_average_score_entry) + + needlebench_atc_summarizer = dict( + dataset_abbrs=[ + *atc_dataset_abbrs, + '######## Needlebench-ATC Accuracy ########', # category + *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list], + '------------------------------------------', + '######## Needlebench-ATC CircularEval ########', # category + *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list], + '------------------------------------------', + *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list], + '------------------------------------------', + ], + summary_groups=needlebench_atc_summary_groups + ) + return needlebench_atc_summarizer + + +atc_summarizer_20 = gen_atc_summarizer(list(range(2, 20, 1))) +atc_summarizer_50 = gen_atc_summarizer(list(range(2, 50, 1))) +atc_summarizer_80 = gen_atc_summarizer(list(range(2, 80, 1))) diff --git a/opencompass/summarizers/needlebench.py b/opencompass/summarizers/needlebench.py index 101785b1d..f811e3d95 100644 --- a/opencompass/summarizers/needlebench.py +++ b/opencompass/summarizers/needlebench.py @@ -5,6 +5,7 @@ import math import os import os.path as osp +import shutil from datetime import datetime from typing import Any, Dict, List, Optional @@ -26,6 +27,92 @@ model_abbr_from_cfg) from opencompass.utils.prompt import get_prompt_hash +model_name_mapping = { + 'llama-2-7b-chat-hf': 'LLaMA-2-7B', + 'llama-2-13b-chat-hf': 'LLaMA-2-13B', + 'llama-2-70b-chat-hf': 'LLaMA-2-70B', + 'baichuan2-7b-chat-hf': 'Baichuan2-7B', + 'baichuan2-13b-chat-hf': 'Baichuan2-13B', + 'yi-6b-chat-hf': 'Yi-6B', + 'yi-34b-chat-hf': 'Yi-34B', + 'deepseek-67b-chat-hf': 'DeepSeek-67B', + 'wizardlm-70b-v1.0-vllm': 'WizardLM-70B', + 'qwen-14b-chat-hf': 'Qwen-14B', + 'qwen-72b-chat-hf': 'Qwen-72B', + 'qwen-72b-chat-vllm': 'Qwen-72B-vLLM', + 'internlm2-chat-7b-turbomind': 'InternLM2-7B-200K', + 'internlm2-chat-20b-turbomind': 'InternLM2-20B-200K', + 'internlm2-chat-7b-hf': 'InternLM2-7B', + 'internlm2-chat-20b-hf': 'InternLM2-20B', + 'qwen-7b-chat-hf': 'Qwen-7B', + 'chatglm3-6b-hf': 'ChatGLM3-6B', + 'chatglm3-6b-32k-hf': 'ChatGLM3-6B-32K', + 'zephyr-7b-beta-vllm': 'Zephyr-7B Beta', + 'mistral-7b-instruct-v0.2-vllm': 'Mistral-7B Inst. v0.2', + 'mistral-7b-instruct-v0.1-vllm': 'Mistral-7B Inst. v0.1', + 'mixtral-8x7b-instruct-v0.1-vllm': 'Mixtral-8x7B Inst. v0.1', + 'orionstar-yi-34b-chat-hf': 'OrionStar-Yi-34B', + 'orionstar-14b-long-chat-vllm': 'Orion-14B-LongChat', + 'internlm-chat-7b-hf': 'InternLM-7B', + 'gemma-2b-it-hf': 'Gemma-2B', + 'gemma-7b-it-hf': 'Gemma-7B', + 'qwen1.5-0.5b-chat-hf': 'Qwen-1.5-0.5B', + 'qwen1.5-1.8b-chat-hf': 'Qwen-1.5-1.8B', + 'qwen1.5-4b-chat-hf': 'Qwen-1.5-4B', + 'qwen1.5-14b-chat-hf': 'Qwen-1.5-14B', + 'qwen1.5-72b-chat-hf': 'Qwen-1.5-72B', + 'qwen1.5-14b-chat-vllm': 'Qwen-1.5-14B-vLLM', + 'qwen1.5-72b-chat-vllm': 'Qwen-1.5-72B-vLLM', + 'glm4_notools': 'GLM-4', + 'claude-3-opus': 'Claude-3-Opus', + # Add more mappings as necessary +} + +dataset_mapping_dict = {} + +needle_counts = ['2', '3', '4', '5'] +languages = ['en', 'zh'] +sizes = ['4k', '8k', '32k', '200k', '1000k'] +types = ['origin', 'parallel'] + +for needle_count in needle_counts: + for language in languages: + for size in sizes: + key = f'{needle_count}needle_{language}_{size}' + value = f'{needle_count}-Needle-Reasoning-{language.upper()}-{size.upper()}' + dataset_mapping_dict[key] = value +for t in types: + for language in languages: + for size in sizes: + if t == 'origin': + key = f'{t}_{language}_{size}' + value = f'Single-Needle-Retrieval-{language.upper()}-{size.upper()}' + elif t == 'parallel': + key = f'{t}_{language}_{size}' + value = f'Multi-Needle-Retrieval-{language.upper()}-{size.upper()}' + dataset_mapping_dict[key] = value + + +def calculate_elementwise_average(model_name, merged_df): + score_columns = [col for col in merged_df.columns if col != 'dataset'] + + origin_columns = [col for col in score_columns if 'origin' in col] + parallel_columns = [col for col in score_columns if 'parallel' in col] + multi_columns = [col for col in score_columns if 'needle' in col] + + if origin_columns and parallel_columns and multi_columns: + origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4 + parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3 + multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3 + merged_df[model_name] = origin_avg + parallel_avg + multi_avg + else: + relevant_columns = origin_columns or parallel_columns or multi_columns + if relevant_columns: + merged_df[model_name] = merged_df[relevant_columns].mean(axis=1) + else: + merged_df[model_name] = pd.Series([0] * len(merged_df)) + + return merged_df.iloc[:, [0, -1]] def read_after_specific_line_except_last(file_name, keyword, offset): with open(file_name, 'r', encoding='utf-8') as file: @@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False df = pd.DataFrame(data, columns=['dataset', model_name]) return df +def convert_to_k(value): + try: + return f'{int(value) // 1000}k' + except ValueError: + return value + def parse_model_scores(text): lines = text.split('\n') @@ -82,8 +175,86 @@ def parse_model_scores(text): return result_dict +def remove_empty_subfolders(plot_path): + for folder_name in tqdm(os.listdir(plot_path), + desc='Deleting Empty folders'): + folder_path = os.path.join(plot_path, folder_name) + if os.path.isdir(folder_path): + if not os.listdir(folder_path): + shutil.rmtree(folder_path) + +def save_results_to_plots(txt_results_save_path): + content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2) + parsed_data = parse_model_scores(content) + model_names = get_dict_model_names(parsed_data) + numbers = [2, 3, 4, 5] + languages = ['en', 'zh'] + size_exists = [] + sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k'] + + for size in sizes_origin: + if size in content: + size_exists.append(size) + + multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists] + origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists] + parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists] + + dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \ + parallel_dataset_abbrs + base_path = os.path.dirname(txt_results_save_path) + plot_path = os.path.join(base_path, 'plots') + + model_scores = {} + + for model_name in tqdm(model_names): + model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model + for dataset_abbr in dataset_abbrs: + parallel_flag = 'parallel' in dataset_abbr + + folder_path = os.path.join(plot_path, dataset_mapping_dict[dataset_abbr]) + ensure_directory(folder_path) + + save_path = os.path.join(folder_path, f'{model_name}.png') + + df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag) + + score = visualize(df, save_path, model_name, dataset_abbr) + + model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score) + + overall_dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + parallel_dataset_abbrs + overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png') + merged_df = merge_dataframes(model_name, overall_dataset_abbrs, parsed_data) + averaged_df = calculate_elementwise_average(model_name, merged_df) + overall_score = visualize(averaged_df, overall_score_pic_path, model_name, 'Overall Score') + + # Single-Retrieval + single_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_single_retrieval_overall.png') + single_retrieval_merged_df = merge_dataframes(model_name, origin_dataset_abbrs, parsed_data) + single_retrieval_averaged_df = calculate_elementwise_average(model_name, single_retrieval_merged_df) + single_retrieval_overall_score = visualize(single_retrieval_averaged_df, single_retrieval_score_pic_path, model_name, 'Single-Retrieval Overall Score') + + # Multi-Retrieval + multi_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_retrieval_overall.png') + multi_retrieval_merged_df = merge_dataframes(model_name, parallel_dataset_abbrs, parsed_data) + multi_retrieval_averaged_df = calculate_elementwise_average(model_name, multi_retrieval_merged_df) + multi_retrieval_overall_score = visualize(multi_retrieval_averaged_df, multi_retrieval_score_pic_path, model_name, 'Multi-Retrieval Overall Score') + + # Multi-Reasoning + multi_reasoning_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_reasoning_overall.png') + multi_reasoning_merged_df = merge_dataframes(model_name, multi_dataset_abbrs, parsed_data) + multi_reasoning_averaged_df = calculate_elementwise_average(model_name, multi_reasoning_merged_df) + multi_reasoning_overall_score = visualize(multi_reasoning_averaged_df, multi_reasoning_score_pic_path, model_name, 'Multi-Reasoning Overall Score') + + model_scores[model_name] = averaged_df + remove_empty_subfolders(plot_path) + return model_scores + def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): df = df_raw.copy() + if df.empty: + return -1 df['Context Length'] = df['dataset'].apply( lambda x: int(x.split('Length')[1].split('Depth')[0])) df['Document Depth'] = df['dataset'].apply( @@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): model_df = df[['Document Depth', 'Context Length', model_name]].copy() model_df.rename(columns={model_name: 'Score'}, inplace=True) - - # Create pivot table pivot_table = pd.pivot_table(model_df, - values='Score', - index=['Document Depth'], - columns=['Context Length'], - aggfunc='mean') + values='Score', + index=['Document Depth'], + columns=['Context Length'], + aggfunc='mean') - # Calculate mean scores mean_scores = pivot_table.mean().values - - # Calculate overall score overall_score = mean_scores.mean() - - # Create heatmap and line plot - plt.figure(figsize=(15.5, 8)) + plt.figure(figsize=(10, 6)) ax = plt.gca() cmap = LinearSegmentedColormap.from_list( 'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F']) - # Draw heatmap sns.heatmap(pivot_table, cmap=cmap, ax=ax, - cbar_kws={'label': 'Score'}, vmin=0, vmax=100) - - # Set line plot data + cbar = ax.collections[0].colorbar x_data = [i + 0.5 for i in range(len(mean_scores))] y_data = mean_scores - # Create twin axis for line plot ax2 = ax.twinx() - # Draw line plot ax2.plot(x_data, - y_data, - color='white', - marker='o', - linestyle='-', - linewidth=2, - markersize=8, - label='Average Depth Score') - # Set y-axis range + y_data, + color='white', + marker='o', + linestyle='-', + linewidth=2, + markersize=8, + label='Average Depth Score' + ) ax2.set_ylim(0, 100) - # Hide original y-axis ticks and labels ax2.set_yticklabels([]) ax2.set_yticks([]) - # Add legend - ax2.legend(loc='upper left') - - # Set chart title and labels - ax.set_title(f'{model_name} {dataset_type} Context ' - 'Performance\nFact Retrieval Across ' - 'Context Lengths ("Needle In A Haystack")') - ax.set_xlabel('Token Limit') - ax.set_ylabel('Depth Percent') - ax.set_xticklabels(pivot_table.columns.values, rotation=45) - ax.set_yticklabels(pivot_table.index.values, rotation=0) - # Add overall score as a subtitle - plt.text(0.5, - -0.13, f'Overall Score for {model_name}: ' - f'{overall_score:.2f}', - ha='center', - va='center', - transform=ax.transAxes, - fontsize=13) - - plt.tight_layout() - plt.subplots_adjust(right=1) - plt.draw() - plt.savefig(save_path) - print(f'Saved :{save_path}') - plt.close() # Close figure to prevent memory leaks - return overall_score - -def save_results_to_plots(txt_results_save_path): - - content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2) - - parsed_data = parse_model_scores(content) - model_names = get_dict_model_names(parsed_data) - numbers = [2, 3, 4, 5] - languages = ['en', 'zh'] - size_exists = [] - sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k'] - - for size in sizes_origin: - if size in content: - size_exists.append(size) - - multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists] - origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists] - parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists] - - dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \ - parallel_dataset_abbrs - base_path = os.path.dirname(txt_results_save_path) - plot_path = os.path.join(base_path, 'plots') - model_scores = {} - for model_name in tqdm(model_names): - model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model - for dataset_abbr in dataset_abbrs: - parallel_flag = 'parallel' in dataset_abbr + ax2.legend(loc='lower left') - # Create a directory for each dataset_abbr - folder_path = os.path.join(plot_path, dataset_abbr) - ensure_directory(folder_path) + if model_name in model_name_mapping: + title_name = model_name_mapping[model_name] + else: + title_name = model_name - # Construct the full path to save the image - save_path = os.path.join(folder_path, f'{model_name}.png') + ax.set_title(title_name, fontsize=12, fontweight='bold', pad=15) - # Create DataFrame for the model and dataset - df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag) + if dataset_type in dataset_mapping_dict: + dataset_name = dataset_mapping_dict[dataset_type] + else: + dataset_name = dataset_type + + ax.text(0.5, 1.005, f'{dataset_name}:{overall_score:.2f}', + transform=ax.transAxes, + ha='center', + fontsize=12, + fontweight='normal') + ax.set_xlabel('Token Length', fontsize=13, fontweight='normal', labelpad=1) + ax.set_ylabel('Depth Percent(%)', fontsize=13, fontweight='normal', labelpad=1) + converted_labels = [convert_to_k(value) for value in pivot_table.columns.values] + + ax.tick_params(axis='both', which='major', length=1, pad=1) + ax.tick_params(axis='both', which='minor', length=1, pad=1) + ax.set_xticklabels(converted_labels, rotation=45) + index_length = len(pivot_table.index) + + selected_indices = pivot_table.index.values[::2] + labels = [str(int(index)) for index in selected_indices] + ax.set_yticks(np.arange(0, len(pivot_table.index), 2)) + ax.set_yticklabels(labels, rotation=0) + for spine in ax.spines.values(): + spine.set_visible(False) + for spine in ax2.spines.values(): + spine.set_visible(False) - # Generate visualization and get the score - score = visualize(df, save_path, model_name, dataset_abbr) + plt.tight_layout() + plt.draw() + directory_path, original_filename = os.path.split(save_path) - # Store the score in the dictionary - model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score) + filename_suffix = (title_name+'_'+dataset_name).replace(' ', '_') + new_filename = f'{filename_suffix}.png' - # Process and visualize the overall score - overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png') - merged_df = merge_dataframes(model_name, dataset_abbrs, parsed_data) + new_save_path = os.path.join(directory_path, new_filename) - print(merge_dataframes) - averaged_df = calculate_elementwise_average(merged_df) + plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0) + print(f'Saved :{new_save_path}') - # Assume visualize returns the average score for the overall visualization - overall_score = visualize(averaged_df, overall_score_pic_path, 'weighted_average_score', 'Overall Score') + plt.close() - # Add the overall score to the dictionary - model_datasets_scores['Overall'] = '{:.02f}'.format(overall_score) + return overall_score - # Add the model's scores to the main dictionary - model_scores[model_name] = model_datasets_scores def ensure_directory(path): if not os.path.exists(path): @@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data): merged_df = reduce(lambda left, right: pd.merge(left, right, on='dataset', how='outer'), dfs) if merged_df.isnull().any().any(): - print('Warning: Some rows were filtered out due to NaN values. This is often due to mismatched row counts among DataFrames.') + print('Warning: Some rows were filtered out due to NaN values. ' + 'This is often due to mismatched row counts among DataFrames.') merged_df = merged_df.dropna() return merged_df -def calculate_elementwise_average(merged_df): - score_columns = [col for col in merged_df.columns if col != 'dataset'] - - origin_columns = [col for col in score_columns if 'origin' in col] - parallel_columns = [col for col in score_columns if 'parallel' in col] - multi_columns = [col for col in score_columns if 'needle' in col] - - if origin_columns and parallel_columns and multi_columns: - origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4 - parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3 - multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3 - - merged_df['weighted_average_score'] = origin_avg + parallel_avg + multi_avg - else: - merged_df['weighted_average_score'] = pd.Series([0] * len(merged_df)) - - return merged_df.iloc[:, [0, -1]] - - class NeedleBenchSummarizer(DefaultSummarizer): """NeedleBench summarizer in OpenCompass. @@ -303,20 +408,17 @@ def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode): summarizer_dataset_abbrs = [] if self.dataset_abbrs is None: - # display all dataset metrics included in the config for dataset_abbr in dataset_abbrs: if dataset_abbr in dataset_metrics: for metric in dataset_metrics[dataset_abbr]: summarizer_dataset_abbrs.append((dataset_abbr, metric)) else: summarizer_dataset_abbrs.append((dataset_abbr, None)) - # along with all possible group metrics for dataset_abbr in dataset_metrics: for metric in dataset_metrics[dataset_abbr]: if (dataset_abbr, metric) not in summarizer_dataset_abbrs: summarizer_dataset_abbrs.append((dataset_abbr, metric)) else: - # follow the required order for item in self.dataset_abbrs: if isinstance(item, str): summarizer_dataset_abbrs.append((item, None)) @@ -332,6 +434,7 @@ def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode): for dataset_abbr, metric in summarizer_dataset_abbrs: if dataset_abbr not in dataset_metrics: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) table.append(header) continue @@ -378,33 +481,7 @@ def _format_raw_txt(self, raw_results): raw_txts = '\n'.join(raw_txts) return raw_txts - def _read_and_sort_dataframe(self, file_path): - # Read the file without treating the first row as a header - df = pd.read_csv(file_path, header=None) - - # Function to sort columns based on the value of a specific row, excluding the first column - def sort_columns_based_on_row_corrected(df, base_row_idx, start_row_idx, end_row_idx): - # Extract the rows for sorting - sort_values_row = df.iloc[base_row_idx, 1:].replace('-', np.nan).apply(pd.to_numeric, errors='coerce') - # Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end - min_possible_value = sort_values_row.min(skipna=True) - 1 # Use min value in the row minus 1 or another method - sort_values_row_filled = sort_values_row.fillna(min_possible_value) - # Get the sorted order of indices, excluding the first column - sorted_col_indices = sort_values_row_filled.sort_values(ascending=False).index - # Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index - df.iloc[start_row_idx:end_row_idx+1] = df.iloc[start_row_idx:end_row_idx+1, [0] + sorted_col_indices.tolist()] - - # Apply the corrected sorting function based on the description - sort_columns_based_on_row_corrected(df, 1, 0, 2) # For rows 1-2 based on row 2's values - sort_columns_based_on_row_corrected(df, 4, 3, 7) # For rows 4-7 based on row 5's values - sort_columns_based_on_row_corrected(df, 9, 8, 12) # For rows 9-12 based on row 10's values - sort_columns_based_on_row_corrected(df, 14, 13, 25) # For rows 14-25 based on row 15's values - - # Return the sorted DataFrame - return df - def _output_to_file(self, output_path, time_str, table, raw_txts): - # output to file if output_path is None: output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') @@ -436,38 +513,19 @@ def _output_to_file(self, output_path, time_str, table, raw_txts): f.write('\n'.join([','.join(row) for row in table]) + '\n') self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') - df_sorted = self._read_and_sort_dataframe(output_csv_path) - - sorted_file_path = osp.abspath(output_csv_path).split('.')[0] + '_sorted.csv' - df_sorted.to_csv(sorted_file_path, index=False, header=False) - - self.logger.info(f'write sorted csv to {sorted_file_path}') - def summarize( self, output_path: str = None, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa - # pick up results raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() - - # calculate group metrics raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \ self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode) - - # format table table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode) - - # format raw txt raw_txts = self._format_raw_txt(raw_results) - - # output to screen print(tabulate.tabulate(table, headers='firstrow')) - - # output to .text / .csv files self._output_to_file(output_path, time_str, table, raw_txts) - if self.lark_reporter: content = f'{getpass.getuser()} 的' content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'