diff --git a/configs/datasets/ARC_c/ARC_c_clean_ppl.py b/configs/datasets/ARC_c/ARC_c_clean_ppl.py index 5581919d2..44e9e3e6b 100644 --- a/configs/datasets/ARC_c/ARC_c_clean_ppl.py +++ b/configs/datasets/ARC_c/ARC_c_clean_ppl.py @@ -40,7 +40,8 @@ retriever=dict(type=ZeroRetriever), inferencer=dict(type=PPLInferencer)) -ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator)) +ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) ARC_c_datasets = [ dict( diff --git a/configs/datasets/hellaswag/hellaswag_clean_ppl.py b/configs/datasets/hellaswag/hellaswag_clean_ppl.py index 2bb054837..1f37f4d6b 100644 --- a/configs/datasets/hellaswag/hellaswag_clean_ppl.py +++ b/configs/datasets/hellaswag/hellaswag_clean_ppl.py @@ -21,7 +21,8 @@ retriever=dict(type=ZeroRetriever), inferencer=dict(type=PPLInferencer)) -hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator)) +hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) hellaswag_datasets = [ dict( diff --git a/configs/datasets/mmlu/mmlu_clean_ppl.py b/configs/datasets/mmlu/mmlu_clean_ppl.py index 26cef0383..90a8cf122 100644 --- a/configs/datasets/mmlu/mmlu_clean_ppl.py +++ b/configs/datasets/mmlu/mmlu_clean_ppl.py @@ -97,7 +97,8 @@ inferencer=dict(type=PPLInferencer), ) - mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), ) + mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), + analyze_contamination=True) mmlu_datasets.append( dict( diff --git a/configs/summarizers/contamination.py b/configs/summarizers/contamination.py index 7e0d42b23..96c782288 100644 --- a/configs/summarizers/contamination.py +++ b/configs/summarizers/contamination.py @@ -59,6 +59,128 @@ 'physician': {'accuracy - clean': 24, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 24, 'accuracy - not labeled': 0}, } +mmlu_category_weights = { + "business_ethics": {"accuracy - clean": 44, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 1}, + "security_studies": {"accuracy - clean": 188, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 47, "accuracy - not labeled": 0}, + "high_school_us_history": {"accuracy - clean": 42, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 161}, + "moral_disputes": {"accuracy - clean": 105, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 168, "accuracy - not labeled": 59}, + "philosophy": {"accuracy - clean": 81, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 187, "accuracy - not labeled": 31}, + "public_relations": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, + "high_school_microeconomics": {"accuracy - clean": 82, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 146, "accuracy - not labeled": 0}, + "human_sexuality": {"accuracy - clean": 108, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 4}, + "professional_accounting": {"accuracy - clean": 88, "accuracy - input contaminated": 40, "accuracy - input-and-label contaminated": 152, "accuracy - not labeled": 1}, + "high_school_government_and_politics": {"accuracy - clean": 104, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 82, "accuracy - not labeled": 0}, + "sociology": {"accuracy - clean": 105, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 91, "accuracy - not labeled": 0}, + "conceptual_physics": {"accuracy - clean": 79, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 147, "accuracy - not labeled": 0}, + "human_aging": {"accuracy - clean": 208, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 13, "accuracy - not labeled": 0}, + "high_school_psychology": {"accuracy - clean": 108, "accuracy - input contaminated": 26, "accuracy - input-and-label contaminated": 162, "accuracy - not labeled": 248}, + "jurisprudence": {"accuracy - clean": 59, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 0}, + "moral_scenarios": {"accuracy - clean": 320, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 574}, + "college_medicine": {"accuracy - clean": 107, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 44, "accuracy - not labeled": 5}, + "high_school_world_history": {"accuracy - clean": 61, "accuracy - input contaminated": 2, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 173}, + "virology": {"accuracy - clean": 104, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 58, "accuracy - not labeled": 0}, + "high_school_statistics": {"accuracy - clean": 96, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, + "nutrition": {"accuracy - clean": 172, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 98, "accuracy - not labeled": 24}, + "abstract_algebra": {"accuracy - clean": 84, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 7, "accuracy - not labeled": 0}, + "high_school_geography": {"accuracy - clean": 91, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 105, "accuracy - not labeled": 0}, + "econometrics": {"accuracy - clean": 62, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 0}, + "marketing": {"accuracy - clean": 115, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 101, "accuracy - not labeled": 2}, + "high_school_chemistry": {"accuracy - clean": 108, "accuracy - input contaminated": 25, "accuracy - input-and-label contaminated": 69, "accuracy - not labeled": 0}, + "prehistory": {"accuracy - clean": 154, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 107, "accuracy - not labeled": 57}, + "college_physics": {"accuracy - clean": 25, "accuracy - input contaminated": 20, "accuracy - input-and-label contaminated": 57, "accuracy - not labeled": 0}, + "management": {"accuracy - clean": 35, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 62, "accuracy - not labeled": 0}, + "college_biology": {"accuracy - clean": 91, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 0}, + "high_school_biology": {"accuracy - clean": 128, "accuracy - input contaminated": 17, "accuracy - input-and-label contaminated": 135, "accuracy - not labeled": 29}, + "high_school_physics": {"accuracy - clean": 42, "accuracy - input contaminated": 28, "accuracy - input-and-label contaminated": 80, "accuracy - not labeled": 0}, + "logical_fallacies": {"accuracy - clean": 133, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 24, "accuracy - not labeled": 0}, + "medical_genetics": {"accuracy - clean": 49, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 1}, + "machine_learning": {"accuracy - clean": 71, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, + "professional_law": {"accuracy - clean": 401, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 1119}, + "professional_psychology": {"accuracy - clean": 265, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 27, "accuracy - not labeled": 310}, + "global_facts": {"accuracy - clean": 89, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 0}, + "us_foreign_policy": {"accuracy - clean": 71, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 25, "accuracy - not labeled": 0}, + "international_law": {"accuracy - clean": 73, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 46, "accuracy - not labeled": 0}, + "clinical_knowledge": {"accuracy - clean": 172, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 86, "accuracy - not labeled": 0}, + "high_school_mathematics": {"accuracy - clean": 178, "accuracy - input contaminated": 59, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, + "high_school_computer_science": {"accuracy - clean": 62, "accuracy - input contaminated": 7, "accuracy - input-and-label contaminated": 28, "accuracy - not labeled": 2}, + "college_computer_science": {"accuracy - clean": 68, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 1}, + "electrical_engineering": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 61, "accuracy - not labeled": 0}, + "college_mathematics": {"accuracy - clean": 61, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, + "computer_security": {"accuracy - clean": 55, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, + "high_school_macroeconomics": {"accuracy - clean": 102, "accuracy - input contaminated": 14, "accuracy - input-and-label contaminated": 173, "accuracy - not labeled": 100}, + "astronomy": {"accuracy - clean": 112, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 35, "accuracy - not labeled": 0}, + "college_chemistry": {"accuracy - clean": 46, "accuracy - input contaminated": 19, "accuracy - input-and-label contaminated": 34, "accuracy - not labeled": 0}, + "high_school_european_history": {"accuracy - clean": 41, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 123}, + "miscellaneous": {"accuracy - clean": 256, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 477}, + "formal_logic": {"accuracy - clean": 92, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 21, "accuracy - not labeled": 0}, + "elementary_mathematics": {"accuracy - clean": 155, "accuracy - input contaminated": 31, "accuracy - input-and-label contaminated": 103, "accuracy - not labeled": 88}, + "world_religions": {"accuracy - clean": 130, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, + "professional_medicine": {"accuracy - clean": 191, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 1, "accuracy - not labeled": 36}, + "anatomy": {"accuracy - clean": 52, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, +} + +mmlu_all_sets = [ + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "electrical_engineering", + "astronomy", + "anatomy", + "abstract_algebra", + "machine_learning", + "clinical_knowledge", + "global_facts", + "management", + "nutrition", + "marketing", + "professional_accounting", + "high_school_geography", + "international_law", + "moral_scenarios", + "computer_security", + "high_school_microeconomics", + "professional_law", + "medical_genetics", + "professional_psychology", + "jurisprudence", + "world_religions", + "philosophy", + "virology", + "high_school_chemistry", + "public_relations", + "high_school_macroeconomics", + "human_sexuality", + "elementary_mathematics", + "high_school_physics", + "high_school_computer_science", + "high_school_european_history", + "business_ethics", + "moral_disputes", + "high_school_statistics", + "miscellaneous", + "formal_logic", + "high_school_government_and_politics", + "prehistory", + "security_studies", + "high_school_biology", + "logical_fallacies", + "high_school_world_history", + "professional_medicine", + "high_school_mathematics", + "college_medicine", + "high_school_us_history", + "sociology", + "econometrics", + "high_school_psychology", + "human_aging", + "us_foreign_policy", + "conceptual_physics", +] + +ARC_weights = {'accuracy - clean': 836, 'accuracy - input contaminated': 53, 'accuracy - input-and-label contaminated': 283, 'accuracy - not labeled': 0} +hellaswag_weights = {'accuracy - clean': 5169, 'accuracy - input contaminated': 37, 'accuracy - input-and-label contaminated': 673, 'accuracy - not labeled': 4163} ceval_stem = ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics', 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics', 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology', 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine'] ceval_social_science = ['college_economics', 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification', 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography'] @@ -69,89 +191,56 @@ name_and_subsets = [ ('ceval', ceval_all), - ('ceval-stem', ceval_stem), - ('ceval-social-science', ceval_social_science), - ('ceval-humanities', ceval_humanities), - ('ceval-other', ceval_other), - ('ceval-hard', ceval_hard) + ('mmlu', mmlu_all_sets), ] -ceval_summary_groups = [] +summary_groups = [] for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated']: - for dataset_abbr, subsets in name_and_subsets: - weights = {f'ceval-{i}': ceval_category_weights[i][metric_name] for i in subsets} - subsets = [[f'ceval-{i}', metric_name] for i in subsets] - ceval_summary_groups.append( - { - 'name': dataset_abbr, - 'subsets': subsets, - 'metric': metric_name, - 'weights': weights, - } - ) + ceval_weights = {f'ceval-{i}': ceval_category_weights[i][metric_name] for i in ceval_all} + ceval_subsets = [[f'ceval-{i}', metric_name] for i in ceval_all] + + summary_groups.append( + { + 'name': 'ceval', + 'subsets': ceval_subsets, + 'metric': metric_name, + 'weights': ceval_weights, + } + ) + + mmlu_weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in mmlu_all_sets} + mmlu_subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in mmlu_all_sets] + + summary_groups.append( + { + 'name': 'mmlu', + 'subsets': mmlu_subsets, + 'metric': metric_name, + 'weights': mmlu_weights, + } + ) + + summary_groups.append( + { + 'name': 'hellaswag', + 'subsets': [['hellaswag', metric_name]], + 'metric': metric_name, + 'weights': {'hellaswag': hellaswag_weights[metric_name]} + } + ) + + summary_groups.append( + { + 'name': 'ARC-c-test', + 'subsets': [['ARC-c-test', metric_name]], + 'metric': metric_name, + 'weights': {'ARC-c-test': ARC_weights[metric_name]} + } + ) -ceval_summarizer = dict( +summarizer = dict( type=CircularSummarizer, metric_types=['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated'], - dataset_abbrs = [ - 'ceval-computer_network', - 'ceval-operating_system', - 'ceval-computer_architecture', - 'ceval-college_programming', - 'ceval-college_physics', - 'ceval-college_chemistry', - 'ceval-advanced_mathematics', - 'ceval-probability_and_statistics', - 'ceval-discrete_mathematics', - 'ceval-electrical_engineer', - 'ceval-metrology_engineer', - 'ceval-high_school_mathematics', - 'ceval-high_school_physics', - 'ceval-high_school_chemistry', - 'ceval-high_school_biology', - 'ceval-middle_school_mathematics', - 'ceval-middle_school_biology', - 'ceval-middle_school_physics', - 'ceval-middle_school_chemistry', - 'ceval-veterinary_medicine', - 'ceval-college_economics', - 'ceval-business_administration', - 'ceval-marxism', - 'ceval-mao_zedong_thought', - 'ceval-education_science', - 'ceval-teacher_qualification', - 'ceval-high_school_politics', - 'ceval-high_school_geography', - 'ceval-middle_school_politics', - 'ceval-middle_school_geography', - 'ceval-modern_chinese_history', - 'ceval-ideological_and_moral_cultivation', - 'ceval-logic', - 'ceval-law', - 'ceval-chinese_language_and_literature', - 'ceval-art_studies', - 'ceval-professional_tour_guide', - 'ceval-legal_professional', - 'ceval-high_school_chinese', - 'ceval-high_school_history', - 'ceval-middle_school_history', - 'ceval-civil_servant', - 'ceval-sports_science', - 'ceval-plant_protection', - 'ceval-basic_medicine', - 'ceval-clinical_medicine', - 'ceval-urban_and_rural_planner', - 'ceval-accountant', - 'ceval-fire_engineer', - 'ceval-environmental_impact_assessment_engineer', - 'ceval-tax_accountant', - 'ceval-physician', - 'ceval-humanities', - 'ceval-stem', - 'ceval-social-science', - 'ceval-other', - 'ceval-hard', - 'ceval', - ], - summary_groups=ceval_summary_groups, + dataset_abbrs = ['ceval', 'mmlu', 'hellaswag', 'ARC-c-test'], + summary_groups=summary_groups, )