Skip to content

Commit

Permalink
update contamination.py summarizer
Browse files Browse the repository at this point in the history
  • Loading branch information
liyucheng09 committed Dec 13, 2023
1 parent 1481ba6 commit 0d7ad84
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 82 deletions.
3 changes: 2 additions & 1 deletion configs/datasets/ARC_c/ARC_c_clean_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer))

ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator))
ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
analyze_contamination=True)

ARC_c_datasets = [
dict(
Expand Down
3 changes: 2 additions & 1 deletion configs/datasets/hellaswag/hellaswag_clean_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer))

hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator))
hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
analyze_contamination=True)

hellaswag_datasets = [
dict(
Expand Down
3 changes: 2 additions & 1 deletion configs/datasets/mmlu/mmlu_clean_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@
inferencer=dict(type=PPLInferencer),
)

mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator), )
mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
analyze_contamination=True)

mmlu_datasets.append(
dict(
Expand Down
247 changes: 168 additions & 79 deletions configs/summarizers/contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,128 @@
'physician': {'accuracy - clean': 24, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 24, 'accuracy - not labeled': 0},
}

mmlu_category_weights = {
"business_ethics": {"accuracy - clean": 44, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 1},
"security_studies": {"accuracy - clean": 188, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 47, "accuracy - not labeled": 0},
"high_school_us_history": {"accuracy - clean": 42, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 161},
"moral_disputes": {"accuracy - clean": 105, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 168, "accuracy - not labeled": 59},
"philosophy": {"accuracy - clean": 81, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 187, "accuracy - not labeled": 31},
"public_relations": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0},
"high_school_microeconomics": {"accuracy - clean": 82, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 146, "accuracy - not labeled": 0},
"human_sexuality": {"accuracy - clean": 108, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 4},
"professional_accounting": {"accuracy - clean": 88, "accuracy - input contaminated": 40, "accuracy - input-and-label contaminated": 152, "accuracy - not labeled": 1},
"high_school_government_and_politics": {"accuracy - clean": 104, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 82, "accuracy - not labeled": 0},
"sociology": {"accuracy - clean": 105, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 91, "accuracy - not labeled": 0},
"conceptual_physics": {"accuracy - clean": 79, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 147, "accuracy - not labeled": 0},
"human_aging": {"accuracy - clean": 208, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 13, "accuracy - not labeled": 0},
"high_school_psychology": {"accuracy - clean": 108, "accuracy - input contaminated": 26, "accuracy - input-and-label contaminated": 162, "accuracy - not labeled": 248},
"jurisprudence": {"accuracy - clean": 59, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 0},
"moral_scenarios": {"accuracy - clean": 320, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 574},
"college_medicine": {"accuracy - clean": 107, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 44, "accuracy - not labeled": 5},
"high_school_world_history": {"accuracy - clean": 61, "accuracy - input contaminated": 2, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 173},
"virology": {"accuracy - clean": 104, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 58, "accuracy - not labeled": 0},
"high_school_statistics": {"accuracy - clean": 96, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0},
"nutrition": {"accuracy - clean": 172, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 98, "accuracy - not labeled": 24},
"abstract_algebra": {"accuracy - clean": 84, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 7, "accuracy - not labeled": 0},
"high_school_geography": {"accuracy - clean": 91, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 105, "accuracy - not labeled": 0},
"econometrics": {"accuracy - clean": 62, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 0},
"marketing": {"accuracy - clean": 115, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 101, "accuracy - not labeled": 2},
"high_school_chemistry": {"accuracy - clean": 108, "accuracy - input contaminated": 25, "accuracy - input-and-label contaminated": 69, "accuracy - not labeled": 0},
"prehistory": {"accuracy - clean": 154, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 107, "accuracy - not labeled": 57},
"college_physics": {"accuracy - clean": 25, "accuracy - input contaminated": 20, "accuracy - input-and-label contaminated": 57, "accuracy - not labeled": 0},
"management": {"accuracy - clean": 35, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 62, "accuracy - not labeled": 0},
"college_biology": {"accuracy - clean": 91, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 0},
"high_school_biology": {"accuracy - clean": 128, "accuracy - input contaminated": 17, "accuracy - input-and-label contaminated": 135, "accuracy - not labeled": 29},
"high_school_physics": {"accuracy - clean": 42, "accuracy - input contaminated": 28, "accuracy - input-and-label contaminated": 80, "accuracy - not labeled": 0},
"logical_fallacies": {"accuracy - clean": 133, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 24, "accuracy - not labeled": 0},
"medical_genetics": {"accuracy - clean": 49, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 1},
"machine_learning": {"accuracy - clean": 71, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0},
"professional_law": {"accuracy - clean": 401, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 1119},
"professional_psychology": {"accuracy - clean": 265, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 27, "accuracy - not labeled": 310},
"global_facts": {"accuracy - clean": 89, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 0},
"us_foreign_policy": {"accuracy - clean": 71, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 25, "accuracy - not labeled": 0},
"international_law": {"accuracy - clean": 73, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 46, "accuracy - not labeled": 0},
"clinical_knowledge": {"accuracy - clean": 172, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 86, "accuracy - not labeled": 0},
"high_school_mathematics": {"accuracy - clean": 178, "accuracy - input contaminated": 59, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0},
"high_school_computer_science": {"accuracy - clean": 62, "accuracy - input contaminated": 7, "accuracy - input-and-label contaminated": 28, "accuracy - not labeled": 2},
"college_computer_science": {"accuracy - clean": 68, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 1},
"electrical_engineering": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 61, "accuracy - not labeled": 0},
"college_mathematics": {"accuracy - clean": 61, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0},
"computer_security": {"accuracy - clean": 55, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0},
"high_school_macroeconomics": {"accuracy - clean": 102, "accuracy - input contaminated": 14, "accuracy - input-and-label contaminated": 173, "accuracy - not labeled": 100},
"astronomy": {"accuracy - clean": 112, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 35, "accuracy - not labeled": 0},
"college_chemistry": {"accuracy - clean": 46, "accuracy - input contaminated": 19, "accuracy - input-and-label contaminated": 34, "accuracy - not labeled": 0},
"high_school_european_history": {"accuracy - clean": 41, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 123},
"miscellaneous": {"accuracy - clean": 256, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 477},
"formal_logic": {"accuracy - clean": 92, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 21, "accuracy - not labeled": 0},
"elementary_mathematics": {"accuracy - clean": 155, "accuracy - input contaminated": 31, "accuracy - input-and-label contaminated": 103, "accuracy - not labeled": 88},
"world_religions": {"accuracy - clean": 130, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0},
"professional_medicine": {"accuracy - clean": 191, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 1, "accuracy - not labeled": 36},
"anatomy": {"accuracy - clean": 52, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0},
}

mmlu_all_sets = [
"college_biology",
"college_chemistry",
"college_computer_science",
"college_mathematics",
"college_physics",
"electrical_engineering",
"astronomy",
"anatomy",
"abstract_algebra",
"machine_learning",
"clinical_knowledge",
"global_facts",
"management",
"nutrition",
"marketing",
"professional_accounting",
"high_school_geography",
"international_law",
"moral_scenarios",
"computer_security",
"high_school_microeconomics",
"professional_law",
"medical_genetics",
"professional_psychology",
"jurisprudence",
"world_religions",
"philosophy",
"virology",
"high_school_chemistry",
"public_relations",
"high_school_macroeconomics",
"human_sexuality",
"elementary_mathematics",
"high_school_physics",
"high_school_computer_science",
"high_school_european_history",
"business_ethics",
"moral_disputes",
"high_school_statistics",
"miscellaneous",
"formal_logic",
"high_school_government_and_politics",
"prehistory",
"security_studies",
"high_school_biology",
"logical_fallacies",
"high_school_world_history",
"professional_medicine",
"high_school_mathematics",
"college_medicine",
"high_school_us_history",
"sociology",
"econometrics",
"high_school_psychology",
"human_aging",
"us_foreign_policy",
"conceptual_physics",
]

ARC_weights = {'accuracy - clean': 836, 'accuracy - input contaminated': 53, 'accuracy - input-and-label contaminated': 283, 'accuracy - not labeled': 0}
hellaswag_weights = {'accuracy - clean': 5169, 'accuracy - input contaminated': 37, 'accuracy - input-and-label contaminated': 673, 'accuracy - not labeled': 4163}

ceval_stem = ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics', 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics', 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology', 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine']
ceval_social_science = ['college_economics', 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification', 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography']
Expand All @@ -69,89 +191,56 @@

name_and_subsets = [
('ceval', ceval_all),
('ceval-stem', ceval_stem),
('ceval-social-science', ceval_social_science),
('ceval-humanities', ceval_humanities),
('ceval-other', ceval_other),
('ceval-hard', ceval_hard)
('mmlu', mmlu_all_sets),
]

ceval_summary_groups = []
summary_groups = []
for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated']:
for dataset_abbr, subsets in name_and_subsets:
weights = {f'ceval-{i}': ceval_category_weights[i][metric_name] for i in subsets}
subsets = [[f'ceval-{i}', metric_name] for i in subsets]
ceval_summary_groups.append(
{
'name': dataset_abbr,
'subsets': subsets,
'metric': metric_name,
'weights': weights,
}
)
ceval_weights = {f'ceval-{i}': ceval_category_weights[i][metric_name] for i in ceval_all}
ceval_subsets = [[f'ceval-{i}', metric_name] for i in ceval_all]

summary_groups.append(
{
'name': 'ceval',
'subsets': ceval_subsets,
'metric': metric_name,
'weights': ceval_weights,
}
)

mmlu_weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in mmlu_all_sets}
mmlu_subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in mmlu_all_sets]

summary_groups.append(
{
'name': 'mmlu',
'subsets': mmlu_subsets,
'metric': metric_name,
'weights': mmlu_weights,
}
)

summary_groups.append(
{
'name': 'hellaswag',
'subsets': [['hellaswag', metric_name]],
'metric': metric_name,
'weights': {'hellaswag': hellaswag_weights[metric_name]}
}
)

summary_groups.append(
{
'name': 'ARC-c-test',
'subsets': [['ARC-c-test', metric_name]],
'metric': metric_name,
'weights': {'ARC-c-test': ARC_weights[metric_name]}
}
)

ceval_summarizer = dict(
summarizer = dict(
type=CircularSummarizer,
metric_types=['accuracy - clean', 'accuracy - input contaminated', 'accuracy - input-and-label contaminated'],
dataset_abbrs = [
'ceval-computer_network',
'ceval-operating_system',
'ceval-computer_architecture',
'ceval-college_programming',
'ceval-college_physics',
'ceval-college_chemistry',
'ceval-advanced_mathematics',
'ceval-probability_and_statistics',
'ceval-discrete_mathematics',
'ceval-electrical_engineer',
'ceval-metrology_engineer',
'ceval-high_school_mathematics',
'ceval-high_school_physics',
'ceval-high_school_chemistry',
'ceval-high_school_biology',
'ceval-middle_school_mathematics',
'ceval-middle_school_biology',
'ceval-middle_school_physics',
'ceval-middle_school_chemistry',
'ceval-veterinary_medicine',
'ceval-college_economics',
'ceval-business_administration',
'ceval-marxism',
'ceval-mao_zedong_thought',
'ceval-education_science',
'ceval-teacher_qualification',
'ceval-high_school_politics',
'ceval-high_school_geography',
'ceval-middle_school_politics',
'ceval-middle_school_geography',
'ceval-modern_chinese_history',
'ceval-ideological_and_moral_cultivation',
'ceval-logic',
'ceval-law',
'ceval-chinese_language_and_literature',
'ceval-art_studies',
'ceval-professional_tour_guide',
'ceval-legal_professional',
'ceval-high_school_chinese',
'ceval-high_school_history',
'ceval-middle_school_history',
'ceval-civil_servant',
'ceval-sports_science',
'ceval-plant_protection',
'ceval-basic_medicine',
'ceval-clinical_medicine',
'ceval-urban_and_rural_planner',
'ceval-accountant',
'ceval-fire_engineer',
'ceval-environmental_impact_assessment_engineer',
'ceval-tax_accountant',
'ceval-physician',
'ceval-humanities',
'ceval-stem',
'ceval-social-science',
'ceval-other',
'ceval-hard',
'ceval',
],
summary_groups=ceval_summary_groups,
dataset_abbrs = ['ceval', 'mmlu', 'hellaswag', 'ARC-c-test'],
summary_groups=summary_groups,
)

0 comments on commit 0d7ad84

Please sign in to comment.