Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create annotation check #21

Merged
merged 3 commits into from
Aug 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ together==1.2.1
cohere==5.6.1
git+https://github.com/lm-sys/FastChat.git@92a6d1fcd69a88ea169c0b01065ce44f1e690a2c
python-dotenv
sentence-splitter==1.4
sentence-splitter==1.4
pandas
huggingface_hub
tabulate
numpy
103 changes: 103 additions & 0 deletions scripts/get_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import argparse
import json
import logging
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
from huggingface_hub import snapshot_download
from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING

logging.basicConfig(level=logging.INFO)


def get_args():
# fmt: off
parser = argparse.ArgumentParser(description="Get evaluation results")
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.")
parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.")
parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.")
# fmt: on
return parser.parse_args()


def main():
args = get_args()
dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset"))
lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()]

if args.langs:
logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}")
for lang_dir in lang_folders:
if lang_dir.name in args.langs:
model_scores = get_scores(lang_dir)
df = pd.DataFrame(model_scores)
metadata_df = df[["model", "model_type", "score"]]
key = "subset_scores" if args.show_subsets else "category_scores"
scores_df = pd.DataFrame(df[key].tolist())
lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False)
print(f"\n*** Results for {lang_dir.name} ***\n")
print(lang_scores_df.to_markdown(tablefmt="github", index=False))

else:
logging.info("Showing m-rewardbench scores for all languages")
lang_scores = {}
for lang_dir in lang_folders:
model_scores = get_scores(lang_dir)
lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores}

lang_scores_df = pd.DataFrame(lang_scores)
print(lang_scores_df.to_markdown(tablefmt="github"))


def get_scores(lang_dir: Path) -> List[Dict[str, Any]]:
"""Get scores for a single language, returns the category scores and the per-subset scores per model"""
files = [file for file in lang_dir.iterdir() if file.suffix == ".json"]
logging.debug(f"Found {len(files)} model results for {lang_dir.name}")

def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
"""Weighted average of each dataset"""
category_scores = {}
for category, subsets in SUBSET_MAPPING.items():
subset_results = [results[subset] for subset in subsets]
subset_lengths = [EXAMPLE_COUNTS[subset] for subset in subsets]
wt_avg = sum(v * w for v, w in zip(subset_results, subset_lengths)) / sum(subset_lengths)
category_scores[category] = wt_avg
return category_scores

model_scores = []
for file in files:
with open(file, "r") as f:
result = json.load(f)
# The Generative and Clasifier RMs have different JSON schemas
# so we need to handle them separately
if "subset" in result:
# Most likely generative
model_scores.append(
{
"model": result["subset"].pop("model"),
"model_type": result["subset"].pop("model_type"),
"chat_template": result["subset"].pop("chat_template"),
# The rewardbench score is the average of the weighted average of the four category scores
"score": sum(result["leaderboard"].values()) / len(result["leaderboard"]),
"category_scores": result["leaderboard"],
"subset_scores": result["subset"],
}
)
else:
category_scores = _compute_category_scores(result["extra_results"])
model_scores.append(
{
"model": result["model"],
"model_type": "Sequence Classifier",
"chat_template": result["chat_template"],
"score": sum(category_scores.values()) / len(category_scores),
"category_scores": category_scores,
"subset_scores": result["extra_results"],
}
)
return model_scores


if __name__ == "__main__":
main()
108 changes: 108 additions & 0 deletions scripts/make_pref_annotation_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import argparse
import json
import logging
import sys
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from datasets import load_dataset
from huggingface_hub import snapshot_download

logging.basicConfig(level=logging.INFO)


def get_args():
# fmt: off
parser = argparse.ArgumentParser(description="Create annotation CSV for a given language.")
parser.add_argument("--output_dir", type=Path, required=True, help="Directory to save the annotation CSV files.")
parser.add_argument("--langs", nargs="*", required=True, type=str, help="Languages to create annotation files on.")
parser.add_argument("--pred_dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset containing the results.")
parser.add_argument("--gold_dataset", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="HuggingFace dataset containing the gold labels.")
parser.add_argument("--use_model", type=str, default=None, help="If set, will use model outputs as basis for sampling. Will sample equal number of wins/losses/ties. Only works for Generative RMs for now.")
parser.add_argument("--sample_size", type=int, default=None, help="Total number of instances to sample.")
# fmt: on
return parser.parse_args()


def main():
args = get_args()
pred_dir = Path(snapshot_download(args.pred_dataset, repo_type="dataset"))
lang_folders = [d for d in pred_dir.iterdir() if d.is_dir()]

# for lang in args.langs:
for lang_dir in lang_folders:
if lang_dir.name in args.langs:
lang = lang_dir.name
gold_dataset = load_dataset(args.gold_dataset, lang, split="filtered")
annotation_df = gold_dataset.to_pandas()
if args.use_model:
logging.info(f"Will sample based on {args.use_model} results")
scores = get_per_instance_scores(model_name=args.use_model, lang_dir=lang_dir)
annotation_df["scores"] = scores

if args.sample_size:
logging.info(f"Sampling {args.sample_size} examples")
annotation_df = stratified_sampling(annotation_df, n=args.sample_size, column="scores")

logging.info(f"Number of annotation tasks: {len(annotation_df)}")
logging.info("Randomly swapping the completions")
swap_mask = np.random.rand(len(annotation_df)) < 0.5
annotation_df["swapped"] = swap_mask.astype(int)
annotation_df = annotation_df.rename(columns={"chosen": "completion_a", "rejected": "completion_b"})

# Save the answer key before swapping and removing some other columns
answer_key_df = annotation_df.copy()
# Start swapping
annotation_df.loc[swap_mask, ["completion_a", "completion_b"]] = annotation_df.loc[
swap_mask, ["completion_b", "completion_a"]
].values
annotation_df = annotation_df.drop(
columns=["chosen_model", "rejected_model", "subset", "scores", "swapped"]
)

output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
answer_key_output = output_dir / f"{lang}-answer_key.csv"
answer_key_df.to_csv(answer_key_output, index=False)
annotation_file_output = output_dir / f"{lang}-annotation.csv"
annotation_df.to_csv(annotation_file_output, index=False)
logging.info(f"Saved answer key and annotation file to {output_dir}")


def get_per_instance_scores(model_name: str, lang_dir: Path) -> List[float]:
model_file = [
file for file in lang_dir.iterdir() if file.suffix == ".json" and model_name.replace("/", "___") in str(file)
]
if len(model_file) == 0:
logging.error(f"Can't find model '{model_name}' in {lang_dir.name} results")
sys.exit(1)

with open(model_file[0], "r") as f:
results = json.load(f)

scores = results["scores"]["results"]
return scores


def stratified_sampling(df: "pd.DataFrame", n: int, column: str = "scores") -> "pd.DataFrame":
counts = df[column].value_counts()
min_count = counts.min()
num_categories = len(counts)
samples_per_category = min(n // num_categories, min_count)

# Sample the rows
samples = []
for score in counts.index:
score_df = df[df[column] == score]
sampled_df = score_df.sample(n=samples_per_category, random_state=42)
samples.append(sampled_df)

# Concatenate the samples
sampled_df = pd.concat(samples).reset_index(drop=True)
return sampled_df


if __name__ == "__main__":
main()