for-ai · ljvmiranda921 · Aug 11, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 11, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,8 @@ together==1.2.1
 cohere==5.6.1
 git+https://github.com/lm-sys/FastChat.git@92a6d1fcd69a88ea169c0b01065ce44f1e690a2c
 python-dotenv
-sentence-splitter==1.4
+sentence-splitter==1.4
+pandas
+huggingface_hub
+tabulate
+numpy
diff --git a/scripts/get_results.py b/scripts/get_results.py
@@ -0,0 +1,103 @@
+import argparse
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict, List
+
+import pandas as pd
+from huggingface_hub import snapshot_download
+from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_args():
+    # fmt: off
+    parser = argparse.ArgumentParser(description="Get evaluation results")
+    parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.")
+    parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.")
+    parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.")
+    # fmt: on
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset"))
+    lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()]
+
+    if args.langs:
+        logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}")
+        for lang_dir in lang_folders:
+            if lang_dir.name in args.langs:
+                model_scores = get_scores(lang_dir)
+                df = pd.DataFrame(model_scores)
+                metadata_df = df[["model", "model_type", "score"]]
+                key = "subset_scores" if args.show_subsets else "category_scores"
+                scores_df = pd.DataFrame(df[key].tolist())
+                lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False)
+                print(f"\n*** Results for {lang_dir.name} ***\n")
+                print(lang_scores_df.to_markdown(tablefmt="github", index=False))
+
+    else:
+        logging.info("Showing m-rewardbench scores for all languages")
+        lang_scores = {}
+        for lang_dir in lang_folders:
+            model_scores = get_scores(lang_dir)
+            lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores}
+
+        lang_scores_df = pd.DataFrame(lang_scores)
+        print(lang_scores_df.to_markdown(tablefmt="github"))
+
+
+def get_scores(lang_dir: Path) -> List[Dict[str, Any]]:
+    """Get scores for a single language, returns the category scores and the per-subset scores per model"""
+    files = [file for file in lang_dir.iterdir() if file.suffix == ".json"]
+    logging.debug(f"Found {len(files)} model results for {lang_dir.name}")
+
+    def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
+        """Weighted average of each dataset"""
+        category_scores = {}
+        for category, subsets in SUBSET_MAPPING.items():
+            subset_results = [results[subset] for subset in subsets]
+            subset_lengths = [EXAMPLE_COUNTS[subset] for subset in subsets]
+            wt_avg = sum(v * w for v, w in zip(subset_results, subset_lengths)) / sum(subset_lengths)
+            category_scores[category] = wt_avg
+        return category_scores
+
+    model_scores = []
+    for file in files:
+        with open(file, "r") as f:
+            result = json.load(f)
+        # The Generative and Clasifier RMs have different JSON schemas
+        # so we need to handle them separately
+        if "subset" in result:
+            # Most likely generative
+            model_scores.append(
+                {
+                    "model": result["subset"].pop("model"),
+                    "model_type": result["subset"].pop("model_type"),
+                    "chat_template": result["subset"].pop("chat_template"),
+                    # The rewardbench score is the average of the weighted average of the four category scores
+                    "score": sum(result["leaderboard"].values()) / len(result["leaderboard"]),
+                    "category_scores": result["leaderboard"],
+                    "subset_scores": result["subset"],
+                }
+            )
+        else:
+            category_scores = _compute_category_scores(result["extra_results"])
+            model_scores.append(
+                {
+                    "model": result["model"],
+                    "model_type": "Sequence Classifier",
+                    "chat_template": result["chat_template"],
+                    "score": sum(category_scores.values()) / len(category_scores),
+                    "category_scores": category_scores,
+                    "subset_scores": result["extra_results"],
+                }
+            )
+    return model_scores
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/make_pref_annotation_task.py b/scripts/make_pref_annotation_task.py
@@ -0,0 +1,108 @@
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from huggingface_hub import snapshot_download
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_args():
+    # fmt: off
+    parser = argparse.ArgumentParser(description="Create annotation CSV for a given language.")
+    parser.add_argument("--output_dir", type=Path, required=True, help="Directory to save the annotation CSV files.")
+    parser.add_argument("--langs", nargs="*", required=True, type=str, help="Languages to create annotation files on.")
+    parser.add_argument("--pred_dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset containing the results.")
+    parser.add_argument("--gold_dataset", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="HuggingFace dataset containing the gold labels.")
+    parser.add_argument("--use_model", type=str, default=None, help="If set, will use model outputs as basis for sampling. Will sample equal number of wins/losses/ties. Only works for Generative RMs for now.")
+    parser.add_argument("--sample_size", type=int, default=None, help="Total number of instances to sample.")
+    # fmt: on
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    pred_dir = Path(snapshot_download(args.pred_dataset, repo_type="dataset"))
+    lang_folders = [d for d in pred_dir.iterdir() if d.is_dir()]
+
+    # for lang in args.langs:
+    for lang_dir in lang_folders:
+        if lang_dir.name in args.langs:
+            lang = lang_dir.name
+            gold_dataset = load_dataset(args.gold_dataset, lang, split="filtered")
+            annotation_df = gold_dataset.to_pandas()
+            if args.use_model:
+                logging.info(f"Will sample based on {args.use_model} results")
+                scores = get_per_instance_scores(model_name=args.use_model, lang_dir=lang_dir)
+                annotation_df["scores"] = scores
+
+                if args.sample_size:
+                    logging.info(f"Sampling {args.sample_size} examples")
+                    annotation_df = stratified_sampling(annotation_df, n=args.sample_size, column="scores")
+
+            logging.info(f"Number of annotation tasks: {len(annotation_df)}")
+            logging.info("Randomly swapping the completions")
+            swap_mask = np.random.rand(len(annotation_df)) < 0.5
+            annotation_df["swapped"] = swap_mask.astype(int)
+            annotation_df = annotation_df.rename(columns={"chosen": "completion_a", "rejected": "completion_b"})
+
+            # Save the answer key before swapping and removing some other columns
+            answer_key_df = annotation_df.copy()
+            # Start swapping
+            annotation_df.loc[swap_mask, ["completion_a", "completion_b"]] = annotation_df.loc[
+                swap_mask, ["completion_b", "completion_a"]
+            ].values
+            annotation_df = annotation_df.drop(
+                columns=["chosen_model", "rejected_model", "subset", "scores", "swapped"]
+            )
+
+            output_dir = Path(args.output_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+            answer_key_output = output_dir / f"{lang}-answer_key.csv"
+            answer_key_df.to_csv(answer_key_output, index=False)
+            annotation_file_output = output_dir / f"{lang}-annotation.csv"
+            annotation_df.to_csv(annotation_file_output, index=False)
+            logging.info(f"Saved answer key and annotation file to {output_dir}")
+
+
+def get_per_instance_scores(model_name: str, lang_dir: Path) -> List[float]:
+    model_file = [
+        file for file in lang_dir.iterdir() if file.suffix == ".json" and model_name.replace("/", "___") in str(file)
+    ]
+    if len(model_file) == 0:
+        logging.error(f"Can't find model '{model_name}' in {lang_dir.name} results")
+        sys.exit(1)
+
+    with open(model_file[0], "r") as f:
+        results = json.load(f)
+
+    scores = results["scores"]["results"]
+    return scores
+
+
+def stratified_sampling(df: "pd.DataFrame", n: int, column: str = "scores") -> "pd.DataFrame":
+    counts = df[column].value_counts()
+    min_count = counts.min()
+    num_categories = len(counts)
+    samples_per_category = min(n // num_categories, min_count)
+
+    # Sample the rows
+    samples = []
+    for score in counts.index:
+        score_df = df[df[column] == score]
+        sampled_df = score_df.sample(n=samples_per_category, random_state=42)
+        samples.append(sampled_df)
+
+    # Concatenate the samples
+    sampled_df = pd.concat(samples).reset_index(drop=True)
+    return sampled_df
+
+
+if __name__ == "__main__":
+    main()