From b57e6905bfafd9e63b0978db693f14ca20f74ffc Mon Sep 17 00:00:00 2001
From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
Date: Sun, 4 Aug 2024 08:22:27 -0700
Subject: [PATCH] Few more bug fixes for the RM script (#16)

* Fix incorrect params

* Fix README

* Pass value of conv when loading model

* Fix output parsing

* Attempt to patch transformers version
---
 Dockerfile                 |  1 +
 README.md                  | 25 ++++++++++++-------------
 scripts/run_rewardbench.py | 34 ++++++++++++++--------------------
 3 files changed, 27 insertions(+), 33 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index a6ffe8d..1688efc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,6 +9,7 @@ WORKDIR /stage
 RUN apt-get update && apt-get install -y --no-install-recommends git
 COPY requirements.txt /stage
 RUN pip install -r requirements.txt
+RUN pip install transformers==4.43.0
 
 # Copy all files
 COPY . /stage
diff --git a/README.md b/README.md
index fa23f7c..f5f0f4e 100644
--- a/README.md
+++ b/README.md
@@ -27,19 +27,19 @@ export HF_TOKEN=<your huggingface token>
 ```
 
 You can find all runnable experiments in the `scripts` directory.
-Their filename should explicitly tell you their purpose. 
+Their filename should explicitly tell you their purpose.
 
 ### Running translation
 
- We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using: 
+We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using:
 
- ```
- pip install git+https://github.com/mediacloud/sentence-splitter.git
- ```
+```
+pip install git+https://github.com/mediacloud/sentence-splitter.git
+```
 
- To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following:
+To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following:
 
-```  
+```
 cd scripts
 bash run_nllb.sh
 ```
@@ -56,14 +56,14 @@ For example, if we want to get the reward score of the UltraRM-13b reward model
 python -m scripts.run_rewardbench \
     --model openbmb/UltraRM-13b \
     --chat_template openbmb \
-    --dataset $DATASET \
+    --dataset_name $DATASET \
     --lang_code $LANG_CODE \
     --split "filtered" \
     --output_dir $OUTDIR \
     --batch_size 8 \
     --trust_remote_code \
     --force_truncation \
-    --save_all 
+    --save_all
 ```
 
 The evaluation parameters can be found in the [allenai/reward-bench](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml) repository.
@@ -97,10 +97,10 @@ python -m scripts.run_generative \
     --model gpt-4-turbo-2024-04-09 \
     --split "filtered" \
     --lang_code $LANG_CODE \
-    --output_dir $OUTDIR 
+    --output_dir $OUTDIR
 ```
 
-You can also run open-source LMs in a generative fashion. 
+You can also run open-source LMs in a generative fashion.
 The inference is then routed through [vLLM](https://github.com/vllm-project/vllm).
 Here's an example using `meta-llama/Meta-Llama-3-70B-Instruct`:
 
@@ -128,7 +128,6 @@ python -m scripts/run_generative.py \
     --output_dir $OUTDIR
 ```
 
-
 ## Testing and Development
 
 This codebase contains minimal tests, mostly we test functions that were added or patched from RewardBench.
@@ -150,4 +149,4 @@ You can automatically format your code by running:
 
 ```
 make style
-```
\ No newline at end of file
+```
diff --git a/scripts/run_rewardbench.py b/scripts/run_rewardbench.py
index b271fa9..4543983 100644
--- a/scripts/run_rewardbench.py
+++ b/scripts/run_rewardbench.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -43,7 +44,7 @@ def main():
     parser = argparse.ArgumentParser(description="Evaluate a reward model.")
 
     # fmt: off
-    parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on")
+    parser.add_argument("--dataset_name", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on")
     parser.add_argument("--lang_code", type=str, default=None, help="the language code to use")
     parser.add_argument("--split", type=str, default="filtered", help="the split to evaluate on")
     parser.add_argument("--model", type=str, required=True, help="the model to evaluate")
@@ -143,7 +144,7 @@ def main():
     logger.info("*** Load dataset ***")
     tokenizer_path = args.tokenizer if args.tokenizer else args.model
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code)
-    if args.dataset == "allenai/reward-bench":
+    if args.dataset_name == "allenai/reward-bench":
         logger.info("Running core eval dataset.")
         # primary set compiles slightly more information
         dataset, subsets = load_eval_dataset(
@@ -157,6 +158,7 @@ def main():
     else:
         dataset, subsets = load_multilingual_eval_dataset(
             dataset_name=args.dataset_name,
+            conv=conv,
             lang_code=args.lang_code,
             custom_dialogue_formatting=False,
             tokenizer=tokenizer,
@@ -327,7 +329,7 @@ def main():
     logger.info(f"Mean rejected: {np.mean(scores_rejected)}, std: {np.std(scores_rejected)}")
     logger.info(f"Mean margin: {np.mean(np.array(scores_chosen) - np.array(scores_rejected))}")
 
-    if "reward-bench" in args.dataset:
+    if "reward-bench" in args.dataset_name:
         logger.info("Computing grouped results")
         out_dataset = dataset.add_column("results", results)
         if args.debug:
@@ -351,15 +353,12 @@ def main():
     # compile scores
     ############################
     # save score in json to args.output_dir + args.model + ".json"
-    output_path = args.output_dir + args.model + args.lang_code + ".json"
-    dirname = os.path.dirname(output_path)
-    os.makedirs(dirname, exist_ok=True)
-
-    # remove old data
-    if os.path.exists(output_path):
-        os.remove(output_path)
+    output_dir = Path(args.output_dir)
+    output_path = output_dir / f"{args.model}-{args.lang_code}.json"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
 
-    with open(output_path, "w") as f:
+    logger.info(f"Saving to {output_path}")
+    with output_path.open("w") as f:
         json.dump(
             {
                 "accuracy": accuracy,
@@ -368,22 +367,17 @@ def main():
                 "ref_model": args.ref_model,
                 "tokenizer": tokenizer_path,
                 "chat_template": args.chat_template,
-                "extra_results": results_grouped if "reward-bench" in args.dataset else None,
+                "extra_results": results_grouped if "reward-bench" in args.dataset_name else None,
             },
             f,
         )
 
     # if save_all is passed, save a large jsonl with all scores_chosen, scores_rejected
     if args.save_all:
-        output_path = args.output_dir + args.model + "-" + args.lang_code + "-all.jsonl"
-        dirname = os.path.dirname(output_path)
-        os.makedirs(dirname, exist_ok=True)
-
-        # remove old data
-        if os.path.exists(output_path):
-            os.remove(output_path)
+        output_path = output_dir / f"{args.model}-{args.lang_code}-all.jsonl"
+        logger.info(f"Saving 'all' results to {output_path}")
 
-        with open(output_path, "w") as f:
+        with output_path.open("w") as f:
             for chosen, rejected in zip(scores_chosen, scores_rejected):
                 f.write(json.dumps({"chosen": scores_chosen, "rejected": scores_rejected}) + "\n")