From b57e6905bfafd9e63b0978db693f14ca20f74ffc Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Sun, 4 Aug 2024 08:22:27 -0700 Subject: [PATCH] Few more bug fixes for the RM script (#16) * Fix incorrect params * Fix README * Pass value of conv when loading model * Fix output parsing * Attempt to patch transformers version --- Dockerfile | 1 + README.md | 25 ++++++++++++------------- scripts/run_rewardbench.py | 34 ++++++++++++++-------------------- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/Dockerfile b/Dockerfile index a6ffe8d..1688efc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,7 @@ WORKDIR /stage RUN apt-get update && apt-get install -y --no-install-recommends git COPY requirements.txt /stage RUN pip install -r requirements.txt +RUN pip install transformers==4.43.0 # Copy all files COPY . /stage diff --git a/README.md b/README.md index fa23f7c..f5f0f4e 100644 --- a/README.md +++ b/README.md @@ -27,19 +27,19 @@ export HF_TOKEN= ``` You can find all runnable experiments in the `scripts` directory. -Their filename should explicitly tell you their purpose. +Their filename should explicitly tell you their purpose. ### Running translation - We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using: +We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using: - ``` - pip install git+https://github.com/mediacloud/sentence-splitter.git - ``` +``` +pip install git+https://github.com/mediacloud/sentence-splitter.git +``` - To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following: +To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following: -``` +``` cd scripts bash run_nllb.sh ``` @@ -56,14 +56,14 @@ For example, if we want to get the reward score of the UltraRM-13b reward model python -m scripts.run_rewardbench \ --model openbmb/UltraRM-13b \ --chat_template openbmb \ - --dataset $DATASET \ + --dataset_name $DATASET \ --lang_code $LANG_CODE \ --split "filtered" \ --output_dir $OUTDIR \ --batch_size 8 \ --trust_remote_code \ --force_truncation \ - --save_all + --save_all ``` The evaluation parameters can be found in the [allenai/reward-bench](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml) repository. @@ -97,10 +97,10 @@ python -m scripts.run_generative \ --model gpt-4-turbo-2024-04-09 \ --split "filtered" \ --lang_code $LANG_CODE \ - --output_dir $OUTDIR + --output_dir $OUTDIR ``` -You can also run open-source LMs in a generative fashion. +You can also run open-source LMs in a generative fashion. The inference is then routed through [vLLM](https://github.com/vllm-project/vllm). Here's an example using `meta-llama/Meta-Llama-3-70B-Instruct`: @@ -128,7 +128,6 @@ python -m scripts/run_generative.py \ --output_dir $OUTDIR ``` - ## Testing and Development This codebase contains minimal tests, mostly we test functions that were added or patched from RewardBench. @@ -150,4 +149,4 @@ You can automatically format your code by running: ``` make style -``` \ No newline at end of file +``` diff --git a/scripts/run_rewardbench.py b/scripts/run_rewardbench.py index b271fa9..4543983 100644 --- a/scripts/run_rewardbench.py +++ b/scripts/run_rewardbench.py @@ -22,6 +22,7 @@ import logging import os import sys +from pathlib import Path import numpy as np import torch @@ -43,7 +44,7 @@ def main(): parser = argparse.ArgumentParser(description="Evaluate a reward model.") # fmt: off - parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on") + parser.add_argument("--dataset_name", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on") parser.add_argument("--lang_code", type=str, default=None, help="the language code to use") parser.add_argument("--split", type=str, default="filtered", help="the split to evaluate on") parser.add_argument("--model", type=str, required=True, help="the model to evaluate") @@ -143,7 +144,7 @@ def main(): logger.info("*** Load dataset ***") tokenizer_path = args.tokenizer if args.tokenizer else args.model tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code) - if args.dataset == "allenai/reward-bench": + if args.dataset_name == "allenai/reward-bench": logger.info("Running core eval dataset.") # primary set compiles slightly more information dataset, subsets = load_eval_dataset( @@ -157,6 +158,7 @@ def main(): else: dataset, subsets = load_multilingual_eval_dataset( dataset_name=args.dataset_name, + conv=conv, lang_code=args.lang_code, custom_dialogue_formatting=False, tokenizer=tokenizer, @@ -327,7 +329,7 @@ def main(): logger.info(f"Mean rejected: {np.mean(scores_rejected)}, std: {np.std(scores_rejected)}") logger.info(f"Mean margin: {np.mean(np.array(scores_chosen) - np.array(scores_rejected))}") - if "reward-bench" in args.dataset: + if "reward-bench" in args.dataset_name: logger.info("Computing grouped results") out_dataset = dataset.add_column("results", results) if args.debug: @@ -351,15 +353,12 @@ def main(): # compile scores ############################ # save score in json to args.output_dir + args.model + ".json" - output_path = args.output_dir + args.model + args.lang_code + ".json" - dirname = os.path.dirname(output_path) - os.makedirs(dirname, exist_ok=True) - - # remove old data - if os.path.exists(output_path): - os.remove(output_path) + output_dir = Path(args.output_dir) + output_path = output_dir / f"{args.model}-{args.lang_code}.json" + output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w") as f: + logger.info(f"Saving to {output_path}") + with output_path.open("w") as f: json.dump( { "accuracy": accuracy, @@ -368,22 +367,17 @@ def main(): "ref_model": args.ref_model, "tokenizer": tokenizer_path, "chat_template": args.chat_template, - "extra_results": results_grouped if "reward-bench" in args.dataset else None, + "extra_results": results_grouped if "reward-bench" in args.dataset_name else None, }, f, ) # if save_all is passed, save a large jsonl with all scores_chosen, scores_rejected if args.save_all: - output_path = args.output_dir + args.model + "-" + args.lang_code + "-all.jsonl" - dirname = os.path.dirname(output_path) - os.makedirs(dirname, exist_ok=True) - - # remove old data - if os.path.exists(output_path): - os.remove(output_path) + output_path = output_dir / f"{args.model}-{args.lang_code}-all.jsonl" + logger.info(f"Saving 'all' results to {output_path}") - with open(output_path, "w") as f: + with output_path.open("w") as f: for chosen, rejected in zip(scores_chosen, scores_rejected): f.write(json.dumps({"chosen": scores_chosen, "rejected": scores_rejected}) + "\n")