Skip to content

Commit

Permalink
Few more bug fixes for the RM script (#16)
Browse files Browse the repository at this point in the history
* Fix incorrect params

* Fix README

* Pass value of conv when loading model

* Fix output parsing

* Attempt to patch transformers version
  • Loading branch information
ljvmiranda921 authored Aug 4, 2024
1 parent f329aa1 commit b57e690
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 33 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ WORKDIR /stage
RUN apt-get update && apt-get install -y --no-install-recommends git
COPY requirements.txt /stage
RUN pip install -r requirements.txt
RUN pip install transformers==4.43.0

# Copy all files
COPY . /stage
25 changes: 12 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ export HF_TOKEN=<your huggingface token>
```

You can find all runnable experiments in the `scripts` directory.
Their filename should explicitly tell you their purpose.
Their filename should explicitly tell you their purpose.

### Running translation

We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using:
We currently use [`facebook/nllb-200-3.3B`](https://huggingface.co/facebook/nllb-200-3.3B) for translation. First install sentence splitter using:

```
pip install git+https://github.com/mediacloud/sentence-splitter.git
```
```
pip install git+https://github.com/mediacloud/sentence-splitter.git
```

To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following:
To translate reward bench into [22 Aya languages](https://arxiv.org/abs/2405.15032) run the following:

```
```
cd scripts
bash run_nllb.sh
```
Expand All @@ -56,14 +56,14 @@ For example, if we want to get the reward score of the UltraRM-13b reward model
python -m scripts.run_rewardbench \
--model openbmb/UltraRM-13b \
--chat_template openbmb \
--dataset $DATASET \
--dataset_name $DATASET \
--lang_code $LANG_CODE \
--split "filtered" \
--output_dir $OUTDIR \
--batch_size 8 \
--trust_remote_code \
--force_truncation \
--save_all
--save_all
```

The evaluation parameters can be found in the [allenai/reward-bench](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml) repository.
Expand Down Expand Up @@ -97,10 +97,10 @@ python -m scripts.run_generative \
--model gpt-4-turbo-2024-04-09 \
--split "filtered" \
--lang_code $LANG_CODE \
--output_dir $OUTDIR
--output_dir $OUTDIR
```

You can also run open-source LMs in a generative fashion.
You can also run open-source LMs in a generative fashion.
The inference is then routed through [vLLM](https://github.com/vllm-project/vllm).
Here's an example using `meta-llama/Meta-Llama-3-70B-Instruct`:

Expand Down Expand Up @@ -128,7 +128,6 @@ python -m scripts/run_generative.py \
--output_dir $OUTDIR
```


## Testing and Development

This codebase contains minimal tests, mostly we test functions that were added or patched from RewardBench.
Expand All @@ -150,4 +149,4 @@ You can automatically format your code by running:

```
make style
```
```
34 changes: 14 additions & 20 deletions scripts/run_rewardbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import logging
import os
import sys
from pathlib import Path

import numpy as np
import torch
Expand All @@ -43,7 +44,7 @@ def main():
parser = argparse.ArgumentParser(description="Evaluate a reward model.")

# fmt: off
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on")
parser.add_argument("--dataset_name", type=str, default="aya-rm-multilingual/multilingual-reward-bench", help="the dataset to evaluate on")
parser.add_argument("--lang_code", type=str, default=None, help="the language code to use")
parser.add_argument("--split", type=str, default="filtered", help="the split to evaluate on")
parser.add_argument("--model", type=str, required=True, help="the model to evaluate")
Expand Down Expand Up @@ -143,7 +144,7 @@ def main():
logger.info("*** Load dataset ***")
tokenizer_path = args.tokenizer if args.tokenizer else args.model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code)
if args.dataset == "allenai/reward-bench":
if args.dataset_name == "allenai/reward-bench":
logger.info("Running core eval dataset.")
# primary set compiles slightly more information
dataset, subsets = load_eval_dataset(
Expand All @@ -157,6 +158,7 @@ def main():
else:
dataset, subsets = load_multilingual_eval_dataset(
dataset_name=args.dataset_name,
conv=conv,
lang_code=args.lang_code,
custom_dialogue_formatting=False,
tokenizer=tokenizer,
Expand Down Expand Up @@ -327,7 +329,7 @@ def main():
logger.info(f"Mean rejected: {np.mean(scores_rejected)}, std: {np.std(scores_rejected)}")
logger.info(f"Mean margin: {np.mean(np.array(scores_chosen) - np.array(scores_rejected))}")

if "reward-bench" in args.dataset:
if "reward-bench" in args.dataset_name:
logger.info("Computing grouped results")
out_dataset = dataset.add_column("results", results)
if args.debug:
Expand All @@ -351,15 +353,12 @@ def main():
# compile scores
############################
# save score in json to args.output_dir + args.model + ".json"
output_path = args.output_dir + args.model + args.lang_code + ".json"
dirname = os.path.dirname(output_path)
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.exists(output_path):
os.remove(output_path)
output_dir = Path(args.output_dir)
output_path = output_dir / f"{args.model}-{args.lang_code}.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w") as f:
logger.info(f"Saving to {output_path}")
with output_path.open("w") as f:
json.dump(
{
"accuracy": accuracy,
Expand All @@ -368,22 +367,17 @@ def main():
"ref_model": args.ref_model,
"tokenizer": tokenizer_path,
"chat_template": args.chat_template,
"extra_results": results_grouped if "reward-bench" in args.dataset else None,
"extra_results": results_grouped if "reward-bench" in args.dataset_name else None,
},
f,
)

# if save_all is passed, save a large jsonl with all scores_chosen, scores_rejected
if args.save_all:
output_path = args.output_dir + args.model + "-" + args.lang_code + "-all.jsonl"
dirname = os.path.dirname(output_path)
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.exists(output_path):
os.remove(output_path)
output_path = output_dir / f"{args.model}-{args.lang_code}-all.jsonl"
logger.info(f"Saving 'all' results to {output_path}")

with open(output_path, "w") as f:
with output_path.open("w") as f:
for chosen, rejected in zip(scores_chosen, scores_rejected):
f.write(json.dumps({"chosen": scores_chosen, "rejected": scores_rejected}) + "\n")

Expand Down

0 comments on commit b57e690

Please sign in to comment.