From 24190671572515e18bf841073d433b7def4e9adb Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Mon, 8 Jul 2024 13:39:28 -0700
Subject: [PATCH 1/8] Rename RM evals name

---
 README.md                                       | 4 ++--
 scripts/{run_rewardbench.sh => run_rm_evals.sh} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename scripts/{run_rewardbench.sh => run_rm_evals.sh} (100%)

diff --git a/README.md b/README.md
index 3f0b8a2..fb3c357 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ Note that the [`rewardbench`](https://pypi.org/project/rewardbench/) package req
 
 You can find all runnable experiments in the `scripts` directory.
 Their filename should explicitly tell you their purpose. 
-For example, `scripts/run_rewardbench.sh` runs the RewardBench inference pipeline on a select number of models given a dataset:
+For example, `scripts/run_rm_evals.sh` runs the RewardBench inference pipeline on a select number of models given a dataset:
 
 ```sh
-./scripts/run_rewardbench.sh
+./scripts/run_rm_evals.sh
 ```
\ No newline at end of file
diff --git a/scripts/run_rewardbench.sh b/scripts/run_rm_evals.sh
similarity index 100%
rename from scripts/run_rewardbench.sh
rename to scripts/run_rm_evals.sh

From a4d50493477eb44fd80d680aee70f8b1d33dd1dc Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Mon, 8 Jul 2024 13:29:35 -0700
Subject: [PATCH 2/8] Implement generative_rm pipeline

* Copied the script straight from rewardbench and modified it for any
custom preference dataset

TODO: Implement Cohere API
---
 requirements.txt          |   8 +-
 scripts/run_generative.py | 360 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 scripts/run_generative.py

diff --git a/requirements.txt b/requirements.txt
index e60f6fe..c7ad015 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,9 @@
 rewardbench
+protobuf
 datasets>=2.20.0
-protobuf>=5.26.1
\ No newline at end of file
+vllm>=0.4.1
+openai==1.35.13
+anthropic==0.31.0
+google-generativeai==0.7.2
+together==1.2.1
+git+https://github.com/lm-sys/FastChat.git@92a6d1fcd69a88ea169c0b01065ce44f1e690a2c
\ No newline at end of file
diff --git a/scripts/run_generative.py b/scripts/run_generative.py
new file mode 100644
index 0000000..1bbbe1a
--- /dev/null
+++ b/scripts/run_generative.py
@@ -0,0 +1,360 @@
+"""
+Modified version of https://github.com/allenai/reward-bench/blob/045c7f8291f804d193bb102f590fd9db8d52cec3/scripts/run_generative.py
+Updated to accommodate custom preference datasets
+"""
+
+# Copyright 2023 AllenAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run a generative RM. For now, this requires openai and anthropic to be installed
+# Examples:
+# python scripts/run_generative.py --dataset_name <DATASET_NAME> --model gpt-3.5-turbo
+# python scripts/run_generative.py --dataset_name <DATASET_NAME> --model=claude-3-haiku-20240307
+
+# note: for none API models, this script uses vllm
+# pip install vllm
+
+import argparse
+import logging
+import json
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+from datasets import load_dataset
+from fastchat.conversation import get_conv_template
+from rewardbench.generative import ANTHROPIC_MODEL_LIST, API_MODEL_LIST
+from rewardbench.generative import GEMINI_MODEL_LIST, OPENAI_MODEL_LIST
+from rewardbench.generative import format_judge_answers, process_judgement
+from rewardbench.generative import run_judge_pair
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+# get token from HF_TOKEN env variable, but if it doesn't exist pass none
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+if not HF_TOKEN:
+    raise ValueError("Missing value for HF_TOKEN environment variable")
+
+
+def get_args():
+    """
+    Parse arguments strings model and chat_template
+    """
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument("--dataset_name", type=str, required=True, help="name of dataset to test on")
+    parser.add_argument("--split", default="test", type=str, required=True, help="dataset split to evaluate")
+    parser.add_argument("--model", type=str, nargs="+", required=True, help="name of model to use")
+    parser.add_argument("--chat_template", type=str, default=None, help="fastchat chat template (optional)")
+    parser.add_argument("--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline")
+    parser.add_argument("--num_gpus", type=int, default=1, help="number of gpus to use, for multi-node vllm")
+    parser.add_argument("--debug", action="store_true", help="run on debug mode (show additional info, etc.)")
+    parser.add_argument("--sample", type=int, default=None, help="sample a few instances for testing")
+    parser.add_argument("--num_threads", type=int, default=10, help="number of threads to use for parallel processing of examples")
+    parser.add_argument("--force_local", action="store_true", default=False, help="force local run, even if model is on Together API")
+    # fmt: on
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    ###############
+    # Setup logging
+    ###############
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = logging.INFO
+    logger.setLevel(log_level)
+
+    logger.info(
+        f"Running reward model on {args.model} with chat template {args.chat_template}"
+    )
+
+    model_type = "Generative RM"
+
+    # if model is list, make type + PoLL and check multiple is odd
+    if isinstance(args.model, list) and len(args.model) == 1:
+        args.model = args.model[0]
+    elif isinstance(args.model, list):
+        model_type += " PoLL"
+        # assert that is odd and > 1
+        assert len(args.model) % 2 == 1
+
+    # define variable if is API or local
+    is_api_models = (
+        isinstance(args.model, list)
+        or args.model in API_MODEL_LIST
+        or not args.force_local
+    )
+
+    # if model isn't API, load via vllm
+    if not is_api_models:
+        # load model
+        model = LLM(
+            args.model,
+            trust_remote_code=args.trust_remote_code,
+            tensor_parallel_size=args.num_gpus,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        if "Llama-3" in args.model or "llama3-8b" in args.model:
+            stop_token_ids = [128009]
+        else:
+            stop_token_ids = []
+
+        sampling_params = SamplingParams(
+            n=1,
+            temperature=0,
+            top_p=1,
+            max_tokens=1024,
+            stop_token_ids=stop_token_ids,
+        )
+
+    # handle off-case models
+    is_prometheus = False  # handles output tokens differently (less flexible)
+    # use different prompt for prometheus/gemini models
+    if "prometheus" in args.model:
+        model_modifier = "prometheus"
+        is_prometheus = True
+    elif "gemini" in args.model:
+        model_modifier = "gemini"
+    else:
+        model_modifier = None
+
+    ############################
+    # Load dataset
+    ############################
+    logger.info("*** Load dataset ***")
+    dataset = load_dataset(args.dataset_name, split=args.split)
+    # Rename columns for compatibility with existing API
+    dataset = dataset.rename_columns(
+        {"chosen": "text_chosen", "rejected": "text_rejected"}
+    )
+
+    if args.sample:
+        logger.debug(f"Running on first {args.sample} examples")
+        dataset = dataset.select(range(args.sample))
+
+    if is_api_models:
+        ############################
+        # Run inference via API
+        ############################
+        def update_progress_bar(done, total):
+            # Simple text-based progress bar
+            progress = int(50 * done / total)  # Calculate progress (50 chars width)
+            sys.stdout.write(
+                "\r[{}{}] {}/{}".format(
+                    "#" * progress, "." * (50 - progress), done, total
+                )
+            )
+            sys.stdout.flush()
+
+        def get_judgement(batch, debug=args.debug):
+            mult_turn = True if len(batch["text_chosen"]) > 2 else False
+            prompt = batch["prompt"]
+            answer_a = batch["text_chosen"]
+            answer_b = batch["text_rejected"]
+
+            # shuffle a and b randomly for position bias
+            is_shuffled = np.random.rand() > 0.5
+            if is_shuffled:
+                answer_a, answer_b = answer_b, answer_a
+                winner_text = "B"
+                loser_text = "A"
+            else:
+                winner_text = "A"
+                loser_text = "B"
+
+            if len(batch["text_chosen"]) <= 4:  # set up only for 1 or 2 turns
+                winner, request, judgement = run_judge_pair(
+                    prompt,
+                    answer_a,
+                    answer_b,
+                    args.model,
+                    multi_turn=mult_turn,
+                    model_modifier=model_modifier,
+                )
+                if debug:
+                    print(f"Prompt: {request}")
+                    print(f"Judgement: {judgement}")
+
+                # handle voting
+                if isinstance(winner, list):
+                    # print votes if debug
+                    if debug:
+                        print(winner)
+                    winner = max(set(winner), key=winner.count)
+
+                if winner == winner_text:
+                    return 1
+                elif winner == loser_text:
+                    return 0
+                else:  # if "error"
+                    return 0.5  # effectively a tie
+
+        with ThreadPoolExecutor(max_workers=args.num_threads) as executor:
+            # Map 'my_function' across the vector, executing in parallel using threads
+            # results = list(executor.map(get_judgement, dataset))
+
+            # Progress bar version
+            results = [None] * len(dataset)  # Preallocate results list
+            done_tasks = 0  # Counter for completed tasks
+
+            with ThreadPoolExecutor(max_workers=args.num_threads) as executor:
+                # Submit all tasks and hold their futures in a list
+                future_to_index = {
+                    executor.submit(get_judgement, x): i for i, x in enumerate(dataset)
+                }
+
+                # As tasks complete, update progress and store results in the original order
+                for future in as_completed(future_to_index):
+                    index = future_to_index[future]
+                    results[index] = future.result()
+                    done_tasks += 1
+                    update_progress_bar(done_tasks, len(dataset))
+
+            # Print newline after progress bar
+            print()
+    else:
+        ############################
+        # Run model weights with vllm
+        ############################
+
+        def format_judgements(batch, optional_chat_template=None):
+            # TODO expand this to include fastchat chat templates if needed
+            mult_turn = True if len(batch["text_chosen"]) > 2 else False
+            prompt = batch["text_chosen"][0]["content"]
+            answer_a = batch["text_chosen"]
+            answer_b = batch["text_rejected"]
+
+            # shuffle a and b randomly for position bias
+            is_shuffled = np.random.rand() > 0.5
+            if is_shuffled:
+                answer_a, answer_b = answer_b, answer_a
+
+            system_prompt, user_prompt = format_judge_answers(
+                prompt,
+                answer_a,
+                answer_b,
+                multi_turn=mult_turn,
+                model_modifier=model_modifier,
+            )
+
+            if optional_chat_template is not None:
+                optional_chat_template.set_system_message(system_prompt)
+                optional_chat_template.messages = []
+                optional_chat_template.append_message(
+                    optional_chat_template.roles[0], user_prompt
+                )
+                optional_chat_template.append_message(
+                    optional_chat_template.roles[1], None
+                )
+                prompt = optional_chat_template.get_prompt()
+            elif model_modifier:
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ]
+                prompt = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+            batch["text"] = prompt
+            batch["is_shuffled"] = is_shuffled
+            return batch
+
+        # format the dataset for the model, with optional fastchat templating
+        chat_template = (
+            get_conv_template(args.chat_template)
+            if args.chat_template is not None
+            else None
+        )
+        dataset_prompts = dataset.map(
+            format_judgements, fn_kwargs={"optional_chat_template": chat_template}
+        )
+
+        # collect texts of dataset in list
+        prompts = dataset_prompts["text"]
+        is_shuffled = dataset_prompts["is_shuffled"]
+
+        # generate
+        logger.info("*** Run inference ***")
+        outputs = model.generate(prompts, sampling_params)
+        logger.info("*** Inference done ***")
+
+        answers = [o.outputs[0].text for o in outputs]
+        winners = [process_judgement(a, is_prometheus=is_prometheus) for a in answers]
+
+        def process_shuffled(win, shuffle):
+            winner_text, loser_text = "B", "A" if shuffle else "A", "B"
+            if win == winner_text:
+                return 1
+            elif win == loser_text:
+                return 0
+            else:  # if "error"
+                return 0.5  # effectively a tie
+
+        results = [process_shuffled(w, s) for w, s in zip(winners, is_shuffled)]
+
+    ############################
+    # Print & process results
+    ############################
+    # add column for results for easy printing
+    out_dataset = dataset.add_column("results", results)
+
+    # model name concat if list
+    if isinstance(args.model, list):
+        model_name = "_".join(args.model)
+        model_name = "PoLL/" + model_name
+    else:
+        model_name = args.model
+    # if model in openai or Anthropic list, append org to model name
+    if args.model in OPENAI_MODEL_LIST:
+        model_name = "openai/" + model_name
+    elif args.model in ANTHROPIC_MODEL_LIST:
+        model_name = "anthropic/" + model_name
+    elif args.model in GEMINI_MODEL_LIST:
+        model_name = "google/" + model_name
+
+    # compute scores
+    num_correct = sum(out_dataset["results"])
+    num_total = len(out_dataset["results"])
+    print(f"{args.dataset_name}: {num_correct}/{num_total} ({num_correct/num_total})")
+
+    # save results
+    results_dict = {
+        "dataset": args.dataset_name,
+        "model": model_name,
+        "chat_template": args.chat_template,
+        "scores": {
+            "accuracy": num_correct / num_total,
+            "num_correct": num_correct,
+            "num_total": num_total,
+            "results": results,
+        },
+    }
+
+    file_path = f"{model_name.replace('/', '___')}.json"
+    with open(file_path, "w") as f:
+        json.dump(results_dict, f, indent=4)
+
+    logger.info(f"Saved results to {file_path}")
+
+
+if __name__ == "__main__":
+    main()

From d5f37962351b5f5177307feb8e5fabf81b2559b4 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Wed, 10 Jul 2024 20:02:24 -0700
Subject: [PATCH 3/8] Update README

---
 README.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fb3c357..8192af7 100644
--- a/README.md
+++ b/README.md
@@ -20,10 +20,25 @@ Note that the [`rewardbench`](https://pypi.org/project/rewardbench/) package req
 
 ## Running experiments
 
+First, you need to set a [HuggingFace token](https://huggingface.co/settings/tokens) as an environment variable (`HF_TOKEN`):
+
+```sh
+export HF_TOKEN=<your huggingface token>
+```
+
+If you're planning to use some closed-source APIs, you also need to set the tokens for each:
+
+
+```sh
+export OPENAI_API_KEY=<your openai token>
+export ANTHROPIC_API_KEY=<your anthropic token>
+export GEMINI_API_KEY=<your gemini token>
+```
+
 You can find all runnable experiments in the `scripts` directory.
 Their filename should explicitly tell you their purpose. 
 For example, `scripts/run_rm_evals.sh` runs the RewardBench inference pipeline on a select number of models given a dataset:
 
 ```sh
 ./scripts/run_rm_evals.sh
-```
\ No newline at end of file
+```

From 174c3676b486adebd554c4c9f9f213d2d846e5a0 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Wed, 10 Jul 2024 20:02:33 -0700
Subject: [PATCH 4/8] Add script for running llm evals

---
 scripts/run_llm_evals.sh | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100755 scripts/run_llm_evals.sh

diff --git a/scripts/run_llm_evals.sh b/scripts/run_llm_evals.sh
new file mode 100755
index 0000000..9db4f2a
--- /dev/null
+++ b/scripts/run_llm_evals.sh
@@ -0,0 +1,8 @@
+python3 -m scripts/run_generative.py \
+    --dataset_name ljvmiranda921/ultrafeedback-multilingual-dpo-test \
+    --model gpt-4-turbo-2024-04-09 \
+    --split test
+python3 -m scripts/run_generative.py \
+    --dataset_name ljvmiranda921/ultrafeedback-english-dpo-test \
+    --model gpt-4-turbo-2024-04-09 \
+    --split test
\ No newline at end of file

From 843b14fc5e007bbac385dc7287043b43ce65f991 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Wed, 10 Jul 2024 20:18:25 -0700
Subject: [PATCH 5/8] Use prompt directly from the dataset

---
 scripts/run_generative.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_generative.py b/scripts/run_generative.py
index 1bbbe1a..ec57c7b 100644
--- a/scripts/run_generative.py
+++ b/scripts/run_generative.py
@@ -239,7 +239,7 @@ def get_judgement(batch, debug=args.debug):
         def format_judgements(batch, optional_chat_template=None):
             # TODO expand this to include fastchat chat templates if needed
             mult_turn = True if len(batch["text_chosen"]) > 2 else False
-            prompt = batch["text_chosen"][0]["content"]
+            prompt = batch["prompt"]
             answer_a = batch["text_chosen"]
             answer_b = batch["text_rejected"]
 

From f0e3736e1efcbd94bd005eaaeb7bfc74e6780b2c Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Thu, 11 Jul 2024 09:06:29 -0700
Subject: [PATCH 6/8] Add some more logging

---
 scripts/run_generative.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/run_generative.py b/scripts/run_generative.py
index ec57c7b..dacd86a 100644
--- a/scripts/run_generative.py
+++ b/scripts/run_generative.py
@@ -152,6 +152,8 @@ def main():
         dataset = dataset.select(range(args.sample))
 
     if is_api_models:
+        logging.info("Found an API model, will run inference through their API")
+
         ############################
         # Run inference via API
         ############################
@@ -235,6 +237,7 @@ def get_judgement(batch, debug=args.debug):
         ############################
         # Run model weights with vllm
         ############################
+        logging.info("Not an API model, will run model weights via vLLM")
 
         def format_judgements(batch, optional_chat_template=None):
             # TODO expand this to include fastchat chat templates if needed

From 92c644a009e13780ca3c93d2885fabe6debeeb21 Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Fri, 12 Jul 2024 23:26:52 -0700
Subject: [PATCH 7/8] Run isort on the code

---
 scripts/run_generative.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_generative.py b/scripts/run_generative.py
index dacd86a..73ea5b8 100644
--- a/scripts/run_generative.py
+++ b/scripts/run_generative.py
@@ -26,8 +26,8 @@
 # pip install vllm
 
 import argparse
-import logging
 import json
+import logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed

From dff5259457a2c2fdc39a34f5a8a829a06548199f Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Sat, 13 Jul 2024 11:18:38 -0700
Subject: [PATCH 8/8] Update documentation

---
 README.md                 | 59 +++++++++++++++++++++++++++++++++++----
 scripts/run_generative.py |  6 +++-
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 8192af7..6ea4c58 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,41 @@ First, you need to set a [HuggingFace token](https://huggingface.co/settings/tok
 export HF_TOKEN=<your huggingface token>
 ```
 
-If you're planning to use some closed-source APIs, you also need to set the tokens for each:
+You can find all runnable experiments in the `scripts` directory.
+Their filename should explicitly tell you their purpose. 
 
+### Getting rewards from a Reward Model (RM) on a HuggingFace dataset
+
+Here, we use the `rewardbench` command-line interface and pass a HuggingFace dataset.
+This is useful if the reward model is trained as a Custom classifier (🛠️), Sequence classifier (🔢), or via DPO (🎯).
+For example, if we want to get the reward score of the UltraRM-13b reward model on a preference dataset, we run:
+
+```sh
+rewardbench \
+    --model openbmb/UltraRM-13b \
+    --chat_template openbmb \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --output_dir $OUTDIR \
+    --batch_size 8 \
+    --trust_remote_code \
+    --force_truncation \
+    --save_all 
+```
+
+The evaluation parameters can be found in the [allenai/reward-bench](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml) repository.
+This runs the reward model on the (prompt, chosen, rejected) triples and give us the reward score for each instance.
+The results are saved into a JSON file inside the `$OUTDIR` directory.
+Finally, you can find some experiments in the `scripts/run_rm_evals.sh` script.
+
+### Getting rewards from a Generative RM on a HuggingFace dataset
+
+Here we use `scripts/run_generative.py`, a modified version of the [same script in RewardBench](https://github.com/allenai/reward-bench/blob/main/scripts/run_generative.py) to obtain rewards from a Generative RM (🗨️).
+The only difference is that this script accepts any arbitrary HuggingFace preference dataset (which we plan to conribute upstream later on) instead of just the RewardBench dataset.
+
+For Generative RMs, we prompt a model in a style akin to LLM-as-a-judge, and then parse the output to obtain the preference.
+This can be done for closed-source APIs (e.g., GPT-4, Claude) or open-source LMs (done via vLLM).
+If you're planning to use some closed-source APIs, you also need to set the tokens for each:
 
 ```sh
 export OPENAI_API_KEY=<your openai token>
@@ -35,10 +68,26 @@ export ANTHROPIC_API_KEY=<your anthropic token>
 export GEMINI_API_KEY=<your gemini token>
 ```
 
-You can find all runnable experiments in the `scripts` directory.
-Their filename should explicitly tell you their purpose. 
-For example, `scripts/run_rm_evals.sh` runs the RewardBench inference pipeline on a select number of models given a dataset:
+Say we want to obtain the preferences of `gpt-4-2024-04-09`:
+
+```sh
+export OPENAI_API_KEY=<your openai token>
+python -m scripts/run_generative.py \
+    --dataset_name $DATASET \
+    --split $SPLIT \
+    --model gpt-4-turbo-2024-04-09 \
+    --output_dir $OUTDIR 
+```
+
+You can also run open-source LMs in a generative fashion. 
+The inference is then routed through [vLLM](https://github.com/vllm-project/vllm).
+Here's an example using `meta-llama/Meta-Llama-3-70B-Instruct`:
 
 ```sh
-./scripts/run_rm_evals.sh
+python -m scripts/run_generative.py \
+    --dataset_name $DATASET \
+    --split $SPLIT \
+    --model "meta-llama/Meta-Llama-3-70B-Instruct" \
+    --num_gpus 4 \
+    --output_dir $OUTDIR
 ```
diff --git a/scripts/run_generative.py b/scripts/run_generative.py
index 73ea5b8..0814118 100644
--- a/scripts/run_generative.py
+++ b/scripts/run_generative.py
@@ -21,6 +21,7 @@
 # Examples:
 # python scripts/run_generative.py --dataset_name <DATASET_NAME> --model gpt-3.5-turbo
 # python scripts/run_generative.py --dataset_name <DATASET_NAME> --model=claude-3-haiku-20240307
+# python scripts/run_generative.py --dataset_name <DATASET_NAME> --model=CohereForAI/c4ai-command-r-v01 --num_gpus 2 --force_local
 
 # note: for none API models, this script uses vllm
 # pip install vllm
@@ -31,6 +32,7 @@
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
 
 import numpy as np
 from datasets import load_dataset
@@ -57,6 +59,7 @@ def get_args():
     parser.add_argument("--dataset_name", type=str, required=True, help="name of dataset to test on")
     parser.add_argument("--split", default="test", type=str, required=True, help="dataset split to evaluate")
     parser.add_argument("--model", type=str, nargs="+", required=True, help="name of model to use")
+    parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the results.")
     parser.add_argument("--chat_template", type=str, default=None, help="fastchat chat template (optional)")
     parser.add_argument("--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline")
     parser.add_argument("--num_gpus", type=int, default=1, help="number of gpus to use, for multi-node vllm")
@@ -352,7 +355,8 @@ def process_shuffled(win, shuffle):
         },
     }
 
-    file_path = f"{model_name.replace('/', '___')}.json"
+    output_dir = Path(args.output_dir)
+    file_path = output_dir / f"{model_name.replace('/', '___')}.json"
     with open(file_path, "w") as f:
         json.dump(results_dict, f, indent=4)