From a21fec1817dd585bfcb8fc4e09b38161610eb5fb Mon Sep 17 00:00:00 2001
From: Betswish <j.qi@rug.nl>
Date: Sun, 31 Dec 2023 07:44:35 +0100
Subject: [PATCH] Update

---
 .../requirements-usage-example.txt            | 136 ++++++++++++++
 .../tasks/cross_lingual_consistency/task.py   |  79 +++-----
 .../usage_example.py                          | 168 ++++++++++++++++++
 3 files changed, 325 insertions(+), 58 deletions(-)
 create mode 100644 src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt
 create mode 100644 src/genbench/tasks/cross_lingual_consistency/usage_example.py

diff --git a/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt b/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt
new file mode 100644
index 0000000..0292afa
--- /dev/null
+++ b/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt
@@ -0,0 +1,136 @@
+accelerate               0.19.0
+aiohttp                  3.8.4
+aiosignal                1.3.1
+antlr4-python3-runtime   4.9.3
+async-timeout            4.0.2
+attrs                    23.1.0
+bitsandbytes             0.39.0
+bitsandbytes-cuda113     0.30.1
+cachetools               5.3.1
+captum                   0.6.0
+certifi                  2023.5.7
+chardet                  3.0.4
+charset-normalizer       3.1.0
+click                    8.1.3
+cmake                    3.26.3
+colorama                 0.4.6
+commonmark               0.9.1
+contourpy                1.0.7
+cycler                   0.11.0
+Cython                   0.29.34
+datasets                 2.12.0
+dill                     0.3.6
+filelock                 3.12.0
+fonttools                4.39.4
+frozenlist               1.3.3
+fsspec                   2023.5.0
+google-api-core          2.11.0
+google-auth              2.19.1
+google-cloud-core        2.3.2
+google-cloud-translate   3.11.1
+googleapis-common-protos 1.59.0
+googletrans              4.0.0rc1
+grpcio                   1.54.2
+grpcio-status            1.54.2
+h11                      0.9.0
+h2                       3.2.0
+higher                   0.2.1
+hpack                    3.0.0
+hstspreload              2023.1.1
+httpcore                 0.9.1
+httpx                    0.13.3
+huggingface-hub          0.14.1
+hydra-core               1.3.2
+hyperframe               5.2.0
+idna                     2.10
+inquirerpy               0.3.4
+inseq                    0.5.0.dev0
+Jinja2                   3.1.2
+joblib                   1.2.0
+jsonschema               4.17.3
+kiwisolver               1.4.4
+lang2vec                 1.1.6
+lang2vec                 1.1.6
+lang2vec                 1.1.6
+lit                      16.0.5
+MarkupSafe               2.1.2
+matplotlib               3.5.3
+mpmath                   1.3.0
+multidict                6.0.4
+multiprocess             0.70.14
+mypy-extensions          1.0.0
+networkx                 3.1
+numpy                    1.24.3
+nvidia-cublas-cu11       11.10.3.66
+nvidia-cuda-cupti-cu11   11.7.101
+nvidia-cuda-nvrtc-cu11   11.7.99
+nvidia-cuda-runtime-cu11 11.7.99
+nvidia-cudnn-cu11        8.5.0.96
+nvidia-cufft-cu11        10.9.0.58
+nvidia-curand-cu11       10.2.10.91
+nvidia-cusolver-cu11     11.4.0.1
+nvidia-cusparse-cu11     11.7.4.91
+nvidia-nccl-cu11         2.14.3
+nvidia-nvtx-cu11         11.7.91
+omegaconf                2.3.0
+overrides                7.3.1
+packaging                23.1
+pandas                   2.0.1
+pastel                   0.2.1
+pfzy                     0.3.4
+Pillow                   9.5.0
+pip                      23.0.1
+poethepoet               0.13.1
+prompt-toolkit           3.0.38
+proto-plus               1.22.2
+protobuf                 3.20.3
+psutil                   5.9.5
+pyarrow                  12.0.0
+pyasn1                   0.5.0
+pyasn1-modules           0.3.0
+Pygments                 2.15.1
+pyparsing                3.0.9
+pyproject-toml           0.0.10
+pyre-extensions          0.0.23
+pyrsistent               0.19.3
+python-dateutil          2.8.2
+pytz                     2023.3
+PyYAML                   6.0
+regex                    2023.5.5
+requests                 2.30.0
+responses                0.18.0
+rfc3986                  1.5.0
+rich                     10.16.2
+rsa                      4.9
+sacremoses               0.0.53
+safetensors              0.3.1
+scikit-learn             1.2.2
+scipy                    1.10.1
+sentencepiece            0.1.99
+setuptools               66.0.0
+six                      1.16.0
+sniffio                  1.3.0
+sympy                    1.12
+threadpoolctl            3.1.0
+tokenizers               0.13.3
+toml                     0.10.2
+tomli                    2.0.1
+torch                    2.0.1
+torchtyping              0.1.4
+tqdm                     4.65.0
+transformers             4.30.0.dev0
+triton                   2.0.0
+typeguard                2.13.3
+typing_extensions        4.5.0
+typing-inspect           0.8.0
+tzdata                   2023.3
+unicodecsv               0.14.1
+unimorph-inflect         0.0.1
+unimorph-inflect         0.0.1
+unimorph-inflect         0.0.1
+urllib3                  1.26.16
+wcwidth                  0.2.6
+wheel                    0.38.4
+xformers                 0.0.16
+xxhash                   3.2.0
+yarl                     1.9.2
diff --git a/src/genbench/tasks/cross_lingual_consistency/task.py b/src/genbench/tasks/cross_lingual_consistency/task.py
index 9559bae..c29756e 100644
--- a/src/genbench/tasks/cross_lingual_consistency/task.py
+++ b/src/genbench/tasks/cross_lingual_consistency/task.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from typing import Any, Dict, List, Mapping, Optional
 
 import datasets
@@ -8,6 +7,7 @@
 from genbench import Task
 from genbench.api import PreparationStrategy
 
+
 class CrossLingualConsistencyTask(Task):
     def _load_data_source(
         self,
@@ -146,80 +146,43 @@ def get_prepared_datasets(
     def evaluate_predictions(
         self,
         *,
-        predictions: List[Mapping[str, Any]] = None,
+        ranked=None,
+        origin=None,
         gold: datasets.Dataset = None,
     ) -> Dict[str, float]:
         def softmax(x):
             """Compute softmax values for each sets of scores in x."""
             return np.exp(x) / np.sum(np.exp(x), axis=0)
 
-        # Make sure that the predictions are in the same order as the gold dataset
-        assert len(predictions) == len(gold)
-
-        # Just to make sure the gold dataset is the same as the one we generated in `get_prepared_datasets`
-        assert "lang" in gold.features
-        assert "_genbnech_idx" in gold.features
-
-        # Also, make sure that predictions contain logprobs for each option
-        assert all(
-            "target_option_logprobs" in pred and len(pred["target_option_logprobs"]) == len(pred["target_options"])
-            for pred in predictions
-        )
-
-        # Group the prediction and instances such that we have:
-        # _genbnech_idx -> {
-        #    "lang_id_1": { ...data_instance..., target_option_logprobs: ... }
-        #    "lang_id_2": { ...data_instance..., target_option_logprobs: ... }
-        # },
-
-        grouped_examples = defaultdict(dict)
-        for pred, gold in zip(predictions, gold):
-            original_idx = gold["_genbnech_idx"]
-            lang = gold["lang"]
-            grouped_examples[original_idx][lang] = {
-                **gold,
-                **pred,
-            }
+        # Split candidates of the two langs
+        lang1_rankings = ranked[: len(ranked) / 2]
+        lang2_rankings = ranked[len(ranked) / 2 :]
+        cand_list1 = [[j for j in list(i.keys())] for i in origin[: len(origin) / 2]]
+        cand_list2 = [[j for j in list(i.keys())] for i in origin[len(origin) / 2 :]]
+
+        num_consistent = 0
 
-        CLC_score = 0
-        count = 0
-        langs = []
-        # Now, we compute the cross lingual consistency score
-        for idx, example in grouped_examples.items():
-            # Rerank the options based on the logprobs
-            for lang, data in example.items():
-                if len(langs) < 2:
-                    langs.append(lang)
-
-                logprobs = data["target_option_logprobs"]
-                sorted_options = sorted(
-                    zip(data["target_options"], logprobs),
-                    key=lambda x: x[1],
-                    reverse=False,
-                )
-                sorted_options, logprobs = zip(*sorted_options)
-                grouped_examples[idx][lang]["target_options"] = list(sorted_options)
-                grouped_examples[idx][lang]["target_option_logprobs"] = list(logprobs)
-
-            # Compute the cross lingual consistency score
-            ranking1 = grouped_examples[idx][langs[0]]["target_options"]
-            ranking2 = grouped_examples[idx][langs[1]]["target_options"]
+        for i in range(len(lang1_rankings)):
+            ranking1 = lang1_rankings[i]
+            ranking2 = lang2_rankings[i]
+
+            candidate1 = cand_list1[i]
+            candidate2 = cand_list2[i]
 
             order = [len(ranking1) - i for i in range(len(ranking1))]
             order = np.array(order)
+
             weight = softmax(order)
 
             for j in range(len(ranking1)):
-                set1 = {ranking1.index(i) for i in ranking1[: j + 1]}
-                set2 = {ranking2.index(i) for i in ranking2[: j + 1]}
+                set1 = {candidate1.index(i) for i in ranking1[: j + 1]}
+                set2 = {candidate2.index(i) for i in ranking2[: j + 1]}
 
                 cover = set1.intersection(set2)
-                CLC_score += weight[j] * (len(cover) / len(set1))
-
-            count += 1
-        CLC_score /= count
+                num_consistent += weight[j] * (len(cover) / len(set1))
 
         # Compute the final score
+        CLC_score = num_consistent / len(lang1_rankings)
         result = {
             "cross_lingual_consistency": CLC_score,
         }
diff --git a/src/genbench/tasks/cross_lingual_consistency/usage_example.py b/src/genbench/tasks/cross_lingual_consistency/usage_example.py
new file mode 100644
index 0000000..385866e
--- /dev/null
+++ b/src/genbench/tasks/cross_lingual_consistency/usage_example.py
@@ -0,0 +1,168 @@
+import numpy as np
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    MT5ForConditionalGeneration,
+    T5ForConditionalGeneration,
+    XGLMTokenizer,
+)
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+# No need for changing this function
+# This function calculates the probabilities of each candidate
+def predict_mask(answer_cand, prompt, mname, lang):
+    answer_pred_probs = dict()
+
+    for answer in answer_cand:
+        answer_cand_probs = []
+
+        if (
+            "t5" not in mname
+            and "xglm" not in mname
+            and "opt" not in mname
+            and "bloom" not in mname
+            and "llama" not in mname
+            and "gpt" not in mname
+        ):
+            answer_tokens = tokenizer(answer)["input_ids"][1:-1]
+
+            if "xlm-roberta" in mname and answer_tokens[0] == 6 and lang == "zh":
+                answer_tokens = answer_tokens[1:]
+
+            new_mask = ["<mask>"] * len(answer_tokens)
+
+            if lang == "zh":
+                new_mask = "".join(new_mask)
+            else:
+                new_mask = " ".join(new_mask)
+
+            prompt_new = prompt.replace("<mask>", new_mask)
+            prompt_new = prompt_new.replace("<mask>", tokenizer.mask_token)
+
+            for j, w_idx in enumerate(answer_tokens):
+                model_inputs = tokenizer(prompt_new, return_tensors="pt").to(device)
+                model_outputs = model(**model_inputs)
+                input_ids = model_inputs["input_ids"][0]
+                outputs = model_outputs["logits"]
+                masked_index = torch.nonzero(input_ids == tokenizer.mask_token_id, as_tuple=False)
+
+                logits = outputs[0, masked_index[0].item(), :]
+                probs = logits.softmax(dim=-1).detach().cpu().numpy()
+                answer_cand_probs.append(-np.log(probs[w_idx]))
+
+                pos = prompt_new.find(tokenizer.mask_token)
+                prompt_new = (
+                    prompt_new[:pos]
+                    + tokenizer.convert_ids_to_tokens(w_idx)
+                    + prompt_new[pos + len(tokenizer.mask_token) :]
+                )
+
+            answer_pred_probs[answer] = np.mean(answer_cand_probs)
+
+        elif "xglm" in mname or "opt" in mname or "bloom" in mname or "llama" in mname or "gpt" in mname:
+            prompt_new = prompt.replace("<mask>", answer)
+
+            model_input = tokenizer(prompt_new, return_tensors="pt").to(device)
+            output = model(**model_input)
+
+            if lang == "zh":
+                logits = output["logits"][0, :-1]
+                token_ids = model_input["input_ids"][0, 1:]
+            else:
+                logits = output["logits"][0, :-2]
+                token_ids = model_input["input_ids"][0, 1:-1]
+
+            answer_pred_probs[answer] = float(torch.nn.CrossEntropyLoss(reduction="mean")(logits, token_ids))
+        else:
+            input_ids = tokenizer(prompt.replace("<mask>", "<extra_id_0>"), return_tensors="pt").input_ids.to(device)
+            labels = tokenizer("<extra_id_0> " + answer + " <extra_id_1>", return_tensors="pt").input_ids.to(device)
+            target_ids = labels[0][1:-2]
+
+            outputs = model(input_ids=input_ids, labels=labels).logits
+            masked_index = torch.tensor(list(range(outputs.size()[1]))[1:-2])
+
+            for idx, t_idx in zip(masked_index, target_ids):
+                logits = outputs[0, idx.item(), :]
+                probs = logits.softmax(dim=-1).detach().cpu().numpy()
+                answer_cand_probs.append(-np.log(probs[t_idx]))
+
+            answer_pred_probs[answer] = np.mean(answer_cand_probs)
+
+    return answer_pred_probs
+
+
+cross_ling_const_task = load_task("cross_lingual_consistency")
+
+# Variables (Input dataset, langs and model)
+mini = True  # True for BMLAMA-17; False for BMLAMA-53
+lang1 = "en"
+lang2 = "es"
+mname = "bigscience/bloom-3b"
+
+# Setup dataset
+# The dataset length is num_languages * num_instances_per_language
+ds = cross_ling_const_task.get_prepared_datasets(
+    preparation_strategy=PreparationStrategy.PROMPT_BASED_TESTING,
+    mini=mini,
+    lang1=lang1,
+    lang2=lang2,
+)
+
+
+# Setup model & tokenizer
+if "xglm" in mname or "opt" in mname or "bloom" in mname or "llama" in mname or "gpt" in mname:
+    model = AutoModelForCausalLM.from_pretrained(mname)
+elif "google/mt5" in mname:
+    model = MT5ForConditionalGeneration.from_pretrained(mname)
+elif "t5" in mname:
+    model = T5ForConditionalGeneration.from_pretrained(mname)
+else:
+    model = AutoModelForMaskedLM.from_pretrained(mname)
+
+if "xglm" in mname:
+    tokenizer = XGLMTokenizer.from_pretrained(mname)
+else:
+    tokenizer = AutoTokenizer.from_pretrained(mname)
+
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print("Runing on:" + device)
+print()
+model = model.to(device)
+
+# Store the ranked candidates
+ranked_keys_list = []
+# Store the original candidates
+origin_keys_list = []
+
+# Reranking based on prob
+for i, data in enumerate(ds):
+    # The form of each data:
+    # {
+    #   "input": "The capital of Canada ",
+    #   "target": "Ottawa",
+    #   "target_options": [
+    #       "Beijing",
+    #       "Tokyo",
+    #       "Ottawa",
+    #   ],
+    #   "_genbnech_idx": <some index>
+    # }
+
+    logprobs = []
+    lang = lang1 if i < len(ds) / 2 else lang2
+
+    answer_pred_probs = predict_mask(data["target_options"], data["input"], mname, lang)
+    origin_keys_list.append(answer_pred_probs)
+
+    sorted_probs = sorted(answer_pred_probs.items(), key=lambda x: x[1], reverse=False)
+    ranked_keys = [x[0] for x in sorted_probs]
+    ranked_keys_list.append(ranked_keys)
+
+# Evaluate Cross Lingual Consistency:
+results = cross_ling_const_task.evaluate_predictions(ranked_keys_list, origin_keys_list)
+print(results)