From a21fec1817dd585bfcb8fc4e09b38161610eb5fb Mon Sep 17 00:00:00 2001 From: Betswish Date: Sun, 31 Dec 2023 07:44:35 +0100 Subject: [PATCH] Update --- .../requirements-usage-example.txt | 136 ++++++++++++++ .../tasks/cross_lingual_consistency/task.py | 79 +++----- .../usage_example.py | 168 ++++++++++++++++++ 3 files changed, 325 insertions(+), 58 deletions(-) create mode 100644 src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt create mode 100644 src/genbench/tasks/cross_lingual_consistency/usage_example.py diff --git a/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt b/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt new file mode 100644 index 0000000..0292afa --- /dev/null +++ b/src/genbench/tasks/cross_lingual_consistency/requirements-usage-example.txt @@ -0,0 +1,136 @@ +accelerate 0.19.0 +aiohttp 3.8.4 +aiosignal 1.3.1 +antlr4-python3-runtime 4.9.3 +async-timeout 4.0.2 +attrs 23.1.0 +bitsandbytes 0.39.0 +bitsandbytes-cuda113 0.30.1 +cachetools 5.3.1 +captum 0.6.0 +certifi 2023.5.7 +chardet 3.0.4 +charset-normalizer 3.1.0 +click 8.1.3 +cmake 3.26.3 +colorama 0.4.6 +commonmark 0.9.1 +contourpy 1.0.7 +cycler 0.11.0 +Cython 0.29.34 +datasets 2.12.0 +dill 0.3.6 +filelock 3.12.0 +fonttools 4.39.4 +frozenlist 1.3.3 +fsspec 2023.5.0 +google-api-core 2.11.0 +google-auth 2.19.1 +google-cloud-core 2.3.2 +google-cloud-translate 3.11.1 +googleapis-common-protos 1.59.0 +googletrans 4.0.0rc1 +grpcio 1.54.2 +grpcio-status 1.54.2 +h11 0.9.0 +h2 3.2.0 +higher 0.2.1 +hpack 3.0.0 +hstspreload 2023.1.1 +httpcore 0.9.1 +httpx 0.13.3 +huggingface-hub 0.14.1 +hydra-core 1.3.2 +hyperframe 5.2.0 +idna 2.10 +inquirerpy 0.3.4 +inseq 0.5.0.dev0 +Jinja2 3.1.2 +joblib 1.2.0 +jsonschema 4.17.3 +kiwisolver 1.4.4 +lang2vec 1.1.6 +lang2vec 1.1.6 +lang2vec 1.1.6 +lit 16.0.5 +MarkupSafe 2.1.2 +matplotlib 3.5.3 +mpmath 1.3.0 +multidict 6.0.4 +multiprocess 0.70.14 +mypy-extensions 1.0.0 +networkx 3.1 +numpy 1.24.3 +nvidia-cublas-cu11 11.10.3.66 +nvidia-cuda-cupti-cu11 11.7.101 +nvidia-cuda-nvrtc-cu11 11.7.99 +nvidia-cuda-runtime-cu11 11.7.99 +nvidia-cudnn-cu11 8.5.0.96 +nvidia-cufft-cu11 10.9.0.58 +nvidia-curand-cu11 10.2.10.91 +nvidia-cusolver-cu11 11.4.0.1 +nvidia-cusparse-cu11 11.7.4.91 +nvidia-nccl-cu11 2.14.3 +nvidia-nvtx-cu11 11.7.91 +omegaconf 2.3.0 +overrides 7.3.1 +packaging 23.1 +pandas 2.0.1 +pastel 0.2.1 +pfzy 0.3.4 +Pillow 9.5.0 +pip 23.0.1 +poethepoet 0.13.1 +prompt-toolkit 3.0.38 +proto-plus 1.22.2 +protobuf 3.20.3 +psutil 5.9.5 +pyarrow 12.0.0 +pyasn1 0.5.0 +pyasn1-modules 0.3.0 +Pygments 2.15.1 +pyparsing 3.0.9 +pyproject-toml 0.0.10 +pyre-extensions 0.0.23 +pyrsistent 0.19.3 +python-dateutil 2.8.2 +pytz 2023.3 +PyYAML 6.0 +regex 2023.5.5 +requests 2.30.0 +responses 0.18.0 +rfc3986 1.5.0 +rich 10.16.2 +rsa 4.9 +sacremoses 0.0.53 +safetensors 0.3.1 +scikit-learn 1.2.2 +scipy 1.10.1 +sentencepiece 0.1.99 +setuptools 66.0.0 +six 1.16.0 +sniffio 1.3.0 +sympy 1.12 +threadpoolctl 3.1.0 +tokenizers 0.13.3 +toml 0.10.2 +tomli 2.0.1 +torch 2.0.1 +torchtyping 0.1.4 +tqdm 4.65.0 +transformers 4.30.0.dev0 +triton 2.0.0 +typeguard 2.13.3 +typing_extensions 4.5.0 +typing-inspect 0.8.0 +tzdata 2023.3 +unicodecsv 0.14.1 +unimorph-inflect 0.0.1 +unimorph-inflect 0.0.1 +unimorph-inflect 0.0.1 +urllib3 1.26.16 +wcwidth 0.2.6 +wheel 0.38.4 +xformers 0.0.16 +xxhash 3.2.0 +yarl 1.9.2 diff --git a/src/genbench/tasks/cross_lingual_consistency/task.py b/src/genbench/tasks/cross_lingual_consistency/task.py index 9559bae..c29756e 100644 --- a/src/genbench/tasks/cross_lingual_consistency/task.py +++ b/src/genbench/tasks/cross_lingual_consistency/task.py @@ -1,4 +1,3 @@ -from collections import defaultdict from typing import Any, Dict, List, Mapping, Optional import datasets @@ -8,6 +7,7 @@ from genbench import Task from genbench.api import PreparationStrategy + class CrossLingualConsistencyTask(Task): def _load_data_source( self, @@ -146,80 +146,43 @@ def get_prepared_datasets( def evaluate_predictions( self, *, - predictions: List[Mapping[str, Any]] = None, + ranked=None, + origin=None, gold: datasets.Dataset = None, ) -> Dict[str, float]: def softmax(x): """Compute softmax values for each sets of scores in x.""" return np.exp(x) / np.sum(np.exp(x), axis=0) - # Make sure that the predictions are in the same order as the gold dataset - assert len(predictions) == len(gold) - - # Just to make sure the gold dataset is the same as the one we generated in `get_prepared_datasets` - assert "lang" in gold.features - assert "_genbnech_idx" in gold.features - - # Also, make sure that predictions contain logprobs for each option - assert all( - "target_option_logprobs" in pred and len(pred["target_option_logprobs"]) == len(pred["target_options"]) - for pred in predictions - ) - - # Group the prediction and instances such that we have: - # _genbnech_idx -> { - # "lang_id_1": { ...data_instance..., target_option_logprobs: ... } - # "lang_id_2": { ...data_instance..., target_option_logprobs: ... } - # }, - - grouped_examples = defaultdict(dict) - for pred, gold in zip(predictions, gold): - original_idx = gold["_genbnech_idx"] - lang = gold["lang"] - grouped_examples[original_idx][lang] = { - **gold, - **pred, - } + # Split candidates of the two langs + lang1_rankings = ranked[: len(ranked) / 2] + lang2_rankings = ranked[len(ranked) / 2 :] + cand_list1 = [[j for j in list(i.keys())] for i in origin[: len(origin) / 2]] + cand_list2 = [[j for j in list(i.keys())] for i in origin[len(origin) / 2 :]] + + num_consistent = 0 - CLC_score = 0 - count = 0 - langs = [] - # Now, we compute the cross lingual consistency score - for idx, example in grouped_examples.items(): - # Rerank the options based on the logprobs - for lang, data in example.items(): - if len(langs) < 2: - langs.append(lang) - - logprobs = data["target_option_logprobs"] - sorted_options = sorted( - zip(data["target_options"], logprobs), - key=lambda x: x[1], - reverse=False, - ) - sorted_options, logprobs = zip(*sorted_options) - grouped_examples[idx][lang]["target_options"] = list(sorted_options) - grouped_examples[idx][lang]["target_option_logprobs"] = list(logprobs) - - # Compute the cross lingual consistency score - ranking1 = grouped_examples[idx][langs[0]]["target_options"] - ranking2 = grouped_examples[idx][langs[1]]["target_options"] + for i in range(len(lang1_rankings)): + ranking1 = lang1_rankings[i] + ranking2 = lang2_rankings[i] + + candidate1 = cand_list1[i] + candidate2 = cand_list2[i] order = [len(ranking1) - i for i in range(len(ranking1))] order = np.array(order) + weight = softmax(order) for j in range(len(ranking1)): - set1 = {ranking1.index(i) for i in ranking1[: j + 1]} - set2 = {ranking2.index(i) for i in ranking2[: j + 1]} + set1 = {candidate1.index(i) for i in ranking1[: j + 1]} + set2 = {candidate2.index(i) for i in ranking2[: j + 1]} cover = set1.intersection(set2) - CLC_score += weight[j] * (len(cover) / len(set1)) - - count += 1 - CLC_score /= count + num_consistent += weight[j] * (len(cover) / len(set1)) # Compute the final score + CLC_score = num_consistent / len(lang1_rankings) result = { "cross_lingual_consistency": CLC_score, } diff --git a/src/genbench/tasks/cross_lingual_consistency/usage_example.py b/src/genbench/tasks/cross_lingual_consistency/usage_example.py new file mode 100644 index 0000000..385866e --- /dev/null +++ b/src/genbench/tasks/cross_lingual_consistency/usage_example.py @@ -0,0 +1,168 @@ +import numpy as np +import torch +from transformers import ( + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoTokenizer, + MT5ForConditionalGeneration, + T5ForConditionalGeneration, + XGLMTokenizer, +) + +from genbench import load_task +from genbench.api import PreparationStrategy + + +# No need for changing this function +# This function calculates the probabilities of each candidate +def predict_mask(answer_cand, prompt, mname, lang): + answer_pred_probs = dict() + + for answer in answer_cand: + answer_cand_probs = [] + + if ( + "t5" not in mname + and "xglm" not in mname + and "opt" not in mname + and "bloom" not in mname + and "llama" not in mname + and "gpt" not in mname + ): + answer_tokens = tokenizer(answer)["input_ids"][1:-1] + + if "xlm-roberta" in mname and answer_tokens[0] == 6 and lang == "zh": + answer_tokens = answer_tokens[1:] + + new_mask = [""] * len(answer_tokens) + + if lang == "zh": + new_mask = "".join(new_mask) + else: + new_mask = " ".join(new_mask) + + prompt_new = prompt.replace("", new_mask) + prompt_new = prompt_new.replace("", tokenizer.mask_token) + + for j, w_idx in enumerate(answer_tokens): + model_inputs = tokenizer(prompt_new, return_tensors="pt").to(device) + model_outputs = model(**model_inputs) + input_ids = model_inputs["input_ids"][0] + outputs = model_outputs["logits"] + masked_index = torch.nonzero(input_ids == tokenizer.mask_token_id, as_tuple=False) + + logits = outputs[0, masked_index[0].item(), :] + probs = logits.softmax(dim=-1).detach().cpu().numpy() + answer_cand_probs.append(-np.log(probs[w_idx])) + + pos = prompt_new.find(tokenizer.mask_token) + prompt_new = ( + prompt_new[:pos] + + tokenizer.convert_ids_to_tokens(w_idx) + + prompt_new[pos + len(tokenizer.mask_token) :] + ) + + answer_pred_probs[answer] = np.mean(answer_cand_probs) + + elif "xglm" in mname or "opt" in mname or "bloom" in mname or "llama" in mname or "gpt" in mname: + prompt_new = prompt.replace("", answer) + + model_input = tokenizer(prompt_new, return_tensors="pt").to(device) + output = model(**model_input) + + if lang == "zh": + logits = output["logits"][0, :-1] + token_ids = model_input["input_ids"][0, 1:] + else: + logits = output["logits"][0, :-2] + token_ids = model_input["input_ids"][0, 1:-1] + + answer_pred_probs[answer] = float(torch.nn.CrossEntropyLoss(reduction="mean")(logits, token_ids)) + else: + input_ids = tokenizer(prompt.replace("", ""), return_tensors="pt").input_ids.to(device) + labels = tokenizer(" " + answer + " ", return_tensors="pt").input_ids.to(device) + target_ids = labels[0][1:-2] + + outputs = model(input_ids=input_ids, labels=labels).logits + masked_index = torch.tensor(list(range(outputs.size()[1]))[1:-2]) + + for idx, t_idx in zip(masked_index, target_ids): + logits = outputs[0, idx.item(), :] + probs = logits.softmax(dim=-1).detach().cpu().numpy() + answer_cand_probs.append(-np.log(probs[t_idx])) + + answer_pred_probs[answer] = np.mean(answer_cand_probs) + + return answer_pred_probs + + +cross_ling_const_task = load_task("cross_lingual_consistency") + +# Variables (Input dataset, langs and model) +mini = True # True for BMLAMA-17; False for BMLAMA-53 +lang1 = "en" +lang2 = "es" +mname = "bigscience/bloom-3b" + +# Setup dataset +# The dataset length is num_languages * num_instances_per_language +ds = cross_ling_const_task.get_prepared_datasets( + preparation_strategy=PreparationStrategy.PROMPT_BASED_TESTING, + mini=mini, + lang1=lang1, + lang2=lang2, +) + + +# Setup model & tokenizer +if "xglm" in mname or "opt" in mname or "bloom" in mname or "llama" in mname or "gpt" in mname: + model = AutoModelForCausalLM.from_pretrained(mname) +elif "google/mt5" in mname: + model = MT5ForConditionalGeneration.from_pretrained(mname) +elif "t5" in mname: + model = T5ForConditionalGeneration.from_pretrained(mname) +else: + model = AutoModelForMaskedLM.from_pretrained(mname) + +if "xglm" in mname: + tokenizer = XGLMTokenizer.from_pretrained(mname) +else: + tokenizer = AutoTokenizer.from_pretrained(mname) + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +print("Runing on:" + device) +print() +model = model.to(device) + +# Store the ranked candidates +ranked_keys_list = [] +# Store the original candidates +origin_keys_list = [] + +# Reranking based on prob +for i, data in enumerate(ds): + # The form of each data: + # { + # "input": "The capital of Canada ", + # "target": "Ottawa", + # "target_options": [ + # "Beijing", + # "Tokyo", + # "Ottawa", + # ], + # "_genbnech_idx": + # } + + logprobs = [] + lang = lang1 if i < len(ds) / 2 else lang2 + + answer_pred_probs = predict_mask(data["target_options"], data["input"], mname, lang) + origin_keys_list.append(answer_pred_probs) + + sorted_probs = sorted(answer_pred_probs.items(), key=lambda x: x[1], reverse=False) + ranked_keys = [x[0] for x in sorted_probs] + ranked_keys_list.append(ranked_keys) + +# Evaluate Cross Lingual Consistency: +results = cross_ling_const_task.evaluate_predictions(ranked_keys_list, origin_keys_list) +print(results)