diff --git a/src/genbench/tasks/latent_feature_splits/__init__.py b/src/genbench/tasks/latent_feature_splits/__init__.py new file mode 100644 index 0000000..8ceca21 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/__init__.py @@ -0,0 +1,5 @@ +from genbench import TaskDict + + +class LatentFeatureSplits(TaskDict): + pass diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet new file mode 100644 index 0000000..d5c8c01 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet @@ -0,0 +1,57 @@ +{ + name: 'Latent Feature Splits (bert_closest_split)', + + // @TODO: Add a description of the task + description: "We split hate speech data based on the internal representations of a RoBERTa model. + The o.o.d. data splits leads to an under-representation of parts of the latent space in the + model's training set, making the split more challenging than a random split.", + + // @TODO: Add a list of keywords that describe the task + keywords: [ + 'non-i.i.d. generalisation', + 'o.o.d. generalisation', + 'latent-features', + 'hate speech' + ], + + authors: [ + 'Maike Züfle', + 'Verna Dankers', + 'Ivan Titov', + + ], + + data_source: { + type: 'manual', + test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_test_new.jsonl', + train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_train.jsonl' + }, + + has_train_set: true, + + task_type: 'multiple_choice', + + evaluation_metrics: [ + { + hf_id: 'accuracy', + best_score: 1.0, + git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', + }, + { + hf_id: 'f1', + average: 'macro', + best_score: 1.0, + git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb', + }, + ], + + preparation_strategies: { + // A recipe for preparing the model to perform the task by configuring its prompt. + // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc. + // We provide a few options for configuring the prompt. But, the task creator can + // also provide a custom prompt preparation in the task's Python class. + finetuning: { + objective: 'maximum_likelihood', + } + }, +} \ No newline at end of file diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md new file mode 100644 index 0000000..f345ddb --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md @@ -0,0 +1,52 @@ +# Hate Speech Detection (bert_closest_split) + +## Abstract +With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems. +Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit. +We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations. +We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space. +This result generalises when developing a split with one model and evaluating it on another. +Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable. +We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark. + +## Examples +{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]} +{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} +{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]} + +## Usage +For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences. + +## Data Source +The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by Binny Mathew, Punyajoy Saha, +Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021. + +It is licensed under the MIT License: + +Copyright (c) 2020 Punyajoy Saha + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +## Limitations and Bias +*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.* + +## GenBench Eval card +This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus). +The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings. +![GenBench Eval Card](eval_card.png) diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png new file mode 100644 index 0000000..5a6877d Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png differ diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py new file mode 100644 index 0000000..b7d322d --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py @@ -0,0 +1,99 @@ +from collections import OrderedDict +from typing import Any, List, Mapping + +import datasets +import evaluate + +from genbench import Task +from genbench.api import TaskType +from genbench.utils.logging import get_logger + + +logger = get_logger(__name__) + + +class LatentFeatureSplitBertClosestSplit(Task): + def evaluate_predictions( + self, + *, + predictions: List[Mapping[str, Any]] = None, + gold: datasets.Dataset = None, + ) -> OrderedDict[str, float]: + """Evaluate the predictions of the model against the gold data. + + Args: + predictions: A list of dictionaries, where each dictionary contains the predicted + values for an example. The keys are strings and the values can be any type. + gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task. + + Returns: + A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted + values. The keys are strings representing the name of the evaluation metric and the values are + floating-point numbers. + + Raises: + ValueError: If a metric returns None. + """ + result = OrderedDict() + for metric_config in self.config.evaluation_metrics: + hf_id = metric_config.hf_id + if isinstance(hf_id, str): + hf_id = [hf_id] + + metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha) + + refs_lst = [g["target"] for g in gold] + preds_lst = [pred["target"] for pred in predictions] + + ref_type = type(refs_lst[0]) + pred_type = type(preds_lst[0]) + if pred_type != ref_type: + if self.config.task_type != TaskType.MULTIPLE_CHOICE: + raise ValueError( + f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. " + ) + # Convert predictions to the same type as the references + if pred_type == str and ref_type == int: + logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.") + converted_preds = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_preds.append(ref["target_options"].index(pred)) + preds_lst = converted_preds + elif pred_type == int and ref_type == str: + logger.warning("Predictions are ints, but references are strings. Converting references to ints.") + converted_refs = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_refs.append(ref["target_options"].index(ref["target"])) + refs_lst = converted_refs + else: + if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int: + # Convert both predictions and references to int + logger.warning( + "Predictions and references have the same type, but it is not int. Converting both to int." + ) + converted_preds = [] + converted_refs = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_preds.append(ref["target_options"].index(pred)) + converted_refs.append(ref["target_options"].index(ref["target"])) + preds_lst = converted_preds + refs_lst = converted_refs + + extra_kwargs = metric_config.compute_extra_kwargs or {} + output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs) + + if output is None: + raise ValueError( + f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation." + ) + + # Update output keys to include the metric id + metric_id = "_".join(hf_id) + output = {f"hf_{metric_id}__{k}": v for k, v in output.items()} + + result.update(output) + + return result diff --git a/src/genbench/tasks/latent_feature_splits/config.jsonnet b/src/genbench/tasks/latent_feature_splits/config.jsonnet new file mode 100644 index 0000000..ef4f553 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/config.jsonnet @@ -0,0 +1,58 @@ +{ + name: 'Latent Feature Split', + + // @TODO: Add a description of the task + description: "We split hate speech data based on the internal representations of a RoBERTa model. + The o.o.d. data splits leads to an under-representation of parts of the latent space in the + model's training set, making the split more challenging than a random split.", + + // @TODO: Add a list of keywords that describe the task + keywords: [ + 'non-i.i.d. generalisation', + 'o.o.d. generalisation', + 'latent-features', + 'hate speech' + ], + + authors: [ + 'Maike Züfle', + 'Verna Dankers', + 'Ivan Titov', + + ], + + data_source: { + type: 'manual', + test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl', + train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl' + }, + + has_train_set: true, + + task_type: 'multiple_choice', + + evaluation_metrics: [ + { + hf_id: 'accuracy', + best_score: 1.0, + git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', + }, + { + hf_id: 'f1', + average: 'macro', + best_score: 1.0, + git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb', + }, + + ], + + preparation_strategies: { + // A recipe for preparing the model to perform the task by configuring its prompt. + // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc. + // We provide a few options for configuring the prompt. But, the task creator can + // also provide a custom prompt preparation in the task's Python class. + finetuning: { + objective: 'maximum_likelihood', + } + }, +} \ No newline at end of file diff --git a/src/genbench/tasks/latent_feature_splits/doc.md b/src/genbench/tasks/latent_feature_splits/doc.md new file mode 100644 index 0000000..d51a56e --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/doc.md @@ -0,0 +1,52 @@ +# Hate Speech Detection + +## Abstract +With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems. +Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit. +We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations. +We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space. +This result generalises when developing a split with one model and evaluating it on another. +Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable. +We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark. + +## Examples +{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]} +{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} +{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]} + +## Usage +For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences. + +## Data Source +The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by Binny Mathew, Punyajoy Saha, +Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021. + +It is licensed under the MIT License: + +Copyright (c) 2020 Punyajoy Saha + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +## Limitations and Bias +*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.* + +## GenBench Eval card +This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus). +The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings. +![GenBench Eval Card](eval_card.png) diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet new file mode 100644 index 0000000..d30afa0 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet @@ -0,0 +1,57 @@ +{ + name: 'Latent Feature Splits (roberta_closest_split)', + + // @TODO: Add a description of the task + description: "We split hate speech data based on the internal representations of a RoBERTa model. + The o.o.d. data splits leads to an under-representation of parts of the latent space in the + model's training set, making the split more challenging than a random split.", + + // @TODO: Add a list of keywords that describe the task + keywords: [ + 'non-i.i.d. generalisation', + 'o.o.d. generalisation', + 'latent-features', + 'hate speech' + ], + + authors: [ + 'Maike Züfle', + 'Verna Dankers', + 'Ivan Titov', + + ], + + data_source: { + type: 'manual', + test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl', + train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl' + }, + + has_train_set: true, + + task_type: 'multiple_choice', + + evaluation_metrics: [ + { + hf_id: 'accuracy', + best_score: 1.0, + git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', + }, + { + hf_id: 'f1', + average: 'macro', + best_score: 1.0, + git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb', + }, + ], + + preparation_strategies: { + // A recipe for preparing the model to perform the task by configuring its prompt. + // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc. + // We provide a few options for configuring the prompt. But, the task creator can + // also provide a custom prompt preparation in the task's Python class. + finetuning: { + objective: 'maximum_likelihood', + } + }, +} \ No newline at end of file diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md new file mode 100644 index 0000000..0956e1d --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md @@ -0,0 +1,52 @@ +# Hate Speech Detection (roberta_closest_split) + +## Abstract +With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems. +Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit. +We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations. +We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space. +This result generalises when developing a split with one model and evaluating it on another. +Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable. +We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark. + +## Examples +{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]} +{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} +{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]} + +## Usage +For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences. + +## Data Source +The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by Binny Mathew, Punyajoy Saha, +Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021. + +It is licensed under the MIT License: + +Copyright (c) 2020 Punyajoy Saha + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +## Limitations and Bias +*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.* + +## GenBench Eval card +This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus). +The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings. +![GenBench Eval Card](eval_card.png) diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png new file mode 100644 index 0000000..5a6877d Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png differ diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py new file mode 100644 index 0000000..c6ec3fc --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py @@ -0,0 +1,99 @@ +from collections import OrderedDict +from typing import Any, List, Mapping + +import datasets +import evaluate + +from genbench import Task +from genbench.api import TaskType +from genbench.utils.logging import get_logger + + +logger = get_logger(__name__) + + +class LatentFeatureSplitRobertaClosestSplit(Task): + def evaluate_predictions( + self, + *, + predictions: List[Mapping[str, Any]] = None, + gold: datasets.Dataset = None, + ) -> OrderedDict[str, float]: + """Evaluate the predictions of the model against the gold data. + + Args: + predictions: A list of dictionaries, where each dictionary contains the predicted + values for an example. The keys are strings and the values can be any type. + gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task. + + Returns: + A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted + values. The keys are strings representing the name of the evaluation metric and the values are + floating-point numbers. + + Raises: + ValueError: If a metric returns None. + """ + result = OrderedDict() + for metric_config in self.config.evaluation_metrics: + hf_id = metric_config.hf_id + if isinstance(hf_id, str): + hf_id = [hf_id] + + metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha) + + refs_lst = [g["target"] for g in gold] + preds_lst = [pred["target"] for pred in predictions] + + ref_type = type(refs_lst[0]) + pred_type = type(preds_lst[0]) + if pred_type != ref_type: + if self.config.task_type != TaskType.MULTIPLE_CHOICE: + raise ValueError( + f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. " + ) + # Convert predictions to the same type as the references + if pred_type == str and ref_type == int: + logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.") + converted_preds = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_preds.append(ref["target_options"].index(pred)) + preds_lst = converted_preds + elif pred_type == int and ref_type == str: + logger.warning("Predictions are ints, but references are strings. Converting references to ints.") + converted_refs = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_refs.append(ref["target_options"].index(ref["target"])) + refs_lst = converted_refs + else: + if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int: + # Convert both predictions and references to int + logger.warning( + "Predictions and references have the same type, but it is not int. Converting both to int." + ) + converted_preds = [] + converted_refs = [] + for pred, ref in zip(preds_lst, gold): + assert "target_options" in ref + converted_preds.append(ref["target_options"].index(pred)) + converted_refs.append(ref["target_options"].index(ref["target"])) + preds_lst = converted_preds + refs_lst = converted_refs + + extra_kwargs = metric_config.compute_extra_kwargs or {} + output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs) + + if output is None: + raise ValueError( + f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation." + ) + + # Update output keys to include the metric id + metric_id = "_".join(hf_id) + output = {f"hf_{metric_id}__{k}": v for k, v in output.items()} + + result.update(output) + + return result diff --git a/src/genbench/tasks/latent_feature_splits/test_hatespeech.py b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py new file mode 100644 index 0000000..523cad1 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py @@ -0,0 +1,8 @@ +from genbench import load_task +from genbench.api import PreparationStrategy + + +task = load_task("latent_feature_splits:bert_closest_split") +ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING) +print(ds) +print(ds["test"][0]) diff --git a/src/genbench/tasks/latent_feature_splits/usage_example.py b/src/genbench/tasks/latent_feature_splits/usage_example.py new file mode 100644 index 0000000..8aef633 --- /dev/null +++ b/src/genbench/tasks/latent_feature_splits/usage_example.py @@ -0,0 +1,91 @@ +import os + +import evaluate +import numpy as np +from datasets import DatasetDict +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) + +from genbench import load_task +from genbench.api import PreparationStrategy + + +def compute_metrics(eval_preds): + metric = evaluate.load("f1") + logits, labels = eval_preds + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels, average="macro") + + +def main(split_name, num_labels, bsz, lr, epochs, checkpoint): + """ + Basic functionality to load data, train and evaluate the model. + Args: + - split_name: str (bert_closest_split | roberta_closest_split) + - num_labels (int) + - bsz (int): batch size + - lr (float): learning rate + - epochs (int): number of epochs + - checkpoint (str): should be a valid HF model name + """ + + def tokenize_function(example): + return tokenizer(example["input"]) + + # Convert GenBench format to HF dataset format, get devset, preview dataset + task = load_task(f"latent_feature_splits:{split_name}") + ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING) + ds_split = ds["train"].train_test_split(0.1) + ds = DatasetDict({"train": ds_split["train"], "validation": ds_split["test"], "test": ds["test"]}) + ds = ds.rename_column("target", "label") + print(ds) + + # Load and preprocess data + tokenizer = AutoTokenizer.from_pretrained(checkpoint) + tokenized_datasets = ds.map(tokenize_function, batch_size=bsz, batched=True) + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Load model and HF trainer, WITH evaluation during training + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels) + training_args = TrainingArguments( + "test-trainer", + learning_rate=lr, + num_train_epochs=epochs, + per_device_train_batch_size=bsz, + per_device_eval_batch_size=bsz, + evaluation_strategy="epoch", + ) + trainer = Trainer( + model, + training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + ) + + # Evaluate for random performance level, train, evaluate again + predictions = trainer.predict(tokenized_datasets["test"]) + f1_pre = compute_metrics((predictions.predictions, predictions.label_ids)) + trainer.train() + predictions = trainer.predict(tokenized_datasets["test"]) + f1_post = compute_metrics((predictions.predictions, predictions.label_ids)) + print(f"Random f1: {f1_pre}, f1 post-training: {f1_post}") + + +if __name__ == "__main__": + os.environ["WANDB_DISABLED"] = "true" + split_name = "bert_closest_split" + num_labels = 3 + batch_size = 16 + lr = 2e-5 + epochs = 5 + checkpoint = "bert-base-uncased" + + main(split_name, num_labels, batch_size, lr, epochs, checkpoint)