diff --git a/src/genbench/tasks/latent_feature_splits/__init__.py b/src/genbench/tasks/latent_feature_splits/__init__.py
new file mode 100644
index 0000000..8ceca21
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class LatentFeatureSplits(TaskDict):
+    pass
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet
new file mode 100644
index 0000000..d5c8c01
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Latent Feature Splits (bert_closest_split)',
+
+   // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_test_new.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md
new file mode 100644
index 0000000..f345ddb
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection (bert_closest_split)
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png
new file mode 100644
index 0000000..5a6877d
Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png differ
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py
new file mode 100644
index 0000000..b7d322d
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py
@@ -0,0 +1,99 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import datasets
+import evaluate
+
+from genbench import Task
+from genbench.api import TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class LatentFeatureSplitBertClosestSplit(Task):
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: List[Mapping[str, Any]] = None,
+        gold: datasets.Dataset = None,
+    ) -> OrderedDict[str, float]:
+        """Evaluate the predictions of the model against the gold data.
+
+        Args:
+            predictions: A list of dictionaries, where each dictionary contains the predicted
+                         values for an example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+
+        Raises:
+            ValueError: If a metric returns None.
+        """
+        result = OrderedDict()
+        for metric_config in self.config.evaluation_metrics:
+            hf_id = metric_config.hf_id
+            if isinstance(hf_id, str):
+                hf_id = [hf_id]
+
+            metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+            refs_lst = [g["target"] for g in gold]
+            preds_lst = [pred["target"] for pred in predictions]
+
+            ref_type = type(refs_lst[0])
+            pred_type = type(preds_lst[0])
+            if pred_type != ref_type:
+                if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+                    raise ValueError(
+                        f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+                    )
+                # Convert predictions to the same type as the references
+                if pred_type == str and ref_type == int:
+                    logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+                    converted_preds = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                    preds_lst = converted_preds
+                elif pred_type == int and ref_type == str:
+                    logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    refs_lst = converted_refs
+            else:
+                if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+                    # Convert both predictions and references to int
+                    logger.warning(
+                        "Predictions and references have the same type, but it is not int. Converting both to int."
+                    )
+                    converted_preds = []
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    preds_lst = converted_preds
+                    refs_lst = converted_refs
+
+            extra_kwargs = metric_config.compute_extra_kwargs or {}
+            output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+            if output is None:
+                raise ValueError(
+                    f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+                )
+
+            # Update output keys to include the metric id
+            metric_id = "_".join(hf_id)
+            output = {f"hf_{metric_id}__{k}": v for k, v in output.items()}
+
+            result.update(output)
+
+        return result
diff --git a/src/genbench/tasks/latent_feature_splits/config.jsonnet b/src/genbench/tasks/latent_feature_splits/config.jsonnet
new file mode 100644
index 0000000..ef4f553
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/config.jsonnet
@@ -0,0 +1,58 @@
+{
+    name: 'Latent Feature Split',
+
+     // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/doc.md b/src/genbench/tasks/latent_feature_splits/doc.md
new file mode 100644
index 0000000..d51a56e
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet
new file mode 100644
index 0000000..d30afa0
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Latent Feature Splits (roberta_closest_split)',
+
+    // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md
new file mode 100644
index 0000000..0956e1d
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection (roberta_closest_split)
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png
new file mode 100644
index 0000000..5a6877d
Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png differ
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py
new file mode 100644
index 0000000..c6ec3fc
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py
@@ -0,0 +1,99 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import datasets
+import evaluate
+
+from genbench import Task
+from genbench.api import TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class LatentFeatureSplitRobertaClosestSplit(Task):
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: List[Mapping[str, Any]] = None,
+        gold: datasets.Dataset = None,
+    ) -> OrderedDict[str, float]:
+        """Evaluate the predictions of the model against the gold data.
+
+        Args:
+            predictions: A list of dictionaries, where each dictionary contains the predicted
+                         values for an example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+
+        Raises:
+            ValueError: If a metric returns None.
+        """
+        result = OrderedDict()
+        for metric_config in self.config.evaluation_metrics:
+            hf_id = metric_config.hf_id
+            if isinstance(hf_id, str):
+                hf_id = [hf_id]
+
+            metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+            refs_lst = [g["target"] for g in gold]
+            preds_lst = [pred["target"] for pred in predictions]
+
+            ref_type = type(refs_lst[0])
+            pred_type = type(preds_lst[0])
+            if pred_type != ref_type:
+                if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+                    raise ValueError(
+                        f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+                    )
+                # Convert predictions to the same type as the references
+                if pred_type == str and ref_type == int:
+                    logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+                    converted_preds = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                    preds_lst = converted_preds
+                elif pred_type == int and ref_type == str:
+                    logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    refs_lst = converted_refs
+            else:
+                if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+                    # Convert both predictions and references to int
+                    logger.warning(
+                        "Predictions and references have the same type, but it is not int. Converting both to int."
+                    )
+                    converted_preds = []
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    preds_lst = converted_preds
+                    refs_lst = converted_refs
+
+            extra_kwargs = metric_config.compute_extra_kwargs or {}
+            output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+            if output is None:
+                raise ValueError(
+                    f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+                )
+
+            # Update output keys to include the metric id
+            metric_id = "_".join(hf_id)
+            output = {f"hf_{metric_id}__{k}": v for k, v in output.items()}
+
+            result.update(output)
+
+        return result
diff --git a/src/genbench/tasks/latent_feature_splits/test_hatespeech.py b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py
new file mode 100644
index 0000000..523cad1
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py
@@ -0,0 +1,8 @@
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+task = load_task("latent_feature_splits:bert_closest_split")
+ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING)
+print(ds)
+print(ds["test"][0])
diff --git a/src/genbench/tasks/latent_feature_splits/usage_example.py b/src/genbench/tasks/latent_feature_splits/usage_example.py
new file mode 100644
index 0000000..8aef633
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/usage_example.py
@@ -0,0 +1,91 @@
+import os
+
+import evaluate
+import numpy as np
+from datasets import DatasetDict
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainingArguments,
+)
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+def compute_metrics(eval_preds):
+    metric = evaluate.load("f1")
+    logits, labels = eval_preds
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels, average="macro")
+
+
+def main(split_name, num_labels, bsz, lr, epochs, checkpoint):
+    """
+    Basic functionality to load data, train and evaluate the model.
+    Args:
+        - split_name: str (bert_closest_split | roberta_closest_split)
+        - num_labels (int)
+        - bsz (int): batch size
+        - lr (float): learning rate
+        - epochs (int): number of epochs
+        - checkpoint (str): should be a valid HF model name
+    """
+
+    def tokenize_function(example):
+        return tokenizer(example["input"])
+
+    # Convert GenBench format to HF dataset format, get devset, preview dataset
+    task = load_task(f"latent_feature_splits:{split_name}")
+    ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING)
+    ds_split = ds["train"].train_test_split(0.1)
+    ds = DatasetDict({"train": ds_split["train"], "validation": ds_split["test"], "test": ds["test"]})
+    ds = ds.rename_column("target", "label")
+    print(ds)
+
+    # Load and preprocess data
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    tokenized_datasets = ds.map(tokenize_function, batch_size=bsz, batched=True)
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # Load model and HF trainer, WITH evaluation during training
+    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
+    training_args = TrainingArguments(
+        "test-trainer",
+        learning_rate=lr,
+        num_train_epochs=epochs,
+        per_device_train_batch_size=bsz,
+        per_device_eval_batch_size=bsz,
+        evaluation_strategy="epoch",
+    )
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+    )
+
+    # Evaluate for random performance level, train, evaluate again
+    predictions = trainer.predict(tokenized_datasets["test"])
+    f1_pre = compute_metrics((predictions.predictions, predictions.label_ids))
+    trainer.train()
+    predictions = trainer.predict(tokenized_datasets["test"])
+    f1_post = compute_metrics((predictions.predictions, predictions.label_ids))
+    print(f"Random f1: {f1_pre}, f1 post-training: {f1_post}")
+
+
+if __name__ == "__main__":
+    os.environ["WANDB_DISABLED"] = "true"
+    split_name = "bert_closest_split"
+    num_labels = 3
+    batch_size = 16
+    lr = 2e-5
+    epochs = 5
+    checkpoint = "bert-base-uncased"
+
+    main(split_name, num_labels, batch_size, lr, epochs, checkpoint)