diff --git a/.github/workflows/task_submission_ci.yml b/.github/workflows/task_submission_ci.yml
index 8f5404e..0539a7a 100644
--- a/.github/workflows/task_submission_ci.yml
+++ b/.github/workflows/task_submission_ci.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Parse the Task ID from PR's title
         id: pr_task_id
         run: |
-          task_id=$(echo '${{ github.event.pull_request.title }}' | sed -n -e 's/^\[Task Submission\][[:alnum:][:space:]()]\+[[:space:]]*(`\([^`]*\)`)[[:space:]]*.*/\1/p')
+          task_id=$(echo '${{ github.event.pull_request.title }}' | sed -n -e 's/^\[Task Submission\][[:alnum:][:space:]()_-]\+[[:space:]]*(`\([^`]*\)`)[[:space:]]*.*/\1/p')
           echo "Task ID: $task_id"
           echo "task_id=$task_id" >> $GITHUB_OUTPUT
         shell: bash
@@ -111,4 +111,4 @@ jobs:
 
       - name: Test Task
         run: |
-          genbench-cli test-task -i ${{ steps.pr_task_id.outputs.task_id }} --tests-dir ./tests
\ No newline at end of file
+          genbench-cli test-task -i ${{ steps.pr_task_id.outputs.task_id }} --tests-dir ./tests
diff --git a/setup.py b/setup.py
index 30daf88..1d0752d 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
     # Numpy is needed for some of HF's metrics
     "numpy",
     "typing_extensions>=4.6",
+    "statsmodels>=0.14",
 ]
 
 
diff --git a/src/genbench/tasks/europarl_dbca_splits/__init__.py b/src/genbench/tasks/europarl_dbca_splits/__init__.py
new file mode 100644
index 0000000..eecdf60
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class EuroparlDbcaSplits(TaskDict):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/_base_task.py b/src/genbench/tasks/europarl_dbca_splits/_base_task.py
new file mode 100644
index 0000000..3e4be76
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/_base_task.py
@@ -0,0 +1,116 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import evaluate
+import numpy as np
+from datasets import Dataset
+
+from genbench import Task
+from genbench.api import EvaluationResult, TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class BaseDbcaTask(Task):
+    """This task evaluates how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).
+    """
+
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: List[Mapping[str, Any]] = None,
+        gold: Dataset = None,
+    ) -> EvaluationResult:
+        result = OrderedDict()
+        for metric_config in self.config.evaluation_metrics:
+            hf_id = metric_config.hf_id
+            if isinstance(hf_id, str):
+                hf_id = [hf_id]
+
+            metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+            refs_lst = [g["target"] for g in gold]
+            preds_lst = [pred["target"] for pred in predictions]
+
+            ref_type = type(refs_lst[0])
+            pred_type = type(preds_lst[0])
+            if pred_type != ref_type:
+                if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+                    raise ValueError(
+                        f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+                    )
+                # Convert predictions to the same type as the references
+                if pred_type == str and ref_type == int:
+                    logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+                    converted_preds = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                    preds_lst = converted_preds
+                elif pred_type == int and ref_type == str:
+                    logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    refs_lst = converted_refs
+            else:
+                if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+                    # Convert both predictions and references to int
+                    logger.warning(
+                        "Predictions and references have the same type, but it is not int. Converting both to int."
+                    )
+                    converted_preds = []
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    preds_lst = converted_preds
+                    refs_lst = converted_refs
+
+            extra_kwargs = metric_config.compute_extra_kwargs or {}
+            output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+            if output is None:
+                raise ValueError(
+                    f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+                )
+
+            # Update output keys to include the metric id
+            metric_id = "_".join(hf_id)
+            output = {f"hf_{metric_id}__{k}": v for k, v in output.items() if k == "score"}
+
+            result.update(output)
+
+        return result
+
+    def chernoff_coef(self, vec1, vec2, alpha):
+        """
+        The Chernoff coefficient c is a similarity measure C_{alpha}(P||Q)
+        = sum_k[p_k^alpha * q_k^(1-alpha)] e[0,1] between two (probability)
+        distributions P and Q. The alpha parameter determines if we want to
+        measure whether Q includes elements that are not in P.
+        """
+        if alpha < 0 or alpha > 1:
+            raise ValueError("alpha must be in [0,1]")
+        # use log to avoid underflow
+        return np.sum(np.exp((np.log(vec1) * alpha) + (np.log(vec2) * (1 - alpha))), axis=1)
+
+    def normalize_vector(self, vector):
+        """Normalize a vector to have sum 1."""
+        return np.nan_to_num(np.divide(vector, np.sum(vector)))
+
+    def divergence(self, vec1, vec2, alpha):
+        """
+        Calculate divergence between two vectors.
+        Atom divergence is 1 - Chernoff coefficient, with alpha=0.5.
+        Compound divergence is 1 - Chernoff coefficient, with alpha=0.1.
+        """
+        return float(1 - self.chernoff_coef(self.normalize_vector(vec1), self.normalize_vector(vec2), alpha))
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet
new file mode 100644
index 0000000..4c9d9bd
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv0_de)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_de'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md
new file mode 100644
index 0000000..c6e1e28
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_de)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py
new file mode 100644
index 0000000..898b036
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0De(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet
new file mode 100644
index 0000000..c8b975b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv0_el)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_el'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md
new file mode 100644
index 0000000..f880163
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_el)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py
new file mode 100644
index 0000000..1124f49
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0El(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet
new file mode 100644
index 0000000..e97f2bd
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv0_fi)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_fi'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md
new file mode 100644
index 0000000..31a0e0d
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_fi)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py
new file mode 100644
index 0000000..7bf9f32
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0Fi(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet
new file mode 100644
index 0000000..0cf8db9
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv0_fr)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_fr'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md
new file mode 100644
index 0000000..79e7f71
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_fr)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py
new file mode 100644
index 0000000..943fe65
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0Fr(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet
new file mode 100644
index 0000000..837e681
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet
@@ -0,0 +1,44 @@
+{
+    name: 'Europarl DBCA splits (comdiv1_de)',
+
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_de'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md
new file mode 100644
index 0000000..58415ce
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_de)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py
new file mode 100644
index 0000000..3b9ec0a
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1De(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet
new file mode 100644
index 0000000..f6be560
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet
@@ -0,0 +1,44 @@
+{
+    name: 'Europarl DBCA splits (comdiv1_el)',
+
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_el'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md
new file mode 100644
index 0000000..90b6a6b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_el)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py
new file mode 100644
index 0000000..7fcf724
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1El(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet
new file mode 100644
index 0000000..76976df
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv1_fi)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_fi'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md
new file mode 100644
index 0000000..0c5f258
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_fi)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py
new file mode 100644
index 0000000..8fc677b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1Fi(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet
new file mode 100644
index 0000000..6d095f4
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet
@@ -0,0 +1,43 @@
+{
+    name: 'Europarl DBCA splits (comdiv1_fr)',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+    
+    keywords: [
+        'translation',
+        'dependency relations',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+    
+    task_type: 'free_form',
+    
+    data_source: {
+        type: 'hf',
+        hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_fr'],
+        git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+    },
+
+    evaluation_metrics: [
+        {
+            hf_id: 'chrf',
+            git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+            best_score: 100.0,
+        }
+    ],
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md
new file mode 100644
index 0000000..eda471f
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_fr)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py
new file mode 100644
index 0000000..8e27ac1
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1Fr(BaseDbcaTask):
+    pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/config.jsonnet
new file mode 100644
index 0000000..9b01c57
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/config.jsonnet
@@ -0,0 +1,28 @@
+{
+    name: 'Divergent DepRel Distributions',
+
+    description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+    dependency relations. In practice, this means that the test set includes novel
+    (<head lemma>, <deprel>, <dependant lemma>) tuples (=compounds) that were not seen in
+    the training set, while having similar relative frequencies of the lemmas and dependency
+    relation tags (= elements of the compound tuples = atoms).',
+
+    keywords: [
+        'translation',
+    ],
+
+    authors: [
+        'Anssi Moisio',
+    ],
+
+    subtasks_order: [
+        'comdiv0_de',
+        'comdiv1_de',
+        'comdiv0_fr',
+        'comdiv1_fr',
+        'comdiv0_el',
+        'comdiv1_el',
+        'comdiv0_fi',
+        'comdiv1_fi',
+    ],
+}
diff --git a/src/genbench/tasks/europarl_dbca_splits/doc.md b/src/genbench/tasks/europarl_dbca_splits/doc.md
new file mode 100644
index 0000000..a32d0bf
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/doc.md
@@ -0,0 +1,64 @@
+# Train-test data splits of the Europarl NMT corpus with divergent distributions of dependency relations
+## Abstract
+Compositional generalisation (CG), in NLP and in machine learning generally, has been assessed mostly using artificial datasets. It is important to develop benchmarks to assess CG also in real-world natural language tasks in order to understand the abilities and limitations of systems that are deployed in the wild. In our GenBench Collaborative Benchmarking Task submission, we utilise the distribution-based compositionality assessment (DBCA) framework to split the Europarl translation corpus into a training and test set in a way that translating the test set requires compositional generalisation capacity. Specifically, the training and test sets have divergent distributions of dependency relations, testing the NMT system's capacity to translate dependencies that they have not been trained on. 
+
+
+## Examples
+The task is simply sentence-level translation, e.g.:
+```
+"input": "If the House agrees, I shall do as Mr Evans has suggested.", "target": "Jos parlamentin jäsenet kannattavat sitä, teen niin kuin jäsen Evans ehdotti."
+```
+
+
+## Usage
+To use the provided maximum-compound-divergence train-test split for a target language (German=de, French=fr, Greek=el, Finnish=fi), load the data, train a model on the training subset, and evaluate the model's predictions on the test subset
+```
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+# Load the task
+task = load_task("europarl_dbca_splits")
+ds = task.comdiv1_de.get_prepared_datasets(PreparationStrategy.FINETUNING)
+
+# Evaluate predictions
+preds = ...
+print(task.comdiv1_de.evaluate_predictions(
+            predictions=preds,
+            gold=ds['test'],
+        )
+    )
+```
+To compare a model's capacity to generalise, we assess how much the translation accuracy decreases when the compound divergence between train and test sets increases. We keep atom distributions the same between train and test sets to make generalisation possible in principle. This means we should evaluate each model on both low- and high-compound-divergence data splits. To compute the generalisation score as described in the accompanying paper, train two systems on the splits with compound divergence values 0 and 1 (e.g. subtasks "comdiv0_de" and "comdiv1_de"), and take the ratio of the chrF2++ scores:  `task.comdiv1_de.evaluate_predictions(predictions_comdiv1_de, gold_comdiv1_de) / task.comdiv0_de.evaluate_predictions(predictions_comdiv0_de, gold_comdiv0_de)`
+
+#### Using your other data sets:
+To compute the atom and compound divergences for any pair of training (pre-training, training and/or fine-tuning) and test data sets, use method `EuroparlDbcaSplitsComdiv0De.divergence`. To create the atom and compound distributions of the training and test sets, the frequencies of each atom and compound in each set need to be first counted. The vectors that represent the atom and compound distributions of the train/test sets are inputted to the method to calculate the divergences:
+```
+import numpy as np
+# alpha is 0.5 for atom divergence and 0.1 for compound divergence
+train_set_atom_distribution = np.array([[2,4,10]])
+test_set_atom_distribution =  np.array([[1,2,5]])
+atom_divergence = task.comdiv1_de.divergence(train_set_atom_distribution,
+                                  test_set_atom_distribution,
+                                  0.5)
+# atom_divergence = 0.0
+
+train_set_compound_distribution = np.array([[2,0,6]])
+test_set_compound_distribution = np.array([[0,5,5]])
+compound_divergence = task.comdiv1_de.divergence(train_set_compound_distribution,
+                                      test_set_compound_distribution,
+                                      0.1)
+# compound_divergence = 0.4793101280037947
+```
+Each element in the distribution vectors represents the frequency of one type of atom/compound.
+
+
+## Data Source
+The original data source is `https://opus.nlpl.eu/Europarl.php`
+
+## Limitations and Bias
+Our goal was to create a benchmark that tests generalisation to novel dependency relations in a comprehensive way, not selecting some specific types of dependency relations and leaving out other types. However, memory requirements of the data splitting algorithm did not permit us to use all of the atoms and compounds in the distribution divergence calculations, so we opted to leave out the most frequent and the most infrequent lemmas, and the dependency relations that include them, which probably affects the results.
+
+## GenBench Eval card
+The motivation is primarily intrinsic: it is important to assess if translation models learn the systematic rules that characterise natural language, in order to get some understanding how the models work. Another motivation is practical; compositional generalisation is important for the practical reason that it would make the models robust. The type of the generalisation is compositional, and the shift type is covariate, since the input data distribution changes but the task remains otherwise the same. Shift source is partitioned natural data, since we do not use any artificial data, but the train-test split is artificial. Lastly, the shift locus in our experiments is train-test, but the method and benchmark could also possibly be used as a finetune train-test benchmark, by finetuning a pretrained model on the training set.
+
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/europarl_dbca_splits/eval_card.png b/src/genbench/tasks/europarl_dbca_splits/eval_card.png
new file mode 100644
index 0000000..6f7cd95
Binary files /dev/null and b/src/genbench/tasks/europarl_dbca_splits/eval_card.png differ
diff --git a/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt b/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt
new file mode 100644
index 0000000..765824a
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt
@@ -0,0 +1 @@
+transformers==4.35.2
diff --git a/src/genbench/tasks/europarl_dbca_splits/usage_example.py b/src/genbench/tasks/europarl_dbca_splits/usage_example.py
new file mode 100644
index 0000000..c3c9b12
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/usage_example.py
@@ -0,0 +1,193 @@
+"""
+Usage example for the Europarl DBCA splits task.
+
+Training of the NMT model is mostly based on the HuggingFace NLP course chapter on translation:
+https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt
+"""
+import argparse
+
+from datasets import DatasetDict
+from transformers import (
+    DataCollatorForSeq2Seq,
+    FSMTConfig,
+    FSMTForConditionalGeneration,
+    FSMTTokenizer,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    pipeline,
+)
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+def tokenize_corpus(dataset, save_to_file):
+    """
+    Tokenizes the dataset and saves it to disk.
+    """
+
+    def preprocess_function(examples):
+        inputs = examples["input"]
+        targets = examples["target"]
+        model_inputs = tokenizer(inputs, text_target=targets, max_length=MAX_LENGTH, truncation=True)
+        return model_inputs
+
+    dataset = DatasetDict(dataset)
+    tokenized = dataset.map(
+        preprocess_function,
+        batched=True,
+    )
+    tokenized.save_to_disk(save_to_file)
+    return tokenized
+
+
+def translate_sentences(model_name_or_path, eval_dataset):
+    """
+    Translates the sentences in eval_dataset using the given model.
+    """
+    translator = pipeline(
+        "translation",
+        model=model_name_or_path,
+        device="cuda",
+        batch_size=BATCH_SIZE,
+    )
+    return translator(eval_dataset, max_length=MAX_LENGTH)
+
+
+def train_from_scratch(tokenized_corpus, output_dir_name):
+    """
+    Trains an FSMT model from scratch.
+    Model architecture is similar to that in Vaswani et al. (2017).
+    """
+    config = FSMTConfig(
+        activation_dropout=0.0,
+        activation_function="relu",
+        architectures=["FSMTForConditionalGeneration"],
+        attention_dropout=0.1,
+        bos_token_id=0,
+        d_model=512,
+        decoder={"bos_token_id": 2, "model_type": "fsmt_decoder", "vocab_size": 42024},
+        decoder_attention_heads=8,
+        decoder_ffn_dim=2048,
+        decoder_layerdrop=0,
+        decoder_layers=6,
+        decoder_start_token_id=2,
+        dropout=0.1,
+        encoder_attention_heads=8,
+        encoder_ffn_dim=2048,
+        encoder_layerdrop=0,
+        encoder_layers=6,
+        eos_token_id=2,
+        forced_eos_token_id=2,
+        init_std=0.02,
+        is_encoder_decoder=True,
+        langs=["en", "de"],
+        length_penalty=1.15,
+        max_length=MAX_LENGTH,
+        max_position_embeddings=1024,
+        model_type="fsmt",
+        num_beams=5,
+        num_hidden_layers=6,
+        pad_token_id=1,
+        scale_embedding=True,
+        src_vocab_size=42024,
+        tgt_vocab_size=42024,
+        tie_word_embeddings=False,
+        transformers_version="4.35.2",
+        use_cache=True,
+    )
+    model = FSMTForConditionalGeneration(config=config)
+
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=output_dir_name,
+        evaluation_strategy="steps",
+        eval_steps=5000,
+        save_strategy="steps",
+        save_steps=10000,
+        learning_rate=2e-5,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        weight_decay=0.01,
+        save_total_limit=10,
+        max_steps=100000,
+        fp16=True,
+    )
+
+    trainer = Seq2SeqTrainer(
+        model,
+        training_args,
+        train_dataset=tokenized_corpus["train"],
+        eval_dataset=tokenized_corpus["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--tokenize", action="store_true")
+    argparser.add_argument("--train", action="store_true")
+    argparser.add_argument("--eval", action="store_true")
+    args = argparser.parse_args()
+
+    # Load the task
+    task = load_task("europarl_dbca_splits")
+
+    # A pretrained multilingual tokenizer, used for both models and both languages
+    tokenizer = FSMTTokenizer.from_pretrained("stas/tiny-wmt19-en-de")
+
+    MAX_LENGTH = 128
+    BATCH_SIZE = 128
+
+    results = []
+    # "comdiv0" is the easy non-compositional data split, with minimal compound divergence
+    # "comdiv1" is the difficult, compositional data split, with maximal compound divergence
+    # English-German corpus is used for this example.
+    # For other target languages, replace "de" with "fr", "el", or "fi" in the subtask name.
+    for comdiv in ["0", "1"]:
+        if comdiv == "0":
+            subtask = task.comdiv0_de
+        else:
+            subtask = task.comdiv1_de
+
+        subtask_dataset = subtask.get_prepared_datasets(PreparationStrategy.FINETUNING)
+
+        tokenized_dataset_dir = f"ds_de_comdiv{comdiv}_tokenized"
+        if args.tokenize:
+            tokenized_datasets = tokenize_corpus(subtask_dataset, tokenized_dataset_dir)
+        else:
+            tokenized_datasets = DatasetDict.load_from_disk(tokenized_dataset_dir)
+
+        # Extract a validation set from training set
+        train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.01)
+        tokenized_datasets["train"] = train_val_split["train"]
+        tokenized_datasets["validation"] = train_val_split["test"]
+
+        nmt_model_dir = f"FSMT_en-de_comdiv{comdiv}"
+        if args.train:
+            train_from_scratch(tokenized_datasets, nmt_model_dir)
+
+        if args.eval:
+            cp = "checkpoint-100000"
+            print(f"Results for comdiv{comdiv}, checkpoint {cp}")
+            preds = translate_sentences(nmt_model_dir + "/" + cp, tokenized_datasets["test"]["input"])
+
+            # re-map the keys to match the evaluation script
+            preds = [{"target": pred["translation_text"]} for pred in preds]
+
+            score = subtask.evaluate_predictions(
+                predictions=preds,
+                gold=tokenized_datasets["test"],
+            )
+            print(score)
+            results.append(score)
+
+    if args.eval:
+        print(
+            "Generalisation score (maximum compound divergence score divided by "
+            + "minimum compound divergence score):"
+        )
+        print(results[1]["hf_chrf__score"] / results[0]["hf_chrf__score"])
diff --git a/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.pdf b/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.pdf
new file mode 100644
index 0000000..3a1d066
Binary files /dev/null and b/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.pdf differ
diff --git a/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.png b/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.png
new file mode 100644
index 0000000..f6dac18
Binary files /dev/null and b/src/genbench/tasks/icl_consistency_test/GenBench_eval_card.png differ
diff --git a/src/genbench/tasks/icl_consistency_test/__init__.py b/src/genbench/tasks/icl_consistency_test/__init__.py
new file mode 100644
index 0000000..91b47b8
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class IclConsistencyTest(TaskDict):
+    pass
diff --git a/src/genbench/tasks/icl_consistency_test/anli/__init__.py b/src/genbench/tasks/icl_consistency_test/anli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/icl_consistency_test/anli/config.jsonnet b/src/genbench/tasks/icl_consistency_test/anli/config.jsonnet
new file mode 100644
index 0000000..0dded00
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/anli/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'ICL consistency test (anli)',
+
+    description: 'The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups. Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the number of parameters or instructions tuning). These external factors can be added into analysis by using the task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 strongly changes model predictions. This test evaluates the ANLI-dataset (Nie et al., 2019).',
+
+    keywords: [
+        'consistency',
+        'LLM',
+        'robustness',
+        'in-context learning',
+        'prompt-based learning',
+        'icl',
+        'anli',
+        'mnli'   
+    ],
+
+    authors: [
+        'Lucas Weber',
+        'Elia Bruni',
+        'Dieuwke Hupkes',
+        
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/LucWeber/icl_consistency_data/main/data/genbench_all_anli.jsonl',  
+    },
+
+    has_validation_set: false,
+    has_train_set: false,
+
+    task_type: 'free_form',
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: '',
+                instruction_few_shot: '',
+                input_prefix: '',
+                output_prefix: '',
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/icl_consistency_test/anli/doc.md b/src/genbench/tasks/icl_consistency_test/anli/doc.md
new file mode 100644
index 0000000..ab3ff15
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/anli/doc.md
@@ -0,0 +1,130 @@
+# ICL consistency test
+
+The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups.
+Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. 
+the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a 
+specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related 
+to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the 
+number of parameters or instructions tuning). These external factors can be added into analysis by using the 
+task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. 
+A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 
+strongly changes model predictions. The ICL consistency test has two subtasks, one evaluating the ANLI-dataset (Nie et al., 2019); 
+the other the MNLI-dataset (Wang et al., 2017).
+
+*Size*: Each subtask contains 57600 when using the full 600 data\_IDs. The user can choose to reduce the number of evaluated data\_IDs.
+
+## Abstract
+Just like the previous generation of _task-tuned models_ (TT), _large language models_ (LLMs) that are adapted to tasks via prompt-based meth- ods like _in-context-learning_ (ICL) or _instruction tuning_ (IT) perform well in some setups, but not in others. This lack of consistency in prompt-based learning hints at a lack of robust generalisation. We here introduce the ICL consistency test – a contribution to the GenBench CBT – which evaluates how consistent a model does predictions across many different setups while using the same data. The test is based on different established natural language inference tasks. We provide preprocessed data that constitutes 96 different ‘setups’ and a metric that estimates model consistency across these setups. The metric is provided on a fine-grained level, to understand what properties of a setup render predictions unstable and on an aggregated level to compare overall model consistency. We conduct an empirical analysis of eight state-of-the-art models and our consistency metric reveals how all tested LLMs lack robust generalisation.
+
+## Examples
+The test evaluates the same datapoints across many different setups to determine the consistency of a model's predictions. Every datapoint has a data\_ID (specifying the original datapoint) and a setup\_ID (with each digit specifying the presence or absence of a factor).
+
+Example with data\_ID - 1120; setup\_ID - id0_0200020:
+```
+The city's name derives from the Greek words "άργυρος" ("árgyros" meaning 
+"silver") and "πόλη" ("poli" meaning "city"). The name's older form was 
+"Argyroupolis". The first name of the settlement was "New Argyroupolis", 
+given by the refugees from Gümüşhane. Using only the above description 
+and what you know about the world, "The city's name derives from Greek words." 
+is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Undead is a 2003 Australian zombie science fiction horror comedy film 
+written and directed by Michael and Peter Spierig and starring Felicity 
+Mason, Mungo McKay and Rob Jenkins. It was then-relatively-unknown "Good Game" 
+presenter Steven O'Donnell's first film role. Using only the above description 
+and what you know about the world, "Steven O'Donnell was not a popular actor before 
+the 2003 Zombie movie." is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Let the Music Do the Talking is the first of four albums by The Joe Perry 
+Project. It was their the most successful, selling approximately 250,000 
+copies in the United States. The title track was re-recorded by Joe Perry's 
+more successful band Aerosmith on their album "Done With Mirrors", albeit 
+with a slightly different melody and Steven Tyler penned lyrics. Using only 
+the above description and what you know about the world, ""Done With Mirrors"
+was an album by The Joe Perry Project." is definitely correct, incorrect, or 
+inconclusive? 
+
+ANSWER: 
+```
+
+## Usage
+For an example script copy `example_evaluation.py` into your genbench root directory (`<path-to-genbench>/genbench_cbt`) and run it.
+#### Dataloading
+The task can loaded through the default GenBench interface as a zero-shot task:
+```python
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+task = load_task("icl_consistency_test")
+ds = task.get_prepared_datasets(
+                    PreparationStrategy.PROMPT_BASED_TESTING, 
+                    shot_list=[0]
+                    )[0]
+```
+#### Evaluation
+Provide the evaluation function with the model outputs as strings, accompanied by the corresponding setup-ids and data-ids 
+from the original dataset.
+For the predictions, please follow the following format: 
+
+`predictions: Dict[setup_ID, Dict[data_ID, model_output]]`
+
+For the gold labels, please provide the original dataset ds:
+
+`gold: datasets.Dataset`
+
+With this input, run the task evaluation like so:
+```python
+results = task.evaluate_predictions(predictions=predictions, 
+                                    gold=ds)
+```
+
+#### Adding factors
+External factors can be added via the `task.add_factor()` method.
+```python
+predictions = (predictions_factor_absent, predictions_factor_present)
+predictions = task.add_factor(data=predictions, 
+                              factor='<name_of_additional_factor>')
+```
+where `predictions_factor_absent` and `predictions_factor_present` are dictionaries of the same format as the original 
+predictions dictionary.
+
+#### Removing factors
+Factors can be removed from the dataset and the evaluation by using the `task.remove_factor()` method.
+```python
+predictions = task.remove_factor(data=ds, 
+                                 factor='<name_of_factor_to_be_removed>')
+```
+where `ds` is the original dataset as obtained by the `task.get_prepared_datasets()` method. Note that removing factors 
+will influence the results on all other factors.
+
+## Data Source
+The original data stems from the ANLI dataset (Nie et al., 2019).
+Prompting templates are taken from promptsource (Bach et al., 2022).
+
+## Limitations and Bias
+We identify the following limitations of the consistency test:
+1. The number of factors in limited and does not cover all possible factors that might influence the predictions. We limited ourselves to factors we deem relevant, to ensure fast evaluation.
+
+2. Currently, the test is only implemented for the ANLI-dataset.
+
+3. External factors such as _Instruction tuning_ or _calibration_ have to be manually added by the user using the `task.add_factor()` method. 
+
+
+## GenBench Eval card
+This test can be used to test generalisation in LLMs (pretrain - test locus).
+It is designed to better understand how LLMs generalise (intrinsic motivation) and to give practical hints on relevant prompt-design decisions (practical motivation). It can be used to assess robustness.
+
+![GenBench Eval Card](GenBench_eval_card.png)
+
+
+## References
+
+Bach, S. H., Sanh, V., Yong, Z. X., Webson, A., Raffel, C., Nayak, N. V., ... & Rush, A. M. (2022). Promptsource: An integrated development environment and repository for natural language prompts. arXiv preprint arXiv:2202.01279.
+
+Nie, Y., Williams, A., Dinan, E., Bansal, M., Weston, J., & Kiela, D. (2019). Adversarial NLI: A new benchmark for natural language understanding. arXiv preprint arXiv:1910.14599.
+
+Wang, Z., Hamza, W., & Florian, R. (2017, August). Bilateral multi-perspective matching for natural language sentences. In Proceedings of the 26th International Joint Conference on Artificial Intelligence (pp. 4144-4150).
diff --git a/src/genbench/tasks/icl_consistency_test/anli/task.py b/src/genbench/tasks/icl_consistency_test/anli/task.py
new file mode 100644
index 0000000..16d04b9
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/anli/task.py
@@ -0,0 +1,263 @@
+from typing import Any, Dict, Tuple
+
+import datasets
+import pandas as pd
+import statsmodels.api as sm
+from numpy import ndarray
+from pandas import DataFrame
+from sklearn.metrics import cohen_kappa_score
+
+from genbench import Task
+
+
+LABELS = [
+    ["Correct", "True", "Always", "Yes", "Guaranteed", "Duplicates"],  # `correct` labels
+    ["Inconclusive", "Possible", "Sometimes", "Maybe", "Neither"],  # `neutral` labels
+    ["Impossible", "Never", "Incorrect", "False", "No", "Not Duplicates"],  # `incorrect` labels
+]
+
+LABEL_TO_NUMERIC = {}
+LABEL_TO_NUMERIC.update(dict([(label, i) for i, label_subset in enumerate(LABELS) for label in label_subset]))
+LABEL_TO_NUMERIC.update(dict([(label.lower(), i) for i, label_subset in enumerate(LABELS) for label in label_subset]))
+
+factors = [
+    "balanced_labels",
+    "one_label",
+    "cross_task",
+    "cross_instructions",
+    "n_shots",
+    "instructions",
+    "hp_instructions",
+]
+
+
+class IclConsistencyTestAnli(Task):
+    """Python implementation of the ICL consistency test task."""
+
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: Dict[str, Dict[str, Any]],
+        gold: datasets.Dataset,
+    ) -> Dict[str, Any]:
+        """Evaluate the predictions of the model against the gold data.
+        Calculating exact match accuracy plus consistency across all setups (Cohen's kappa).
+
+        Args:
+            predictions: A dictionary of dictionary, where the keys of the outer dictionary contains
+                         the setup_IDs and the inner dictionary the data_IDs. The values of the inner dictionary
+                         are the predictions for the example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+        """
+        self._set_factors()
+
+        gold_pandas = gold.to_pandas()
+        gold_pandas["data_ID"] = gold_pandas["data_ID"].astype(str)
+        gold_labels_numeric = gold_pandas.set_index("data_ID")["target_numeric"].to_dict()
+
+        results_df = self._create_df(predictions, gold_labels_numeric)
+        results_df = results_df.sort_values(by=["setup_ID", "data_ID"])
+        self._assert_equal_data_ids(results_df)
+
+        # Compute the accuracy for each setup
+        accuracies, setup_IDs, setups_by_factor = [], [], []
+        for setup_ID, setup_predictions in results_df.groupby("setup_ID"):
+            accuracy = (setup_predictions["predictions_numeric"] == setup_predictions["target_numeric"]).mean()
+
+            accuracies.append(accuracy)
+            setup_IDs.append(setup_ID)
+            setups_by_factor.append(setup_predictions[self.factors].head(1))
+
+        accuracies_df = DataFrame({"setup_ID": setup_IDs, "accuracy": accuracies})
+        setups_by_factor_df = pd.concat(setups_by_factor, ignore_index=True)
+
+        # Compute main effects for each factor
+        betas, p_values = [], []
+        for factor in self.factors:
+            X = setups_by_factor_df[factor].to_numpy(dtype=int)  # X is binary and states if a factor is present or not
+            y = accuracies_df["accuracy"].to_numpy(dtype=float)  # y are the acc. scores of the respective setups
+            mask = X != 2  # create mask to ignore setups that are irrelevant to factor (coded as X == 2)
+
+            # fit GLM
+            beta, p_value = self._calculate_main_effects(X[mask], y[mask])
+
+            betas.append(beta)
+            p_values.append(p_value)
+
+        main_effects_df = DataFrame({"factor": self.factors, "beta": betas, "p_value": p_values})
+
+        # Compute Cohen's kappa for consistency
+        kappas = []
+        for factor in self.factors:
+            factor_present = results_df.loc[results_df[factor] == "1"]["predictions_numeric"]
+            factor_absent = results_df.loc[results_df[factor] == "0"]["predictions_numeric"]
+
+            # mask out predictions that are out-of-label-distribution
+            mask = [(f1 != -1 and f2 != -1) for f1, f2 in zip(factor_absent, factor_present)]
+            factor_present, factor_absent = factor_present[mask], factor_absent[mask]
+
+            kappas.append(cohen_kappa_score(factor_present, factor_absent))
+
+        kappas_df = DataFrame({"factor": self.factors, "kappa": kappas})
+
+        # Calculate average kappa
+        kappa_avg = kappas_df["kappa"].mean()
+
+        # Return the evaluation metrics.
+        return {
+            "accuracy": accuracies_df,
+            "main_effects": main_effects_df,
+            "kappas": kappas_df,
+            "kappa_avg": kappa_avg,
+        }
+
+    def add_factor(self, data: Tuple[Dict, Dict], factor: str) -> Dict[str, Dict[str, Any]]:
+        """Concatenate the data with the factor present and absent and update the setup_IDs accordingly. Also add the
+           respective factor to the list of factors.
+
+        Args:
+            data: A tuple containing predictions, where the first element are predictions with factor absent and the
+                    second element are predictions with factor present.
+            factor: A string giving the name of the added factor.
+
+        """
+
+        # Update the setup_IDs of the data by appending a 0 when the factor is absent or 1 when the factor is present.
+        setup_ids0 = list(data[0].keys())
+        setup_ids1 = list(data[1].keys())
+
+        for setup_id0, setup_id1 in zip(setup_ids0, setup_ids1):
+            updated_id0 = setup_id0 + "0"
+            updated_id1 = setup_id1 + "1"
+            data[0][updated_id0] = data[0].pop(setup_id0)
+            data[1][updated_id1] = data[1].pop(setup_id1)
+
+        # Add factor to list of factors.
+        self._set_factors()
+        self.factors.append(factor)
+
+        return {**data[0], **data[1]}
+
+    def remove_factor(self, data: datasets.Dataset, factor: str, keep_present: bool = False) -> datasets.Dataset:
+        """Remove data of factor and update the setup_IDs accordingly. Also remove the
+           respective factor from the list of factors. Keep_present determines whether to keep data with the factor
+           present or absent.
+
+        Args:
+            data: The dataset as obtained by the get_prepared_datasets() method.
+            factor: A string with the name of the factor to remove.
+            keep_present: whether to keep data with the factor present or absent.
+        """
+        self._set_factors()
+
+        len_setup_ID_preamble = 4
+        index_factor = self.factors.index(factor) + len_setup_ID_preamble
+        realisation_to_keep = str(int(keep_present))
+
+        # filter out all unwanted datapoints and adapt setup_IDs to exclude factor
+        data = data.filter(lambda x: x["setup_ID"][index_factor] == realisation_to_keep)
+        data = data.map(lambda x: {**x, "setup_ID": x["setup_ID"][:index_factor] + x["setup_ID"][index_factor + 1 :]})
+
+        # Remove factor from list of factors.
+        self.factors.pop(self.factors.index(factor))
+
+        return data
+
+    def _create_df(self, predictions: Dict[str, Dict[str, Any]], gold_labels: Dict[str, int]) -> DataFrame:
+        """Create a dataframe containing all predictions, gold labels and labels.
+
+        Args:
+            predictions: A dictionary of dictionary, where the keys of the outer dictionary contains
+                         the setup_IDs and the inner dictionary the data_IDs. The values of the inner dictionary
+                         are the predictions for the example. The keys are strings and the values can be any type.
+            gold: A dictionary, where the keys are the data_IDs and the values are the gold labels for the example.
+                         The keys are strings and the values can be any type.
+
+        Returns:
+            A pandas dataframe containing the predictions and gold data.
+        """
+        additional_keys = ["predictions_numeric", "target_numeric", "setup_ID", "data_ID"]
+        results_dict = {factor: [] for factor in self.factors + additional_keys}
+
+        for setup_ID, predictions_setup in predictions.items():
+            data_ids = list(predictions_setup.keys())
+            n_datapoints = len(data_ids)
+
+            results_dict["data_ID"].extend(data_ids)
+            results_dict["setup_ID"].extend([setup_ID] * n_datapoints)
+            results_dict["target_numeric"].extend(gold_labels[data_id] for data_id in data_ids)
+            results_dict["predictions_numeric"].extend(
+                self._label_to_numeric(predictions_setup[data_id]) for data_id in data_ids
+            )
+
+            temp = self._convert_numeric_id_to_dict(setup_ID, n_repetitions=n_datapoints)
+            for factor in self.factors:
+                results_dict[factor].extend(temp[factor])
+
+        return DataFrame(results_dict)
+
+    def _set_factors(self):
+        if not hasattr(self, "factors"):
+            self.factors = factors
+
+    def _convert_numeric_id_to_dict(self, setup_id: str, n_repetitions: int = 1) -> Dict[str, Any]:
+        """Convert a numeric setup_ID to a interpretable dict.
+
+        Args:
+            id: A numeric ID of the form `id_1010101' where each digit represents a factor.
+
+        Returns:
+            A dict containing factors as keys and the factor realisation as value.
+        """
+        setup_id = setup_id.split("_")[1]
+
+        setup_dict = {}
+        for factor, value in zip(self.factors, setup_id):
+            setup_dict[factor] = [value] * n_repetitions
+
+        return setup_dict
+
+    @staticmethod
+    def _calculate_main_effects(X: ndarray, y: ndarray) -> Tuple[ndarray, ndarray]:
+        """
+
+        :return:
+        """
+        # Add a constant column to X for the intercept
+        X = sm.add_constant(X)
+
+        # Fit GLM
+        model = sm.GLM(y, X)
+        results = model.fit()
+
+        return results.params[1], results.pvalues[1]
+
+    @staticmethod
+    def _label_to_numeric(label: str) -> int:
+        """Convert a label to a numeric value.
+
+        Args:
+            label: A label.
+
+        Returns:
+            A numeric label.
+        """
+        return LABEL_TO_NUMERIC[label] if label in LABEL_TO_NUMERIC else -1
+
+    @staticmethod
+    def _assert_equal_data_ids(results_df: DataFrame) -> None:
+        """Assert that all data_IDs are the same for all setups.
+
+        Args:
+            results_df: A pandas dataframe containing the predictions and gold data.
+        """
+        used_data_ids = results_df["data_ID"].unique()
+        for setup_ID in results_df["setup_ID"].unique():
+            assert (
+                used_data_ids.sort() == results_df.loc[results_df["setup_ID"] == setup_ID]["data_ID"].unique().sort()
+            ), "Not all data_IDs are the same for all setups. Check for missing predictions!"
diff --git a/src/genbench/tasks/icl_consistency_test/config.jsonnet b/src/genbench/tasks/icl_consistency_test/config.jsonnet
new file mode 100644
index 0000000..43a4af8
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/config.jsonnet
@@ -0,0 +1,29 @@
+{
+    name: 'ICL consistency test',
+
+    description: 'The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups. Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the number of parameters or instructions tuning). These external factors can be added into analysis by using the task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 strongly changes model predictions. The ICL consistency test has two subtasks, one evaluating the ANLI-dataset (Nie et al., 2019); the other the MNLI-dataset (Wang et al., 2017).',
+
+    keywords: [
+        'consistency',
+        'LLM',
+        'robustness',
+        'in-context learning',
+        'prompt-based learning',
+        'icl',
+        'anli',
+        'mnli'
+    ],
+
+    authors: [
+        'Lucas Weber',
+        'Elia Bruni',
+        'Dieuwke Hupkes',
+        
+    ],
+
+    subtasks_order: [
+        'anli',
+        'mnli',
+        
+    ],
+}
diff --git a/src/genbench/tasks/icl_consistency_test/doc.md b/src/genbench/tasks/icl_consistency_test/doc.md
new file mode 100644
index 0000000..ab3ff15
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/doc.md
@@ -0,0 +1,130 @@
+# ICL consistency test
+
+The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups.
+Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. 
+the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a 
+specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related 
+to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the 
+number of parameters or instructions tuning). These external factors can be added into analysis by using the 
+task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. 
+A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 
+strongly changes model predictions. The ICL consistency test has two subtasks, one evaluating the ANLI-dataset (Nie et al., 2019); 
+the other the MNLI-dataset (Wang et al., 2017).
+
+*Size*: Each subtask contains 57600 when using the full 600 data\_IDs. The user can choose to reduce the number of evaluated data\_IDs.
+
+## Abstract
+Just like the previous generation of _task-tuned models_ (TT), _large language models_ (LLMs) that are adapted to tasks via prompt-based meth- ods like _in-context-learning_ (ICL) or _instruction tuning_ (IT) perform well in some setups, but not in others. This lack of consistency in prompt-based learning hints at a lack of robust generalisation. We here introduce the ICL consistency test – a contribution to the GenBench CBT – which evaluates how consistent a model does predictions across many different setups while using the same data. The test is based on different established natural language inference tasks. We provide preprocessed data that constitutes 96 different ‘setups’ and a metric that estimates model consistency across these setups. The metric is provided on a fine-grained level, to understand what properties of a setup render predictions unstable and on an aggregated level to compare overall model consistency. We conduct an empirical analysis of eight state-of-the-art models and our consistency metric reveals how all tested LLMs lack robust generalisation.
+
+## Examples
+The test evaluates the same datapoints across many different setups to determine the consistency of a model's predictions. Every datapoint has a data\_ID (specifying the original datapoint) and a setup\_ID (with each digit specifying the presence or absence of a factor).
+
+Example with data\_ID - 1120; setup\_ID - id0_0200020:
+```
+The city's name derives from the Greek words "άργυρος" ("árgyros" meaning 
+"silver") and "πόλη" ("poli" meaning "city"). The name's older form was 
+"Argyroupolis". The first name of the settlement was "New Argyroupolis", 
+given by the refugees from Gümüşhane. Using only the above description 
+and what you know about the world, "The city's name derives from Greek words." 
+is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Undead is a 2003 Australian zombie science fiction horror comedy film 
+written and directed by Michael and Peter Spierig and starring Felicity 
+Mason, Mungo McKay and Rob Jenkins. It was then-relatively-unknown "Good Game" 
+presenter Steven O'Donnell's first film role. Using only the above description 
+and what you know about the world, "Steven O'Donnell was not a popular actor before 
+the 2003 Zombie movie." is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Let the Music Do the Talking is the first of four albums by The Joe Perry 
+Project. It was their the most successful, selling approximately 250,000 
+copies in the United States. The title track was re-recorded by Joe Perry's 
+more successful band Aerosmith on their album "Done With Mirrors", albeit 
+with a slightly different melody and Steven Tyler penned lyrics. Using only 
+the above description and what you know about the world, ""Done With Mirrors"
+was an album by The Joe Perry Project." is definitely correct, incorrect, or 
+inconclusive? 
+
+ANSWER: 
+```
+
+## Usage
+For an example script copy `example_evaluation.py` into your genbench root directory (`<path-to-genbench>/genbench_cbt`) and run it.
+#### Dataloading
+The task can loaded through the default GenBench interface as a zero-shot task:
+```python
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+task = load_task("icl_consistency_test")
+ds = task.get_prepared_datasets(
+                    PreparationStrategy.PROMPT_BASED_TESTING, 
+                    shot_list=[0]
+                    )[0]
+```
+#### Evaluation
+Provide the evaluation function with the model outputs as strings, accompanied by the corresponding setup-ids and data-ids 
+from the original dataset.
+For the predictions, please follow the following format: 
+
+`predictions: Dict[setup_ID, Dict[data_ID, model_output]]`
+
+For the gold labels, please provide the original dataset ds:
+
+`gold: datasets.Dataset`
+
+With this input, run the task evaluation like so:
+```python
+results = task.evaluate_predictions(predictions=predictions, 
+                                    gold=ds)
+```
+
+#### Adding factors
+External factors can be added via the `task.add_factor()` method.
+```python
+predictions = (predictions_factor_absent, predictions_factor_present)
+predictions = task.add_factor(data=predictions, 
+                              factor='<name_of_additional_factor>')
+```
+where `predictions_factor_absent` and `predictions_factor_present` are dictionaries of the same format as the original 
+predictions dictionary.
+
+#### Removing factors
+Factors can be removed from the dataset and the evaluation by using the `task.remove_factor()` method.
+```python
+predictions = task.remove_factor(data=ds, 
+                                 factor='<name_of_factor_to_be_removed>')
+```
+where `ds` is the original dataset as obtained by the `task.get_prepared_datasets()` method. Note that removing factors 
+will influence the results on all other factors.
+
+## Data Source
+The original data stems from the ANLI dataset (Nie et al., 2019).
+Prompting templates are taken from promptsource (Bach et al., 2022).
+
+## Limitations and Bias
+We identify the following limitations of the consistency test:
+1. The number of factors in limited and does not cover all possible factors that might influence the predictions. We limited ourselves to factors we deem relevant, to ensure fast evaluation.
+
+2. Currently, the test is only implemented for the ANLI-dataset.
+
+3. External factors such as _Instruction tuning_ or _calibration_ have to be manually added by the user using the `task.add_factor()` method. 
+
+
+## GenBench Eval card
+This test can be used to test generalisation in LLMs (pretrain - test locus).
+It is designed to better understand how LLMs generalise (intrinsic motivation) and to give practical hints on relevant prompt-design decisions (practical motivation). It can be used to assess robustness.
+
+![GenBench Eval Card](GenBench_eval_card.png)
+
+
+## References
+
+Bach, S. H., Sanh, V., Yong, Z. X., Webson, A., Raffel, C., Nayak, N. V., ... & Rush, A. M. (2022). Promptsource: An integrated development environment and repository for natural language prompts. arXiv preprint arXiv:2202.01279.
+
+Nie, Y., Williams, A., Dinan, E., Bansal, M., Weston, J., & Kiela, D. (2019). Adversarial NLI: A new benchmark for natural language understanding. arXiv preprint arXiv:1910.14599.
+
+Wang, Z., Hamza, W., & Florian, R. (2017, August). Bilateral multi-perspective matching for natural language sentences. In Proceedings of the 26th International Joint Conference on Artificial Intelligence (pp. 4144-4150).
diff --git a/src/genbench/tasks/icl_consistency_test/example_evaluation.py b/src/genbench/tasks/icl_consistency_test/example_evaluation.py
new file mode 100644
index 0000000..b52bf9e
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/example_evaluation.py
@@ -0,0 +1,142 @@
+"""
+EXAMPLE USAGE OF ICL CONSISTENCY TEST
+
+This script requires additional packages to be installed:
+
+pip install torch
+pip install git+https://github.com/huggingface/transformers.git
+pip install bitsandbytes
+pip install accelerate
+
+"""
+import string
+from typing import Dict, List
+
+import torch
+import transformers
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+DATASET = "anli"  # options: {'anli', 'mnli'}
+N_DATAPOINTS = 200
+MODEL_NAME = "huggyllama/llama-7b"
+BATCH_SIZE = 8
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+class Generator:
+    """
+    A simple wrapper to evaluate a given hf-model
+    """
+
+    def __init__(self, model_name="huggyllama/llama-7b"):
+        self.max_new_tokens = 4  # some labels consist of up to 4 tokens
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_name,
+            device_map="auto",
+            padding_side="left",
+        )
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_name,
+            load_in_8bit=True,
+            device_map="auto",
+        ).eval()
+
+        self.generation_config = transformers.GenerationConfig(
+            do_sample=False,
+            return_dict_in_generate=False,
+            output_scores=True,
+            max_new_tokens=self.max_new_tokens,
+            return_full_text=False,
+        )
+
+    def generate(self, prompt) -> List[str]:
+        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
+
+        input_ids = inputs["input_ids"].to(device)
+        attention_mask = inputs["attention_mask"].to(device)
+
+        generation_output = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            generation_config=self.generation_config,
+        )
+
+        outputs = self.tokenizer.batch_decode(generation_output[:, input_ids.shape[1] :])
+
+        # do some post-processing
+        outputs = [o.strip().split()[0].translate(str.maketrans("", "", string.punctuation)) for o in outputs]
+
+        return outputs
+
+    def make_predictions(self, dataset, bs=8) -> Dict[str, Dict[str, str]]:
+        out = {}
+        dl = DataLoader(dataset=dataset, batch_size=bs, num_workers=0)
+
+        with torch.no_grad():
+            for batch in tqdm(dl):
+                prediction = self.generate(batch["input"])
+
+                # organize predictions into output dictionary
+                for i, (data_ID, setup_ID) in enumerate(zip(batch["data_ID"], batch["setup_ID"])):
+                    data_ID = str(data_ID.item() if isinstance(data_ID, Tensor) else data_ID)
+                    if setup_ID in out.keys():
+                        out[setup_ID].update({data_ID: prediction[i]})
+                    else:
+                        out[setup_ID] = {data_ID: prediction[i]}
+
+        return out
+
+
+if __name__ == "__main__":
+    # Load the task
+    task = load_task("icl_consistency_test")[DATASET]
+    ds = task.get_prepared_datasets(PreparationStrategy.PROMPT_BASED_TESTING, shot_list=[0])[0]
+
+    # Selecting a subset of example for illustration purposes
+    subset = list(set(ds["data_ID"]))[:N_DATAPOINTS]
+    ds = ds.filter(lambda x: x["data_ID"] in subset)
+
+    # Generate predictions for the dataset
+    generator = Generator(model_name=MODEL_NAME)
+    predictions = generator.make_predictions(ds, bs=BATCH_SIZE)
+
+    # OPTIONAL: The ICL-consistency test provides the option to add factors to the analysis by using the
+    # `add_factor` method.
+    add_external_factor = False
+    if add_external_factor:
+        predictions_external_factor = ...  # some function generating alternative predictions
+        predictions = task.add_factor(data=(predictions, predictions_external_factor), factor="<my-external-factor>")
+
+    # Evaluate the predictions
+    results = task.evaluate_predictions(predictions=predictions, gold=ds)
+
+    print_out = f"""
+    {"#" * 90}
+    EVALUATED SUCCESSFULLY!
+    {"#" * 90}
+
+    {"-" * 90}
+    Accuracies:
+    Mean: {results["accuracy"]["accuracy"].mean()}; std: {results["accuracy"]["accuracy"].std()}
+    {"-" * 90}
+    Main effects:
+    {results["main_effects"]}
+    {"-" * 90}
+    Consistency:
+    {results["kappas"]}
+    {"-" * 90}
+
+    {"#" * 90}
+    Overall consistency: {results["kappa_avg"]}
+    {"#" * 90}
+    """
+
+    print(print_out)
diff --git a/src/genbench/tasks/icl_consistency_test/mnli/__init__.py b/src/genbench/tasks/icl_consistency_test/mnli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/icl_consistency_test/mnli/config.jsonnet b/src/genbench/tasks/icl_consistency_test/mnli/config.jsonnet
new file mode 100644
index 0000000..7896931
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/mnli/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'ICL consistency test (mnli)',
+
+    description: 'The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups. Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the number of parameters or instructions tuning). These external factors can be added into analysis by using the task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 strongly changes model predictions. This test evaluates the MNLI-dataset (Wang et al., 2017).',
+
+    keywords: [
+        'consistency',
+        'LLM',
+        'robustness',
+        'in-context learning',
+        'prompt-based learning',
+        'icl',
+        'anli',
+        'mnli'     
+    ],
+
+    authors: [
+        'Lucas Weber',
+        'Elia Bruni',
+        'Dieuwke Hupkes',
+        
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/LucWeber/icl_consistency_data/main/data/genbench_all_glue+mnli.jsonl',  
+    },
+
+    has_validation_set: false,
+    has_train_set: false,
+
+    task_type: 'free_form',
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: '',
+                instruction_few_shot: '',
+                input_prefix: '',
+                output_prefix: '',
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/icl_consistency_test/mnli/doc.md b/src/genbench/tasks/icl_consistency_test/mnli/doc.md
new file mode 100644
index 0000000..ab3ff15
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/mnli/doc.md
@@ -0,0 +1,130 @@
+# ICL consistency test
+
+The ICL consistency test measures the consistency of LLM predictions on the same datapoints across many different setups.
+Different setups are defined by "factors". On the one hand, factors can be specific attributes of the used prompt (e.g. 
+the number of examples the model is presented with ["n_shots"] or the type of instructions that were used to wrap a 
+specific datapoint ["Instructions"]). On the otherhand, the analysis can also be augmented by factors that are related 
+to the way a model is evaluated (e.g. whether a model is calibrated) or the type of model that is evaluated (e.g. the 
+number of parameters or instructions tuning). These external factors can be added into analysis by using the 
+task.add_factor() method. The output-metric is Cohen\'s kappa for each factor across all different conditions. 
+A kappa-value close to 1 indicates that the factors does not change the model prediction, while a factor close to 0 
+strongly changes model predictions. The ICL consistency test has two subtasks, one evaluating the ANLI-dataset (Nie et al., 2019); 
+the other the MNLI-dataset (Wang et al., 2017).
+
+*Size*: Each subtask contains 57600 when using the full 600 data\_IDs. The user can choose to reduce the number of evaluated data\_IDs.
+
+## Abstract
+Just like the previous generation of _task-tuned models_ (TT), _large language models_ (LLMs) that are adapted to tasks via prompt-based meth- ods like _in-context-learning_ (ICL) or _instruction tuning_ (IT) perform well in some setups, but not in others. This lack of consistency in prompt-based learning hints at a lack of robust generalisation. We here introduce the ICL consistency test – a contribution to the GenBench CBT – which evaluates how consistent a model does predictions across many different setups while using the same data. The test is based on different established natural language inference tasks. We provide preprocessed data that constitutes 96 different ‘setups’ and a metric that estimates model consistency across these setups. The metric is provided on a fine-grained level, to understand what properties of a setup render predictions unstable and on an aggregated level to compare overall model consistency. We conduct an empirical analysis of eight state-of-the-art models and our consistency metric reveals how all tested LLMs lack robust generalisation.
+
+## Examples
+The test evaluates the same datapoints across many different setups to determine the consistency of a model's predictions. Every datapoint has a data\_ID (specifying the original datapoint) and a setup\_ID (with each digit specifying the presence or absence of a factor).
+
+Example with data\_ID - 1120; setup\_ID - id0_0200020:
+```
+The city's name derives from the Greek words "άργυρος" ("árgyros" meaning 
+"silver") and "πόλη" ("poli" meaning "city"). The name's older form was 
+"Argyroupolis". The first name of the settlement was "New Argyroupolis", 
+given by the refugees from Gümüşhane. Using only the above description 
+and what you know about the world, "The city's name derives from Greek words." 
+is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Undead is a 2003 Australian zombie science fiction horror comedy film 
+written and directed by Michael and Peter Spierig and starring Felicity 
+Mason, Mungo McKay and Rob Jenkins. It was then-relatively-unknown "Good Game" 
+presenter Steven O'Donnell's first film role. Using only the above description 
+and what you know about the world, "Steven O'Donnell was not a popular actor before 
+the 2003 Zombie movie." is definitely correct, incorrect, or inconclusive? 
+
+ANSWER:  Correct.
+
+Let the Music Do the Talking is the first of four albums by The Joe Perry 
+Project. It was their the most successful, selling approximately 250,000 
+copies in the United States. The title track was re-recorded by Joe Perry's 
+more successful band Aerosmith on their album "Done With Mirrors", albeit 
+with a slightly different melody and Steven Tyler penned lyrics. Using only 
+the above description and what you know about the world, ""Done With Mirrors"
+was an album by The Joe Perry Project." is definitely correct, incorrect, or 
+inconclusive? 
+
+ANSWER: 
+```
+
+## Usage
+For an example script copy `example_evaluation.py` into your genbench root directory (`<path-to-genbench>/genbench_cbt`) and run it.
+#### Dataloading
+The task can loaded through the default GenBench interface as a zero-shot task:
+```python
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+task = load_task("icl_consistency_test")
+ds = task.get_prepared_datasets(
+                    PreparationStrategy.PROMPT_BASED_TESTING, 
+                    shot_list=[0]
+                    )[0]
+```
+#### Evaluation
+Provide the evaluation function with the model outputs as strings, accompanied by the corresponding setup-ids and data-ids 
+from the original dataset.
+For the predictions, please follow the following format: 
+
+`predictions: Dict[setup_ID, Dict[data_ID, model_output]]`
+
+For the gold labels, please provide the original dataset ds:
+
+`gold: datasets.Dataset`
+
+With this input, run the task evaluation like so:
+```python
+results = task.evaluate_predictions(predictions=predictions, 
+                                    gold=ds)
+```
+
+#### Adding factors
+External factors can be added via the `task.add_factor()` method.
+```python
+predictions = (predictions_factor_absent, predictions_factor_present)
+predictions = task.add_factor(data=predictions, 
+                              factor='<name_of_additional_factor>')
+```
+where `predictions_factor_absent` and `predictions_factor_present` are dictionaries of the same format as the original 
+predictions dictionary.
+
+#### Removing factors
+Factors can be removed from the dataset and the evaluation by using the `task.remove_factor()` method.
+```python
+predictions = task.remove_factor(data=ds, 
+                                 factor='<name_of_factor_to_be_removed>')
+```
+where `ds` is the original dataset as obtained by the `task.get_prepared_datasets()` method. Note that removing factors 
+will influence the results on all other factors.
+
+## Data Source
+The original data stems from the ANLI dataset (Nie et al., 2019).
+Prompting templates are taken from promptsource (Bach et al., 2022).
+
+## Limitations and Bias
+We identify the following limitations of the consistency test:
+1. The number of factors in limited and does not cover all possible factors that might influence the predictions. We limited ourselves to factors we deem relevant, to ensure fast evaluation.
+
+2. Currently, the test is only implemented for the ANLI-dataset.
+
+3. External factors such as _Instruction tuning_ or _calibration_ have to be manually added by the user using the `task.add_factor()` method. 
+
+
+## GenBench Eval card
+This test can be used to test generalisation in LLMs (pretrain - test locus).
+It is designed to better understand how LLMs generalise (intrinsic motivation) and to give practical hints on relevant prompt-design decisions (practical motivation). It can be used to assess robustness.
+
+![GenBench Eval Card](GenBench_eval_card.png)
+
+
+## References
+
+Bach, S. H., Sanh, V., Yong, Z. X., Webson, A., Raffel, C., Nayak, N. V., ... & Rush, A. M. (2022). Promptsource: An integrated development environment and repository for natural language prompts. arXiv preprint arXiv:2202.01279.
+
+Nie, Y., Williams, A., Dinan, E., Bansal, M., Weston, J., & Kiela, D. (2019). Adversarial NLI: A new benchmark for natural language understanding. arXiv preprint arXiv:1910.14599.
+
+Wang, Z., Hamza, W., & Florian, R. (2017, August). Bilateral multi-perspective matching for natural language sentences. In Proceedings of the 26th International Joint Conference on Artificial Intelligence (pp. 4144-4150).
diff --git a/src/genbench/tasks/icl_consistency_test/mnli/task.py b/src/genbench/tasks/icl_consistency_test/mnli/task.py
new file mode 100644
index 0000000..70755be
--- /dev/null
+++ b/src/genbench/tasks/icl_consistency_test/mnli/task.py
@@ -0,0 +1,263 @@
+from typing import Any, Dict, Tuple
+
+import datasets
+import pandas as pd
+import statsmodels.api as sm
+from numpy import ndarray
+from pandas import DataFrame
+from sklearn.metrics import cohen_kappa_score
+
+from genbench import Task
+
+
+LABELS = [
+    ["Correct", "True", "Always", "Yes", "Guaranteed", "Duplicates"],  # `correct` labels
+    ["Inconclusive", "Possible", "Sometimes", "Maybe", "Neither"],  # `neutral` labels
+    ["Impossible", "Never", "Incorrect", "False", "No", "Not Duplicates"],  # `incorrect` labels
+]
+
+LABEL_TO_NUMERIC = {}
+LABEL_TO_NUMERIC.update(dict([(label, i) for i, label_subset in enumerate(LABELS) for label in label_subset]))
+LABEL_TO_NUMERIC.update(dict([(label.lower(), i) for i, label_subset in enumerate(LABELS) for label in label_subset]))
+
+factors = [
+    "balanced_labels",
+    "one_label",
+    "cross_task",
+    "cross_instructions",
+    "n_shots",
+    "instructions",
+    "hp_instructions",
+]
+
+
+class IclConsistencyTestWSubtasksMnli(Task):
+    """Python implementation of the ICL consistency test task."""
+
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: Dict[str, Dict[str, Any]],
+        gold: datasets.Dataset,
+    ) -> Dict[str, Any]:
+        """Evaluate the predictions of the model against the gold data.
+        Calculating exact match accuracy plus consistency across all setups (Cohen's kappa).
+
+        Args:
+            predictions: A dictionary of dictionary, where the keys of the outer dictionary contains
+                         the setup_IDs and the inner dictionary the data_IDs. The values of the inner dictionary
+                         are the predictions for the example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+        """
+        self._set_factors()
+
+        gold_pandas = gold.to_pandas()
+        gold_pandas["data_ID"] = gold_pandas["data_ID"].astype(str)
+        gold_labels_numeric = gold_pandas.set_index("data_ID")["target_numeric"].to_dict()
+
+        results_df = self._create_df(predictions, gold_labels_numeric)
+        results_df = results_df.sort_values(by=["setup_ID", "data_ID"])
+        self._assert_equal_data_ids(results_df)
+
+        # Compute the accuracy for each setup
+        accuracies, setup_IDs, setups_by_factor = [], [], []
+        for setup_ID, setup_predictions in results_df.groupby("setup_ID"):
+            accuracy = (setup_predictions["predictions_numeric"] == setup_predictions["target_numeric"]).mean()
+
+            accuracies.append(accuracy)
+            setup_IDs.append(setup_ID)
+            setups_by_factor.append(setup_predictions[self.factors].head(1))
+
+        accuracies_df = DataFrame({"setup_ID": setup_IDs, "accuracy": accuracies})
+        setups_by_factor_df = pd.concat(setups_by_factor, ignore_index=True)
+
+        # Compute main effects for each factor
+        betas, p_values = [], []
+        for factor in self.factors:
+            X = setups_by_factor_df[factor].to_numpy(dtype=int)  # X is binary and states if a factor is present or not
+            y = accuracies_df["accuracy"].to_numpy(dtype=float)  # y are the acc. scores of the respective setups
+            mask = X != 2  # create mask to ignore setups that are irrelevant to factor (coded as X == 2)
+
+            # fit GLM
+            beta, p_value = self._calculate_main_effects(X[mask], y[mask])
+
+            betas.append(beta)
+            p_values.append(p_value)
+
+        main_effects_df = DataFrame({"factor": self.factors, "beta": betas, "p_value": p_values})
+
+        # Compute Cohen's kappa for consistency
+        kappas = []
+        for factor in self.factors:
+            factor_present = results_df.loc[results_df[factor] == "1"]["predictions_numeric"]
+            factor_absent = results_df.loc[results_df[factor] == "0"]["predictions_numeric"]
+
+            # mask out predictions that are out-of-label-distribution
+            mask = [(f1 != -1 and f2 != -1) for f1, f2 in zip(factor_absent, factor_present)]
+            factor_present, factor_absent = factor_present[mask], factor_absent[mask]
+
+            kappas.append(cohen_kappa_score(factor_present, factor_absent))
+
+        kappas_df = DataFrame({"factor": self.factors, "kappa": kappas})
+
+        # Calculate average kappa
+        kappa_avg = kappas_df["kappa"].mean()
+
+        # Return the evaluation metrics.
+        return {
+            "accuracy": accuracies_df,
+            "main_effects": main_effects_df,
+            "kappas": kappas_df,
+            "kappa_avg": kappa_avg,
+        }
+
+    def add_factor(self, data: Tuple[Dict, Dict], factor: str) -> Dict[str, Dict[str, Any]]:
+        """Concatenate the data with the factor present and absent and update the setup_IDs accordingly. Also add the
+           respective factor to the list of factors.
+
+        Args:
+            data: A tuple containing predictions, where the first element are predictions with factor absent and the
+                    second element are predictions with factor present.
+            factor: A string giving the name of the added factor.
+
+        """
+
+        # Update the setup_IDs of the data by appending a 0 when the factor is absent or 1 when the factor is present.
+        setup_ids0 = list(data[0].keys())
+        setup_ids1 = list(data[1].keys())
+
+        for setup_id0, setup_id1 in zip(setup_ids0, setup_ids1):
+            updated_id0 = setup_id0 + "0"
+            updated_id1 = setup_id1 + "1"
+            data[0][updated_id0] = data[0].pop(setup_id0)
+            data[1][updated_id1] = data[1].pop(setup_id1)
+
+        # Add factor to list of factors.
+        self._set_factors()
+        self.factors.append(factor)
+
+        return {**data[0], **data[1]}
+
+    def remove_factor(self, data: datasets.Dataset, factor: str, keep_present: bool = False) -> datasets.Dataset:
+        """Remove data of factor and update the setup_IDs accordingly. Also remove the
+           respective factor from the list of factors. Keep_present determines whether to keep data with the factor
+           present or absent.
+
+        Args:
+            data: The dataset as obtained by the get_prepared_datasets() method.
+            factor: A string with the name of the factor to remove.
+            keep_present: whether to keep data with the factor present or absent.
+        """
+        self._set_factors()
+
+        len_setup_ID_preamble = 4
+        index_factor = self.factors.index(factor) + len_setup_ID_preamble
+        realisation_to_keep = str(int(keep_present))
+
+        # filter out all unwanted datapoints and adapt setup_IDs to exclude factor
+        data = data.filter(lambda x: x["setup_ID"][index_factor] == realisation_to_keep)
+        data = data.map(lambda x: {**x, "setup_ID": x["setup_ID"][:index_factor] + x["setup_ID"][index_factor + 1 :]})
+
+        # Remove factor from list of factors.
+        self.factors.pop(self.factors.index(factor))
+
+        return data
+
+    def _create_df(self, predictions: Dict[str, Dict[str, Any]], gold_labels: Dict[str, int]) -> DataFrame:
+        """Create a dataframe containing all predictions, gold labels and labels.
+
+        Args:
+            predictions: A dictionary of dictionary, where the keys of the outer dictionary contains
+                         the setup_IDs and the inner dictionary the data_IDs. The values of the inner dictionary
+                         are the predictions for the example. The keys are strings and the values can be any type.
+            gold: A dictionary, where the keys are the data_IDs and the values are the gold labels for the example.
+                         The keys are strings and the values can be any type.
+
+        Returns:
+            A pandas dataframe containing the predictions and gold data.
+        """
+        additional_keys = ["predictions_numeric", "target_numeric", "setup_ID", "data_ID"]
+        results_dict = {factor: [] for factor in self.factors + additional_keys}
+
+        for setup_ID, predictions_setup in predictions.items():
+            data_ids = list(predictions_setup.keys())
+            n_datapoints = len(data_ids)
+
+            results_dict["data_ID"].extend(data_ids)
+            results_dict["setup_ID"].extend([setup_ID] * n_datapoints)
+            results_dict["target_numeric"].extend(gold_labels[data_id] for data_id in data_ids)
+            results_dict["predictions_numeric"].extend(
+                self._label_to_numeric(predictions_setup[data_id]) for data_id in data_ids
+            )
+
+            temp = self._convert_numeric_id_to_dict(setup_ID, n_repetitions=n_datapoints)
+            for factor in self.factors:
+                results_dict[factor].extend(temp[factor])
+
+        return DataFrame(results_dict)
+
+    def _set_factors(self):
+        if not hasattr(self, "factors"):
+            self.factors = factors
+
+    def _convert_numeric_id_to_dict(self, setup_id: str, n_repetitions: int = 1) -> Dict[str, Any]:
+        """Convert a numeric setup_ID to a interpretable dict.
+
+        Args:
+            id: A numeric ID of the form `id_1010101' where each digit represents a factor.
+
+        Returns:
+            A dict containing factors as keys and the factor realisation as value.
+        """
+        setup_id = setup_id.split("_")[1]
+
+        setup_dict = {}
+        for factor, value in zip(self.factors, setup_id):
+            setup_dict[factor] = [value] * n_repetitions
+
+        return setup_dict
+
+    @staticmethod
+    def _calculate_main_effects(X: ndarray, y: ndarray) -> Tuple[ndarray, ndarray]:
+        """
+
+        :return:
+        """
+        # Add a constant column to X for the intercept
+        X = sm.add_constant(X)
+
+        # Fit GLM
+        model = sm.GLM(y, X)
+        results = model.fit()
+
+        return results.params[1], results.pvalues[1]
+
+    @staticmethod
+    def _label_to_numeric(label: str) -> int:
+        """Convert a label to a numeric value.
+
+        Args:
+            label: A label.
+
+        Returns:
+            A numeric label.
+        """
+        return LABEL_TO_NUMERIC[label] if label in LABEL_TO_NUMERIC else -1
+
+    @staticmethod
+    def _assert_equal_data_ids(results_df: DataFrame) -> None:
+        """Assert that all data_IDs are the same for all setups.
+
+        Args:
+            results_df: A pandas dataframe containing the predictions and gold data.
+        """
+        used_data_ids = results_df["data_ID"].unique()
+        for setup_ID in results_df["setup_ID"].unique():
+            assert (
+                used_data_ids.sort() == results_df.loc[results_df["setup_ID"] == setup_ID]["data_ID"].unique().sort()
+            ), "Not all data_IDs are the same for all setups. Check for missing predictions!"
diff --git a/src/genbench/tasks/latent_feature_splits/__init__.py b/src/genbench/tasks/latent_feature_splits/__init__.py
new file mode 100644
index 0000000..8ceca21
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class LatentFeatureSplits(TaskDict):
+    pass
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet
new file mode 100644
index 0000000..d5c8c01
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Latent Feature Splits (bert_closest_split)',
+
+   // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_test_new.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_bert_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md
new file mode 100644
index 0000000..f345ddb
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection (bert_closest_split)
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png
new file mode 100644
index 0000000..5a6877d
Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/bert_closest_split/eval_card.png differ
diff --git a/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py
new file mode 100644
index 0000000..b7d322d
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/bert_closest_split/task.py
@@ -0,0 +1,99 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import datasets
+import evaluate
+
+from genbench import Task
+from genbench.api import TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class LatentFeatureSplitBertClosestSplit(Task):
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: List[Mapping[str, Any]] = None,
+        gold: datasets.Dataset = None,
+    ) -> OrderedDict[str, float]:
+        """Evaluate the predictions of the model against the gold data.
+
+        Args:
+            predictions: A list of dictionaries, where each dictionary contains the predicted
+                         values for an example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+
+        Raises:
+            ValueError: If a metric returns None.
+        """
+        result = OrderedDict()
+        for metric_config in self.config.evaluation_metrics:
+            hf_id = metric_config.hf_id
+            if isinstance(hf_id, str):
+                hf_id = [hf_id]
+
+            metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+            refs_lst = [g["target"] for g in gold]
+            preds_lst = [pred["target"] for pred in predictions]
+
+            ref_type = type(refs_lst[0])
+            pred_type = type(preds_lst[0])
+            if pred_type != ref_type:
+                if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+                    raise ValueError(
+                        f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+                    )
+                # Convert predictions to the same type as the references
+                if pred_type == str and ref_type == int:
+                    logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+                    converted_preds = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                    preds_lst = converted_preds
+                elif pred_type == int and ref_type == str:
+                    logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    refs_lst = converted_refs
+            else:
+                if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+                    # Convert both predictions and references to int
+                    logger.warning(
+                        "Predictions and references have the same type, but it is not int. Converting both to int."
+                    )
+                    converted_preds = []
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    preds_lst = converted_preds
+                    refs_lst = converted_refs
+
+            extra_kwargs = metric_config.compute_extra_kwargs or {}
+            output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+            if output is None:
+                raise ValueError(
+                    f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+                )
+
+            # Update output keys to include the metric id
+            metric_id = "_".join(hf_id)
+            output = {f"hf_{metric_id}__{k}": v for k, v in output.items()}
+
+            result.update(output)
+
+        return result
diff --git a/src/genbench/tasks/latent_feature_splits/config.jsonnet b/src/genbench/tasks/latent_feature_splits/config.jsonnet
new file mode 100644
index 0000000..ef4f553
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/config.jsonnet
@@ -0,0 +1,58 @@
+{
+    name: 'Latent Feature Split',
+
+     // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/doc.md b/src/genbench/tasks/latent_feature_splits/doc.md
new file mode 100644
index 0000000..d51a56e
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet
new file mode 100644
index 0000000..d30afa0
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Latent Feature Splits (roberta_closest_split)',
+
+    // @TODO: Add a description of the task
+    description: "We split hate speech data based on the internal representations of a RoBERTa model. 
+    The o.o.d. data splits leads to an under-representation of parts of the latent space in the 
+    model's training set, making the split more challenging than a random split.",
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'non-i.i.d. generalisation',
+        'o.o.d. generalisation',
+        'latent-features',
+        'hate speech'
+    ],
+
+    authors: [
+        'Maike Züfle',
+        'Verna Dankers',
+        'Ivan Titov',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_test.jsonl',
+        train: 'https://raw.githubusercontent.com/MaikeZuefle/Latent-Feature-Splits/main/genbench_splits/hatexplain_roberta_closest_split_train.jsonl'
+    },
+
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        },
+        {
+            hf_id: 'f1',
+            average: 'macro',
+            best_score: 1.0,
+            git_commit_sha: '3a4c40f7397dcd7d9dccf0659616dc6b14072dcb',
+        },
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        finetuning: {
+            objective: 'maximum_likelihood',
+        }
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md
new file mode 100644
index 0000000..0956e1d
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/doc.md
@@ -0,0 +1,52 @@
+# Hate Speech Detection (roberta_closest_split)
+
+## Abstract
+With the ever-growing presence of social media platforms comes the increased spread of harmful content and the need for robust hate speech detection systems.
+Such systems easily overfit to specific targets and keywords, and evaluating them without considering distribution shifts that might occur between train and test data overestimates their benefit.
+We challenge hate speech models via new train-test splits of existing datasets that rely on the clustering of models' hidden representations.
+We present two split variants (Subset-Sum-Split and Closest-Split) that, when applied to two datasets using four models, reveal how models catastrophically fail on blind spots in the latent space.
+This result generalises when developing a split with one model and evaluating it on another.
+Our analysis suggests that there is no clear surface-level property of the data split that correlates with the decreased performance, which underscores that task difficulty is not always humanly interpretable.
+We recommend incorporating latent feature-based splits in model development and release two splits via the GenBench benchmark.
+
+## Examples
+{"input": "wow do not all speak at once niggers", "target": 0, "target_options": ["hate", "noHate", "offensive"]}  
+{"input": "how long will jews be a majority in israel", "target": 1, "target_options": ["hate", "noHate", "offensive"]} 
+{"input": "sounds like something a moslem would do", "target": 2, "target_options": ["hate", "noHate", "offensive"]}
+
+## Usage
+For the task, the model has to decide whether a social media post includes hate speech, offensive speech or normal sentences.
+
+## Data Source
+The dataset was published in `HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection ` by  Binny Mathew, Punyajoy Saha,
+Seid Muhie Yimam, Chris Biemann, Pawan Goyal and Animesh Mukherjee in 2021. It was accepted at AAAI 2021.
+
+It is licensed under the MIT License:
+
+Copyright (c) 2020 Punyajoy Saha
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+## Limitations and Bias
+*Note any known limitations or biases that the Hate Speech Detection has, with links and references if possible.*
+
+## GenBench Eval card
+This method can be used to test generalisation in HateSpeech for LLMs (pretrain - test locus).
+The split is based on the feature representations of a language model, therefore we assume that the shift is a covariate shift. The method assesses the robustness of language models and how well they generalise in out-of-distribution settings.
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png
new file mode 100644
index 0000000..5a6877d
Binary files /dev/null and b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/eval_card.png differ
diff --git a/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py
new file mode 100644
index 0000000..c6ec3fc
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/roberta_closest_split/task.py
@@ -0,0 +1,99 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import datasets
+import evaluate
+
+from genbench import Task
+from genbench.api import TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class LatentFeatureSplitRobertaClosestSplit(Task):
+    def evaluate_predictions(
+        self,
+        *,
+        predictions: List[Mapping[str, Any]] = None,
+        gold: datasets.Dataset = None,
+    ) -> OrderedDict[str, float]:
+        """Evaluate the predictions of the model against the gold data.
+
+        Args:
+            predictions: A list of dictionaries, where each dictionary contains the predicted
+                         values for an example. The keys are strings and the values can be any type.
+            gold: A HuggingFace `datasets.Dataset` object containing the ground truth data for the task.
+
+        Returns:
+            A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+            values. The keys are strings representing the name of the evaluation metric and the values are
+            floating-point numbers.
+
+        Raises:
+            ValueError: If a metric returns None.
+        """
+        result = OrderedDict()
+        for metric_config in self.config.evaluation_metrics:
+            hf_id = metric_config.hf_id
+            if isinstance(hf_id, str):
+                hf_id = [hf_id]
+
+            metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+            refs_lst = [g["target"] for g in gold]
+            preds_lst = [pred["target"] for pred in predictions]
+
+            ref_type = type(refs_lst[0])
+            pred_type = type(preds_lst[0])
+            if pred_type != ref_type:
+                if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+                    raise ValueError(
+                        f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+                    )
+                # Convert predictions to the same type as the references
+                if pred_type == str and ref_type == int:
+                    logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+                    converted_preds = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                    preds_lst = converted_preds
+                elif pred_type == int and ref_type == str:
+                    logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    refs_lst = converted_refs
+            else:
+                if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+                    # Convert both predictions and references to int
+                    logger.warning(
+                        "Predictions and references have the same type, but it is not int. Converting both to int."
+                    )
+                    converted_preds = []
+                    converted_refs = []
+                    for pred, ref in zip(preds_lst, gold):
+                        assert "target_options" in ref
+                        converted_preds.append(ref["target_options"].index(pred))
+                        converted_refs.append(ref["target_options"].index(ref["target"]))
+                    preds_lst = converted_preds
+                    refs_lst = converted_refs
+
+            extra_kwargs = metric_config.compute_extra_kwargs or {}
+            output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+            if output is None:
+                raise ValueError(
+                    f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+                )
+
+            # Update output keys to include the metric id
+            metric_id = "_".join(hf_id)
+            output = {f"hf_{metric_id}__{k}": v for k, v in output.items()}
+
+            result.update(output)
+
+        return result
diff --git a/src/genbench/tasks/latent_feature_splits/test_hatespeech.py b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py
new file mode 100644
index 0000000..523cad1
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/test_hatespeech.py
@@ -0,0 +1,8 @@
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+task = load_task("latent_feature_splits:bert_closest_split")
+ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING)
+print(ds)
+print(ds["test"][0])
diff --git a/src/genbench/tasks/latent_feature_splits/usage_example.py b/src/genbench/tasks/latent_feature_splits/usage_example.py
new file mode 100644
index 0000000..8aef633
--- /dev/null
+++ b/src/genbench/tasks/latent_feature_splits/usage_example.py
@@ -0,0 +1,91 @@
+import os
+
+import evaluate
+import numpy as np
+from datasets import DatasetDict
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainingArguments,
+)
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+def compute_metrics(eval_preds):
+    metric = evaluate.load("f1")
+    logits, labels = eval_preds
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels, average="macro")
+
+
+def main(split_name, num_labels, bsz, lr, epochs, checkpoint):
+    """
+    Basic functionality to load data, train and evaluate the model.
+    Args:
+        - split_name: str (bert_closest_split | roberta_closest_split)
+        - num_labels (int)
+        - bsz (int): batch size
+        - lr (float): learning rate
+        - epochs (int): number of epochs
+        - checkpoint (str): should be a valid HF model name
+    """
+
+    def tokenize_function(example):
+        return tokenizer(example["input"])
+
+    # Convert GenBench format to HF dataset format, get devset, preview dataset
+    task = load_task(f"latent_feature_splits:{split_name}")
+    ds = task.get_prepared_datasets(PreparationStrategy.FINETUNING)
+    ds_split = ds["train"].train_test_split(0.1)
+    ds = DatasetDict({"train": ds_split["train"], "validation": ds_split["test"], "test": ds["test"]})
+    ds = ds.rename_column("target", "label")
+    print(ds)
+
+    # Load and preprocess data
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    tokenized_datasets = ds.map(tokenize_function, batch_size=bsz, batched=True)
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # Load model and HF trainer, WITH evaluation during training
+    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
+    training_args = TrainingArguments(
+        "test-trainer",
+        learning_rate=lr,
+        num_train_epochs=epochs,
+        per_device_train_batch_size=bsz,
+        per_device_eval_batch_size=bsz,
+        evaluation_strategy="epoch",
+    )
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+    )
+
+    # Evaluate for random performance level, train, evaluate again
+    predictions = trainer.predict(tokenized_datasets["test"])
+    f1_pre = compute_metrics((predictions.predictions, predictions.label_ids))
+    trainer.train()
+    predictions = trainer.predict(tokenized_datasets["test"])
+    f1_post = compute_metrics((predictions.predictions, predictions.label_ids))
+    print(f"Random f1: {f1_pre}, f1 post-training: {f1_post}")
+
+
+if __name__ == "__main__":
+    os.environ["WANDB_DISABLED"] = "true"
+    split_name = "bert_closest_split"
+    num_labels = 3
+    batch_size = 16
+    lr = 2e-5
+    epochs = 5
+    checkpoint = "bert-base-uncased"
+
+    main(split_name, num_labels, batch_size, lr, epochs, checkpoint)
diff --git a/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf
new file mode 100644
index 0000000..3d4e16e
Binary files /dev/null and b/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf differ
diff --git a/src/genbench/tasks/nl_codesearch_clf/__init__.py b/src/genbench/tasks/nl_codesearch_clf/__init__.py
new file mode 100644
index 0000000..b8d3157
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchClf(TaskDict):
+    pass
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet
new file mode 100644
index 0000000..09feac6
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,58 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_adv)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+	has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+		
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Python programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
new file mode 100644
index 0000000..8193db4
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
new file mode 100644
index 0000000..4e77608
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetAdv(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test" or split == "train":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet
new file mode 100644
index 0000000..01715cb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_go)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'go',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+	    },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Go programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
new file mode 100644
index 0000000..aa3720e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
new file mode 100644
index 0000000..9b880ec
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetGo(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet
new file mode 100644
index 0000000..1ea6599
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_java)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+	
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'java',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Java programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
new file mode 100644
index 0000000..16abaa2
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_java)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_java) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
new file mode 100644
index 0000000..292e74c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetJava(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet
new file mode 100644
index 0000000..f61ade9
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_javascript)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_javascript) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'javascript',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_javascript.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+			git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+			best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and an Javascript programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md
new file mode 100644
index 0000000..86806bc
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_javascript)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_javascript) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py
new file mode 100644
index 0000000..5e201a4
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetJavascript(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet
new file mode 100644
index 0000000..c4f0b9d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet
@@ -0,0 +1,55 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_php)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_php) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'php',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+	
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_php.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a PHP programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md
new file mode 100644
index 0000000..024058f
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_php)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_php) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py
new file mode 100644
index 0000000..1378ff0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetPhp(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet
new file mode 100644
index 0000000..98d7a1e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_ruby)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_ruby) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'ruby',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',        
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_ruby.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Ruby programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md
new file mode 100644
index 0000000..012e885
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_ruby)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_ruby) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py
new file mode 100644
index 0000000..7f4db9b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetRuby(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/config.jsonnet
new file mode 100644
index 0000000..3881142
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/config.jsonnet
@@ -0,0 +1,35 @@
+{
+    name: 'Natural Language Codesearch Classification',
+
+    // @TODO: Add a description of the task
+    description: 'Natural Language Codesearch Classification aims to measure the generalization capabilites of language models in code understanding using binary classification as an evaluation task. It includes multiple subtasks to measure three different types of generalizations',
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'codesearch',
+        'natural language query',
+		'binary classification',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+        
+    ],
+
+    subtasks_order: [
+	    'codesearchnet_adv',
+        'cosqa',
+        'codesearchnet_ruby',
+        'codesearchnet_go',
+        'codesearchnet_java',
+        'codesearchnet_javascript',
+        'codesearchnet_php',
+        'statcodesearch',
+        
+    ],
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/__init__.py b/src/genbench/tasks/nl_codesearch_clf/cosqa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet
new file mode 100644
index 0000000..5e20f63
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Natural Language Codesearch Classification (cosqa)',
+
+    description: 'Natural Language Codesearch Classification (cosqa) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+	    'python',
+        'robustness',
+        'covariate shift'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_cosqa.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+    
+	has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a search query and a Python programming language code snippet, determine if the query accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md b/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md
new file mode 100644
index 0000000..8973fdb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (webquery)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (webquery).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (webquery).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (webquery).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (webquery) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py
new file mode 100644
index 0000000..7d1c292
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCosqa(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/doc.md b/src/genbench/tasks/nl_codesearch_clf/doc.md
new file mode 100644
index 0000000..18fc30b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/doc.md
@@ -0,0 +1,43 @@
+## Motivation
+Language models can serve as a valuable tool for software developers to increase productivity. Large generative models can be used for code generation and code completion, while smaller encoder-only models are capable of performing code search tasks using natural language queries. These capabilities are heavily influenced by the quality and diversity of the available training data. Source code datasets used for training usually focus on the most popular languages and testing is mostly conducted on the same distributions, often overlooking low resource programming languages. Motivated by the NLP generalisation taxonomy proposed by Hupkes et. al., we propose a new benchmark dataset called [placeholder] which builds upon existing natural language code search datasets to systemically study the code understanding generalization capabilities of language models. For evaluation and comparison, we collect several baseline results using fine-tuned BERT-style models and GPT-style large language models in a zero-shot setting.
+
+## Examples
+Given a natural language comment or search query, determine if a given code snippet matches the function of the code.
+
+**match**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def set_sampled_topics ( self , sampled_topics ) : assert sampled_topics . dtype == np . int and len ( sampled_topics . shape ) <= 2 if len ( sampled_topics . shape ) == 1 : self . sampled_topics = sampled_topics . reshape ( 1 , sampled_topics . shape [ 0 ] ) else : self . sampled_topics = sampled_topics self . samples = self . sampled_topics . shape [ 0 ] self . tt = self . tt_comp ( self . sampled_topics ) self . dt = self . dt_comp ( self . sampled_topics )", "target": 1, "target_options": ["no_match", "match"]} \
+**no_match**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def _resolve_entity ( mo ) : ent = mo . group ( \"entity\" ) s = mo . group ( ) if s . startswith ( '&#' ) : if s [ 2 ] in 'xX' : radix = 16 else : radix = 10 try : num = int ( ent , radix ) except ( ValueError , OverflowError ) : return u'' else : num = name2codepoint . get ( ent ) if num is None or num < 0 : # unknown entity -> ignore return u'' try : return unichr ( num ) except ValueError : return u''", "target": 0, "target_options": ["no_match", "match"]}
+
+## Data Source
+**CodeSearchNet** : original dataset first published in https://arxiv.org/pdf/1909.09436.pdf , Java, Javascript, Go, Ruby, PHP subsets collected from huggingface-hub \
+**CodeSearchNet Adv** : a processed version of the CodeSearchNet Python dataset, introduced in the CodeXGLUE benchmark suite https://github.com/microsoft/CodeXGLUE \
+**CoSQA** : Python codesnippets from the CodeSearchNet dataset paired with real world user search engine queries, introduced in https://arxiv.org/pdf/2105.13239.pdf \
+**StatCodeSearch** : R code-comment pair snippets, scraped and extracted from public project on the Open Science Framework (OSF) by the submission authors
+
+For each comment in each subset we sampled randomly another code snippet from given subset, to create a fully balanced binary classification dataset. \
+For the dataset statistics we only consider the positive (matching) pairs. \
+
+**Dataset Size**:\
+*Finetuning set:* \
+ -CodeSearchNet Adv train set 251820 \
+*Test sets:* \
+ -CodeSearchNet Adv test set 19210 \
+ -CoSQA 10293\
+ -CodeSearchNet Ruby 2279\
+ -CodeSearchNet Go 14291\
+ -CodeSearchNet Java 26909\
+ -CodeSearchNet Javascript 6483\
+ -CodeSearchNet PHP 29391\
+ -StatCodeSearch 1070 \
+ -Combined test set 109926
+## Limitations and Bias
+TBD
+
+## Citation
+TBD
+
+## Further References
+Husain, H., Wu, H. H., Gazit, T., Allamanis, M., & Brockschmidt, M. (2019). Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436.
+
+Lu, S., Guo, D., Ren, S., Huang, J., Svyatkovskiy, A., Blanco, A., Shujie, L. I. U. (2021, June). CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).
+
+Huang J., Tang D., Shou L., Gong M., Xu K., Jiang D., Zhou M., Duan N. (2021) CoSQA: 20,000+ web queries for code search and question answering. In Proceedings of the 59th Annual Meeting of Association of Computational Linguistics and the 11th Internationaal Joint Conference on Natural Language Processing.
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt
new file mode 100644
index 0000000..b9e2d8a
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt
@@ -0,0 +1,5 @@
+torch v. 2.1.0
+numpy v. 1.25.1
+tqdm v. 4.65.0
+transformers v. 4.32.0
+scikit-learn v. 1.3.0 
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/__init__.py b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet
new file mode 100644
index 0000000..bd6eb74
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Natural Language Codesearch Classification (statcodesearch)',
+
+    description: 'Natural Language Codesearch Classification (statcodesearch) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual and domain generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'r',
+        'cross-lingual',
+        'domain-shift'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_statcodesearch.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a R programming language codesnippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md
new file mode 100644
index 0000000..0826a5c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (statcodesearch)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (statcodesearch).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (statcodesearch).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (statcodesearch).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (statcodesearch) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py
new file mode 100644
index 0000000..5134760
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfStatcodesearch(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/usage_example.py b/src/genbench/tasks/nl_codesearch_clf/usage_example.py
new file mode 100644
index 0000000..9641473
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/usage_example.py
@@ -0,0 +1,331 @@
+import argparse
+import json
+import logging
+
+import torch
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, PreTrainedModel, get_scheduler
+
+from genbench import load_task
+
+
+##########################################################
+# Data Loadig Utils
+##########################################################
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, features):
+        self.features = features
+
+    def __getitem__(self, index):
+        return self.features[index]
+
+    def __len__(self):
+        return len(self.features)
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def _convert_examples_to_features(
+    comments,
+    codes,
+    labels,
+    max_seq_length,
+    tokenizer,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    eos_token="</s>",
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=1,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    features = []
+    for ex_index, (comment, code, label) in enumerate(zip(comments, codes, labels)):
+        # As was done in CodeBERT
+        tokens_comment = tokenizer.tokenize(comment)[:50]
+        tokens_code = tokenizer.tokenize(code)
+
+        # update max_seq_length to account for [CLS], [SEP], [SEP] tokens (-3)
+        n_special_tokens = 3
+        if cls_token is None:
+            n_special_tokens -= 1
+        s_max_seq_length = max_seq_length - n_special_tokens
+        _truncate_seq_pair(tokens_comment, tokens_code, s_max_seq_length)
+
+        # change sep for eos if no sep_token
+        if sep_token is None:
+            sep_token = eos_token
+
+        # [SEP] inbetween and at the end
+        tokens = tokens_comment + [sep_token] + tokens_code + [sep_token]
+        # CLS at the beginning
+        if cls_token is not None:
+            tokens = [cls_token] + tokens
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # 1 for tokens, 0 for padding
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # padding with 0 up to max_seq_length
+        padding_length = max_seq_length - len(input_ids)
+        input_ids = input_ids + ([pad_token] * padding_length)
+        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+        # check
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+
+        # convert to tensors
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        input_mask = torch.tensor(input_mask, dtype=torch.long)
+        label = torch.tensor(label, dtype=torch.long)
+
+        features.append({"input_ids": input_ids, "attention_mask": input_mask, "labels": label})
+    return features
+
+
+def load_data(tokenizer, batch_size, seq_len, train_file, is_train):
+    # create dataset
+    comments = []
+    codes = []
+    labels = []
+    skipped = 0
+
+    is_sep_token_set = tokenizer.sep_token is not None
+    is_cls_token_set = tokenizer.cls_token is not None
+    is_pad_token_set = tokenizer.pad_token is not None
+    is_eos_token_set = tokenizer.eos_token is not None
+
+    for split, dataset in train_file.items():
+        if is_train and split == "test":
+            continue
+        if not is_train and split == "train":
+            continue
+        for sample in dataset:
+            try:
+                input = sample["input"]
+                # split at [CODESPLIT] token
+                input = input.split("[CODESPLIT]")
+                if len(input) != 2:
+                    # skip cases with more than one [SEP] token
+                    logging.warning(f"Input contains more than one [CODESPLIT] token: {input}")
+                    skipped += 1
+                    continue
+                # skip every sample that contains special tokens
+                if is_sep_token_set and (tokenizer.sep_token in input[0] or tokenizer.sep_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_cls_token_set and (tokenizer.cls_token in input[0] or tokenizer.cls_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_pad_token_set and (tokenizer.pad_token in input[0] or tokenizer.pad_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_eos_token_set and (tokenizer.eos_token in input[0] or tokenizer.eos_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                comments.append(input[0])
+                codes.append(input[1])
+                labels.append(sample["target"])
+            except json.JSONDecodeError as e:
+                print(f"Error: JSON decoding failed - {e}")
+                continue
+    logging.info(f"Skipped {skipped} samples due to special tokens")
+    # tokenize
+    features = _convert_examples_to_features(
+        comments,
+        codes,
+        labels,
+        max_seq_length=seq_len,
+        tokenizer=tokenizer,
+        cls_token=tokenizer.cls_token,
+        sep_token=tokenizer.sep_token,
+        cls_token_segment_id=tokenizer.cls_token_id,
+        pad_token_segment_id=tokenizer.pad_token_id,
+        eos_token=tokenizer.eos_token,
+    )
+
+    # Convert to Dataset
+    features = Dataset(features)
+
+    return DataLoader(features, batch_size=batch_size, shuffle=True)
+
+
+##############################################################
+# Fine-tune Model
+##############################################################
+
+
+def train(model: PreTrainedModel, dataloader: DataLoader, args: argparse.Namespace):
+    """
+    Fine-tune the model.
+    :param model: the pretrained model to be fine-tuned
+    :param dataloader: an iterable data loader
+    :param args: training arguments (and also some other arguments)
+    :return: the fine-tuned model
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = "cpu"
+    model.to(device)
+    model.train()
+
+    num_training_steps = args.epochs * len(dataloader)
+    progress_bar = tqdm(range(num_training_steps))
+
+    optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+    lr_scheduler = get_scheduler(
+        name="linear",
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=num_training_steps,
+    )
+
+    for epoch in range(args.epochs):
+        for batch in dataloader:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.update(1)
+
+
+###########################################################
+# Evaluate Model
+###########################################################
+
+
+def clf(model, dataloader, args):
+    """Predict on test set."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    predictions = []
+    labels = []
+    logging.info("Evaluating...")
+    for batch in tqdm(dataloader):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+            predictions.extend(outputs.logits.argmax(-1).cpu().numpy().tolist())
+            labels.extend(batch["labels"].cpu().numpy().tolist())
+
+    metrics = {}
+    # calc metrics
+
+    # calc accuracy
+    accuracy = accuracy_score(labels, predictions)
+    metrics["accuracy"] = accuracy
+
+    # calc precision
+    precision = precision_score(labels, predictions)
+    metrics["precision"] = precision
+
+    # calc recall
+    recall = recall_score(labels, predictions)
+    metrics["recall"] = recall
+
+    # calc f1
+    f1 = f1_score(labels, predictions)
+    metrics["f1"] = f1
+
+    return metrics
+
+
+##############################################################
+#  Run example
+##############################################################
+
+
+def main():
+    """Main function."""
+    # args
+    parser = argparse.ArgumentParser()
+    # parser.add_argument('--dataset', type=str, default='./codesearchnet_adv')
+    parser.add_argument("--model", default="roberta-base")
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--num_warmup_steps", type=int, default=0)
+    parser.add_argument("--output_dir", type=str, default="models")
+    parser.add_argument("--seq_len", type=int, default=512, help="maximum sequence length")
+    # parser.add_argument("--distractors", type=int, default=99, help="number of distractors per true pair")
+    parser.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO")
+
+    args = parser.parse_args()
+
+    TRAIN_FILE = load_task("nl_codesearch_clf:codesearchnet_adv").get_dataset_raw()
+
+    # logging
+    logging.basicConfig(level=args.log_level)
+
+    # load tokenizer
+    logging.info("Loading model...")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    # load data
+    logging.info("Loading data...")
+    dataloader = load_data(tokenizer, args.batch_size, args.seq_len, TRAIN_FILE, True)
+
+    model = AutoModelForSequenceClassification.from_pretrained(args.model)
+
+    # train
+    logging.info("Training...")
+    # train(model, dataloader, args)
+
+    # save model
+    logging.info("Saving model...")
+    model.save_pretrained(f"{args.output_dir}/{args.model}")
+    # also soave tokenizer
+    tokenizer.save_pretrained(f"{args.output_dir}/{args.model}")
+
+    TEST_FILES = [
+        ["codesearchnetadv", load_task("nl_codesearch_clf:codesearchnet_adv").get_dataset_raw()],
+        ["codesearchnet_ruby", load_task("nl_codesearch_clf:codesearchnet_ruby").get_dataset_raw()],
+        ["codesearchnet_go", load_task("nl_codesearch_clf:codesearchnet_go").get_dataset_raw()],
+        ["codesearchnet_java", load_task("nl_codesearch_clf:codesearchnet_java").get_dataset_raw()],
+        ["codesearchnet_javascript", load_task("nl_codesearch_clf:codesearchnet_javascript").get_dataset_raw()],
+        ["codesearchnet_php", load_task("nl_codesearch_clf:codesearchnet_php").get_dataset_raw()],
+        ["cosqa", load_task("nl_codesearch_clf:cosqa").get_dataset_raw()],
+        ["statcodesearch", load_task("nl_codesearch_clf:statcodesearch").get_dataset_raw()],
+    ]
+
+    results = {}
+    for file in TEST_FILES:
+        logging.info(f"Evaluating on {file[0]}...")
+        dataloader = load_data(tokenizer, args.batch_size, args.seq_len, file[1], False)
+        metrics = clf(model, dataloader, args)
+        results[file[0]] = metrics
+        logging.info(f"Test results for {file[0]}: {metrics}")
+
+    logging.info(f"Test results: {results}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf
new file mode 100644
index 0000000..3d4e16e
Binary files /dev/null and b/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf differ
diff --git a/src/genbench/tasks/nl_codesearch_mrr/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/__init__.py
new file mode 100644
index 0000000..85a91e5
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchMrr(TaskDict):
+    pass
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
new file mode 100644
index 0000000..4272171
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_adv)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
new file mode 100644
index 0000000..901fc56
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
new file mode 100644
index 0000000..52535c5
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
@@ -0,0 +1,127 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetAdv(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            # Create negative samples for training
+            elif split == "train":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select 49 other items
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
new file mode 100644
index 0000000..990651b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_go)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'go',
+        'cross-lingual',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
new file mode 100644
index 0000000..8bbf5c3
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
new file mode 100644
index 0000000..beff8ca
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetGo(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet
new file mode 100644
index 0000000..e97580e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_java)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'java',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md
new file mode 100644
index 0000000..a18ffab
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_java)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_java) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py
new file mode 100644
index 0000000..b5ec8e0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetJava(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet
new file mode 100644
index 0000000..3a691cb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_javascript)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_javascript) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'javascript',
+        'cross-lingual',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_javascript.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md
new file mode 100644
index 0000000..6b56758
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_javascript)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_javascript) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py
new file mode 100644
index 0000000..aeb2056
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetJavascript(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet
new file mode 100644
index 0000000..3f12d27
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_php)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_php) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'php',
+        'cross-lingual',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_php.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+	    },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md
new file mode 100644
index 0000000..9fd3043
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_php)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_php) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py
new file mode 100644
index 0000000..797855b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetPhp(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet
new file mode 100644
index 0000000..e3d7582
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_ruby)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_ruby) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'ruby',
+        'cross-lingual',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',   
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_ruby.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+	    hf_id: 'accuracy',
+	    git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+	    best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md
new file mode 100644
index 0000000..a0e0efb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_ruby)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_ruby) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py
new file mode 100644
index 0000000..f2525c1
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetRuby(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet
new file mode 100644
index 0000000..ee9854d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet
@@ -0,0 +1,29 @@
+{
+    name: 'Natural Language Codesearch Ranking',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal ranking',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    subtasks_order: [
+        'codesearchnet_adv',
+        'cosqa',
+        'codesearchnet_ruby',
+        'codesearchnet_go',
+        'codesearchnet_java',
+        'codesearchnet_javascript',
+        'codesearchnet_php',
+        'statcodesearch',	
+    ],
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/cosqa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet
new file mode 100644
index 0000000..846e115
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'Natural Language Codesearch Ranking (cosqa)',
+
+    description: 'Natural Language Codesearch Ranking (cosqa) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',    
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_cosqa.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+	    },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md b/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md
new file mode 100644
index 0000000..e31666d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (webquery)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (webquery).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (webquery).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (webquery).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (webquery) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py b/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py
new file mode 100644
index 0000000..64b959e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCosqa(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/doc.md b/src/genbench/tasks/nl_codesearch_mrr/doc.md
new file mode 100644
index 0000000..3cf5ad0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/doc.md
@@ -0,0 +1,43 @@
+## Motivation
+Language models can serve as a valuable tool for software developers to increase productivity. Large generative models can be used for code generation and code completion, while smaller encoder-only models are capable of performing code search tasks using natural language queries. These capabilities are heavily influenced by the quality and diversity of the available training data. Source code datasets used for training usually focus on the most popular languages and testing is mostly conducted on the same distributions, often overlooking low resource programming languages. Motivated by the NLP generalisation taxonomy proposed by Hupkes et. al., we propose a new benchmark dataset called [placeholder] which builds upon existing natural language code search datasets to systemically study the code understanding generalization capabilities of language models. For evaluation and comparison, we collect several baseline results using fine-tuned BERT-style models and GPT-style large language models in a zero-shot setting.
+
+## Examples
+Given n number of code comment pairs (1 true pair and n-1 distractor pair where a comment has been matched with a random code snippet), calculate the MRR score.
+
+**true sample**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def set_sampled_topics ( self , sampled_topics ) : assert sampled_topics . dtype == np . int and len ( sampled_topics . shape ) <= 2 if len ( sampled_topics . shape ) == 1 : self . sampled_topics = sampled_topics . reshape ( 1 , sampled_topics . shape [ 0 ] ) else : self . sampled_topics = sampled_topics self . samples = self . sampled_topics . shape [ 0 ] self . tt = self . tt_comp ( self . sampled_topics ) self . dt = self . dt_comp ( self . sampled_topics )", "target": 1, "target_options": ["no_match", "match"]} \
+**distractor sample**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def _resolve_entity ( mo ) : ent = mo . group ( \"entity\" ) s = mo . group ( ) if s . startswith ( '&#' ) : if s [ 2 ] in 'xX' : radix = 16 else : radix = 10 try : num = int ( ent , radix ) except ( ValueError , OverflowError ) : return u'' else : num = name2codepoint . get ( ent ) if num is None or num < 0 : # unknown entity -> ignore return u'' try : return unichr ( num ) except ValueError : return u''", "target": 0, "target_options": ["no_match", "match"]}
+
+## Data Source
+**CodeSearchNet** : original dataset first published in https://arxiv.org/pdf/1909.09436.pdf , Java, Javascript, Go, Ruby, PHP subsets collected from huggingface-hub \
+**CodeSearchNet Adv** : a processed version of the CodeSearchNet Python dataset, introduced in the CodeXGLUE benchmark suite https://github.com/microsoft/CodeXGLUE \
+**CoSQA** : Python codesnippets from the CodeSearchNet dataset paired with real world user search engine queries, introduced in https://arxiv.org/pdf/2105.13239.pdf \
+**StatCodeSearch** : R code-comment pair snippets, scraped and extracted from public project on the Open Science Framework (OSF) by the submission authors
+
+For each comment in each subset we sampled randomly another code snippet from given subset, to create a fully balanced binary classification dataset. \
+For the dataset statistics we only consider the positive (matching) pairs. \
+
+**Dataset Size**:\
+*Finetuning set:* \
+ -CodeSearchNet Adv train set 251820 \
+*Test sets:* \
+ -CodeSearchNet Adv test set 19210 \
+ -CoSQA 10293\
+ -CodeSearchNet Ruby 2279\
+ -CodeSearchNet Go 14291\
+ -CodeSearchNet Java 26909\
+ -CodeSearchNet Javascript 6483\
+ -CodeSearchNet PHP 29391\
+ -StatCodeSearch 1070 \
+ -Combined test set 109926
+## Limitations and Bias
+TBD
+
+## Citation
+TBD
+
+## Further References
+Husain, H., Wu, H. H., Gazit, T., Allamanis, M., & Brockschmidt, M. (2019). Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436.
+
+Lu, S., Guo, D., Ren, S., Huang, J., Svyatkovskiy, A., Blanco, A., Shujie, L. I. U. (2021, June). CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).
+
+Huang J., Tang D., Shou L., Gong M., Xu K., Jiang D., Zhou M., Duan N. (2021) CoSQA: 20,000+ web queries for code search and question answering. In Proceedings of the 59th Annual Meeting of Association of Computational Linguistics and the 11th Internationaal Joint Conference on Natural Language Processing.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py b/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py
new file mode 100644
index 0000000..6246a78
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py
@@ -0,0 +1,27 @@
+from genbench import load_task
+
+
+def main():
+    high_mrr_test_list = []
+    for i in range(1, 11):
+        score_dict = dict.fromkeys(["score"])
+        score_dict["score"] = 1 / i
+        high_mrr_test_list.append(score_dict)
+
+    low_mrr_test_list = []
+    for i in range(1, 11):
+        score_dict = dict.fromkeys(["score"])
+        score_dict["score"] = 1 * i
+        low_mrr_test_list.append(score_dict)
+
+    task = load_task("nl_codesearch_mrr:statcodesearch")
+
+    high_results = task.evaluate_predictions(high_mrr_test_list, 9)
+    print(high_results)
+
+    low_results = task.evaluate_predictions(low_mrr_test_list, 9)
+    print(low_results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt b/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt
new file mode 100644
index 0000000..ffb4c93
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt
@@ -0,0 +1,4 @@
+torch v. 2.1.0
+numpy v. 1.25.1
+tqdm v. 4.65.0
+transformers v. 4.32.0
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet
new file mode 100644
index 0000000..0ffe3e7
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'Natural Language Codesearch Ranking (statcodesearch)',
+
+    description: 'Natural Language Codesearch Ranking (statcodesearch) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual and domain generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'r',
+        'cross-lingual',
+        'domain-shift'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_statcodesearch.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+		finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md
new file mode 100644
index 0000000..0826a5c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (statcodesearch)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (statcodesearch).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (statcodesearch).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (statcodesearch).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (statcodesearch) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py
new file mode 100644
index 0000000..2566044
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py
@@ -0,0 +1,108 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrStatcodesearch(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        random.seed(42)
+
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/usage_example.py b/src/genbench/tasks/nl_codesearch_mrr/usage_example.py
new file mode 100644
index 0000000..8bb1455
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/usage_example.py
@@ -0,0 +1,319 @@
+import argparse
+import json
+import logging
+import random
+
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, PreTrainedModel, get_scheduler
+
+from genbench import load_task
+
+
+##########################################################
+# Data Loadig Utils
+##########################################################
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, features):
+        self.features = features
+
+    def __getitem__(self, index):
+        return self.features[index]
+
+    def __len__(self):
+        return len(self.features)
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def _convert_examples_to_features(
+    comments,
+    codes,
+    labels,
+    max_seq_length,
+    tokenizer,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    eos_token="</s>",
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=1,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    features = []
+    for ex_index, (comment, code, label) in enumerate(zip(comments, codes, labels)):
+        # As was done in CodeBERT
+        tokens_comment = tokenizer.tokenize(comment)[:50]
+        tokens_code = tokenizer.tokenize(code)
+
+        # update max_seq_length to account for [CLS], [SEP], [SEP] tokens (-3)
+        n_special_tokens = 3
+        if cls_token is None:
+            n_special_tokens -= 1
+        s_max_seq_length = max_seq_length - n_special_tokens
+        _truncate_seq_pair(tokens_comment, tokens_code, s_max_seq_length)
+
+        # change sep for eos if no sep_token
+        if sep_token is None:
+            sep_token = eos_token
+
+        # [SEP] inbetween and at the end
+        tokens = tokens_comment + [sep_token] + tokens_code + [sep_token]
+        # CLS at the beginning
+        if cls_token is not None:
+            tokens = [cls_token] + tokens
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # 1 for tokens, 0 for padding
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # padding with 0 up to max_seq_length
+        padding_length = max_seq_length - len(input_ids)
+        input_ids = input_ids + ([pad_token] * padding_length)
+        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+        # check
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+
+        # convert to tensors
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        input_mask = torch.tensor(input_mask, dtype=torch.long)
+        label = torch.tensor(label, dtype=torch.long)
+
+        features.append({"input_ids": input_ids, "attention_mask": input_mask, "labels": label})
+    return features
+
+
+def load_data(tokenizer, batch_size, seq_len, train_file, is_train):
+    # create dataset
+    comments = []
+    codes = []
+    labels = []
+    skipped = 0
+    if is_train:
+        do_shuffle = True
+    else:
+        do_shuffle = False
+
+    is_sep_token_set = tokenizer.sep_token is not None
+    is_cls_token_set = tokenizer.cls_token is not None
+    is_pad_token_set = tokenizer.pad_token is not None
+    is_eos_token_set = tokenizer.eos_token is not None
+
+    for split, dataset in train_file.items():
+        if is_train and split == "test":
+            continue
+        if not is_train and split == "train":
+            continue
+        for sample in dataset:
+            try:
+                input = sample["input"]
+                # split at [CODESPLIT] token
+                input = input.split("[CODESPLIT]")
+                if len(input) != 2:
+                    # skip cases with more than one [SEP] token
+                    logging.warning(f"Input contains more than one [CODESPLIT] token: {input}")
+                    skipped += 1
+                    continue
+                # skip every sample that contains special tokens
+                if is_sep_token_set and (tokenizer.sep_token in input[0] or tokenizer.sep_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_cls_token_set and (tokenizer.cls_token in input[0] or tokenizer.cls_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_pad_token_set and (tokenizer.pad_token in input[0] or tokenizer.pad_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_eos_token_set and (tokenizer.eos_token in input[0] or tokenizer.eos_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                comments.append(input[0])
+                codes.append(input[1])
+                labels.append(sample["target"])
+
+            except json.JSONDecodeError as e:
+                print(f"Error: JSON decoding failed - {e}")
+                continue
+    logging.info(f"Skipped {skipped} samples due to special tokens")
+    # tokenize
+    features = _convert_examples_to_features(
+        comments,
+        codes,
+        labels,
+        max_seq_length=seq_len,
+        tokenizer=tokenizer,
+        cls_token=tokenizer.cls_token,
+        sep_token=tokenizer.sep_token,
+        cls_token_segment_id=tokenizer.cls_token_id,
+        pad_token_segment_id=tokenizer.pad_token_id,
+        eos_token=tokenizer.eos_token,
+    )
+
+    # Convert to Dataset
+    features = Dataset(features)
+
+    return DataLoader(features, batch_size=batch_size, shuffle=do_shuffle)
+
+
+##############################################################
+# Fine-tune Model
+##############################################################
+
+
+def train(model: PreTrainedModel, dataloader: DataLoader, args: argparse.Namespace):
+    """
+    Fine-tune the model.
+    :param model: the pretrained model to be fine-tuned
+    :param dataloader: an iterable data loader
+    :param args: training arguments (and also some other arguments)
+    :return: the fine-tuned model
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.train()
+
+    num_training_steps = args.epochs * len(dataloader)
+    progress_bar = tqdm(range(num_training_steps))
+
+    optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+    lr_scheduler = get_scheduler(
+        name="linear",
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=num_training_steps,
+    )
+
+    for epoch in range(args.epochs):
+        for batch in dataloader:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.update(1)
+
+
+###########################################################
+# Evaluate Model
+###########################################################
+
+
+def get_scores(model, dataloader):
+    random.seed(42)
+    # make predictions for all chunks
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("Using device:", device)
+    model.to(device)
+    model.eval()
+
+    score_list = []
+    for batch in tqdm(dataloader):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+            score_dict = dict.fromkeys(["score"])
+            score_dict["score"] = outputs.logits.cpu().numpy()
+            score_list.append(score_dict)
+
+    return score_list
+
+
+##############################################################
+#  Run example
+##############################################################
+
+
+def main():
+    """Main function."""
+    # args
+    parser = argparse.ArgumentParser()
+    # parser.add_argument('--dataset', type=str, default='./codesearchnet_adv')
+    parser.add_argument("--model", default="roberta-base")
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--num_warmup_steps", type=int, default=0)
+    parser.add_argument("--output_dir", type=str, default="models")
+    parser.add_argument("--seq_len", type=int, default=512, help="maximum sequence length")
+    parser.add_argument("--distractors", type=int, default=2, help="number of distractors per true pair")
+    parser.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO")
+
+    args = parser.parse_args()
+
+    TRAIN_FILE = load_task("nl_codesearch_mrr:codesearchnet_adv").get_dataset_raw(args.distractors)
+
+    # logging
+    logging.basicConfig(level=args.log_level)
+
+    # load tokenizer
+    logging.info("Loading model...")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    # load data
+    logging.info("Loading data...")
+    dataloader = load_data(tokenizer, args.batch_size, args.seq_len, TRAIN_FILE, True)
+
+    model = AutoModelForSequenceClassification.from_pretrained(args.model)
+
+    # train
+    logging.info("Training...")
+    train(model, dataloader, args)
+
+    # save model
+    logging.info("Saving model...")
+    model.save_pretrained(f"{args.output_dir}/{args.model}")
+    # also soave tokenizer
+    tokenizer.save_pretrained(f"{args.output_dir}/{args.model}")
+
+    TEST_TASKS = [
+        ["codesearchnetadv", load_task("nl_codesearch_mrr:codesearchnet_adv")],
+        ["codesearchnet_ruby", load_task("nl_codesearch_mrr:codesearchnet_ruby")],
+        ["codesearchnet_go", load_task("nl_codesearch_mrr:codesearchnet_go")],
+        ["codesearchnet_java", load_task("nl_codesearch_mrr:codesearchnet_java")],
+        ["codesearchnet_javascript", load_task("nl_codesearch_mrr:codesearchnet_javascript")],
+        ["codesearchnet_php", load_task("nl_codesearch_mrr:codesearchnet_php")],
+        ["cosqa", load_task("nl_codesearch_mrr:cosqa")],
+        ["statcodesearch", load_task("nl_codesearch_mrr:statcodesearch")],
+    ]
+
+    results = {}
+    for task in TEST_TASKS:
+        logging.info(f"Calculating Logits for MRR {task[0]}...")
+        dataloader = load_data(tokenizer, 1, args.seq_len, task[1].get_dataset_raw(args.distractors), False)
+        scores = get_scores(model, dataloader)
+        mrr_value = task[1].evaluate_predictions(scores, args.distractors)
+        logging.info(f"Test results for {task[0]}: {mrr_value}")
+        results[task[0]] = mrr_value
+
+    logging.info(f"Test results: {results}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_task.py b/tests/test_task.py
index 3c1872b..e8ce065 100644
--- a/tests/test_task.py
+++ b/tests/test_task.py
@@ -68,6 +68,10 @@ def test_split_file(task_obj: Task):
 
 def test_task_config_matches_provided_sets(task_obj: Task):
     """Test case to verify if the task config matches the provided sets"""
+    class_dict = task_obj.__class__.__dict__
+    if "get_datasets_raw" in class_dict and "get_prepared_datasets" in class_dict:
+        pytest.skip("Task has custom get_datasets_raw and prepared_dataset builder.")
+
     datasets_raw = task_obj.get_datasets_raw()
 
     task_sets = [DatasetSplit.TEST.value]