diff --git a/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf
new file mode 100644
index 0000000..3d4e16e
Binary files /dev/null and b/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf differ
diff --git a/src/genbench/tasks/nl_codesearch_clf/__init__.py b/src/genbench/tasks/nl_codesearch_clf/__init__.py
new file mode 100644
index 0000000..b8d3157
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchClf(TaskDict):
+    pass
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet
new file mode 100644
index 0000000..09feac6
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,58 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_adv)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+	has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+		
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Python programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
new file mode 100644
index 0000000..8193db4
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
new file mode 100644
index 0000000..4e77608
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetAdv(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test" or split == "train":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet
new file mode 100644
index 0000000..01715cb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_go)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'go',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+	    },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Go programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
new file mode 100644
index 0000000..aa3720e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
new file mode 100644
index 0000000..9b880ec
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetGo(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet
new file mode 100644
index 0000000..1ea6599
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_java)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+	
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'java',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Java programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
new file mode 100644
index 0000000..16abaa2
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_java)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_java) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
new file mode 100644
index 0000000..292e74c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetJava(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet
new file mode 100644
index 0000000..f61ade9
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_javascript)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_javascript) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'javascript',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_javascript.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+			git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+			best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and an Javascript programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md
new file mode 100644
index 0000000..86806bc
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_javascript)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_javascript).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_javascript) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py
new file mode 100644
index 0000000..5e201a4
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetJavascript(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet
new file mode 100644
index 0000000..c4f0b9d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/config.jsonnet
@@ -0,0 +1,55 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_php)',
+	
+    description: 'Natural Language Codesearch Classification (codesearchnet_php) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'php',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+	
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_php.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a PHP programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md
new file mode 100644
index 0000000..024058f
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_php)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_php).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_php) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py
new file mode 100644
index 0000000..1378ff0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetPhp(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet
new file mode 100644
index 0000000..98d7a1e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_ruby)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_ruby) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'ruby',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',        
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_ruby.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Ruby programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md
new file mode 100644
index 0000000..012e885
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_ruby)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_ruby).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_ruby) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py
new file mode 100644
index 0000000..7f4db9b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetRuby(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/config.jsonnet
new file mode 100644
index 0000000..3881142
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/config.jsonnet
@@ -0,0 +1,35 @@
+{
+    name: 'Natural Language Codesearch Classification',
+
+    // @TODO: Add a description of the task
+    description: 'Natural Language Codesearch Classification aims to measure the generalization capabilites of language models in code understanding using binary classification as an evaluation task. It includes multiple subtasks to measure three different types of generalizations',
+
+    // @TODO: Add a list of keywords that describe the task
+    keywords: [
+        'codesearch',
+        'natural language query',
+		'binary classification',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+        
+    ],
+
+    subtasks_order: [
+	    'codesearchnet_adv',
+        'cosqa',
+        'codesearchnet_ruby',
+        'codesearchnet_go',
+        'codesearchnet_java',
+        'codesearchnet_javascript',
+        'codesearchnet_php',
+        'statcodesearch',
+        
+    ],
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/__init__.py b/src/genbench/tasks/nl_codesearch_clf/cosqa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet
new file mode 100644
index 0000000..5e20f63
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Natural Language Codesearch Classification (cosqa)',
+
+    description: 'Natural Language Codesearch Classification (cosqa) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+	    'python',
+        'robustness',
+        'covariate shift'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',  
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_cosqa.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+    
+	has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a search query and a Python programming language code snippet, determine if the query accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md b/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md
new file mode 100644
index 0000000..8973fdb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (webquery)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (webquery).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (webquery).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (webquery).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (webquery) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py
new file mode 100644
index 0000000..7d1c292
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCosqa(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/doc.md b/src/genbench/tasks/nl_codesearch_clf/doc.md
new file mode 100644
index 0000000..18fc30b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/doc.md
@@ -0,0 +1,43 @@
+## Motivation
+Language models can serve as a valuable tool for software developers to increase productivity. Large generative models can be used for code generation and code completion, while smaller encoder-only models are capable of performing code search tasks using natural language queries. These capabilities are heavily influenced by the quality and diversity of the available training data. Source code datasets used for training usually focus on the most popular languages and testing is mostly conducted on the same distributions, often overlooking low resource programming languages. Motivated by the NLP generalisation taxonomy proposed by Hupkes et. al., we propose a new benchmark dataset called [placeholder] which builds upon existing natural language code search datasets to systemically study the code understanding generalization capabilities of language models. For evaluation and comparison, we collect several baseline results using fine-tuned BERT-style models and GPT-style large language models in a zero-shot setting.
+
+## Examples
+Given a natural language comment or search query, determine if a given code snippet matches the function of the code.
+
+**match**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def set_sampled_topics ( self , sampled_topics ) : assert sampled_topics . dtype == np . int and len ( sampled_topics . shape ) <= 2 if len ( sampled_topics . shape ) == 1 : self . sampled_topics = sampled_topics . reshape ( 1 , sampled_topics . shape [ 0 ] ) else : self . sampled_topics = sampled_topics self . samples = self . sampled_topics . shape [ 0 ] self . tt = self . tt_comp ( self . sampled_topics ) self . dt = self . dt_comp ( self . sampled_topics )", "target": 1, "target_options": ["no_match", "match"]} \
+**no_match**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def _resolve_entity ( mo ) : ent = mo . group ( \"entity\" ) s = mo . group ( ) if s . startswith ( '&#' ) : if s [ 2 ] in 'xX' : radix = 16 else : radix = 10 try : num = int ( ent , radix ) except ( ValueError , OverflowError ) : return u'' else : num = name2codepoint . get ( ent ) if num is None or num < 0 : # unknown entity -> ignore return u'' try : return unichr ( num ) except ValueError : return u''", "target": 0, "target_options": ["no_match", "match"]}
+
+## Data Source
+**CodeSearchNet** : original dataset first published in https://arxiv.org/pdf/1909.09436.pdf , Java, Javascript, Go, Ruby, PHP subsets collected from huggingface-hub \
+**CodeSearchNet Adv** : a processed version of the CodeSearchNet Python dataset, introduced in the CodeXGLUE benchmark suite https://github.com/microsoft/CodeXGLUE \
+**CoSQA** : Python codesnippets from the CodeSearchNet dataset paired with real world user search engine queries, introduced in https://arxiv.org/pdf/2105.13239.pdf \
+**StatCodeSearch** : R code-comment pair snippets, scraped and extracted from public project on the Open Science Framework (OSF) by the submission authors
+
+For each comment in each subset we sampled randomly another code snippet from given subset, to create a fully balanced binary classification dataset. \
+For the dataset statistics we only consider the positive (matching) pairs. \
+
+**Dataset Size**:\
+*Finetuning set:* \
+ -CodeSearchNet Adv train set 251820 \
+*Test sets:* \
+ -CodeSearchNet Adv test set 19210 \
+ -CoSQA 10293\
+ -CodeSearchNet Ruby 2279\
+ -CodeSearchNet Go 14291\
+ -CodeSearchNet Java 26909\
+ -CodeSearchNet Javascript 6483\
+ -CodeSearchNet PHP 29391\
+ -StatCodeSearch 1070 \
+ -Combined test set 109926
+## Limitations and Bias
+TBD
+
+## Citation
+TBD
+
+## Further References
+Husain, H., Wu, H. H., Gazit, T., Allamanis, M., & Brockschmidt, M. (2019). Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436.
+
+Lu, S., Guo, D., Ren, S., Huang, J., Svyatkovskiy, A., Blanco, A., Shujie, L. I. U. (2021, June). CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).
+
+Huang J., Tang D., Shou L., Gong M., Xu K., Jiang D., Zhou M., Duan N. (2021) CoSQA: 20,000+ web queries for code search and question answering. In Proceedings of the 59th Annual Meeting of Association of Computational Linguistics and the 11th Internationaal Joint Conference on Natural Language Processing.
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt
new file mode 100644
index 0000000..b9e2d8a
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt
@@ -0,0 +1,5 @@
+torch v. 2.1.0
+numpy v. 1.25.1
+tqdm v. 4.65.0
+transformers v. 4.32.0
+scikit-learn v. 1.3.0 
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/__init__.py b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet
new file mode 100644
index 0000000..bd6eb74
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/config.jsonnet
@@ -0,0 +1,57 @@
+{
+    name: 'Natural Language Codesearch Classification (statcodesearch)',
+
+    description: 'Natural Language Codesearch Classification (statcodesearch) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual and domain generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'r',
+        'cross-lingual',
+        'domain-shift'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_statcodesearch.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a R programming language codesnippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md
new file mode 100644
index 0000000..0826a5c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (statcodesearch)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (statcodesearch).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (statcodesearch).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (statcodesearch).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (statcodesearch) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py
new file mode 100644
index 0000000..5134760
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfStatcodesearch(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/usage_example.py b/src/genbench/tasks/nl_codesearch_clf/usage_example.py
new file mode 100644
index 0000000..9641473
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_clf/usage_example.py
@@ -0,0 +1,331 @@
+import argparse
+import json
+import logging
+
+import torch
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, PreTrainedModel, get_scheduler
+
+from genbench import load_task
+
+
+##########################################################
+# Data Loadig Utils
+##########################################################
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, features):
+        self.features = features
+
+    def __getitem__(self, index):
+        return self.features[index]
+
+    def __len__(self):
+        return len(self.features)
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def _convert_examples_to_features(
+    comments,
+    codes,
+    labels,
+    max_seq_length,
+    tokenizer,
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    pad_token=0,
+    eos_token="</s>",
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+    cls_token_segment_id=1,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    features = []
+    for ex_index, (comment, code, label) in enumerate(zip(comments, codes, labels)):
+        # As was done in CodeBERT
+        tokens_comment = tokenizer.tokenize(comment)[:50]
+        tokens_code = tokenizer.tokenize(code)
+
+        # update max_seq_length to account for [CLS], [SEP], [SEP] tokens (-3)
+        n_special_tokens = 3
+        if cls_token is None:
+            n_special_tokens -= 1
+        s_max_seq_length = max_seq_length - n_special_tokens
+        _truncate_seq_pair(tokens_comment, tokens_code, s_max_seq_length)
+
+        # change sep for eos if no sep_token
+        if sep_token is None:
+            sep_token = eos_token
+
+        # [SEP] inbetween and at the end
+        tokens = tokens_comment + [sep_token] + tokens_code + [sep_token]
+        # CLS at the beginning
+        if cls_token is not None:
+            tokens = [cls_token] + tokens
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # 1 for tokens, 0 for padding
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # padding with 0 up to max_seq_length
+        padding_length = max_seq_length - len(input_ids)
+        input_ids = input_ids + ([pad_token] * padding_length)
+        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+        # check
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+
+        # convert to tensors
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        input_mask = torch.tensor(input_mask, dtype=torch.long)
+        label = torch.tensor(label, dtype=torch.long)
+
+        features.append({"input_ids": input_ids, "attention_mask": input_mask, "labels": label})
+    return features
+
+
+def load_data(tokenizer, batch_size, seq_len, train_file, is_train):
+    # create dataset
+    comments = []
+    codes = []
+    labels = []
+    skipped = 0
+
+    is_sep_token_set = tokenizer.sep_token is not None
+    is_cls_token_set = tokenizer.cls_token is not None
+    is_pad_token_set = tokenizer.pad_token is not None
+    is_eos_token_set = tokenizer.eos_token is not None
+
+    for split, dataset in train_file.items():
+        if is_train and split == "test":
+            continue
+        if not is_train and split == "train":
+            continue
+        for sample in dataset:
+            try:
+                input = sample["input"]
+                # split at [CODESPLIT] token
+                input = input.split("[CODESPLIT]")
+                if len(input) != 2:
+                    # skip cases with more than one [SEP] token
+                    logging.warning(f"Input contains more than one [CODESPLIT] token: {input}")
+                    skipped += 1
+                    continue
+                # skip every sample that contains special tokens
+                if is_sep_token_set and (tokenizer.sep_token in input[0] or tokenizer.sep_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_cls_token_set and (tokenizer.cls_token in input[0] or tokenizer.cls_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_pad_token_set and (tokenizer.pad_token in input[0] or tokenizer.pad_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                if is_eos_token_set and (tokenizer.eos_token in input[0] or tokenizer.eos_token in input[1]):
+                    logging.warning(f"Input contains special tokens: {input}")
+                    skipped += 1
+                    continue
+                comments.append(input[0])
+                codes.append(input[1])
+                labels.append(sample["target"])
+            except json.JSONDecodeError as e:
+                print(f"Error: JSON decoding failed - {e}")
+                continue
+    logging.info(f"Skipped {skipped} samples due to special tokens")
+    # tokenize
+    features = _convert_examples_to_features(
+        comments,
+        codes,
+        labels,
+        max_seq_length=seq_len,
+        tokenizer=tokenizer,
+        cls_token=tokenizer.cls_token,
+        sep_token=tokenizer.sep_token,
+        cls_token_segment_id=tokenizer.cls_token_id,
+        pad_token_segment_id=tokenizer.pad_token_id,
+        eos_token=tokenizer.eos_token,
+    )
+
+    # Convert to Dataset
+    features = Dataset(features)
+
+    return DataLoader(features, batch_size=batch_size, shuffle=True)
+
+
+##############################################################
+# Fine-tune Model
+##############################################################
+
+
+def train(model: PreTrainedModel, dataloader: DataLoader, args: argparse.Namespace):
+    """
+    Fine-tune the model.
+    :param model: the pretrained model to be fine-tuned
+    :param dataloader: an iterable data loader
+    :param args: training arguments (and also some other arguments)
+    :return: the fine-tuned model
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = "cpu"
+    model.to(device)
+    model.train()
+
+    num_training_steps = args.epochs * len(dataloader)
+    progress_bar = tqdm(range(num_training_steps))
+
+    optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+    lr_scheduler = get_scheduler(
+        name="linear",
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=num_training_steps,
+    )
+
+    for epoch in range(args.epochs):
+        for batch in dataloader:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.update(1)
+
+
+###########################################################
+# Evaluate Model
+###########################################################
+
+
+def clf(model, dataloader, args):
+    """Predict on test set."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    predictions = []
+    labels = []
+    logging.info("Evaluating...")
+    for batch in tqdm(dataloader):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(**batch)
+            predictions.extend(outputs.logits.argmax(-1).cpu().numpy().tolist())
+            labels.extend(batch["labels"].cpu().numpy().tolist())
+
+    metrics = {}
+    # calc metrics
+
+    # calc accuracy
+    accuracy = accuracy_score(labels, predictions)
+    metrics["accuracy"] = accuracy
+
+    # calc precision
+    precision = precision_score(labels, predictions)
+    metrics["precision"] = precision
+
+    # calc recall
+    recall = recall_score(labels, predictions)
+    metrics["recall"] = recall
+
+    # calc f1
+    f1 = f1_score(labels, predictions)
+    metrics["f1"] = f1
+
+    return metrics
+
+
+##############################################################
+#  Run example
+##############################################################
+
+
+def main():
+    """Main function."""
+    # args
+    parser = argparse.ArgumentParser()
+    # parser.add_argument('--dataset', type=str, default='./codesearchnet_adv')
+    parser.add_argument("--model", default="roberta-base")
+    parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--num_warmup_steps", type=int, default=0)
+    parser.add_argument("--output_dir", type=str, default="models")
+    parser.add_argument("--seq_len", type=int, default=512, help="maximum sequence length")
+    # parser.add_argument("--distractors", type=int, default=99, help="number of distractors per true pair")
+    parser.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO")
+
+    args = parser.parse_args()
+
+    TRAIN_FILE = load_task("nl_codesearch_clf:codesearchnet_adv").get_dataset_raw()
+
+    # logging
+    logging.basicConfig(level=args.log_level)
+
+    # load tokenizer
+    logging.info("Loading model...")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    # load data
+    logging.info("Loading data...")
+    dataloader = load_data(tokenizer, args.batch_size, args.seq_len, TRAIN_FILE, True)
+
+    model = AutoModelForSequenceClassification.from_pretrained(args.model)
+
+    # train
+    logging.info("Training...")
+    # train(model, dataloader, args)
+
+    # save model
+    logging.info("Saving model...")
+    model.save_pretrained(f"{args.output_dir}/{args.model}")
+    # also soave tokenizer
+    tokenizer.save_pretrained(f"{args.output_dir}/{args.model}")
+
+    TEST_FILES = [
+        ["codesearchnetadv", load_task("nl_codesearch_clf:codesearchnet_adv").get_dataset_raw()],
+        ["codesearchnet_ruby", load_task("nl_codesearch_clf:codesearchnet_ruby").get_dataset_raw()],
+        ["codesearchnet_go", load_task("nl_codesearch_clf:codesearchnet_go").get_dataset_raw()],
+        ["codesearchnet_java", load_task("nl_codesearch_clf:codesearchnet_java").get_dataset_raw()],
+        ["codesearchnet_javascript", load_task("nl_codesearch_clf:codesearchnet_javascript").get_dataset_raw()],
+        ["codesearchnet_php", load_task("nl_codesearch_clf:codesearchnet_php").get_dataset_raw()],
+        ["cosqa", load_task("nl_codesearch_clf:cosqa").get_dataset_raw()],
+        ["statcodesearch", load_task("nl_codesearch_clf:statcodesearch").get_dataset_raw()],
+    ]
+
+    results = {}
+    for file in TEST_FILES:
+        logging.info(f"Evaluating on {file[0]}...")
+        dataloader = load_data(tokenizer, args.batch_size, args.seq_len, file[1], False)
+        metrics = clf(model, dataloader, args)
+        results[file[0]] = metrics
+        logging.info(f"Test results for {file[0]}: {metrics}")
+
+    logging.info(f"Test results: {results}")
+
+
+if __name__ == "__main__":
+    main()