Merge pull request #27 from drndr/nl_codesearch_mrr

[Task Submission] Natural Language Codesearch Ranking (`nl_codesearch_mrr`)
GenBench · Dec 31, 2023 · 5fd5b05 · 5fd5b05
2 parents 474d98f + b5bec7e
commit 5fd5b05
Show file tree

Hide file tree

Showing 39 changed files with 1,847 additions and 0 deletions.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf
diff --git a/src/genbench/tasks/nl_codesearch_mrr/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchMrr(TaskDict):
+    pass
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,48 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_adv)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
@@ -0,0 +1,127 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetAdv(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            # Create negative samples for training
+            elif split == "train":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select 49 other items
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
@@ -0,0 +1,47 @@
+{
+    name: 'Natural Language Codesearch Ranking (codesearchnet_go)',
+
+    description: 'Natural Language Codesearch Ranking (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'mean reciprocal rank',
+        'go',
+        'cross-lingual',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+        hf_id: 'accuracy',
+        git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+        best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+    },
+}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+    """
+    Split an iterable into chunks of a specified size.
+
+    Args:
+        iterable: The iterable to be chunked.
+        chunk_size: The size of each chunk.
+
+    Returns:
+        A generator that yields chunks of the iterable.
+    """
+    if chunk_size <= 0:
+        raise ValueError("Chunk size must be greater than zero")
+
+    chunk = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+
+    if chunk:
+        yield chunk
+
+
+class NlCodesearchMrrCodesearchnetGo(Task):
+    def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+        Args:
+            n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        # Create 49 distractors for each item
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                # Convert dataset to list for easier manipulation
+                dataset_list = list(dataset)
+
+                new_data = []
+
+                for idx, item in enumerate(dataset_list):
+                    new_data.append(item)
+
+                    # Create other_items list once and then simply exclude the current item during sampling
+                    other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+                    random_items = random.sample(other_items, n_distractors)
+
+                    input_parts = item["input"].split("[CODESPLIT]")
+
+                    for random_item in random_items:
+                        random_input_parts = random_item["input"].split("[CODESPLIT]")
+                        new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                        new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                        new_data.append(new_item)
+
+                # Convert list back to HuggingFace dataset
+                output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+            else:
+                output[split] = dataset
+        return output
+
+    def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+        """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+        This function assumes that the predictions were made and passed onto this function unshuffled.
+        The test data is ordered with each true pair followed by n number of distractors
+         Args:
+             predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+                          The keys are strings and the values are floats (logit scores or similarity values).
+             n_distractors:  Number of distractor comment-code pair for each true pair.
+                             Must be the same number as in the get_dataset_raw function
+
+         Returns:
+             A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+             values. The keys are strings representing the name of the evaluation metric and the values are
+             floating-point numbers.
+        """
+        ranks = []
+
+        batched_predictions = chunked(predictions, n_distractors + 1)
+
+        for batch_idx, predictions in enumerate(batched_predictions):
+            correct_score = predictions[0]["score"]
+            scores = np.array([prediction["score"] for prediction in predictions])
+            rank = np.sum(scores >= correct_score)
+            ranks.append(rank)
+        mean_mrr = np.mean(1.0 / np.array(ranks))
+
+        return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py