Skip to content
This repository has been archived by the owner on Jul 23, 2024. It is now read-only.

Commit

Permalink
Merge pull request #27 from drndr/nl_codesearch_mrr
Browse files Browse the repository at this point in the history
[Task Submission] Natural Language Codesearch Ranking (`nl_codesearch_mrr`)
  • Loading branch information
kazemnejad authored Dec 31, 2023
2 parents 474d98f + b5bec7e commit 5fd5b05
Show file tree
Hide file tree
Showing 39 changed files with 1,847 additions and 0 deletions.
Binary file not shown.
5 changes: 5 additions & 0 deletions src/genbench/tasks/nl_codesearch_mrr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from genbench import TaskDict


class NlCodesearchMrr(TaskDict):
pass
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
name: 'Natural Language Codesearch Ranking (codesearchnet_adv)',

description: 'Natural Language Codesearch Ranking (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',

keywords: [
'codesearch',
'natural language query',
'mean reciprocal rank',
'python',
'robustness',
'covariate shift',
],

authors: [
'Andor Diera',
'Abdelhalim Dahou',
'Lukas Galke',
'Fabian Karl',
'Florian Sihler',
'Ansgar Scherp',
],

data_source: {
type: 'manual',
test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
},

has_validation_set: false,
has_train_set: true,

task_type: 'multiple_choice',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0,
},
],

preparation_strategies: {
finetuning: {
objective: 'maximum_likelihood',
},
},
}
19 changes: 19 additions & 0 deletions src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Natural Language Codesearch Ranking (codesearchnet_adv)

## Abstract
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_adv).*

## Examples
*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_adv).*

## Usage
*Describe how to load your task and what is required for evaluation, if anything.*

## Data Source
*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_adv).*

## Limitations and Bias
*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_adv) has, with links and references if possible.*

## GenBench Eval card
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
127 changes: 127 additions & 0 deletions src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import random
from typing import Dict, List

import datasets
import numpy as np

from genbench import Task


def chunked(iterable, chunk_size):
"""
Split an iterable into chunks of a specified size.
Args:
iterable: The iterable to be chunked.
chunk_size: The size of each chunk.
Returns:
A generator that yields chunks of the iterable.
"""
if chunk_size <= 0:
raise ValueError("Chunk size must be greater than zero")

chunk = []
for item in iterable:
chunk.append(item)
if len(chunk) == chunk_size:
yield chunk
chunk = []

if chunk:
yield chunk


class NlCodesearchMrrCodesearchnetAdv(Task):
def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
"""Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
Args:
n_distractors: the number of randomly sampled distractor code for each ranking chunk
Returns:
A dictionary containing key-value pairs for the raw datasets.
The keys are strings representing the name of the dataset split
(e.g., "train", "validation", "test") and the values are
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
The train split only contains the original dataset.
"""
# Load the raw datasets
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
output: Dict[str, datasets.Dataset] = {}
# Set random seed for consistency
random.seed(42)
# Create distractors for each item
for split, dataset in raw_datasets.items():
if split == "test":
# Convert dataset to list for easier manipulation
dataset_list = list(dataset)

new_data = []

for idx, item in enumerate(dataset_list):
new_data.append(item)

# Create other_items list once and then simply exclude the current item during sampling
other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
random_items = random.sample(other_items, n_distractors)

input_parts = item["input"].split("[CODESPLIT]")

for random_item in random_items:
random_input_parts = random_item["input"].split("[CODESPLIT]")
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
new_data.append(new_item)
# Convert list back to HuggingFace dataset
output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
# Create negative samples for training
elif split == "train":
new_dataset = datasets.Dataset.from_dict({})
for item in dataset:
# Add comment-code pair to new dataset
new_dataset = new_dataset.add_item(item)
other_items = [other_item for other_item in dataset if other_item != item]
# Randomly select 49 other items
random_item = random.sample(other_items, 1)
# Split input into comment and code
input_parts = item["input"].split("[CODESPLIT]")
# Split random input into comment and code
random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
# Combine the "input" fields of the original and random items
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
# Add negative sample comment-code pair to new dataset
new_dataset = new_dataset.add_item(new_item)
output[split] = new_dataset
else:
output[split] = dataset
return output

def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
"""Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
This function assumes that the predictions were made and passed onto this function unshuffled.
The test data is ordered with each true pair followed by n number of distractors
Args:
predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
The keys are strings and the values are floats (logit scores or similarity values).
n_distractors: Number of distractor comment-code pair for each true pair.
Must be the same number as in the get_dataset_raw function
Returns:
A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
values. The keys are strings representing the name of the evaluation metric and the values are
floating-point numbers.
"""
ranks = []

batched_predictions = chunked(predictions, n_distractors + 1)

for batch_idx, predictions in enumerate(batched_predictions):
correct_score = predictions[0]["score"]
scores = np.array([prediction["score"] for prediction in predictions])
rank = np.sum(scores >= correct_score)
ranks.append(rank)
mean_mrr = np.mean(1.0 / np.array(ranks))

return {"mean mrr": mean_mrr}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
name: 'Natural Language Codesearch Ranking (codesearchnet_go)',

description: 'Natural Language Codesearch Ranking (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',

keywords: [
'codesearch',
'natural language query',
'mean reciprocal rank',
'go',
'cross-lingual',
],

authors: [
'Andor Diera',
'Abdelhalim Dahou',
'Lukas Galke',
'Fabian Karl',
'Florian Sihler',
'Ansgar Scherp',
],

data_source: {
type: 'manual',
test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
},

has_validation_set: false,
has_train_set: true,

task_type: 'multiple_choice',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0,
},
],

preparation_strategies: {
finetuning: {
objective: 'maximum_likelihood',
},
},
}
19 changes: 19 additions & 0 deletions src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Natural Language Codesearch Ranking (codesearchnet_go)

## Abstract
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_go).*

## Examples
*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_go).*

## Usage
*Describe how to load your task and what is required for evaluation, if anything.*

## Data Source
*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_go).*

## Limitations and Bias
*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_go) has, with links and references if possible.*

## GenBench Eval card
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
109 changes: 109 additions & 0 deletions src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import random
from typing import Dict, List

import datasets
import numpy as np

from genbench import Task


def chunked(iterable, chunk_size):
"""
Split an iterable into chunks of a specified size.
Args:
iterable: The iterable to be chunked.
chunk_size: The size of each chunk.
Returns:
A generator that yields chunks of the iterable.
"""
if chunk_size <= 0:
raise ValueError("Chunk size must be greater than zero")

chunk = []
for item in iterable:
chunk.append(item)
if len(chunk) == chunk_size:
yield chunk
chunk = []

if chunk:
yield chunk


class NlCodesearchMrrCodesearchnetGo(Task):
def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
"""Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
Args:
n_distractors: the number of randomly sampled distractor code for each ranking chunk
Returns:
A dictionary containing key-value pairs for the raw datasets.
The keys are strings representing the name of the dataset split
(e.g., "train", "validation", "test") and the values are
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
The train split only contains the original dataset.
"""
# Load the raw datasets
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
output: Dict[str, datasets.Dataset] = {}
# Set random seed for consistency
random.seed(42)
# Create 49 distractors for each item
for split, dataset in raw_datasets.items():
if split == "test":
# Convert dataset to list for easier manipulation
dataset_list = list(dataset)

new_data = []

for idx, item in enumerate(dataset_list):
new_data.append(item)

# Create other_items list once and then simply exclude the current item during sampling
other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
random_items = random.sample(other_items, n_distractors)

input_parts = item["input"].split("[CODESPLIT]")

for random_item in random_items:
random_input_parts = random_item["input"].split("[CODESPLIT]")
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
new_data.append(new_item)

# Convert list back to HuggingFace dataset
output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
else:
output[split] = dataset
return output

def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
"""Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
This function assumes that the predictions were made and passed onto this function unshuffled.
The test data is ordered with each true pair followed by n number of distractors
Args:
predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
The keys are strings and the values are floats (logit scores or similarity values).
n_distractors: Number of distractor comment-code pair for each true pair.
Must be the same number as in the get_dataset_raw function
Returns:
A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
values. The keys are strings representing the name of the evaluation metric and the values are
floating-point numbers.
"""
ranks = []

batched_predictions = chunked(predictions, n_distractors + 1)

for batch_idx, predictions in enumerate(batched_predictions):
correct_score = predictions[0]["score"]
scores = np.array([prediction["score"] for prediction in predictions])
rank = np.sum(scores >= correct_score)
ranks.append(rank)
mean_mrr = np.mean(1.0 / np.array(ranks))

return {"mean mrr": mean_mrr}
Empty file.
Loading

0 comments on commit 5fd5b05

Please sign in to comment.