This repository has been archived by the owner on Jul 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from drndr/nl_codesearch_mrr
[Task Submission] Natural Language Codesearch Ranking (`nl_codesearch_mrr`)
- Loading branch information
Showing
39 changed files
with
1,847 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from genbench import TaskDict | ||
|
||
|
||
class NlCodesearchMrr(TaskDict): | ||
pass |
Empty file.
48 changes: 48 additions & 0 deletions
48
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
{ | ||
name: 'Natural Language Codesearch Ranking (codesearchnet_adv)', | ||
|
||
description: 'Natural Language Codesearch Ranking (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts', | ||
|
||
keywords: [ | ||
'codesearch', | ||
'natural language query', | ||
'mean reciprocal rank', | ||
'python', | ||
'robustness', | ||
'covariate shift', | ||
], | ||
|
||
authors: [ | ||
'Andor Diera', | ||
'Abdelhalim Dahou', | ||
'Lukas Galke', | ||
'Fabian Karl', | ||
'Florian Sihler', | ||
'Ansgar Scherp', | ||
], | ||
|
||
data_source: { | ||
type: 'manual', | ||
test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl', | ||
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl', | ||
}, | ||
|
||
has_validation_set: false, | ||
has_train_set: true, | ||
|
||
task_type: 'multiple_choice', | ||
|
||
evaluation_metrics: [ | ||
{ | ||
hf_id: 'accuracy', | ||
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', | ||
best_score: 1.0, | ||
}, | ||
], | ||
|
||
preparation_strategies: { | ||
finetuning: { | ||
objective: 'maximum_likelihood', | ||
}, | ||
}, | ||
} |
19 changes: 19 additions & 0 deletions
19
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Natural Language Codesearch Ranking (codesearchnet_adv) | ||
|
||
## Abstract | ||
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_adv).* | ||
|
||
## Examples | ||
*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_adv).* | ||
|
||
## Usage | ||
*Describe how to load your task and what is required for evaluation, if anything.* | ||
|
||
## Data Source | ||
*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_adv).* | ||
|
||
## Limitations and Bias | ||
*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_adv) has, with links and references if possible.* | ||
|
||
## GenBench Eval card | ||
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. |
127 changes: 127 additions & 0 deletions
127
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import random | ||
from typing import Dict, List | ||
|
||
import datasets | ||
import numpy as np | ||
|
||
from genbench import Task | ||
|
||
|
||
def chunked(iterable, chunk_size): | ||
""" | ||
Split an iterable into chunks of a specified size. | ||
Args: | ||
iterable: The iterable to be chunked. | ||
chunk_size: The size of each chunk. | ||
Returns: | ||
A generator that yields chunks of the iterable. | ||
""" | ||
if chunk_size <= 0: | ||
raise ValueError("Chunk size must be greater than zero") | ||
|
||
chunk = [] | ||
for item in iterable: | ||
chunk.append(item) | ||
if len(chunk) == chunk_size: | ||
yield chunk | ||
chunk = [] | ||
|
||
if chunk: | ||
yield chunk | ||
|
||
|
||
class NlCodesearchMrrCodesearchnetAdv(Task): | ||
def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]: | ||
"""Create the dataset adding n distractor pair (original comment, random code snippet) for ranking. | ||
Args: | ||
n_distractors: the number of randomly sampled distractor code for each ranking chunk | ||
Returns: | ||
A dictionary containing key-value pairs for the raw datasets. | ||
The keys are strings representing the name of the dataset split | ||
(e.g., "train", "validation", "test") and the values are | ||
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split. | ||
The train split only contains the original dataset. | ||
""" | ||
# Load the raw datasets | ||
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source() | ||
output: Dict[str, datasets.Dataset] = {} | ||
# Set random seed for consistency | ||
random.seed(42) | ||
# Create distractors for each item | ||
for split, dataset in raw_datasets.items(): | ||
if split == "test": | ||
# Convert dataset to list for easier manipulation | ||
dataset_list = list(dataset) | ||
|
||
new_data = [] | ||
|
||
for idx, item in enumerate(dataset_list): | ||
new_data.append(item) | ||
|
||
# Create other_items list once and then simply exclude the current item during sampling | ||
other_items = dataset_list[:idx] + dataset_list[idx + 1 :] | ||
random_items = random.sample(other_items, n_distractors) | ||
|
||
input_parts = item["input"].split("[CODESPLIT]") | ||
|
||
for random_item in random_items: | ||
random_input_parts = random_item["input"].split("[CODESPLIT]") | ||
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] | ||
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} | ||
new_data.append(new_item) | ||
# Convert list back to HuggingFace dataset | ||
output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]}) | ||
# Create negative samples for training | ||
elif split == "train": | ||
new_dataset = datasets.Dataset.from_dict({}) | ||
for item in dataset: | ||
# Add comment-code pair to new dataset | ||
new_dataset = new_dataset.add_item(item) | ||
other_items = [other_item for other_item in dataset if other_item != item] | ||
# Randomly select 49 other items | ||
random_item = random.sample(other_items, 1) | ||
# Split input into comment and code | ||
input_parts = item["input"].split("[CODESPLIT]") | ||
# Split random input into comment and code | ||
random_input_parts = random_item[0]["input"].split("[CODESPLIT]") | ||
# Combine the "input" fields of the original and random items | ||
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] | ||
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} | ||
# Add negative sample comment-code pair to new dataset | ||
new_dataset = new_dataset.add_item(new_item) | ||
output[split] = new_dataset | ||
else: | ||
output[split] = dataset | ||
return output | ||
|
||
def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]: | ||
"""Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors | ||
This function assumes that the predictions were made and passed onto this function unshuffled. | ||
The test data is ordered with each true pair followed by n number of distractors | ||
Args: | ||
predictions: A list of dictionaries, where each dictionary contains the predicted values for an example. | ||
The keys are strings and the values are floats (logit scores or similarity values). | ||
n_distractors: Number of distractor comment-code pair for each true pair. | ||
Must be the same number as in the get_dataset_raw function | ||
Returns: | ||
A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted | ||
values. The keys are strings representing the name of the evaluation metric and the values are | ||
floating-point numbers. | ||
""" | ||
ranks = [] | ||
|
||
batched_predictions = chunked(predictions, n_distractors + 1) | ||
|
||
for batch_idx, predictions in enumerate(batched_predictions): | ||
correct_score = predictions[0]["score"] | ||
scores = np.array([prediction["score"] for prediction in predictions]) | ||
rank = np.sum(scores >= correct_score) | ||
ranks.append(rank) | ||
mean_mrr = np.mean(1.0 / np.array(ranks)) | ||
|
||
return {"mean mrr": mean_mrr} |
Empty file.
47 changes: 47 additions & 0 deletions
47
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
{ | ||
name: 'Natural Language Codesearch Ranking (codesearchnet_go)', | ||
|
||
description: 'Natural Language Codesearch Ranking (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization', | ||
|
||
keywords: [ | ||
'codesearch', | ||
'natural language query', | ||
'mean reciprocal rank', | ||
'go', | ||
'cross-lingual', | ||
], | ||
|
||
authors: [ | ||
'Andor Diera', | ||
'Abdelhalim Dahou', | ||
'Lukas Galke', | ||
'Fabian Karl', | ||
'Florian Sihler', | ||
'Ansgar Scherp', | ||
], | ||
|
||
data_source: { | ||
type: 'manual', | ||
test: 'https://zenodo.org/record/8310891/files/test_go.jsonl', | ||
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl', | ||
}, | ||
|
||
has_validation_set: false, | ||
has_train_set: true, | ||
|
||
task_type: 'multiple_choice', | ||
|
||
evaluation_metrics: [ | ||
{ | ||
hf_id: 'accuracy', | ||
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a', | ||
best_score: 1.0, | ||
}, | ||
], | ||
|
||
preparation_strategies: { | ||
finetuning: { | ||
objective: 'maximum_likelihood', | ||
}, | ||
}, | ||
} |
19 changes: 19 additions & 0 deletions
19
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Natural Language Codesearch Ranking (codesearchnet_go) | ||
|
||
## Abstract | ||
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_go).* | ||
|
||
## Examples | ||
*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_go).* | ||
|
||
## Usage | ||
*Describe how to load your task and what is required for evaluation, if anything.* | ||
|
||
## Data Source | ||
*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_go).* | ||
|
||
## Limitations and Bias | ||
*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_go) has, with links and references if possible.* | ||
|
||
## GenBench Eval card | ||
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*. |
109 changes: 109 additions & 0 deletions
109
src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import random | ||
from typing import Dict, List | ||
|
||
import datasets | ||
import numpy as np | ||
|
||
from genbench import Task | ||
|
||
|
||
def chunked(iterable, chunk_size): | ||
""" | ||
Split an iterable into chunks of a specified size. | ||
Args: | ||
iterable: The iterable to be chunked. | ||
chunk_size: The size of each chunk. | ||
Returns: | ||
A generator that yields chunks of the iterable. | ||
""" | ||
if chunk_size <= 0: | ||
raise ValueError("Chunk size must be greater than zero") | ||
|
||
chunk = [] | ||
for item in iterable: | ||
chunk.append(item) | ||
if len(chunk) == chunk_size: | ||
yield chunk | ||
chunk = [] | ||
|
||
if chunk: | ||
yield chunk | ||
|
||
|
||
class NlCodesearchMrrCodesearchnetGo(Task): | ||
def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]: | ||
"""Create the dataset adding n distractor pair (original comment, random code snippet) for ranking. | ||
Args: | ||
n_distractors: the number of randomly sampled distractor code for each ranking chunk | ||
Returns: | ||
A dictionary containing key-value pairs for the raw datasets. | ||
The keys are strings representing the name of the dataset split | ||
(e.g., "train", "validation", "test") and the values are | ||
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split. | ||
The train split only contains the original dataset. | ||
""" | ||
# Load the raw datasets | ||
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source() | ||
output: Dict[str, datasets.Dataset] = {} | ||
# Set random seed for consistency | ||
random.seed(42) | ||
# Create 49 distractors for each item | ||
for split, dataset in raw_datasets.items(): | ||
if split == "test": | ||
# Convert dataset to list for easier manipulation | ||
dataset_list = list(dataset) | ||
|
||
new_data = [] | ||
|
||
for idx, item in enumerate(dataset_list): | ||
new_data.append(item) | ||
|
||
# Create other_items list once and then simply exclude the current item during sampling | ||
other_items = dataset_list[:idx] + dataset_list[idx + 1 :] | ||
random_items = random.sample(other_items, n_distractors) | ||
|
||
input_parts = item["input"].split("[CODESPLIT]") | ||
|
||
for random_item in random_items: | ||
random_input_parts = random_item["input"].split("[CODESPLIT]") | ||
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] | ||
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} | ||
new_data.append(new_item) | ||
|
||
# Convert list back to HuggingFace dataset | ||
output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]}) | ||
else: | ||
output[split] = dataset | ||
return output | ||
|
||
def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]: | ||
"""Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors | ||
This function assumes that the predictions were made and passed onto this function unshuffled. | ||
The test data is ordered with each true pair followed by n number of distractors | ||
Args: | ||
predictions: A list of dictionaries, where each dictionary contains the predicted values for an example. | ||
The keys are strings and the values are floats (logit scores or similarity values). | ||
n_distractors: Number of distractor comment-code pair for each true pair. | ||
Must be the same number as in the get_dataset_raw function | ||
Returns: | ||
A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted | ||
values. The keys are strings representing the name of the evaluation metric and the values are | ||
floating-point numbers. | ||
""" | ||
ranks = [] | ||
|
||
batched_predictions = chunked(predictions, n_distractors + 1) | ||
|
||
for batch_idx, predictions in enumerate(batched_predictions): | ||
correct_score = predictions[0]["score"] | ||
scores = np.array([prediction["score"] for prediction in predictions]) | ||
rank = np.sum(scores >= correct_score) | ||
ranks.append(rank) | ||
mean_mrr = np.mean(1.0 / np.array(ranks)) | ||
|
||
return {"mean mrr": mean_mrr} |
Empty file.
Oops, something went wrong.