diff --git a/src/genbench/tasks/europarl_dbca_splits/__init__.py b/src/genbench/tasks/europarl_dbca_splits/__init__.py
new file mode 100644
index 0000000..eecdf60
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class EuroparlDbcaSplits(TaskDict):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/_base_task.py b/src/genbench/tasks/europarl_dbca_splits/_base_task.py
new file mode 100644
index 0000000..3e4be76
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/_base_task.py
@@ -0,0 +1,116 @@
+from collections import OrderedDict
+from typing import Any, List, Mapping
+
+import evaluate
+import numpy as np
+from datasets import Dataset
+
+from genbench import Task
+from genbench.api import EvaluationResult, TaskType
+from genbench.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class BaseDbcaTask(Task):
+ """This task evaluates how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (
, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).
+ """
+
+ def evaluate_predictions(
+ self,
+ *,
+ predictions: List[Mapping[str, Any]] = None,
+ gold: Dataset = None,
+ ) -> EvaluationResult:
+ result = OrderedDict()
+ for metric_config in self.config.evaluation_metrics:
+ hf_id = metric_config.hf_id
+ if isinstance(hf_id, str):
+ hf_id = [hf_id]
+
+ metric = evaluate.load(*hf_id, revision=metric_config.git_commit_sha)
+
+ refs_lst = [g["target"] for g in gold]
+ preds_lst = [pred["target"] for pred in predictions]
+
+ ref_type = type(refs_lst[0])
+ pred_type = type(preds_lst[0])
+ if pred_type != ref_type:
+ if self.config.task_type != TaskType.MULTIPLE_CHOICE:
+ raise ValueError(
+ f"Predictions and references have different types: preds: {pred_type} and refs: {ref_type}. "
+ )
+ # Convert predictions to the same type as the references
+ if pred_type == str and ref_type == int:
+ logger.warning("Predictions are strings, but references are ints. Converting predictions to ints.")
+ converted_preds = []
+ for pred, ref in zip(preds_lst, gold):
+ assert "target_options" in ref
+ converted_preds.append(ref["target_options"].index(pred))
+ preds_lst = converted_preds
+ elif pred_type == int and ref_type == str:
+ logger.warning("Predictions are ints, but references are strings. Converting references to ints.")
+ converted_refs = []
+ for pred, ref in zip(preds_lst, gold):
+ assert "target_options" in ref
+ converted_refs.append(ref["target_options"].index(ref["target"]))
+ refs_lst = converted_refs
+ else:
+ if self.config.task_type == TaskType.MULTIPLE_CHOICE and pred_type != int:
+ # Convert both predictions and references to int
+ logger.warning(
+ "Predictions and references have the same type, but it is not int. Converting both to int."
+ )
+ converted_preds = []
+ converted_refs = []
+ for pred, ref in zip(preds_lst, gold):
+ assert "target_options" in ref
+ converted_preds.append(ref["target_options"].index(pred))
+ converted_refs.append(ref["target_options"].index(ref["target"]))
+ preds_lst = converted_preds
+ refs_lst = converted_refs
+
+ extra_kwargs = metric_config.compute_extra_kwargs or {}
+ output: dict = metric.compute(predictions=preds_lst, references=refs_lst, **extra_kwargs)
+
+ if output is None:
+ raise ValueError(
+ f"Metric {metric_config.hf_id} returned None. " f"Please check the metric implementation."
+ )
+
+ # Update output keys to include the metric id
+ metric_id = "_".join(hf_id)
+ output = {f"hf_{metric_id}__{k}": v for k, v in output.items() if k == "score"}
+
+ result.update(output)
+
+ return result
+
+ def chernoff_coef(self, vec1, vec2, alpha):
+ """
+ The Chernoff coefficient c is a similarity measure C_{alpha}(P||Q)
+ = sum_k[p_k^alpha * q_k^(1-alpha)] e[0,1] between two (probability)
+ distributions P and Q. The alpha parameter determines if we want to
+ measure whether Q includes elements that are not in P.
+ """
+ if alpha < 0 or alpha > 1:
+ raise ValueError("alpha must be in [0,1]")
+ # use log to avoid underflow
+ return np.sum(np.exp((np.log(vec1) * alpha) + (np.log(vec2) * (1 - alpha))), axis=1)
+
+ def normalize_vector(self, vector):
+ """Normalize a vector to have sum 1."""
+ return np.nan_to_num(np.divide(vector, np.sum(vector)))
+
+ def divergence(self, vec1, vec2, alpha):
+ """
+ Calculate divergence between two vectors.
+ Atom divergence is 1 - Chernoff coefficient, with alpha=0.5.
+ Compound divergence is 1 - Chernoff coefficient, with alpha=0.1.
+ """
+ return float(1 - self.chernoff_coef(self.normalize_vector(vec1), self.normalize_vector(vec2), alpha))
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet
new file mode 100644
index 0000000..4c9d9bd
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv0_de)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_de'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md
new file mode 100644
index 0000000..c6e1e28
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_de)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py
new file mode 100644
index 0000000..898b036
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_de/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0De(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet
new file mode 100644
index 0000000..c8b975b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv0_el)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_el'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md
new file mode 100644
index 0000000..f880163
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_el)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py
new file mode 100644
index 0000000..1124f49
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_el/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0El(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet
new file mode 100644
index 0000000..e97f2bd
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv0_fi)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_fi'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md
new file mode 100644
index 0000000..31a0e0d
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_fi)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py
new file mode 100644
index 0000000..7bf9f32
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fi/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0Fi(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet
new file mode 100644
index 0000000..0cf8db9
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv0_fr)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv0.0_en_fr'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md
new file mode 100644
index 0000000..79e7f71
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv0_fr)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py
new file mode 100644
index 0000000..943fe65
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv0_fr/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv0Fr(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet
new file mode 100644
index 0000000..837e681
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/config.jsonnet
@@ -0,0 +1,44 @@
+{
+ name: 'Europarl DBCA splits (comdiv1_de)',
+
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_de'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md
new file mode 100644
index 0000000..58415ce
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_de)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py
new file mode 100644
index 0000000..3b9ec0a
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_de/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1De(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet
new file mode 100644
index 0000000..f6be560
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/config.jsonnet
@@ -0,0 +1,44 @@
+{
+ name: 'Europarl DBCA splits (comdiv1_el)',
+
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_el'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md
new file mode 100644
index 0000000..90b6a6b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_el)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py
new file mode 100644
index 0000000..7fcf724
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_el/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1El(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet
new file mode 100644
index 0000000..76976df
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv1_fi)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_fi'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md
new file mode 100644
index 0000000..0c5f258
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_fi)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py
new file mode 100644
index 0000000..8fc677b
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fi/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1Fi(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/__init__.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet
new file mode 100644
index 0000000..6d095f4
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/config.jsonnet
@@ -0,0 +1,43 @@
+{
+ name: 'Europarl DBCA splits (comdiv1_fr)',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ 'dependency relations',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ task_type: 'free_form',
+
+ data_source: {
+ type: 'hf',
+ hf_id: ['Anssi/europarl_dbca_splits', 'comdiv1.0_en_fr'],
+ git_commit_sha: '0dcb7abe8e18aa520cbfcbe9141b916c684912fc'
+ },
+
+ evaluation_metrics: [
+ {
+ hf_id: 'chrf',
+ git_commit_sha: '4b119256e85de9130aa84d87247381c5acb29bc1',
+ best_score: 100.0,
+ }
+ ],
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md
new file mode 100644
index 0000000..eda471f
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/doc.md
@@ -0,0 +1,3 @@
+# Europarl DBCA splits (comdiv1_fr)
+
+see ../doc.md
diff --git a/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py
new file mode 100644
index 0000000..8e27ac1
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/comdiv1_fr/task.py
@@ -0,0 +1,5 @@
+from genbench.tasks.europarl_dbca_splits._base_task import BaseDbcaTask
+
+
+class EuroparlDbcaSplitsComdiv1Fr(BaseDbcaTask):
+ pass
diff --git a/src/genbench/tasks/europarl_dbca_splits/config.jsonnet b/src/genbench/tasks/europarl_dbca_splits/config.jsonnet
new file mode 100644
index 0000000..9b01c57
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/config.jsonnet
@@ -0,0 +1,28 @@
+{
+ name: 'Divergent DepRel Distributions',
+
+ description: 'This task aims to measure how well an NMT model generalises to a shifted distribution of
+ dependency relations. In practice, this means that the test set includes novel
+ (, , ) tuples (=compounds) that were not seen in
+ the training set, while having similar relative frequencies of the lemmas and dependency
+ relation tags (= elements of the compound tuples = atoms).',
+
+ keywords: [
+ 'translation',
+ ],
+
+ authors: [
+ 'Anssi Moisio',
+ ],
+
+ subtasks_order: [
+ 'comdiv0_de',
+ 'comdiv1_de',
+ 'comdiv0_fr',
+ 'comdiv1_fr',
+ 'comdiv0_el',
+ 'comdiv1_el',
+ 'comdiv0_fi',
+ 'comdiv1_fi',
+ ],
+}
diff --git a/src/genbench/tasks/europarl_dbca_splits/doc.md b/src/genbench/tasks/europarl_dbca_splits/doc.md
new file mode 100644
index 0000000..a32d0bf
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/doc.md
@@ -0,0 +1,64 @@
+# Train-test data splits of the Europarl NMT corpus with divergent distributions of dependency relations
+## Abstract
+Compositional generalisation (CG), in NLP and in machine learning generally, has been assessed mostly using artificial datasets. It is important to develop benchmarks to assess CG also in real-world natural language tasks in order to understand the abilities and limitations of systems that are deployed in the wild. In our GenBench Collaborative Benchmarking Task submission, we utilise the distribution-based compositionality assessment (DBCA) framework to split the Europarl translation corpus into a training and test set in a way that translating the test set requires compositional generalisation capacity. Specifically, the training and test sets have divergent distributions of dependency relations, testing the NMT system's capacity to translate dependencies that they have not been trained on.
+
+
+## Examples
+The task is simply sentence-level translation, e.g.:
+```
+"input": "If the House agrees, I shall do as Mr Evans has suggested.", "target": "Jos parlamentin jäsenet kannattavat sitä, teen niin kuin jäsen Evans ehdotti."
+```
+
+
+## Usage
+To use the provided maximum-compound-divergence train-test split for a target language (German=de, French=fr, Greek=el, Finnish=fi), load the data, train a model on the training subset, and evaluate the model's predictions on the test subset
+```
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+# Load the task
+task = load_task("europarl_dbca_splits")
+ds = task.comdiv1_de.get_prepared_datasets(PreparationStrategy.FINETUNING)
+
+# Evaluate predictions
+preds = ...
+print(task.comdiv1_de.evaluate_predictions(
+ predictions=preds,
+ gold=ds['test'],
+ )
+ )
+```
+To compare a model's capacity to generalise, we assess how much the translation accuracy decreases when the compound divergence between train and test sets increases. We keep atom distributions the same between train and test sets to make generalisation possible in principle. This means we should evaluate each model on both low- and high-compound-divergence data splits. To compute the generalisation score as described in the accompanying paper, train two systems on the splits with compound divergence values 0 and 1 (e.g. subtasks "comdiv0_de" and "comdiv1_de"), and take the ratio of the chrF2++ scores: `task.comdiv1_de.evaluate_predictions(predictions_comdiv1_de, gold_comdiv1_de) / task.comdiv0_de.evaluate_predictions(predictions_comdiv0_de, gold_comdiv0_de)`
+
+#### Using your other data sets:
+To compute the atom and compound divergences for any pair of training (pre-training, training and/or fine-tuning) and test data sets, use method `EuroparlDbcaSplitsComdiv0De.divergence`. To create the atom and compound distributions of the training and test sets, the frequencies of each atom and compound in each set need to be first counted. The vectors that represent the atom and compound distributions of the train/test sets are inputted to the method to calculate the divergences:
+```
+import numpy as np
+# alpha is 0.5 for atom divergence and 0.1 for compound divergence
+train_set_atom_distribution = np.array([[2,4,10]])
+test_set_atom_distribution = np.array([[1,2,5]])
+atom_divergence = task.comdiv1_de.divergence(train_set_atom_distribution,
+ test_set_atom_distribution,
+ 0.5)
+# atom_divergence = 0.0
+
+train_set_compound_distribution = np.array([[2,0,6]])
+test_set_compound_distribution = np.array([[0,5,5]])
+compound_divergence = task.comdiv1_de.divergence(train_set_compound_distribution,
+ test_set_compound_distribution,
+ 0.1)
+# compound_divergence = 0.4793101280037947
+```
+Each element in the distribution vectors represents the frequency of one type of atom/compound.
+
+
+## Data Source
+The original data source is `https://opus.nlpl.eu/Europarl.php`
+
+## Limitations and Bias
+Our goal was to create a benchmark that tests generalisation to novel dependency relations in a comprehensive way, not selecting some specific types of dependency relations and leaving out other types. However, memory requirements of the data splitting algorithm did not permit us to use all of the atoms and compounds in the distribution divergence calculations, so we opted to leave out the most frequent and the most infrequent lemmas, and the dependency relations that include them, which probably affects the results.
+
+## GenBench Eval card
+The motivation is primarily intrinsic: it is important to assess if translation models learn the systematic rules that characterise natural language, in order to get some understanding how the models work. Another motivation is practical; compositional generalisation is important for the practical reason that it would make the models robust. The type of the generalisation is compositional, and the shift type is covariate, since the input data distribution changes but the task remains otherwise the same. Shift source is partitioned natural data, since we do not use any artificial data, but the train-test split is artificial. Lastly, the shift locus in our experiments is train-test, but the method and benchmark could also possibly be used as a finetune train-test benchmark, by finetuning a pretrained model on the training set.
+
+![GenBench Eval Card](eval_card.png)
diff --git a/src/genbench/tasks/europarl_dbca_splits/eval_card.png b/src/genbench/tasks/europarl_dbca_splits/eval_card.png
new file mode 100644
index 0000000..6f7cd95
Binary files /dev/null and b/src/genbench/tasks/europarl_dbca_splits/eval_card.png differ
diff --git a/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt b/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt
new file mode 100644
index 0000000..765824a
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/requirements-usage-example.txt
@@ -0,0 +1 @@
+transformers==4.35.2
diff --git a/src/genbench/tasks/europarl_dbca_splits/usage_example.py b/src/genbench/tasks/europarl_dbca_splits/usage_example.py
new file mode 100644
index 0000000..c3c9b12
--- /dev/null
+++ b/src/genbench/tasks/europarl_dbca_splits/usage_example.py
@@ -0,0 +1,193 @@
+"""
+Usage example for the Europarl DBCA splits task.
+
+Training of the NMT model is mostly based on the HuggingFace NLP course chapter on translation:
+https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt
+"""
+import argparse
+
+from datasets import DatasetDict
+from transformers import (
+ DataCollatorForSeq2Seq,
+ FSMTConfig,
+ FSMTForConditionalGeneration,
+ FSMTTokenizer,
+ Seq2SeqTrainer,
+ Seq2SeqTrainingArguments,
+ pipeline,
+)
+
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+def tokenize_corpus(dataset, save_to_file):
+ """
+ Tokenizes the dataset and saves it to disk.
+ """
+
+ def preprocess_function(examples):
+ inputs = examples["input"]
+ targets = examples["target"]
+ model_inputs = tokenizer(inputs, text_target=targets, max_length=MAX_LENGTH, truncation=True)
+ return model_inputs
+
+ dataset = DatasetDict(dataset)
+ tokenized = dataset.map(
+ preprocess_function,
+ batched=True,
+ )
+ tokenized.save_to_disk(save_to_file)
+ return tokenized
+
+
+def translate_sentences(model_name_or_path, eval_dataset):
+ """
+ Translates the sentences in eval_dataset using the given model.
+ """
+ translator = pipeline(
+ "translation",
+ model=model_name_or_path,
+ device="cuda",
+ batch_size=BATCH_SIZE,
+ )
+ return translator(eval_dataset, max_length=MAX_LENGTH)
+
+
+def train_from_scratch(tokenized_corpus, output_dir_name):
+ """
+ Trains an FSMT model from scratch.
+ Model architecture is similar to that in Vaswani et al. (2017).
+ """
+ config = FSMTConfig(
+ activation_dropout=0.0,
+ activation_function="relu",
+ architectures=["FSMTForConditionalGeneration"],
+ attention_dropout=0.1,
+ bos_token_id=0,
+ d_model=512,
+ decoder={"bos_token_id": 2, "model_type": "fsmt_decoder", "vocab_size": 42024},
+ decoder_attention_heads=8,
+ decoder_ffn_dim=2048,
+ decoder_layerdrop=0,
+ decoder_layers=6,
+ decoder_start_token_id=2,
+ dropout=0.1,
+ encoder_attention_heads=8,
+ encoder_ffn_dim=2048,
+ encoder_layerdrop=0,
+ encoder_layers=6,
+ eos_token_id=2,
+ forced_eos_token_id=2,
+ init_std=0.02,
+ is_encoder_decoder=True,
+ langs=["en", "de"],
+ length_penalty=1.15,
+ max_length=MAX_LENGTH,
+ max_position_embeddings=1024,
+ model_type="fsmt",
+ num_beams=5,
+ num_hidden_layers=6,
+ pad_token_id=1,
+ scale_embedding=True,
+ src_vocab_size=42024,
+ tgt_vocab_size=42024,
+ tie_word_embeddings=False,
+ transformers_version="4.35.2",
+ use_cache=True,
+ )
+ model = FSMTForConditionalGeneration(config=config)
+
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+
+ training_args = Seq2SeqTrainingArguments(
+ output_dir=output_dir_name,
+ evaluation_strategy="steps",
+ eval_steps=5000,
+ save_strategy="steps",
+ save_steps=10000,
+ learning_rate=2e-5,
+ per_device_train_batch_size=BATCH_SIZE,
+ per_device_eval_batch_size=BATCH_SIZE,
+ weight_decay=0.01,
+ save_total_limit=10,
+ max_steps=100000,
+ fp16=True,
+ )
+
+ trainer = Seq2SeqTrainer(
+ model,
+ training_args,
+ train_dataset=tokenized_corpus["train"],
+ eval_dataset=tokenized_corpus["validation"],
+ data_collator=data_collator,
+ tokenizer=tokenizer,
+ )
+ trainer.train()
+
+
+if __name__ == "__main__":
+ argparser = argparse.ArgumentParser()
+ argparser.add_argument("--tokenize", action="store_true")
+ argparser.add_argument("--train", action="store_true")
+ argparser.add_argument("--eval", action="store_true")
+ args = argparser.parse_args()
+
+ # Load the task
+ task = load_task("europarl_dbca_splits")
+
+ # A pretrained multilingual tokenizer, used for both models and both languages
+ tokenizer = FSMTTokenizer.from_pretrained("stas/tiny-wmt19-en-de")
+
+ MAX_LENGTH = 128
+ BATCH_SIZE = 128
+
+ results = []
+ # "comdiv0" is the easy non-compositional data split, with minimal compound divergence
+ # "comdiv1" is the difficult, compositional data split, with maximal compound divergence
+ # English-German corpus is used for this example.
+ # For other target languages, replace "de" with "fr", "el", or "fi" in the subtask name.
+ for comdiv in ["0", "1"]:
+ if comdiv == "0":
+ subtask = task.comdiv0_de
+ else:
+ subtask = task.comdiv1_de
+
+ subtask_dataset = subtask.get_prepared_datasets(PreparationStrategy.FINETUNING)
+
+ tokenized_dataset_dir = f"ds_de_comdiv{comdiv}_tokenized"
+ if args.tokenize:
+ tokenized_datasets = tokenize_corpus(subtask_dataset, tokenized_dataset_dir)
+ else:
+ tokenized_datasets = DatasetDict.load_from_disk(tokenized_dataset_dir)
+
+ # Extract a validation set from training set
+ train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.01)
+ tokenized_datasets["train"] = train_val_split["train"]
+ tokenized_datasets["validation"] = train_val_split["test"]
+
+ nmt_model_dir = f"FSMT_en-de_comdiv{comdiv}"
+ if args.train:
+ train_from_scratch(tokenized_datasets, nmt_model_dir)
+
+ if args.eval:
+ cp = "checkpoint-100000"
+ print(f"Results for comdiv{comdiv}, checkpoint {cp}")
+ preds = translate_sentences(nmt_model_dir + "/" + cp, tokenized_datasets["test"]["input"])
+
+ # re-map the keys to match the evaluation script
+ preds = [{"target": pred["translation_text"]} for pred in preds]
+
+ score = subtask.evaluate_predictions(
+ predictions=preds,
+ gold=tokenized_datasets["test"],
+ )
+ print(score)
+ results.append(score)
+
+ if args.eval:
+ print(
+ "Generalisation score (maximum compound divergence score divided by "
+ + "minimum compound divergence score):"
+ )
+ print(results[1]["hf_chrf__score"] / results[0]["hf_chrf__score"])
diff --git a/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf
new file mode 100644
index 0000000..3d4e16e
Binary files /dev/null and b/src/genbench/tasks/nl_codesearch_mrr/GenBench Evaluation Card.pdf differ
diff --git a/src/genbench/tasks/nl_codesearch_mrr/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/__init__.py
new file mode 100644
index 0000000..85a91e5
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchMrr(TaskDict):
+ pass
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
new file mode 100644
index 0000000..4272171
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,48 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_adv)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'python',
+ 'robustness',
+ 'covariate shift',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
new file mode 100644
index 0000000..901fc56
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
new file mode 100644
index 0000000..52535c5
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_adv/task.py
@@ -0,0 +1,127 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetAdv(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ # Create negative samples for training
+ elif split == "train":
+ new_dataset = datasets.Dataset.from_dict({})
+ for item in dataset:
+ # Add comment-code pair to new dataset
+ new_dataset = new_dataset.add_item(item)
+ other_items = [other_item for other_item in dataset if other_item != item]
+ # Randomly select 49 other items
+ random_item = random.sample(other_items, 1)
+ # Split input into comment and code
+ input_parts = item["input"].split("[CODESPLIT]")
+ # Split random input into comment and code
+ random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+ # Combine the "input" fields of the original and random items
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ # Add negative sample comment-code pair to new dataset
+ new_dataset = new_dataset.add_item(new_item)
+ output[split] = new_dataset
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
new file mode 100644
index 0000000..990651b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/config.jsonnet
@@ -0,0 +1,47 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_go)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'go',
+ 'cross-lingual',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
new file mode 100644
index 0000000..8bbf5c3
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
new file mode 100644
index 0000000..beff8ca
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_go/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetGo(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create 49 distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet
new file mode 100644
index 0000000..e97580e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/config.jsonnet
@@ -0,0 +1,47 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_java)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'java',
+ 'cross-lingual'
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md
new file mode 100644
index 0000000..a18ffab
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_java)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_java).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_java) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py
new file mode 100644
index 0000000..b5ec8e0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_java/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetJava(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create 49 distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet
new file mode 100644
index 0000000..3a691cb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/config.jsonnet
@@ -0,0 +1,47 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_javascript)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_javascript) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'javascript',
+ 'cross-lingual',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_javascript.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md
new file mode 100644
index 0000000..6b56758
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_javascript)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_javascript).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_javascript) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py
new file mode 100644
index 0000000..aeb2056
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_javascript/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetJavascript(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create 49 distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet
new file mode 100644
index 0000000..3f12d27
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/config.jsonnet
@@ -0,0 +1,47 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_php)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_php) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'php',
+ 'cross-lingual',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_php.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md
new file mode 100644
index 0000000..9fd3043
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_php)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_php).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_php) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py
new file mode 100644
index 0000000..797855b
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_php/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetPhp(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create 49 distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet
new file mode 100644
index 0000000..e3d7582
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/config.jsonnet
@@ -0,0 +1,47 @@
+{
+ name: 'Natural Language Codesearch Ranking (codesearchnet_ruby)',
+
+ description: 'Natural Language Codesearch Ranking (codesearchnet_ruby) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'ruby',
+ 'cross-lingual',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_ruby.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md
new file mode 100644
index 0000000..a0e0efb
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (codesearchnet_ruby)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (codesearchnet_ruby).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (codesearchnet_ruby) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py
new file mode 100644
index 0000000..f2525c1
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/codesearchnet_ruby/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCodesearchnetRuby(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create 49 distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet
new file mode 100644
index 0000000..ee9854d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/config.jsonnet
@@ -0,0 +1,29 @@
+{
+ name: 'Natural Language Codesearch Ranking',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal ranking',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ subtasks_order: [
+ 'codesearchnet_adv',
+ 'cosqa',
+ 'codesearchnet_ruby',
+ 'codesearchnet_go',
+ 'codesearchnet_java',
+ 'codesearchnet_javascript',
+ 'codesearchnet_php',
+ 'statcodesearch',
+ ],
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/cosqa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet
new file mode 100644
index 0000000..846e115
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/config.jsonnet
@@ -0,0 +1,48 @@
+{
+ name: 'Natural Language Codesearch Ranking (cosqa)',
+
+ description: 'Natural Language Codesearch Ranking (cosqa) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'python',
+ 'robustness',
+ 'covariate shift',
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_cosqa.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md b/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md
new file mode 100644
index 0000000..e31666d
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Ranking (webquery)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Ranking (webquery).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Ranking (webquery).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Ranking (webquery).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Ranking (webquery) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py b/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py
new file mode 100644
index 0000000..64b959e
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/cosqa/task.py
@@ -0,0 +1,109 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrCosqa(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ # Load the raw datasets
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ # Set random seed for consistency
+ random.seed(42)
+ # Create distractors for each item
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/doc.md b/src/genbench/tasks/nl_codesearch_mrr/doc.md
new file mode 100644
index 0000000..3cf5ad0
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/doc.md
@@ -0,0 +1,43 @@
+## Motivation
+Language models can serve as a valuable tool for software developers to increase productivity. Large generative models can be used for code generation and code completion, while smaller encoder-only models are capable of performing code search tasks using natural language queries. These capabilities are heavily influenced by the quality and diversity of the available training data. Source code datasets used for training usually focus on the most popular languages and testing is mostly conducted on the same distributions, often overlooking low resource programming languages. Motivated by the NLP generalisation taxonomy proposed by Hupkes et. al., we propose a new benchmark dataset called [placeholder] which builds upon existing natural language code search datasets to systemically study the code understanding generalization capabilities of language models. For evaluation and comparison, we collect several baseline results using fine-tuned BERT-style models and GPT-style large language models in a zero-shot setting.
+
+## Examples
+Given n number of code comment pairs (1 true pair and n-1 distractor pair where a comment has been matched with a random code snippet), calculate the MRR score.
+
+**true sample**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def set_sampled_topics ( self , sampled_topics ) : assert sampled_topics . dtype == np . int and len ( sampled_topics . shape ) <= 2 if len ( sampled_topics . shape ) == 1 : self . sampled_topics = sampled_topics . reshape ( 1 , sampled_topics . shape [ 0 ] ) else : self . sampled_topics = sampled_topics self . samples = self . sampled_topics . shape [ 0 ] self . tt = self . tt_comp ( self . sampled_topics ) self . dt = self . dt_comp ( self . sampled_topics )", "target": 1, "target_options": ["no_match", "match"]} \
+**distractor sample**: {"input": "Allocate sampled topics to the documents rather than estimate them . Automatically generate term - topic and document - topic matrices . [SEP] def _resolve_entity ( mo ) : ent = mo . group ( \"entity\" ) s = mo . group ( ) if s . startswith ( '' ) : if s [ 2 ] in 'xX' : radix = 16 else : radix = 10 try : num = int ( ent , radix ) except ( ValueError , OverflowError ) : return u'' else : num = name2codepoint . get ( ent ) if num is None or num < 0 : # unknown entity -> ignore return u'' try : return unichr ( num ) except ValueError : return u''", "target": 0, "target_options": ["no_match", "match"]}
+
+## Data Source
+**CodeSearchNet** : original dataset first published in https://arxiv.org/pdf/1909.09436.pdf , Java, Javascript, Go, Ruby, PHP subsets collected from huggingface-hub \
+**CodeSearchNet Adv** : a processed version of the CodeSearchNet Python dataset, introduced in the CodeXGLUE benchmark suite https://github.com/microsoft/CodeXGLUE \
+**CoSQA** : Python codesnippets from the CodeSearchNet dataset paired with real world user search engine queries, introduced in https://arxiv.org/pdf/2105.13239.pdf \
+**StatCodeSearch** : R code-comment pair snippets, scraped and extracted from public project on the Open Science Framework (OSF) by the submission authors
+
+For each comment in each subset we sampled randomly another code snippet from given subset, to create a fully balanced binary classification dataset. \
+For the dataset statistics we only consider the positive (matching) pairs. \
+
+**Dataset Size**:\
+*Finetuning set:* \
+ -CodeSearchNet Adv train set 251820 \
+*Test sets:* \
+ -CodeSearchNet Adv test set 19210 \
+ -CoSQA 10293\
+ -CodeSearchNet Ruby 2279\
+ -CodeSearchNet Go 14291\
+ -CodeSearchNet Java 26909\
+ -CodeSearchNet Javascript 6483\
+ -CodeSearchNet PHP 29391\
+ -StatCodeSearch 1070 \
+ -Combined test set 109926
+## Limitations and Bias
+TBD
+
+## Citation
+TBD
+
+## Further References
+Husain, H., Wu, H. H., Gazit, T., Allamanis, M., & Brockschmidt, M. (2019). Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436.
+
+Lu, S., Guo, D., Ren, S., Huang, J., Svyatkovskiy, A., Blanco, A., Shujie, L. I. U. (2021, June). CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).
+
+Huang J., Tang D., Shou L., Gong M., Xu K., Jiang D., Zhou M., Duan N. (2021) CoSQA: 20,000+ web queries for code search and question answering. In Proceedings of the 59th Annual Meeting of Association of Computational Linguistics and the 11th Internationaal Joint Conference on Natural Language Processing.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py b/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py
new file mode 100644
index 0000000..6246a78
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/mrr_demo.py
@@ -0,0 +1,27 @@
+from genbench import load_task
+
+
+def main():
+ high_mrr_test_list = []
+ for i in range(1, 11):
+ score_dict = dict.fromkeys(["score"])
+ score_dict["score"] = 1 / i
+ high_mrr_test_list.append(score_dict)
+
+ low_mrr_test_list = []
+ for i in range(1, 11):
+ score_dict = dict.fromkeys(["score"])
+ score_dict["score"] = 1 * i
+ low_mrr_test_list.append(score_dict)
+
+ task = load_task("nl_codesearch_mrr:statcodesearch")
+
+ high_results = task.evaluate_predictions(high_mrr_test_list, 9)
+ print(high_results)
+
+ low_results = task.evaluate_predictions(low_mrr_test_list, 9)
+ print(low_results)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt b/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt
new file mode 100644
index 0000000..ffb4c93
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/requirements-usage-example.txt
@@ -0,0 +1,4 @@
+torch v. 2.1.0
+numpy v. 1.25.1
+tqdm v. 4.65.0
+transformers v. 4.32.0
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/__init__.py b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet
new file mode 100644
index 0000000..0ffe3e7
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/config.jsonnet
@@ -0,0 +1,48 @@
+{
+ name: 'Natural Language Codesearch Ranking (statcodesearch)',
+
+ description: 'Natural Language Codesearch Ranking (statcodesearch) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual and domain generalization',
+
+ keywords: [
+ 'codesearch',
+ 'natural language query',
+ 'mean reciprocal rank',
+ 'r',
+ 'cross-lingual',
+ 'domain-shift'
+ ],
+
+ authors: [
+ 'Andor Diera',
+ 'Abdelhalim Dahou',
+ 'Lukas Galke',
+ 'Fabian Karl',
+ 'Florian Sihler',
+ 'Ansgar Scherp',
+ ],
+
+ data_source: {
+ type: 'manual',
+ test: 'https://zenodo.org/record/8310891/files/test_statcodesearch.jsonl',
+ train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+ },
+
+ has_validation_set: false,
+ has_train_set: true,
+
+ task_type: 'multiple_choice',
+
+ evaluation_metrics: [
+ {
+ hf_id: 'accuracy',
+ git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+ best_score: 1.0,
+ },
+ ],
+
+ preparation_strategies: {
+ finetuning: {
+ objective: 'maximum_likelihood',
+ },
+ },
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md
new file mode 100644
index 0000000..0826a5c
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (statcodesearch)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (statcodesearch).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (statcodesearch).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (statcodesearch).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (statcodesearch) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py
new file mode 100644
index 0000000..2566044
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/statcodesearch/task.py
@@ -0,0 +1,108 @@
+import random
+from typing import Dict, List
+
+import datasets
+import numpy as np
+
+from genbench import Task
+
+
+def chunked(iterable, chunk_size):
+ """
+ Split an iterable into chunks of a specified size.
+
+ Args:
+ iterable: The iterable to be chunked.
+ chunk_size: The size of each chunk.
+
+ Returns:
+ A generator that yields chunks of the iterable.
+ """
+ if chunk_size <= 0:
+ raise ValueError("Chunk size must be greater than zero")
+
+ chunk = []
+ for item in iterable:
+ chunk.append(item)
+ if len(chunk) == chunk_size:
+ yield chunk
+ chunk = []
+
+ if chunk:
+ yield chunk
+
+
+class NlCodesearchMrrStatcodesearch(Task):
+ def get_dataset_raw(self, n_distractors) -> Dict[str, datasets.Dataset]:
+ """Create the dataset adding n distractor pair (original comment, random code snippet) for ranking.
+
+ Args:
+ n_distractors: the number of randomly sampled distractor code for each ranking chunk
+
+ Returns:
+ A dictionary containing key-value pairs for the raw datasets.
+ The keys are strings representing the name of the dataset split
+ (e.g., "train", "validation", "test") and the values are
+ HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+ The train split only contains the original dataset.
+ """
+ raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+ output: Dict[str, datasets.Dataset] = {}
+ random.seed(42)
+
+ for split, dataset in raw_datasets.items():
+ if split == "test":
+ # Convert dataset to list for easier manipulation
+ dataset_list = list(dataset)
+
+ new_data = []
+
+ for idx, item in enumerate(dataset_list):
+ new_data.append(item)
+
+ # Create other_items list once and then simply exclude the current item during sampling
+ other_items = dataset_list[:idx] + dataset_list[idx + 1 :]
+ random_items = random.sample(other_items, n_distractors)
+
+ input_parts = item["input"].split("[CODESPLIT]")
+
+ for random_item in random_items:
+ random_input_parts = random_item["input"].split("[CODESPLIT]")
+ new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+ new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+ new_data.append(new_item)
+
+ # Convert list back to HuggingFace dataset
+ output[split] = datasets.Dataset.from_dict({k: [dic[k] for dic in new_data] for k in new_data[0]})
+ else:
+ output[split] = dataset
+
+ return output
+
+ def evaluate_predictions(self, predictions: List[Dict[str, float]], n_distractors) -> Dict[str, float]:
+ """Calculate the MRR score in chunks. One chunk consist of a true comment-code pair and n number of distractors
+ This function assumes that the predictions were made and passed onto this function unshuffled.
+ The test data is ordered with each true pair followed by n number of distractors
+ Args:
+ predictions: A list of dictionaries, where each dictionary contains the predicted values for an example.
+ The keys are strings and the values are floats (logit scores or similarity values).
+ n_distractors: Number of distractor comment-code pair for each true pair.
+ Must be the same number as in the get_dataset_raw function
+
+ Returns:
+ A dictionary containing key-value pairs for the evaluation metric(s) computed on the predicted
+ values. The keys are strings representing the name of the evaluation metric and the values are
+ floating-point numbers.
+ """
+ ranks = []
+
+ batched_predictions = chunked(predictions, n_distractors + 1)
+
+ for batch_idx, predictions in enumerate(batched_predictions):
+ correct_score = predictions[0]["score"]
+ scores = np.array([prediction["score"] for prediction in predictions])
+ rank = np.sum(scores >= correct_score)
+ ranks.append(rank)
+ mean_mrr = np.mean(1.0 / np.array(ranks))
+
+ return {"mean mrr": mean_mrr}
diff --git a/src/genbench/tasks/nl_codesearch_mrr/usage_example.py b/src/genbench/tasks/nl_codesearch_mrr/usage_example.py
new file mode 100644
index 0000000..8bb1455
--- /dev/null
+++ b/src/genbench/tasks/nl_codesearch_mrr/usage_example.py
@@ -0,0 +1,319 @@
+import argparse
+import json
+import logging
+import random
+
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, PreTrainedModel, get_scheduler
+
+from genbench import load_task
+
+
+##########################################################
+# Data Loadig Utils
+##########################################################
+class Dataset(torch.utils.data.Dataset):
+ def __init__(self, features):
+ self.features = features
+
+ def __getitem__(self, index):
+ return self.features[index]
+
+ def __len__(self):
+ return len(self.features)
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+ """Truncates a sequence pair in place to the maximum length."""
+
+ while True:
+ total_length = len(tokens_a) + len(tokens_b)
+ if total_length <= max_length:
+ break
+ if len(tokens_a) > len(tokens_b):
+ tokens_a.pop()
+ else:
+ tokens_b.pop()
+
+
+def _convert_examples_to_features(
+ comments,
+ codes,
+ labels,
+ max_seq_length,
+ tokenizer,
+ cls_token="[CLS]",
+ sep_token="[SEP]",
+ pad_token=0,
+ eos_token="",
+ sequence_a_segment_id=0,
+ sequence_b_segment_id=1,
+ cls_token_segment_id=1,
+ pad_token_segment_id=0,
+ mask_padding_with_zero=True,
+):
+ features = []
+ for ex_index, (comment, code, label) in enumerate(zip(comments, codes, labels)):
+ # As was done in CodeBERT
+ tokens_comment = tokenizer.tokenize(comment)[:50]
+ tokens_code = tokenizer.tokenize(code)
+
+ # update max_seq_length to account for [CLS], [SEP], [SEP] tokens (-3)
+ n_special_tokens = 3
+ if cls_token is None:
+ n_special_tokens -= 1
+ s_max_seq_length = max_seq_length - n_special_tokens
+ _truncate_seq_pair(tokens_comment, tokens_code, s_max_seq_length)
+
+ # change sep for eos if no sep_token
+ if sep_token is None:
+ sep_token = eos_token
+
+ # [SEP] inbetween and at the end
+ tokens = tokens_comment + [sep_token] + tokens_code + [sep_token]
+ # CLS at the beginning
+ if cls_token is not None:
+ tokens = [cls_token] + tokens
+
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+ # 1 for tokens, 0 for padding
+ input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+ # padding with 0 up to max_seq_length
+ padding_length = max_seq_length - len(input_ids)
+ input_ids = input_ids + ([pad_token] * padding_length)
+ input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+ # check
+ assert len(input_ids) == max_seq_length
+ assert len(input_mask) == max_seq_length
+
+ # convert to tensors
+ input_ids = torch.tensor(input_ids, dtype=torch.long)
+ input_mask = torch.tensor(input_mask, dtype=torch.long)
+ label = torch.tensor(label, dtype=torch.long)
+
+ features.append({"input_ids": input_ids, "attention_mask": input_mask, "labels": label})
+ return features
+
+
+def load_data(tokenizer, batch_size, seq_len, train_file, is_train):
+ # create dataset
+ comments = []
+ codes = []
+ labels = []
+ skipped = 0
+ if is_train:
+ do_shuffle = True
+ else:
+ do_shuffle = False
+
+ is_sep_token_set = tokenizer.sep_token is not None
+ is_cls_token_set = tokenizer.cls_token is not None
+ is_pad_token_set = tokenizer.pad_token is not None
+ is_eos_token_set = tokenizer.eos_token is not None
+
+ for split, dataset in train_file.items():
+ if is_train and split == "test":
+ continue
+ if not is_train and split == "train":
+ continue
+ for sample in dataset:
+ try:
+ input = sample["input"]
+ # split at [CODESPLIT] token
+ input = input.split("[CODESPLIT]")
+ if len(input) != 2:
+ # skip cases with more than one [SEP] token
+ logging.warning(f"Input contains more than one [CODESPLIT] token: {input}")
+ skipped += 1
+ continue
+ # skip every sample that contains special tokens
+ if is_sep_token_set and (tokenizer.sep_token in input[0] or tokenizer.sep_token in input[1]):
+ logging.warning(f"Input contains special tokens: {input}")
+ skipped += 1
+ continue
+ if is_cls_token_set and (tokenizer.cls_token in input[0] or tokenizer.cls_token in input[1]):
+ logging.warning(f"Input contains special tokens: {input}")
+ skipped += 1
+ continue
+ if is_pad_token_set and (tokenizer.pad_token in input[0] or tokenizer.pad_token in input[1]):
+ logging.warning(f"Input contains special tokens: {input}")
+ skipped += 1
+ continue
+ if is_eos_token_set and (tokenizer.eos_token in input[0] or tokenizer.eos_token in input[1]):
+ logging.warning(f"Input contains special tokens: {input}")
+ skipped += 1
+ continue
+ comments.append(input[0])
+ codes.append(input[1])
+ labels.append(sample["target"])
+
+ except json.JSONDecodeError as e:
+ print(f"Error: JSON decoding failed - {e}")
+ continue
+ logging.info(f"Skipped {skipped} samples due to special tokens")
+ # tokenize
+ features = _convert_examples_to_features(
+ comments,
+ codes,
+ labels,
+ max_seq_length=seq_len,
+ tokenizer=tokenizer,
+ cls_token=tokenizer.cls_token,
+ sep_token=tokenizer.sep_token,
+ cls_token_segment_id=tokenizer.cls_token_id,
+ pad_token_segment_id=tokenizer.pad_token_id,
+ eos_token=tokenizer.eos_token,
+ )
+
+ # Convert to Dataset
+ features = Dataset(features)
+
+ return DataLoader(features, batch_size=batch_size, shuffle=do_shuffle)
+
+
+##############################################################
+# Fine-tune Model
+##############################################################
+
+
+def train(model: PreTrainedModel, dataloader: DataLoader, args: argparse.Namespace):
+ """
+ Fine-tune the model.
+ :param model: the pretrained model to be fine-tuned
+ :param dataloader: an iterable data loader
+ :param args: training arguments (and also some other arguments)
+ :return: the fine-tuned model
+ """
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model.to(device)
+ model.train()
+
+ num_training_steps = args.epochs * len(dataloader)
+ progress_bar = tqdm(range(num_training_steps))
+
+ optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+ lr_scheduler = get_scheduler(
+ name="linear",
+ optimizer=optimizer,
+ num_warmup_steps=args.num_warmup_steps,
+ num_training_steps=num_training_steps,
+ )
+
+ for epoch in range(args.epochs):
+ for batch in dataloader:
+ batch = {k: v.to(device) for k, v in batch.items()}
+ outputs = model(**batch)
+ loss = outputs.loss
+ loss.backward()
+
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+ progress_bar.update(1)
+
+
+###########################################################
+# Evaluate Model
+###########################################################
+
+
+def get_scores(model, dataloader):
+ random.seed(42)
+ # make predictions for all chunks
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ print("Using device:", device)
+ model.to(device)
+ model.eval()
+
+ score_list = []
+ for batch in tqdm(dataloader):
+ batch = {k: v.to(device) for k, v in batch.items()}
+ with torch.no_grad():
+ outputs = model(**batch)
+ score_dict = dict.fromkeys(["score"])
+ score_dict["score"] = outputs.logits.cpu().numpy()
+ score_list.append(score_dict)
+
+ return score_list
+
+
+##############################################################
+# Run example
+##############################################################
+
+
+def main():
+ """Main function."""
+ # args
+ parser = argparse.ArgumentParser()
+ # parser.add_argument('--dataset', type=str, default='./codesearchnet_adv')
+ parser.add_argument("--model", default="roberta-base")
+ parser.add_argument("--epochs", type=int, default=5)
+ parser.add_argument("--batch_size", type=int, default=32)
+ parser.add_argument("--learning_rate", type=float, default=2e-5)
+ parser.add_argument("--weight_decay", type=float, default=0.01)
+ parser.add_argument("--num_warmup_steps", type=int, default=0)
+ parser.add_argument("--output_dir", type=str, default="models")
+ parser.add_argument("--seq_len", type=int, default=512, help="maximum sequence length")
+ parser.add_argument("--distractors", type=int, default=2, help="number of distractors per true pair")
+ parser.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO")
+
+ args = parser.parse_args()
+
+ TRAIN_FILE = load_task("nl_codesearch_mrr:codesearchnet_adv").get_dataset_raw(args.distractors)
+
+ # logging
+ logging.basicConfig(level=args.log_level)
+
+ # load tokenizer
+ logging.info("Loading model...")
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+ # load data
+ logging.info("Loading data...")
+ dataloader = load_data(tokenizer, args.batch_size, args.seq_len, TRAIN_FILE, True)
+
+ model = AutoModelForSequenceClassification.from_pretrained(args.model)
+
+ # train
+ logging.info("Training...")
+ train(model, dataloader, args)
+
+ # save model
+ logging.info("Saving model...")
+ model.save_pretrained(f"{args.output_dir}/{args.model}")
+ # also soave tokenizer
+ tokenizer.save_pretrained(f"{args.output_dir}/{args.model}")
+
+ TEST_TASKS = [
+ ["codesearchnetadv", load_task("nl_codesearch_mrr:codesearchnet_adv")],
+ ["codesearchnet_ruby", load_task("nl_codesearch_mrr:codesearchnet_ruby")],
+ ["codesearchnet_go", load_task("nl_codesearch_mrr:codesearchnet_go")],
+ ["codesearchnet_java", load_task("nl_codesearch_mrr:codesearchnet_java")],
+ ["codesearchnet_javascript", load_task("nl_codesearch_mrr:codesearchnet_javascript")],
+ ["codesearchnet_php", load_task("nl_codesearch_mrr:codesearchnet_php")],
+ ["cosqa", load_task("nl_codesearch_mrr:cosqa")],
+ ["statcodesearch", load_task("nl_codesearch_mrr:statcodesearch")],
+ ]
+
+ results = {}
+ for task in TEST_TASKS:
+ logging.info(f"Calculating Logits for MRR {task[0]}...")
+ dataloader = load_data(tokenizer, 1, args.seq_len, task[1].get_dataset_raw(args.distractors), False)
+ scores = get_scores(model, dataloader)
+ mrr_value = task[1].evaluate_predictions(scores, args.distractors)
+ logging.info(f"Test results for {task[0]}: {mrr_value}")
+ results[task[0]] = mrr_value
+
+ logging.info(f"Test results: {results}")
+
+
+if __name__ == "__main__":
+ main()