diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py index 57bf53a..4e77608 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py index 69eb7b5..9b880ec 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py index 80ff813..292e74c 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py index 0aa2a35..5e201a4 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py index afa9bb3..1378ff0 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_php/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py index 8941ffc..7f4db9b 100644 --- a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_ruby/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py index 8685f27..7d1c292 100644 --- a/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/cosqa/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt new file mode 100644 index 0000000..0a35dc8 --- /dev/null +++ b/src/genbench/tasks/nl_codesearch_clf/requirements-usage-example.txt @@ -0,0 +1,5 @@ +torch v. 2.1.0 +numpy v. 1.25.1 +tqdm v. 4.65.0 +transformers v. 4.32.0 +scikit-learn v. 1.3.0 \ No newline at end of file diff --git a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py index b0af739..5134760 100644 --- a/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py +++ b/src/genbench/tasks/nl_codesearch_clf/statcodesearch/task.py @@ -34,7 +34,7 @@ def get_dataset_raw(self) -> Dict[str, datasets.Dataset]: # Split input into comment and code input_parts = item["input"].split("[CODESPLIT]") # Split random input into comment and code - random_input_parts = random_item["input"].split("[CODESPLIT]") + random_input_parts = random_item[0]["input"].split("[CODESPLIT]") # Combine the "input" fields of the original and random items new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1] new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]} diff --git a/src/genbench/tasks/nl_codesearch_clf/usage_example.py b/src/genbench/tasks/nl_codesearch_clf/usage_example.py new file mode 100644 index 0000000..4683d98 --- /dev/null +++ b/src/genbench/tasks/nl_codesearch_clf/usage_example.py @@ -0,0 +1,400 @@ +import argparse +import json +import logging +from pathlib import Path + +import torch +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from torch.optim import AdamW +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModelForSequenceClassification, AutoTokenizer, PreTrainedModel, get_scheduler + +from genbench import TaskConfig +from genbench.tasks.nl_codesearch_clf.codesearchnet_adv.task import NlCodesearchClfCodesearchnetAdv +from genbench.tasks.nl_codesearch_clf.codesearchnet_go.task import NlCodesearchClfCodesearchnetGo +from genbench.tasks.nl_codesearch_clf.codesearchnet_java.task import NlCodesearchClfCodesearchnetJava +from genbench.tasks.nl_codesearch_clf.codesearchnet_javascript.task import NlCodesearchClfCodesearchnetJavascript +from genbench.tasks.nl_codesearch_clf.codesearchnet_php.task import NlCodesearchClfCodesearchnetPhp +from genbench.tasks.nl_codesearch_clf.codesearchnet_ruby.task import NlCodesearchClfCodesearchnetRuby +from genbench.tasks.nl_codesearch_clf.cosqa.task import NlCodesearchClfCosqa +from genbench.tasks.nl_codesearch_clf.statcodesearch.task import NlCodesearchClfStatcodesearch + + +########################################################## +# Data Loadig Utils +########################################################## +class Dataset(torch.utils.data.Dataset): + def __init__(self, features): + self.features = features + + def __getitem__(self, index): + return self.features[index] + + def __len__(self): + return len(self.features) + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def _convert_examples_to_features( + comments, + codes, + labels, + max_seq_length, + tokenizer, + cls_token="[CLS]", + sep_token="[SEP]", + pad_token=0, + eos_token="", + sequence_a_segment_id=0, + sequence_b_segment_id=1, + cls_token_segment_id=1, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): + features = [] + for ex_index, (comment, code, label) in enumerate(zip(comments, codes, labels)): + # As was done in CodeBERT + tokens_comment = tokenizer.tokenize(comment)[:50] + tokens_code = tokenizer.tokenize(code) + + # update max_seq_length to account for [CLS], [SEP], [SEP] tokens (-3) + n_special_tokens = 3 + if cls_token is None: + n_special_tokens -= 1 + s_max_seq_length = max_seq_length - n_special_tokens + _truncate_seq_pair(tokens_comment, tokens_code, s_max_seq_length) + + # change sep for eos if no sep_token + if sep_token is None: + sep_token = eos_token + + # [SEP] inbetween and at the end + tokens = tokens_comment + [sep_token] + tokens_code + [sep_token] + # CLS at the beginning + if cls_token is not None: + tokens = [cls_token] + tokens + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # 1 for tokens, 0 for padding + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # padding with 0 up to max_seq_length + padding_length = max_seq_length - len(input_ids) + input_ids = input_ids + ([pad_token] * padding_length) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + + # check + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + + # convert to tensors + input_ids = torch.tensor(input_ids, dtype=torch.long) + input_mask = torch.tensor(input_mask, dtype=torch.long) + label = torch.tensor(label, dtype=torch.long) + + features.append({"input_ids": input_ids, "attention_mask": input_mask, "labels": label}) + return features + + +def load_data(tokenizer, batch_size, seq_len, train_file, is_train): + # create dataset + comments = [] + codes = [] + labels = [] + skipped = 0 + + is_sep_token_set = tokenizer.sep_token is not None + is_cls_token_set = tokenizer.cls_token is not None + is_pad_token_set = tokenizer.pad_token is not None + is_eos_token_set = tokenizer.eos_token is not None + + for split, dataset in train_file.items(): + if is_train and split == "test": + continue + if not is_train and split == "train": + continue + for sample in dataset: + try: + input = sample["input"] + # split at [CODESPLIT] token + input = input.split("[CODESPLIT]") + if len(input) != 2: + # skip cases with more than one [SEP] token + logging.warning(f"Input contains more than one [CODESPLIT] token: {input}") + skipped += 1 + continue + # skip every sample that contains special tokens + if is_sep_token_set and (tokenizer.sep_token in input[0] or tokenizer.sep_token in input[1]): + logging.warning(f"Input contains special tokens: {input}") + skipped += 1 + continue + if is_cls_token_set and (tokenizer.cls_token in input[0] or tokenizer.cls_token in input[1]): + logging.warning(f"Input contains special tokens: {input}") + skipped += 1 + continue + if is_pad_token_set and (tokenizer.pad_token in input[0] or tokenizer.pad_token in input[1]): + logging.warning(f"Input contains special tokens: {input}") + skipped += 1 + continue + if is_eos_token_set and (tokenizer.eos_token in input[0] or tokenizer.eos_token in input[1]): + logging.warning(f"Input contains special tokens: {input}") + skipped += 1 + continue + comments.append(input[0]) + codes.append(input[1]) + labels.append(sample["target"]) + except json.JSONDecodeError as e: + print(f"Error: JSON decoding failed - {e}") + continue + logging.info(f"Skipped {skipped} samples due to special tokens") + print("siker") + # tokenize + features = _convert_examples_to_features( + comments, + codes, + labels, + max_seq_length=seq_len, + tokenizer=tokenizer, + cls_token=tokenizer.cls_token, + sep_token=tokenizer.sep_token, + cls_token_segment_id=tokenizer.cls_token_id, + pad_token_segment_id=tokenizer.pad_token_id, + eos_token=tokenizer.eos_token, + ) + + # Convert to Dataset + features = Dataset(features) + + return DataLoader(features, batch_size=batch_size, shuffle=True) + + +############################################################## +# Fine-tune Model +############################################################## + + +def train(model: PreTrainedModel, dataloader: DataLoader, args: argparse.Namespace): + """ + Fine-tune the model. + :param model: the pretrained model to be fine-tuned + :param dataloader: an iterable data loader + :param args: training arguments (and also some other arguments) + :return: the fine-tuned model + """ + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.train() + + num_training_steps = args.epochs * len(dataloader) + progress_bar = tqdm(range(num_training_steps)) + + optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) + lr_scheduler = get_scheduler( + name="linear", + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=num_training_steps, + ) + + for epoch in range(args.epochs): + for batch in dataloader: + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + loss.backward() + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + + +########################################################### +# Evaluate Model +########################################################### + + +def clf(model, dataloader, args): + """Predict on test set.""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + predictions = [] + labels = [] + logging.info("Evaluating...") + for batch in tqdm(dataloader): + batch = {k: v.to(device) for k, v in batch.items()} + with torch.no_grad(): + outputs = model(**batch) + predictions.extend(outputs.logits.argmax(-1).cpu().numpy().tolist()) + labels.extend(batch["labels"].cpu().numpy().tolist()) + + metrics = {} + # calc metrics + + # calc accuracy + accuracy = accuracy_score(labels, predictions) + metrics["accuracy"] = accuracy + + # calc precision + precision = precision_score(labels, predictions) + metrics["precision"] = precision + + # calc recall + recall = recall_score(labels, predictions) + metrics["recall"] = recall + + # calc f1 + f1 = f1_score(labels, predictions) + metrics["f1"] = f1 + + return metrics + + +############################################################## +# Run example +############################################################## + + +def main(): + """Main function.""" + # args + parser = argparse.ArgumentParser() + # parser.add_argument('--dataset', type=str, default='./codesearchnet_adv') + parser.add_argument("--model", default="roberta-base") + parser.add_argument("--epochs", type=int, default=5) + parser.add_argument("--batch_size", type=int, default=32) + parser.add_argument("--learning_rate", type=float, default=2e-5) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--num_warmup_steps", type=int, default=0) + parser.add_argument("--output_dir", type=str, default="models") + parser.add_argument("--seq_len", type=int, default=512, help="maximum sequence length") + # parser.add_argument("--distractors", type=int, default=99, help="number of distractors per true pair") + parser.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO") + + args = parser.parse_args() + + # TRAIN_FILE = "./codesearchnet_adv/train_adv_clf.jsonl" + + TRAIN_FILE = NlCodesearchClfCodesearchnetAdv( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_adv/config.jsonnet")), + "nl_codesearch", + "codesearchnet_adv", + ).get_dataset_raw() + + # logging + logging.basicConfig(level=args.log_level) + + # load tokenizer + logging.info("Loading model...") + tokenizer = AutoTokenizer.from_pretrained(args.model) + + # load data + logging.info("Loading data...") + dataloader = load_data(tokenizer, args.batch_size, args.seq_len, TRAIN_FILE, True) + + model = AutoModelForSequenceClassification.from_pretrained(args.model) + + # train + logging.info("Training...") + train(model, dataloader, args) + + # save model + logging.info("Saving model...") + model.save_pretrained(f"{args.output_dir}/{args.model}") + # also soave tokenizer + tokenizer.save_pretrained(f"{args.output_dir}/{args.model}") + + TEST_FILES = [ + [ + "codesearchnetadv", + NlCodesearchClfCodesearchnetAdv( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_adv/config.jsonnet")), + "nl_codesearch", + "codesearchnet_adv", + ).get_dataset_raw(), + ], + [ + "codesearchnet_ruby", + NlCodesearchClfCodesearchnetRuby( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_ruby/config.jsonnet")), + "nl_codesearch", + "codesearchnet_ruby", + ).get_dataset_raw(), + ], + [ + "codesearchnet_go", + NlCodesearchClfCodesearchnetGo( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_go/config.jsonnet")), + "nl_codesearch", + "codesearchnet_go", + ).get_dataset_raw(), + ], + [ + "codesearchnet_java", + NlCodesearchClfCodesearchnetJava( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_java/config.jsonnet")), + "nl_codesearch", + "codesearchnet_java", + ).get_dataset_raw(), + ], + [ + "codesearchnet_javascript", + NlCodesearchClfCodesearchnetJavascript( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_javascript/config.jsonnet")), + "nl_codesearch", + "codesearchnet_javascript", + ).get_dataset_raw(), + ], + [ + "codesearchnet_php", + NlCodesearchClfCodesearchnetPhp( + TaskConfig.from_jsonnet(jsonnet_path=Path("./codesearchnet_php/config.jsonnet")), + "nl_codesearch", + "codesearchnet_php", + ).get_dataset_raw(), + ], + [ + "cosqa", + NlCodesearchClfCosqa( + TaskConfig.from_jsonnet(jsonnet_path=Path("./cosqa/config.jsonnet")), "nl_codesearch", "cosqa" + ).get_dataset_raw(), + ], + [ + "statcodesearch", + NlCodesearchClfStatcodesearch( + TaskConfig.from_jsonnet(jsonnet_path=Path("./statcodesearch/config.jsonnet")), + "nl_codesearch", + "statcodesearch", + ).get_dataset_raw(), + ], + ] + + results = {} + for file in TEST_FILES: + logging.info(f"Evaluating on {file[0]}...") + dataloader = load_data(tokenizer, args.batch_size, args.seq_len, file[1], False) + metrics = clf(model, dataloader, args) + results[file[0]] = metrics + logging.info(f"Test results for {file[0]}: {metrics}") + + logging.info(f"Test results: {results}") + + +if __name__ == "__main__": + main()