From 21eadea232f957bdfd3ae28ffae0a0191ffe5bc8 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Mon, 6 Jan 2020 19:22:23 +0000 Subject: [PATCH 01/24] moved the order of moving to device and creating the optimizer --- utils_nlp/models/transformers/common.py | 46 ++++++++++++++----------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 42aedebfb..1dc66625c 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -102,9 +102,28 @@ def fine_tune( verbose=True, seed=None, ): - + # get device device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1) + # unwrap model + if isinstance(self.model, torch.nn.DataParallel): + self.model = self.model.module + + # wrap in DataParallel or DistributedDataParallel + if local_rank != -1: + self.model = torch.nn.parallel.DistributedDataParallel( + self.model, + device_ids=[local_rank], + output_device=local_rank, + find_unused_parameters=True, + ) + else: + if num_gpus > 1: + self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus))) + + # move to device + self.model.to(device) + if seed is not None: Transformer.set_seed(seed, num_gpus > 0) @@ -116,6 +135,7 @@ def fine_tune( else: t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs + # set optimizer if optimizer is None: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ @@ -138,6 +158,7 @@ def fine_tune( ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) + # set scheduler if scheduler is None: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total @@ -150,30 +171,16 @@ def fine_tune( raise ImportError("Please install apex from https://www.github.com/nvidia/apex") self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level) - if local_rank != -1: - self.model = torch.nn.parallel.DistributedDataParallel( - self.model, - device_ids=[local_rank], - output_device=local_rank, - find_unused_parameters=True, - ) - else: - if isinstance(self.model, torch.nn.DataParallel): - self.model = self.model.module - - if num_gpus > 1: - self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus))) - - self.model.to(device) - self.model.train() - + # init training global_step = 0 tr_loss = 0.0 + self.model.train() self.model.zero_grad() train_iterator = trange( int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0] or not verbose ) + # train for _ in train_iterator: epoch_iterator = tqdm( train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose @@ -214,9 +221,6 @@ def fine_tune( train_iterator.close() break - # empty cache - del [batch] - torch.cuda.empty_cache() return global_step, tr_loss / global_step def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True): From 9ab859958754ab8fc97a3c929fe28bce8905c995 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Tue, 7 Jan 2020 20:13:24 +0000 Subject: [PATCH 02/24] added move_model_to_device to pytorch_utils --- utils_nlp/common/pytorch_utils.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index ea09f8768..0410775ac 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -32,6 +32,42 @@ def get_device( return device, num_gpus +def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=-1): + """Moves a model to the specified device (cpu or gpu/s) + and implements data parallelism when multiple gpus are specified. + + Args: + model (Module): A PyTorch model. + device (torch.device): A PyTorch device. + num_gpus (int): The number of GPUs to be used. + If set to None, all available GPUs will be used. + Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. + local_rank (int): Local GPU ID within a node. Used in distributed environments. + Defaults to -1. + """ + # unwrap model + if isinstance(model, torch.nn.DataParallel): + model = model.module + # wrap in DataParallel or DistributedDataParallel + if local_rank != -1: + self.model = torch.nn.parallel.DistributedDataParallel( + self.model, + device_ids=[local_rank], + output_device=local_rank, + find_unused_parameters=True, + ) + else: + if num_gpus > 1: + if gpu_ids is None: + gpu_ids = list(range(num_gpus)) + model = torch.nn.DataParallel(model, device_ids=gpu_ids) + # move to device + model.to(device) + + def move_to_device(model, device, num_gpus=None): """Moves a model to the specified device (cpu or gpu/s) and implements data parallelism when multiple gpus are specified. From ab4b496558ed53b321bdcbf74aa8a106cdd3cd78 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Tue, 7 Jan 2020 20:17:38 +0000 Subject: [PATCH 03/24] moved optim and scheduler init out of fine_tune --- utils_nlp/models/transformers/common.py | 118 +++++++++--------------- 1 file changed, 43 insertions(+), 75 deletions(-) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 1dc66625c..ccaadbf6d 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -22,7 +22,7 @@ from transformers.tokenization_distilbert import DistilBertTokenizer from transformers.tokenization_roberta import RobertaTokenizer from transformers.tokenization_xlnet import XLNetTokenizer -from utils_nlp.common.pytorch_utils import get_device + TOKENIZER_CLASS = {} TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) @@ -81,6 +81,47 @@ def set_seed(seed, cuda=True): if cuda and torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) + @staticmethod + def get_default_optimizer(model, learning_rate, adam_epsilon): + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) + ], + "weight_decay": weight_decay, + }, + { + "params": [ + p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) + return optimizer + + @staticmethod + def get_default_scheduler( + optimizer, warmup_steps, data_loader, max_steps, num_epochs, gradient_accumulation_steps + ): + try: + dataset_length = len(data_loader) + except Exception: + dataset_length = -1 + + if max_steps <= 0: + if dataset_length != -1 and num_epochs > 0: + max_steps = dataset_length // gradient_accumulation_steps * num_epochs + + if max_steps <= 0: + raise Exception("Max steps cannot be determined.") + + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps + ) + return scheduler + def fine_tune( self, train_dataloader, @@ -89,81 +130,18 @@ def fine_tune( num_train_epochs=1, max_grad_norm=1.0, gradient_accumulation_steps=1, - n_gpu=1, optimizer=None, scheduler=None, - weight_decay=0.0, - learning_rate=5e-5, - adam_epsilon=1e-8, - warmup_steps=0, fp16=False, fp16_opt_level="O1", local_rank=-1, verbose=True, seed=None, ): - # get device - device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1) - - # unwrap model - if isinstance(self.model, torch.nn.DataParallel): - self.model = self.model.module - - # wrap in DataParallel or DistributedDataParallel - if local_rank != -1: - self.model = torch.nn.parallel.DistributedDataParallel( - self.model, - device_ids=[local_rank], - output_device=local_rank, - find_unused_parameters=True, - ) - else: - if num_gpus > 1: - self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus))) - - # move to device - self.model.to(device) if seed is not None: Transformer.set_seed(seed, num_gpus > 0) - if max_steps > 0: - t_total = max_steps - num_train_epochs = ( - max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1 - ) - else: - t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs - - # set optimizer - if optimizer is None: - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in self.model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": weight_decay, - }, - { - "params": [ - p - for n, p in self.model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) - - # set scheduler - if scheduler is None: - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total - ) - if fp16: try: from apex import amp @@ -223,18 +201,8 @@ def fine_tune( return global_step, tr_loss / global_step - def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True): - device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1) - - if isinstance(self.model, torch.nn.DataParallel): - self.model = self.model.module - - if num_gpus > 1: - self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus))) - - self.model.to(device) + def predict(self, eval_dataloader, get_inputs, verbose=True): self.model.eval() - for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): From 8242bcf767aa7f1d2357c8d5ccbdeb5806b0bb84 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Wed, 8 Jan 2020 06:37:42 +0000 Subject: [PATCH 04/24] modified structure of transformer wrapper --- ...st_transformers_sequence_classification.py | 20 +++-- utils_nlp/common/pytorch_utils.py | 6 +- utils_nlp/models/transformers/common.py | 6 +- .../transformers/sequence_classification.py | 73 ++++++++++++++----- 4 files changed, 71 insertions(+), 34 deletions(-) diff --git a/tests/unit/test_transformers_sequence_classification.py b/tests/unit/test_transformers_sequence_classification.py index 156854200..c402d106e 100644 --- a/tests/unit/test_transformers_sequence_classification.py +++ b/tests/unit/test_transformers_sequence_classification.py @@ -19,12 +19,11 @@ def test_classifier(data, tmpdir): num_labels = len(pd.unique(data[1])) model_name = "bert-base-uncased" processor = Processor(model_name=model_name, cache_dir=tmpdir) - train_dataloader = processor.create_dataloader_from_df( - df, "text", "label", batch_size=2, num_gpus=0 - ) + ds = processor.dataset_from_dataframe(df, "text", "label") + dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True) classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir) - classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=0, verbose=False) - preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False) + classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False) + preds = classifier.predict(dl, num_gpus=0, verbose=False) assert len(preds) == len(data[1]) @@ -35,17 +34,16 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir): num_labels = len(pd.unique(data[1])) model_name = "bert-base-uncased" processor = Processor(model_name=model_name, cache_dir=tmpdir) - train_dataloader = processor.create_dataloader_from_df( - df, "text", "label", batch_size=2, num_gpus=1 - ) + ds = processor.dataset_from_dataframe(df, "text", "label") + dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True) classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir) - classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=1, verbose=False) + classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False) assert next(classifier.model.parameters()).is_cuda is True # gpu prediction, no model move - preds = classifier.predict(train_dataloader, num_gpus=1, verbose=False) + preds = classifier.predict(dl, num_gpus=1, verbose=False) assert len(preds) == len(data[1]) # cpu prediction, need model move assert next(classifier.model.parameters()).is_cuda is True - preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False) + preds = classifier.predict(dl, num_gpus=0, verbose=False) assert next(classifier.model.parameters()).is_cuda is False diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index 0410775ac..fee66269e 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -47,6 +47,10 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= Defaults to None. local_rank (int): Local GPU ID within a node. Used in distributed environments. Defaults to -1. + + Returns: + Module, DataParallel, DistributedDataParallel: A PyTorch Module or + a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used). """ # unwrap model if isinstance(model, torch.nn.DataParallel): @@ -65,7 +69,7 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= gpu_ids = list(range(num_gpus)) model = torch.nn.DataParallel(model, device_ids=gpu_ids) # move to device - model.to(device) + return model.to(device) def move_to_device(model, device, num_gpus=None): diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index ccaadbf6d..d5f4d5588 100644 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -82,7 +82,7 @@ def set_seed(seed, cuda=True): torch.cuda.manual_seed_all(seed) @staticmethod - def get_default_optimizer(model, learning_rate, adam_epsilon): + def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { @@ -125,6 +125,8 @@ def get_default_scheduler( def fine_tune( self, train_dataloader, + device, + num_gpus, get_inputs, max_steps=-1, num_train_epochs=1, @@ -201,7 +203,7 @@ def fine_tune( return global_step, tr_loss / global_step - def predict(self, eval_dataloader, get_inputs, verbose=True): + def predict(self, eval_dataloader, device, get_inputs, verbose=True): self.model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): batch = tuple(t.to(device) for t in batch) diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 93668471b..b245383db 100644 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -21,6 +21,7 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification, ) +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet @@ -188,20 +189,11 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): return input_ids, attention_mask, token_type_ids - def create_dataloader_from_df( - self, - df, - text_col, - label_col=None, - text2_col=None, - shuffle=False, - max_len=MAX_SEQ_LEN, - batch_size=32, - num_gpus=None, - distributed=False, + def dataset_from_dataframe( + self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN ): if text2_col is None: - ds = SCDataSet( + return SCDataSet( df, text_col, label_col, @@ -210,7 +202,7 @@ def create_dataloader_from_df( max_len=max_len, ) else: - ds = SPCDataSet( + return SPCDataSet( df, text_col, text2_col, @@ -220,6 +212,9 @@ def create_dataloader_from_df( max_len=max_len, ) + def dataloader_from_dataset( + self, ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False + ): if num_gpus is None: num_gpus = torch.cuda.device_count() @@ -250,7 +245,10 @@ def fit( self, train_dataloader, num_epochs=1, + max_steps=-1, + gradient_accumulation_steps=1, num_gpus=None, + gpu_ids=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, @@ -265,9 +263,16 @@ def fit( Args: train_dataloader (Dataloader): Dataloader for the training data. num_epochs (int, optional): Number of training epochs. Defaults to 1. + max_steps (int, optional): Total number of training steps. Overrides num_epochs. + gradient_accumulation_steps (int, optional): Number of steps to accumulate + before performing a backward/update pass. + Default to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. weight_decay (float, optional): Weight decay to apply after each parameter update. @@ -281,20 +286,40 @@ def fit( seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) + + # init optimizer and scheduler + optimizer = Transformer.get_default_optimizer( + self.model, weight_decay, learning_rate, adam_epsilon + ) + scheduler = Transformer.get_default_scheduler( + optimizer, + warmup_steps, + train_dataloader, + max_steps, + num_epochs, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + super().fine_tune( train_dataloader=train_dataloader, + device=device, + num_gpus=num_gpus, get_inputs=Processor.get_inputs, - n_gpu=num_gpus, + max_steps=max_steps, num_train_epochs=num_epochs, - weight_decay=weight_decay, - learning_rate=learning_rate, - adam_epsilon=adam_epsilon, - warmup_steps=warmup_steps, + gradient_accumulation_steps=gradient_accumulation_steps, + optimizer=optimizer, + scheduler=scheduler, + local_rank=local_rank, verbose=verbose, seed=seed, ) - def predict(self, eval_dataloader, num_gpus=None, verbose=True): + def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. @@ -303,17 +328,25 @@ def predict(self, eval_dataloader, num_gpus=None, verbose=True): num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns 1darray: numpy array of predicted label indices. """ + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) + preds = list( super().predict( eval_dataloader=eval_dataloader, + device=device, get_inputs=Processor.get_inputs, - n_gpu=num_gpus, verbose=verbose, ) ) From 9811b8abfb12b2dd4ee1d1e8621a72a02fc3e85b Mon Sep 17 00:00:00 2001 From: saidbleik Date: Mon, 13 Jan 2020 17:45:00 +0000 Subject: [PATCH 05/24] restructuring of common transformers utils --- ...st_transformers_sequence_classification.py | 5 +- utils_nlp/common/pytorch_utils.py | 105 ++++++++---------- utils_nlp/models/transformers/common.py | 91 ++++++--------- .../transformers/sequence_classification.py | 98 ++++++---------- 4 files changed, 125 insertions(+), 174 deletions(-) mode change 100644 => 100755 tests/unit/test_transformers_sequence_classification.py mode change 100644 => 100755 utils_nlp/models/transformers/common.py mode change 100644 => 100755 utils_nlp/models/transformers/sequence_classification.py diff --git a/tests/unit/test_transformers_sequence_classification.py b/tests/unit/test_transformers_sequence_classification.py old mode 100644 new mode 100755 index c402d106e..3ffb1f8b9 --- a/tests/unit/test_transformers_sequence_classification.py +++ b/tests/unit/test_transformers_sequence_classification.py @@ -5,6 +5,7 @@ import pandas as pd from utils_nlp.models.transformers.sequence_classification import SequenceClassifier, Processor +from utils_nlp.common.pytorch_utils import dataloader_from_dataset @pytest.fixture() @@ -20,7 +21,7 @@ def test_classifier(data, tmpdir): model_name = "bert-base-uncased" processor = Processor(model_name=model_name, cache_dir=tmpdir) ds = processor.dataset_from_dataframe(df, "text", "label") - dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True) + dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True) classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir) classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False) preds = classifier.predict(dl, num_gpus=0, verbose=False) @@ -35,7 +36,7 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir): model_name = "bert-base-uncased" processor = Processor(model_name=model_name, cache_dir=tmpdir) ds = processor.dataset_from_dataframe(df, "text", "label") - dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True) + dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True) classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir) classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False) diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index fee66269e..89f98ab2a 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -3,9 +3,12 @@ """Common PyTorch utilities that facilitate building Pytorch models.""" +import warnings + import torch import torch.nn as nn -import warnings +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler def get_device( @@ -17,11 +20,7 @@ def get_device( # init_method="file:///distributed", ): if local_rank == -1: - num_gpus = ( - min(num_gpus, torch.cuda.device_count()) - if num_gpus is not None - else torch.cuda.device_count() - ) + num_gpus = min(num_gpus, torch.cuda.device_count()) if num_gpus is not None else torch.cuda.device_count() device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu") else: torch.cuda.set_device(local_rank) @@ -58,10 +57,7 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= # wrap in DataParallel or DistributedDataParallel if local_rank != -1: self.model = torch.nn.parallel.DistributedDataParallel( - self.model, - device_ids=[local_rank], - output_device=local_rank, - find_unused_parameters=True, + self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, ) else: if num_gpus > 1: @@ -72,59 +68,56 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= return model.to(device) -def move_to_device(model, device, num_gpus=None): - """Moves a model to the specified device (cpu or gpu/s) - and implements data parallelism when multiple gpus are specified. +def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False): + """Creates a PyTorch DataLoader given a Dataset object. Args: - model (Module): A PyTorch model - device (torch.device): A PyTorch device - num_gpus (int): The number of GPUs to be used. Defaults to None, - all gpus are used. + ds (torch.utils.data.DataSet): A PyTorch dataset. + batch_size (int, optional): Batch size. Defaults to 32. + num_gpus (int, optional): The number of GPUs to be used. Defaults to None. + shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False. + distributed (book, optional): If True, a DistributedSampler is used. Defaults to False. Returns: Module, DataParallel: A PyTorch Module or a DataParallel wrapper (when multiple gpus are used). """ - if isinstance(model, nn.DataParallel): - model = model.module + if num_gpus is None: + num_gpus = torch.cuda.device_count() - if not isinstance(device, torch.device): - raise ValueError("device must be of type torch.device.") - - if device.type == "cuda": - model.to(device) # inplace - if num_gpus == 0: - raise ValueError("num_gpus must be non-zero when device.type is 'cuda'") - elif num_gpus == 1: - return model - else: - # parallelize - num_cuda_devices = torch.cuda.device_count() - if num_cuda_devices < 1: - raise Exception("CUDA devices are not available.") - elif num_cuda_devices < 2: - print("Warning: Only 1 CUDA device is available. Data parallelism is not possible.") - return model - else: - if num_gpus is None: - # use all available devices - return nn.DataParallel(model, device_ids=None) - elif num_gpus > num_cuda_devices: - print( - "Warning: Only {0} devices are available. " - "Setting the number of gpus to {0}".format(num_cuda_devices) - ) - return nn.DataParallel(model, device_ids=None) - else: - return nn.DataParallel(model, device_ids=list(range(num_gpus))) - elif device.type == "cpu": - if num_gpus != 0 and num_gpus is not None: - warnings.warn("Device type is 'cpu'. num_gpus is ignored.") - return model.to(device) + batch_size = batch_size * max(1, num_gpus) + if distributed: + sampler = DistributedSampler(ds) else: - raise Exception( - "Device type '{}' not supported. Currently, only cpu " - "and cuda devices are supported.".format(device.type) - ) + sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds) + + return DataLoader(ds, sampler=sampler, batch_size=batch_size) + +def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1): + """Computes the max training steps given a dataloader. + + Args: + dataloader (Dataloader): A PyTorch DataLoader. + num_epochs (int, optional): Number of training epochs. Defaults to 1. + max_steps (int, optional): Total number of training steps. + If set to a positive value, it overrides num_epochs. + Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. + Defualts to -1. + gradient_accumulation_steps (int, optional): Number of steps to accumulate + before performing a backward/update pass. + Default to 1. + + Returns: + int: The max number of steps to be used in a training loop. + """ + try: + dataset_length = len(dataloader) + except Exception: + dataset_length = -1 + if max_steps <= 0: + if dataset_length != -1 and num_epochs > 0: + max_steps = dataset_length // gradient_accumulation_steps * num_epochs + if max_steps <= 0: + raise Exception("Max steps cannot be determined.") + return max_steps \ No newline at end of file diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py old mode 100644 new mode 100755 index d5f4d5588..ccaf48b46 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -7,13 +7,13 @@ import logging import os import random +import time +from itertools import cycle import numpy as np import torch from tqdm import tqdm, trange -from transformers import AdamW -from transformers import get_linear_schedule_with_warmup - +from transformers import AdamW, get_linear_schedule_with_warmup from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP @@ -23,7 +23,6 @@ from transformers.tokenization_roberta import RobertaTokenizer from transformers.tokenization_xlnet import XLNetTokenizer - TOKENIZER_CLASS = {} TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) TOKENIZER_CLASS.update({k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}) @@ -37,12 +36,7 @@ class Transformer: def __init__( - self, - model_class, - model_name="bert-base-cased", - num_labels=2, - cache_dir=".", - load_model_from_dir=None, + self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None, ): if model_name not in self.list_supported_models(): @@ -86,15 +80,11 @@ def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { - "params": [ - p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) - ], + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay, }, { - "params": [ - p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) - ], + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] @@ -102,23 +92,9 @@ def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon): return optimizer @staticmethod - def get_default_scheduler( - optimizer, warmup_steps, data_loader, max_steps, num_epochs, gradient_accumulation_steps - ): - try: - dataset_length = len(data_loader) - except Exception: - dataset_length = -1 - - if max_steps <= 0: - if dataset_length != -1 and num_epochs > 0: - max_steps = dataset_length // gradient_accumulation_steps * num_epochs - - if max_steps <= 0: - raise Exception("Max steps cannot be determined.") - + def get_default_scheduler(optimizer, warmup_steps, num_training_steps): scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps + optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps ) return scheduler @@ -129,7 +105,6 @@ def fine_tune( num_gpus, get_inputs, max_steps=-1, - num_train_epochs=1, max_grad_norm=1.0, gradient_accumulation_steps=1, optimizer=None, @@ -139,6 +114,8 @@ def fine_tune( local_rank=-1, verbose=True, seed=None, + report_every=10, + clip_grad_norm=True, ): if seed is not None: @@ -154,20 +131,16 @@ def fine_tune( # init training global_step = 0 tr_loss = 0.0 + accum_loss = 0 self.model.train() self.model.zero_grad() - train_iterator = trange( - int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0] or not verbose - ) # train - for _ in train_iterator: - epoch_iterator = tqdm( - train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose - ) + start = time.time() + while global_step < max_steps: + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose) for step, batch in enumerate(epoch_iterator): - batch = tuple(t.to(device) for t in batch) - inputs = get_inputs(batch, self.model_name) + inputs = get_inputs(batch, device, self.model_name) outputs = self.model(**inputs) loss = outputs[0] @@ -176,39 +149,47 @@ def fine_tune( if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps - if step % 10 == 0 and verbose: - tqdm.write("Loss:{:.6f}".format(loss)) - if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) + if clip_grad_norm: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() - torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) + if clip_grad_norm: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) tr_loss += loss.item() + + accum_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: + global_step += 1 + if global_step % report_every == 0 and verbose: + end = time.time() + print( + "loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format( + accum_loss / report_every, end - start, len(batch), global_step, max_steps, + ) + ) + accum_loss = 0 + start = end + optimizer.step() - scheduler.step() + if scheduler: + scheduler.step() self.model.zero_grad() - global_step += 1 - if max_steps > 0 and global_step > max_steps: + if global_step > max_steps: epoch_iterator.close() break - if max_steps > 0 and global_step > max_steps: - train_iterator.close() - break return global_step, tr_loss / global_step def predict(self, eval_dataloader, device, get_inputs, verbose=True): self.model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): - batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = get_inputs(batch, self.model_name, train_mode=False) + inputs = get_inputs(batch, device, self.model_name, train_mode=False) outputs = self.model(**inputs) logits = outputs[0] yield logits.detach().cpu().numpy() diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py old mode 100644 new mode 100755 index b245383db..5e2e3763e --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -3,8 +3,10 @@ import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler -from torch.utils.data.distributed import DistributedSampler +from transformers.modeling_albert import ( + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + AlbertForSequenceClassification, +) from transformers.modeling_bert import ( BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification, @@ -21,19 +23,17 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification, ) -from utils_nlp.common.pytorch_utils import get_device, move_model_to_device + +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device, compute_training_steps from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet MODEL_CLASS = {} MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) -MODEL_CLASS.update( - {k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP} -) +MODEL_CLASS.update({k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}) MODEL_CLASS.update({k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}) -MODEL_CLASS.update( - {k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP} -) +MODEL_CLASS.update({k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}) +MODEL_CLASS.update({k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}) class Processor: @@ -57,13 +57,14 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."): ) @staticmethod - def get_inputs(batch, model_name, train_mode=True): + def get_inputs(batch, device, model_name, train_mode=True): """ Creates an input dictionary given a model name. Args: batch (tuple): A tuple containing input ids, attention mask, segment ids, and labels tensors. + device (torch.device): A PyTorch device. model_name (bool, optional): Model name used to format the inputs. train_mode (bool, optional): Training mode flag. Defaults to True. @@ -72,7 +73,8 @@ def get_inputs(batch, model_name, train_mode=True): dict: Dictionary containing input ids, segment ids, masks, and labels. Labels are only returned when train_mode is True. """ - if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert"]: + batch = tuple(t.to(device) for t in batch) + if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert", "albert"]: if train_mode: inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} else: @@ -104,11 +106,7 @@ def text_transform(text, tokenizer, max_len=MAX_SEQ_LEN): print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)) max_len = MAX_SEQ_LEN # truncate and add CLS & SEP markers - tokens = ( - [tokenizer.cls_token] - + tokenizer.tokenize(text)[0 : max_len - 2] - + [tokenizer.sep_token] - ) + tokens = [tokenizer.cls_token] + tokenizer.tokenize(text)[0 : max_len - 2] + [tokenizer.sep_token] # get input ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # pad sequence @@ -189,17 +187,10 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): return input_ids, attention_mask, token_type_ids - def dataset_from_dataframe( - self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN - ): + def dataset_from_dataframe(self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN): if text2_col is None: return SCDataSet( - df, - text_col, - label_col, - transform=Processor.text_transform, - tokenizer=self.tokenizer, - max_len=max_len, + df, text_col, label_col, transform=Processor.text_transform, tokenizer=self.tokenizer, max_len=max_len, ) else: return SPCDataSet( @@ -212,29 +203,11 @@ def dataset_from_dataframe( max_len=max_len, ) - def dataloader_from_dataset( - self, ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False - ): - if num_gpus is None: - num_gpus = torch.cuda.device_count() - - batch_size = batch_size * max(1, num_gpus) - - if distributed: - sampler = DistributedSampler(ds) - else: - sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds) - - return DataLoader(ds, sampler=sampler, batch_size=batch_size) - class SequenceClassifier(Transformer): def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."): super().__init__( - model_class=MODEL_CLASS, - model_name=model_name, - num_labels=num_labels, - cache_dir=cache_dir, + model_class=MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir, ) @staticmethod @@ -261,9 +234,12 @@ def fit( Fine-tunes a pre-trained sequence classification model. Args: - train_dataloader (Dataloader): Dataloader for the training data. + train_dataloader (Dataloader): A PyTorch DataLoader to be used for training. num_epochs (int, optional): Number of training epochs. Defaults to 1. - max_steps (int, optional): Total number of training steps. Overrides num_epochs. + max_steps (int, optional): Total number of training steps. + If set to a positive value, it overrides num_epochs. + Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. + Defualts to -1. gradient_accumulation_steps (int, optional): Number of steps to accumulate before performing a backward/update pass. Default to 1. @@ -288,29 +264,33 @@ def fit( # get device device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) + # move model self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) - # init optimizer and scheduler - optimizer = Transformer.get_default_optimizer( - self.model, weight_decay, learning_rate, adam_epsilon - ) - scheduler = Transformer.get_default_scheduler( - optimizer, - warmup_steps, + # init optimizer + optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) + + # compute the max number of training steps + max_steps = compute_training_steps( train_dataloader, - max_steps, - num_epochs, + num_epochs=num_epochs, + max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) + # inint scheduler + scheduler = Transformer.get_default_scheduler( + optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, + ) + + # fine tune super().fine_tune( train_dataloader=train_dataloader, device=device, num_gpus=num_gpus, get_inputs=Processor.get_inputs, max_steps=max_steps, - num_train_epochs=num_epochs, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, scheduler=scheduler, @@ -344,12 +324,8 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): preds = list( super().predict( - eval_dataloader=eval_dataloader, - device=device, - get_inputs=Processor.get_inputs, - verbose=verbose, + eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose, ) ) preds = np.concatenate(preds) - # todo generator & probs return np.argmax(preds, axis=1) From 74f6ba6662dec87591677a575505b938bcec04dd Mon Sep 17 00:00:00 2001 From: saidbleik Date: Mon, 13 Jan 2020 18:20:20 +0000 Subject: [PATCH 06/24] updated seq classification tests --- utils_nlp/models/transformers/sequence_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 5e2e3763e..a86f27608 100755 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -279,7 +279,7 @@ def fit( gradient_accumulation_steps=gradient_accumulation_steps, ) - # inint scheduler + # inin scheduler scheduler = Transformer.get_default_scheduler( optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, ) From 5611740f960ae4fc9831dbed85a7fa5e23846df3 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Mon, 13 Jan 2020 21:00:11 +0000 Subject: [PATCH 07/24] update seq classification examples --- .../tc_mnli_transformers.ipynb | 205 +++++++++----- .../tc_multi_languages_transformers.ipynb | 249 +++++++++++++++--- .../test_notebooks_text_classification.py | 4 +- utils_nlp/dataset/bbc_hindi.py | 82 ++---- utils_nlp/dataset/dac.py | 68 ++--- utils_nlp/dataset/multinli.py | 70 ++--- 6 files changed, 431 insertions(+), 247 deletions(-) diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb index 952f2bafa..bfbd91ffe 100644 --- a/examples/text_classification/tc_mnli_transformers.ipynb +++ b/examples/text_classification/tc_mnli_transformers.ipynb @@ -32,6 +32,7 @@ "from sklearn.preprocessing import LabelEncoder\n", "from tqdm import tqdm\n", "from utils_nlp.common.timer import Timer\n", + "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n", "from utils_nlp.dataset.multinli import load_pandas_df\n", "from utils_nlp.models.transformers.sequence_classification import (\n", " Processor, SequenceClassifier)" @@ -93,7 +94,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n" + "100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \n" ] } ], @@ -196,7 +197,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", + "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", " FutureWarning)\n" ] } @@ -232,11 +233,11 @@ { "data": { "text/plain": [ - "telephone 1055\n", - "slate 1003\n", - "travel 961\n", - "fiction 952\n", - "government 938\n", + "telephone 1043\n", + "slate 989\n", + "fiction 968\n", + "travel 964\n", + "government 945\n", "Name: genre, dtype: int64" ] }, @@ -385,32 +386,108 @@ " \n", " \n", " 15\n", - " roberta-base\n", + " bert-base-japanese\n", " \n", " \n", " 16\n", - " roberta-large\n", + " bert-base-japanese-whole-word-masking\n", " \n", " \n", " 17\n", - " roberta-large-mnli\n", + " bert-base-japanese-char\n", " \n", " \n", " 18\n", - " xlnet-base-cased\n", + " bert-base-japanese-char-whole-word-masking\n", " \n", " \n", " 19\n", - " xlnet-large-cased\n", + " bert-base-finnish-cased-v1\n", " \n", " \n", " 20\n", - " distilbert-base-uncased\n", + " bert-base-finnish-uncased-v1\n", " \n", " \n", " 21\n", + " roberta-base\n", + " \n", + " \n", + " 22\n", + " roberta-large\n", + " \n", + " \n", + " 23\n", + " roberta-large-mnli\n", + " \n", + " \n", + " 24\n", + " distilroberta-base\n", + " \n", + " \n", + " 25\n", + " roberta-base-openai-detector\n", + " \n", + " \n", + " 26\n", + " roberta-large-openai-detector\n", + " \n", + " \n", + " 27\n", + " xlnet-base-cased\n", + " \n", + " \n", + " 28\n", + " xlnet-large-cased\n", + " \n", + " \n", + " 29\n", + " distilbert-base-uncased\n", + " \n", + " \n", + " 30\n", " distilbert-base-uncased-distilled-squad\n", " \n", + " \n", + " 31\n", + " distilbert-base-german-cased\n", + " \n", + " \n", + " 32\n", + " distilbert-base-multilingual-cased\n", + " \n", + " \n", + " 33\n", + " albert-base-v1\n", + " \n", + " \n", + " 34\n", + " albert-large-v1\n", + " \n", + " \n", + " 35\n", + " albert-xlarge-v1\n", + " \n", + " \n", + " 36\n", + " albert-xxlarge-v1\n", + " \n", + " \n", + " 37\n", + " albert-base-v2\n", + " \n", + " \n", + " 38\n", + " albert-large-v2\n", + " \n", + " \n", + " 39\n", + " albert-xlarge-v2\n", + " \n", + " \n", + " 40\n", + " albert-xxlarge-v2\n", + " \n", " \n", "\n", "" @@ -432,13 +509,32 @@ "12 bert-base-cased-finetuned-mrpc\n", "13 bert-base-german-dbmdz-cased\n", "14 bert-base-german-dbmdz-uncased\n", - "15 roberta-base\n", - "16 roberta-large\n", - "17 roberta-large-mnli\n", - "18 xlnet-base-cased\n", - "19 xlnet-large-cased\n", - "20 distilbert-base-uncased\n", - "21 distilbert-base-uncased-distilled-squad" + "15 bert-base-japanese\n", + "16 bert-base-japanese-whole-word-masking\n", + "17 bert-base-japanese-char\n", + "18 bert-base-japanese-char-whole-word-masking\n", + "19 bert-base-finnish-cased-v1\n", + "20 bert-base-finnish-uncased-v1\n", + "21 roberta-base\n", + "22 roberta-large\n", + "23 roberta-large-mnli\n", + "24 distilroberta-base\n", + "25 roberta-base-openai-detector\n", + "26 roberta-large-openai-detector\n", + "27 xlnet-base-cased\n", + "28 xlnet-large-cased\n", + "29 distilbert-base-uncased\n", + "30 distilbert-base-uncased-distilled-squad\n", + "31 distilbert-base-german-cased\n", + "32 distilbert-base-multilingual-cased\n", + "33 albert-base-v1\n", + "34 albert-large-v1\n", + "35 albert-xlarge-v1\n", + "36 albert-xxlarge-v1\n", + "37 albert-base-v2\n", + "38 albert-large-v2\n", + "39 albert-xlarge-v2\n", + "40 albert-xxlarge-v2" ] }, "execution_count": 10, @@ -492,18 +588,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n", - "100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n", - "100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n", - "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", - " warnings.warn('Was asked to gather along dimension 0, but all '\n", - "100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n", - "100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n", - "100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n", - "100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n", - "100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n", - "100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n", - "100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n" + "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n" ] } ], @@ -518,11 +604,17 @@ " to_lower=model_name.endswith(\"uncased\"),\n", " cache_dir=CACHE_DIR,\n", " )\n", - " train_dataloader = processor.create_dataloader_from_df(\n", - " df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n", + " train_dataset = processor.dataset_from_dataframe(\n", + " df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n", " )\n", - " test_dataloader = processor.create_dataloader_from_df(\n", - " df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n", + " train_dataloader = dataloader_from_dataset(\n", + " train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n", + " )\n", + " test_dataset = processor.dataset_from_dataframe(\n", + " df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n", + " )\n", + " test_dataloader = dataloader_from_dataset(\n", + " test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n", " )\n", "\n", " # fine-tune\n", @@ -531,17 +623,12 @@ " )\n", " with Timer() as t:\n", " classifier.fit(\n", - " train_dataloader,\n", - " num_epochs=NUM_EPOCHS,\n", - " num_gpus=NUM_GPUS,\n", - " verbose=False,\n", + " train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\n", " )\n", " train_time = t.interval / 3600\n", "\n", " # predict\n", - " preds = classifier.predict(\n", - " test_dataloader, num_gpus=NUM_GPUS, verbose=False\n", - " )\n", + " preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\n", "\n", " # eval\n", " accuracy = accuracy_score(df_test[LABEL_COL], preds)\n", @@ -600,21 +687,21 @@ " \n", " \n", " accuracy\n", - " 0.895477\n", - " 0.879584\n", - " 0.894866\n", + " 0.889364\n", + " 0.885697\n", + " 0.886308\n", " \n", " \n", " f1-score\n", - " 0.896656\n", - " 0.881218\n", - " 0.896108\n", + " 0.885225\n", + " 0.880926\n", + " 0.881819\n", " \n", " \n", " time(hrs)\n", - " 0.021865\n", - " 0.035351\n", - " 0.046295\n", + " 0.023326\n", + " 0.044209\n", + " 0.052801\n", " \n", " \n", "\n", @@ -622,9 +709,9 @@ ], "text/plain": [ " distilbert-base-uncased roberta-base xlnet-base-cased\n", - "accuracy 0.895477 0.879584 0.894866\n", - "f1-score 0.896656 0.881218 0.896108\n", - "time(hrs) 0.021865 0.035351 0.046295" + "accuracy 0.889364 0.885697 0.886308\n", + "f1-score 0.885225 0.880926 0.881819\n", + "time(hrs) 0.023326 0.044209 0.052801" ] }, "execution_count": 13, @@ -645,7 +732,7 @@ { "data": { "application/scrapbook.scrap.json+json": { - "data": 0.8899755501222494, + "data": 0.887123064384678, "encoder": "json", "name": "accuracy", "version": 1 @@ -663,7 +750,7 @@ { "data": { "application/scrapbook.scrap.json+json": { - "data": 0.8913273009038569, + "data": 0.8826569624491233, "encoder": "json", "name": "f1", "version": 1 @@ -688,9 +775,9 @@ ], "metadata": { "kernelspec": { - "display_name": "nlp_gpu", + "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)", "language": "python", - "name": "nlp_gpu" + "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921" }, "language_info": { "codemirror_mode": { diff --git a/examples/text_classification/tc_multi_languages_transformers.ipynb b/examples/text_classification/tc_multi_languages_transformers.ipynb index 437c95cfb..d8dfd9244 100644 --- a/examples/text_classification/tc_multi_languages_transformers.ipynb +++ b/examples/text_classification/tc_multi_languages_transformers.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -183,32 +183,108 @@ " \n", " \n", " 15\n", - " roberta-base\n", + " bert-base-japanese\n", " \n", " \n", " 16\n", - " roberta-large\n", + " bert-base-japanese-whole-word-masking\n", " \n", " \n", " 17\n", - " roberta-large-mnli\n", + " bert-base-japanese-char\n", " \n", " \n", " 18\n", - " xlnet-base-cased\n", + " bert-base-japanese-char-whole-word-masking\n", " \n", " \n", " 19\n", - " xlnet-large-cased\n", + " bert-base-finnish-cased-v1\n", " \n", " \n", " 20\n", - " distilbert-base-uncased\n", + " bert-base-finnish-uncased-v1\n", " \n", " \n", " 21\n", + " roberta-base\n", + " \n", + " \n", + " 22\n", + " roberta-large\n", + " \n", + " \n", + " 23\n", + " roberta-large-mnli\n", + " \n", + " \n", + " 24\n", + " distilroberta-base\n", + " \n", + " \n", + " 25\n", + " roberta-base-openai-detector\n", + " \n", + " \n", + " 26\n", + " roberta-large-openai-detector\n", + " \n", + " \n", + " 27\n", + " xlnet-base-cased\n", + " \n", + " \n", + " 28\n", + " xlnet-large-cased\n", + " \n", + " \n", + " 29\n", + " distilbert-base-uncased\n", + " \n", + " \n", + " 30\n", " distilbert-base-uncased-distilled-squad\n", " \n", + " \n", + " 31\n", + " distilbert-base-german-cased\n", + " \n", + " \n", + " 32\n", + " distilbert-base-multilingual-cased\n", + " \n", + " \n", + " 33\n", + " albert-base-v1\n", + " \n", + " \n", + " 34\n", + " albert-large-v1\n", + " \n", + " \n", + " 35\n", + " albert-xlarge-v1\n", + " \n", + " \n", + " 36\n", + " albert-xxlarge-v1\n", + " \n", + " \n", + " 37\n", + " albert-base-v2\n", + " \n", + " \n", + " 38\n", + " albert-large-v2\n", + " \n", + " \n", + " 39\n", + " albert-xlarge-v2\n", + " \n", + " \n", + " 40\n", + " albert-xxlarge-v2\n", + " \n", " \n", "\n", "" @@ -230,13 +306,32 @@ "12 bert-base-cased-finetuned-mrpc\n", "13 bert-base-german-dbmdz-cased\n", "14 bert-base-german-dbmdz-uncased\n", - "15 roberta-base\n", - "16 roberta-large\n", - "17 roberta-large-mnli\n", - "18 xlnet-base-cased\n", - "19 xlnet-large-cased\n", - "20 distilbert-base-uncased\n", - "21 distilbert-base-uncased-distilled-squad" + "15 bert-base-japanese\n", + "16 bert-base-japanese-whole-word-masking\n", + "17 bert-base-japanese-char\n", + "18 bert-base-japanese-char-whole-word-masking\n", + "19 bert-base-finnish-cased-v1\n", + "20 bert-base-finnish-uncased-v1\n", + "21 roberta-base\n", + "22 roberta-large\n", + "23 roberta-large-mnli\n", + "24 distilroberta-base\n", + "25 roberta-base-openai-detector\n", + "26 roberta-large-openai-detector\n", + "27 xlnet-base-cased\n", + "28 xlnet-large-cased\n", + "29 distilbert-base-uncased\n", + "30 distilbert-base-uncased-distilled-squad\n", + "31 distilbert-base-german-cased\n", + "32 distilbert-base-multilingual-cased\n", + "33 albert-base-v1\n", + "34 albert-large-v1\n", + "35 albert-xlarge-v1\n", + "36 albert-xxlarge-v1\n", + "37 albert-base-v2\n", + "38 albert-large-v2\n", + "39 albert-xlarge-v2\n", + "40 albert-xxlarge-v2" ] }, "execution_count": 3, @@ -264,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -281,7 +376,7 @@ " 'num_train_epochs': 5,\n", " 'num_gpus': 2,\n", " 'batch_size': 16,\n", - " 'verbose': True,\n", + " 'verbose': False,\n", " 'load_dataset_func': None,\n", " 'get_labels_func': None\n", "}\n", @@ -325,9 +420,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\n", + "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", + " FutureWarning)\n" + ] + } + ], "source": [ "train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\n", " local_path=CONFIG['local_path'],\n", @@ -354,11 +459,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", + " warnings.warn('Was asked to gather along dimension 0, but all '\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training time : 0.190 hrs\n" + ] + } + ], "source": [ "model = SequenceClassifier(\n", " model_name=CONFIG['model_name'],\n", @@ -390,9 +511,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction time : 0.021 hrs\n" + ] + } + ], "source": [ "with Timer() as t:\n", " preds = model.predict(\n", @@ -422,11 +551,11 @@ "text": [ " precision recall f1-score support\n", "\n", - " culture 0.89 0.89 0.89 843\n", - " diverse 0.99 0.99 0.99 1738\n", - " economy 0.96 0.96 0.96 661\n", - " politics 0.94 0.94 0.94 530\n", - " sports 0.87 0.87 0.87 580\n", + " culture 0.93 0.94 0.93 548\n", + " diverse 0.94 0.94 0.94 640\n", + " economy 0.90 0.88 0.89 570\n", + " politics 0.87 0.88 0.88 809\n", + " sports 0.99 0.98 0.99 1785\n", "\n", " micro avg 0.94 0.94 0.94 4352\n", " macro avg 0.93 0.93 0.93 4352\n", @@ -449,9 +578,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.94, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.94, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.94, + "encoder": "json", + "name": "f1", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "f1" + } + }, + "output_type": "display_data" + } + ], "source": [ "# for testing\n", "report_splits = report.split('\\n')[-2].split()\n", @@ -463,11 +647,10 @@ } ], "metadata": { - "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)", "language": "python", - "name": "python3" + "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921" }, "language_info": { "codemirror_mode": { diff --git a/tests/integration/test_notebooks_text_classification.py b/tests/integration/test_notebooks_text_classification.py index 8f00107eb..97eb9d6d7 100644 --- a/tests/integration/test_notebooks_text_classification.py +++ b/tests/integration/test_notebooks_text_classification.py @@ -33,8 +33,8 @@ def test_tc_mnli_transformers(notebooks, tmp): ), ) result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict - assert pytest.approx(result["accuracy"], 0.89, abs=ABS_TOL) - assert pytest.approx(result["f1"], 0.89, abs=ABS_TOL) + assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL) + assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL) @pytest.mark.integration diff --git a/utils_nlp/dataset/bbc_hindi.py b/utils_nlp/dataset/bbc_hindi.py index c8212cd63..08a779049 100644 --- a/utils_nlp/dataset/bbc_hindi.py +++ b/utils_nlp/dataset/bbc_hindi.py @@ -7,24 +7,22 @@ https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1 """ -import os -import pandas as pd import logging -import numpy as np +import os import tarfile - from tempfile import TemporaryDirectory + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.dataset.url_utils import maybe_download from utils_nlp.models.transformers.common import MAX_SEQ_LEN from utils_nlp.models.transformers.sequence_classification import Processor -from sklearn.preprocessing import LabelEncoder -from sklearn.model_selection import train_test_split - -URL = ( - "https://github.com/NirantK/hindi2vec/releases/" - "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz" -) +URL = "https://github.com/NirantK/hindi2vec/releases/" "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz" def load_pandas_df(local_cache_path=TemporaryDirectory().name): @@ -49,19 +47,9 @@ def load_pandas_df(local_cache_path=TemporaryDirectory().name): train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv") test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv") - train_df = pd.read_csv( - train_csv_file_path, - sep="\t", - encoding='utf-8', - header=None - ) + train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None) - test_df = pd.read_csv( - test_csv_file_path, - sep="\t", - encoding='utf-8', - header=None - ) + test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None) train_df = train_df.fillna("") test_df = test_df.fillna("") @@ -80,7 +68,7 @@ def load_tc_dataset( cache_dir=TemporaryDirectory().name, max_len=MAX_SEQ_LEN, batch_size=32, - num_gpus=None + num_gpus=None, ): """ Load the multinli dataset and split into training and testing datasets. @@ -114,9 +102,9 @@ def load_tc_dataset( Returns: tuple. The tuple contains four elements: - train_dataload (DataLoader): a PyTorch DataLoader instance for training. + train_dataloader (DataLoader): a PyTorch DataLoader instance for training. - test_dataload (DataLoader): a PyTorch DataLoader instance for testing. + test_dataloader (DataLoader): a PyTorch DataLoader instance for testing. label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values can be retrieved by calling the `inverse_transform` function. @@ -140,12 +128,8 @@ def load_tc_dataset( if test_fraction < 0 or test_fraction >= 1.0: logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction)) test_fraction = 0.25 - - train_df, test_df = train_test_split( - all_df, - train_size=(1.0 - test_fraction), - random_state=random_seed - ) + + train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed) if train_sample_ratio > 1.0: train_sample_ratio = 1.0 @@ -153,7 +137,7 @@ def load_tc_dataset( elif train_sample_ratio < 0: logging.error("Invalid training sample ration: {}".format(train_sample_ratio)) raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio)) - + if test_sample_ratio > 1.0: test_sample_ratio = 1.0 logging.warning("Setting the testing sample ratio to 1.0") @@ -171,35 +155,17 @@ def load_tc_dataset( test_labels = label_encoder.transform(test_df[label_col]) test_df[label_col] = test_labels - processor = Processor( - model_name=model_name, - to_lower=to_lower, - cache_dir=cache_dir - ) + processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir) - train_dataloader = processor.create_dataloader_from_df( - df=train_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=True, - distributed=False + train_dataset = processor.dataset_from_dataframe( + df=train_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True) - test_dataloader = processor.create_dataloader_from_df( - df=test_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=False, - distributed=False + test_dataset = processor.dataset_from_dataframe( + df=test_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False) return (train_dataloader, test_dataloader, label_encoder, test_labels) diff --git a/utils_nlp/dataset/dac.py b/utils_nlp/dataset/dac.py index c692dfb56..750e95915 100644 --- a/utils_nlp/dataset/dac.py +++ b/utils_nlp/dataset/dac.py @@ -8,18 +8,19 @@ arabic-text-classification-using-deep-learning-technics/") """ -import os -import pandas as pd import logging +import os +from tempfile import TemporaryDirectory + import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder -from tempfile import TemporaryDirectory +from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.dataset.url_utils import extract_zip, maybe_download from utils_nlp.models.transformers.common import MAX_SEQ_LEN from utils_nlp.models.transformers.sequence_classification import Processor -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder - URL = ( "https://data.mendeley.com/datasets/v524p5dhpj/2" @@ -58,7 +59,7 @@ def load_tc_dataset( cache_dir=TemporaryDirectory().name, max_len=MAX_SEQ_LEN, batch_size=32, - num_gpus=None + num_gpus=None, ): """ Load the multinli dataset and split into training and testing datasets. @@ -92,9 +93,9 @@ def load_tc_dataset( Returns: tuple. The tuple contains four elements: - train_dataload (DataLoader): a PyTorch DataLoader instance for training. + train_dataloader (DataLoader): a PyTorch DataLoader instance for training. - test_dataload (DataLoader): a PyTorch DataLoader instance for testing. + test_dataloader (DataLoader): a PyTorch DataLoader instance for testing. label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values can be retrieved by calling the `inverse_transform` function. @@ -104,11 +105,8 @@ def load_tc_dataset( label IDs by using the label_encoder.transform function. """ - # download and load the original dataset - all_df = load_pandas_df( - local_cache_path=local_path, - num_rows=None - ) + # download and load the original dataset + all_df = load_pandas_df(local_cache_path=local_path, num_rows=None) # set the text and label columns text_col = all_df.columns[0] @@ -123,12 +121,8 @@ def load_tc_dataset( if test_fraction < 0 or test_fraction >= 1.0: logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction)) test_fraction = 0.25 - - train_df, test_df = train_test_split( - all_df, - train_size=(1.0 - test_fraction), - random_state=random_seed - ) + + train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed) if train_sample_ratio > 1.0: train_sample_ratio = 1.0 @@ -136,7 +130,7 @@ def load_tc_dataset( elif train_sample_ratio < 0: logging.error("Invalid training sample ration: {}".format(train_sample_ratio)) raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio)) - + if test_sample_ratio > 1.0: test_sample_ratio = 1.0 logging.warning("Setting the testing sample ratio to 1.0") @@ -149,35 +143,17 @@ def load_tc_dataset( if test_sample_ratio < 1.0: test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True) - processor = Processor( - model_name=model_name, - to_lower=to_lower, - cache_dir=cache_dir - ) + processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir) - train_dataloader = processor.create_dataloader_from_df( - df=train_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=True, - distributed=False + train_dataset = processor.dataset_from_dataframe( + df=train_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True) - test_dataloader = processor.create_dataloader_from_df( - df=test_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=False, - distributed=False + test_dataset = processor.dataset_from_dataframe( + df=test_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False) # the DAC dataset already converted the labels to label ID format test_labels = test_df[label_col] diff --git a/utils_nlp/dataset/multinli.py b/utils_nlp/dataset/multinli.py index 62b772cd1..adab4c925 100644 --- a/utils_nlp/dataset/multinli.py +++ b/utils_nlp/dataset/multinli.py @@ -7,18 +7,19 @@ https://www.nyu.edu/projects/bowman/multinli/ """ +import logging import os +from tempfile import TemporaryDirectory import pandas as pd -import logging +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder -from tempfile import TemporaryDirectory +from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.dataset.data_loaders import DaskJSONLoader from utils_nlp.dataset.url_utils import extract_zip, maybe_download from utils_nlp.models.transformers.common import MAX_SEQ_LEN from utils_nlp.models.transformers.sequence_classification import Processor -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip" DATA_FILES = { @@ -63,9 +64,7 @@ def load_pandas_df(local_cache_path=".", file_split="train"): return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True) -def get_generator( - local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None -): +def get_generator(local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None): """ Returns an extracted dataset as a random batch generator that yields pandas dataframes. Args: @@ -85,9 +84,7 @@ def get_generator( except Exception as e: raise e - loader = DaskJSONLoader( - os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size - ) + loader = DaskJSONLoader(os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size) return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches) @@ -103,7 +100,7 @@ def load_tc_dataset( cache_dir=TemporaryDirectory().name, max_len=MAX_SEQ_LEN, batch_size=32, - num_gpus=None + num_gpus=None, ): """ Load the multinli dataset and split into training and testing datasets. @@ -137,9 +134,9 @@ def load_tc_dataset( Returns: tuple. The tuple contains four elements: - train_dataload (DataLoader): a PyTorch DataLoader instance for training. + train_dataloader (DataLoader): a PyTorch DataLoader instance for training. - test_dataload (DataLoader): a PyTorch DataLoader instance for testing. + test_dataloader (DataLoader): a PyTorch DataLoader instance for testing. label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values can be retrieved by calling the `inverse_transform` function. @@ -150,10 +147,7 @@ def load_tc_dataset( """ # download and load the original dataset - all_df = load_pandas_df( - local_cache_path=local_path, - file_split="train" - ) + all_df = load_pandas_df(local_cache_path=local_path, file_split="train") # select the examples corresponding to one of the entailment labels (neutral # in this case) to avoid duplicate rows, as the sentences are not unique, @@ -169,12 +163,8 @@ def load_tc_dataset( if test_fraction < 0 or test_fraction >= 1.0: logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction)) test_fraction = 0.25 - - train_df, test_df = train_test_split( - all_df, - train_size=(1.0 - test_fraction), - random_state=random_seed - ) + + train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed) if train_sample_ratio > 1.0: train_sample_ratio = 1.0 @@ -182,7 +172,7 @@ def load_tc_dataset( elif train_sample_ratio < 0: logging.error("Invalid training sample ration: {}".format(train_sample_ratio)) raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio)) - + if test_sample_ratio > 1.0: test_sample_ratio = 1.0 logging.warning("Setting the testing sample ratio to 1.0") @@ -200,35 +190,17 @@ def load_tc_dataset( test_labels = label_encoder.transform(test_df[label_col]) test_df[label_col] = test_labels - processor = Processor( - model_name=model_name, - to_lower=to_lower, - cache_dir=cache_dir - ) + processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir) - train_dataloader = processor.create_dataloader_from_df( - df=train_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=True, - distributed=False + train_dataset = processor.dataset_from_dataframe( + df=train_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True) - test_dataloader = processor.create_dataloader_from_df( - df=test_df, - text_col=text_col, - label_col=label_col, - max_len=max_len, - text2_col=None, - batch_size=batch_size, - num_gpus=num_gpus, - shuffle=False, - distributed=False + test_dataset = processor.dataset_from_dataframe( + df=test_df, text_col=text_col, label_col=label_col, max_len=max_len, ) + test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False) return (train_dataloader, test_dataloader, label_encoder, test_labels) From c7d3409dfdc52d56f86027646865aa02d1d6de98 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Tue, 14 Jan 2020 17:08:58 +0000 Subject: [PATCH 08/24] update QA utils and tests --- ..._models_transformers_question_answering.py | 76 +++--- .../models/transformers/question_answering.py | 234 +++++++----------- 2 files changed, 128 insertions(+), 182 deletions(-) mode change 100644 => 100755 utils_nlp/models/transformers/question_answering.py diff --git a/tests/unit/test_models_transformers_question_answering.py b/tests/unit/test_models_transformers_question_answering.py index 010bf5c5d..7f14f0d0e 100644 --- a/tests/unit/test_models_transformers_question_answering.py +++ b/tests/unit/test_models_transformers_question_answering.py @@ -1,18 +1,20 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import pytest import os + +import pytest +import torch + +from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.models.transformers.datasets import QADataset from utils_nlp.models.transformers.question_answering import ( - QAProcessor, - AnswerExtractor, CACHED_EXAMPLES_TEST_FILE, CACHED_FEATURES_TEST_FILE, + AnswerExtractor, + QAProcessor, ) -import torch - NUM_GPUS = max(1, torch.cuda.device_count()) BATCH_SIZE = 8 @@ -109,9 +111,7 @@ def qa_test_data(qa_test_df, tmp_module): feature_cache_dir=tmp_module, ) - qa_processor_distilbert = QAProcessor( - model_name="distilbert-base-uncased", cache_dir=tmp_module - ) + qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module) train_features_distilbert = qa_processor_distilbert.preprocess( train_dataset, batch_size=BATCH_SIZE, @@ -153,15 +153,9 @@ def qa_test_data(qa_test_df, tmp_module): def test_QAProcessor(qa_test_data, tmp_module): for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]: qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module) - qa_processor.preprocess( - qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module - ) - qa_processor.preprocess( - qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module - ) - qa_processor.preprocess( - qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module - ) + qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module) + qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module) + qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module) # test unsupported model type with pytest.raises(ValueError): @@ -169,51 +163,49 @@ def test_QAProcessor(qa_test_data, tmp_module): # test training data has no ground truth exception with pytest.raises(Exception): - qa_processor.preprocess( - qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module - ) + qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module) # test when answer start is a list, but answer text is not with pytest.raises(Exception): qa_processor.preprocess( - qa_test_data["train_dataset_start_text_mismatch"], - is_training=True, - feature_cache_dir=tmp_module, + qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module, ) # test when training data has multiple answers with pytest.raises(Exception): qa_processor.preprocess( - qa_test_data["train_dataset_multi_answers"], - is_training=True, - feature_cache_dir=tmp_module, + qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module, ) def test_AnswerExtractor(qa_test_data, tmp_module): - # test bert + # bert qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module) - qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True) + train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"]) + test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False) + qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True) # test saving fine-tuned model model_output_dir = os.path.join(tmp_module, "fine_tuned") assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin")) assert os.path.exists(os.path.join(model_output_dir, "config.json")) - qa_extractor_from_cache = AnswerExtractor( - cache_dir=tmp_module, load_model_from_dir=model_output_dir - ) - qa_extractor_from_cache.predict(qa_test_data["test_features_bert"]) + qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir) + qa_extractor_from_cache.predict(test_loader_bert, verbose=False) + # xlnet + train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"]) + test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False) qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module) - qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False) - qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"]) + qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False) + qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False) - qa_extractor_distilbert = AnswerExtractor( - model_name="distilbert-base-uncased", cache_dir=tmp_module - ) - qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False) - qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"]) + # distilbert + train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"]) + test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False) + qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module) + qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False) + qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False) def test_postprocess_bert_answer(qa_test_data, tmp_module): @@ -226,8 +218,9 @@ def test_postprocess_bert_answer(qa_test_data, tmp_module): doc_stride=32, feature_cache_dir=tmp_module, ) + test_loader = dataloader_from_dataset(test_features, shuffle=False) qa_extractor = AnswerExtractor(cache_dir=tmp_module) - predictions = qa_extractor.predict(test_features) + predictions = qa_extractor.predict(test_loader) qa_processor.postprocess( results=predictions, @@ -260,8 +253,9 @@ def test_postprocess_xlnet_answer(qa_test_data, tmp_module): doc_stride=32, feature_cache_dir=tmp_module, ) + test_loader = dataloader_from_dataset(test_features, shuffle=False) qa_extractor = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module) - predictions = qa_extractor.predict(test_features) + predictions = qa_extractor.predict(test_loader) qa_processor.postprocess( results=predictions, diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py old mode 100644 new mode 100755 index 4f48e58d9..99cd59724 --- a/utils_nlp/models/transformers/question_answering.py +++ b/utils_nlp/models/transformers/question_answering.py @@ -17,38 +17,30 @@ # Modifications copyright © Microsoft Corporation -import os -import logging -from tqdm import tqdm import collections import json +import logging import math -import jsonlines +import os +import jsonlines import torch -from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler -from torch.utils.data.distributed import DistributedSampler - -from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize +from torch.utils.data import TensorDataset +from tqdm import tqdm +from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering -from transformers.modeling_xlnet import ( - XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, - XLNetForQuestionAnswering, -) -from transformers.modeling_distilbert import ( - DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - DistilBertForQuestionAnswering, -) +from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering +from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering +from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize -from utils_nlp.common.pytorch_utils import get_device +from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer MODEL_CLASS = {} MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}) -MODEL_CLASS.update( - {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP} -) +MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}) +MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}) # cached files during preprocessing # these are used in postprocessing to generate the final answer texts @@ -85,9 +77,7 @@ class QAProcessor: cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". """ - def __init__( - self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="." - ): + def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."): self.model_name = model_name self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained( model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False @@ -116,13 +106,14 @@ def model_type(self): return self._model_type @staticmethod - def get_inputs(batch, model_name, train_mode=True): + def get_inputs(batch, device, model_name, train_mode=True): """ Creates an input dictionary given a model name. Args: batch (tuple): A tuple containing input ids, attention mask, segment ids, and labels tensors. + device (torch.device): A PyTorch device. model_name (bool, optional): Model name used to format the inputs. train_mode (bool, optional): Training mode flag. Defaults to True. @@ -131,6 +122,7 @@ def get_inputs(batch, model_name, train_mode=True): dict: Dictionary containing input ids, segment ids, masks, and labels. Labels are only returned when train_mode is True. """ + batch = tuple(t.to(device) for t in batch) model_type = model_name.split("-")[0] inputs = {"input_ids": batch[0], "attention_mask": batch[1]} @@ -191,6 +183,8 @@ def preprocess( directory. These files are required during postprocessing to generate the final answer texts from predicted answer start and answer end indices. Defaults to "./cached_qa_features". + Returns: + DataSet: A Pytorch DataSet. """ if not os.path.exists(feature_cache_dir): @@ -223,9 +217,7 @@ def preprocess( qa_examples.append(qa_example_cur) - qa_examples_json.append( - {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens} - ) + qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}) features_cur = _create_qa_features( qa_example_cur, @@ -271,28 +263,13 @@ def preprocess( start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) qa_dataset = TensorDataset( - input_ids, - input_mask, - segment_ids, - start_positions, - end_positions, - cls_index, - p_mask, + input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, ) else: unique_id_all = torch.tensor(unique_id_all, dtype=torch.long) - qa_dataset = TensorDataset( - input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all - ) - - if num_gpus is not None: - batch_size = batch_size * max(1, num_gpus) - if distributed: - sampler = DistributedSampler(qa_dataset) - else: - sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset) + qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all) - return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size) + return qa_dataset def postprocess( self, @@ -420,14 +397,7 @@ class QAResult(QAResult_): QAResultExtended_ = collections.namedtuple( "QAResultExtended", - [ - "unique_id", - "start_top_log_probs", - "start_top_index", - "end_top_log_probs", - "end_top_index", - "cls_logits", - ], + ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",], ) @@ -489,18 +459,16 @@ def list_supported_models(): def fit( self, train_dataloader, - num_gpus=None, num_epochs=1, - learning_rate=5e-5, - max_grad_norm=1.0, max_steps=-1, gradient_accumulation_steps=1, - warmup_steps=0, + num_gpus=None, + gpu_ids=None, + local_rank=-1, weight_decay=0.0, + learning_rate=5e-5, adam_epsilon=1e-8, - fp16=False, - fp16_opt_level="O1", - local_rank=-1, + warmup_steps=0, verbose=True, seed=None, cache_model=True, @@ -509,31 +477,30 @@ def fit( Fine-tune pre-trained transofmer models for question answering. Args: - train_dataloader (Dataloader): Dataloader for the training data. - num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will - be used. If set to 0 or GPUs are not available, CPU device will - be used. Defaults to None. + train_dataloader (Dataloader): A PyTorch DataLoader to be used for training. num_epochs (int, optional): Number of training epochs. Defaults to 1. + max_steps (int, optional): Total number of training steps. + If set to a positive value, it overrides num_epochs. + Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. + Defualts to -1. + gradient_accumulation_steps (int, optional): Number of steps to accumulate + before performing a backward/update pass. + Default to 1. + num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will + be used. If set to 0 or GPUs are not available, CPU device will be used. + Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. + local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to + -1, which means non-distributed training. + weight_decay (float, optional): Weight decay to apply after each parameter update. + Defaults to 0.0. learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to 5e-5. - max_grad_norm (float, optional): Maximum gradient norm for gradient clipping. - Defaults to 1.0. - max_steps (int, optional): Maximum number of training steps. If specified, - `num_epochs` will be ignored. Defaults to -1. - gradient_accumulation_steps (int, optional): Number of batches to accumulate - gradients on between each model parameter update. Defaults to 1. + adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning rate`. Defaults to 0. - weight_decay (float, optional): Weight decay to apply after each parameter update. - Defaults to 0.0. - adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. - fp16 (bool, optional): Whether to use 16-bit (mixed) precision (through NVIDIA apex) - instead of 32-bit. Defaults to False. - fp16_opt_level (str, optional): For fp16: Apex AMP optimization level selected in - ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html. - Defaults to "O1", - local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to - -1, which means non-distributed training. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. cache_model (bool, optional): Whether to save the fine-tuned model. If True, @@ -542,28 +509,47 @@ def fit( """ + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) + + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) + + # init optimizer + optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) + + # compute the max number of training steps + max_steps = compute_training_steps( + train_dataloader, + num_epochs=num_epochs, + max_steps=max_steps, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + + # inin scheduler + scheduler = Transformer.get_default_scheduler( + optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, + ) + + # fine tune super().fine_tune( train_dataloader=train_dataloader, + device=device, + num_gpus=num_gpus, get_inputs=QAProcessor.get_inputs, max_steps=max_steps, - num_train_epochs=num_epochs, - max_grad_norm=max_grad_norm, gradient_accumulation_steps=gradient_accumulation_steps, - n_gpu=num_gpus, - weight_decay=weight_decay, - learning_rate=learning_rate, - adam_epsilon=adam_epsilon, - warmup_steps=warmup_steps, - fp16=fp16, - fp16_opt_level=fp16_opt_level, + optimizer=optimizer, + scheduler=scheduler, local_rank=local_rank, verbose=verbose, seed=seed, ) + if cache_model: self.save_model() - def predict(self, test_dataloader, num_gpus=None, verbose=True): + def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True): """ Predicts answer start and end logits. @@ -573,8 +559,9 @@ def predict(self, test_dataloader, num_gpus=None, verbose=True): num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. - local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to - -1, which means non-distributed. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. verbose (bool, optional): Whether to print out the predicting log. Defaults to True. Returns: @@ -584,25 +571,16 @@ def predict(self, test_dataloader, num_gpus=None, verbose=True): def _to_list(tensor): return tensor.detach().cpu().tolist() + # get device device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) - - if isinstance(self.model, torch.nn.DataParallel): - self.model = self.model.module - - if num_gpus > 1: - self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus))) - - self.model.to(device) - self.model.eval() + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) all_results = [] for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose): - batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = QAProcessor.get_inputs(batch, self.model_name, train_mode=False) - + inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False) outputs = self.model(**inputs) - unique_id_tensor = batch[5] for i, u_id in enumerate(unique_id_tensor): @@ -617,9 +595,7 @@ def _to_list(tensor): ) else: result = QAResult( - unique_id=u_id.item(), - start_logits=_to_list(outputs[0][i]), - end_logits=_to_list(outputs[1][i]), + unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]), ) all_results.append(result) torch.cuda.empty_cache() @@ -783,9 +759,7 @@ def postprocess_bert_answer( # Sort by the sum of the start and end logits in ascending order, # so that the first element is the most probable answer - prelim_predictions = sorted( - prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True - ) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) seen_predictions = {} nbest = [] @@ -818,19 +792,11 @@ def postprocess_bert_answer( final_text = "" seen_predictions[final_text] = True - nbest.append( - _NbestPrediction( - text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit - ) - ) + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if unanswerable_exists: if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", start_logit=null_start_logit, end_logit=null_end_logit - ) - ) + nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. @@ -874,9 +840,7 @@ def postprocess_bert_answer( all_probs[example["qa_id"]] = nbest_json[0]["probability"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = ( - score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) - ) + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) scores_diff_json[example["qa_id"]] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example["qa_id"]] = "" @@ -1042,9 +1006,7 @@ def postprocess_xlnet_answer( ) ) - prelim_predictions = sorted( - prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True - ) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) seen_predictions = {} nbest = [] @@ -1075,20 +1037,14 @@ def postprocess_xlnet_answer( tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = _get_final_text( - tok_text, orig_text, tokenizer.do_lower_case, verbose_logging - ) + final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True - nbest.append( - _NbestPrediction( - text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit - ) - ) + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. @@ -1235,9 +1191,7 @@ def _is_whitespace(c): actual_text = " ".join(d_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(a_text)) if actual_text.find(cleaned_answer_text) == -1: - logger.warning( - "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text - ) + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return else: start_position = -1 @@ -1696,9 +1650,7 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - logger.info( - "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text - ) + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using From d0a3a13567ce6280f9090e76401dd7f7bf238347 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Tue, 14 Jan 2020 19:47:22 +0000 Subject: [PATCH 09/24] minor edits to seq classification utils --- .../transformers/sequence_classification.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index a86f27608..5199f2d3d 100755 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -2,29 +2,16 @@ # Licensed under the MIT License. import numpy as np -import torch -from transformers.modeling_albert import ( - ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - AlbertForSequenceClassification, -) -from transformers.modeling_bert import ( - BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - BertForSequenceClassification, -) +from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForSequenceClassification +from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification from transformers.modeling_distilbert import ( DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForSequenceClassification, ) -from transformers.modeling_roberta import ( - ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - RobertaForSequenceClassification, -) -from transformers.modeling_xlnet import ( - XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, - XLNetForSequenceClassification, -) +from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification +from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification -from utils_nlp.common.pytorch_utils import get_device, move_model_to_device, compute_training_steps +from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet @@ -279,7 +266,7 @@ def fit( gradient_accumulation_steps=gradient_accumulation_steps, ) - # inin scheduler + # init scheduler scheduler = Transformer.get_default_scheduler( optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, ) From 8bb1930ccd2f407edcb9fe8b5f869e3243a41449 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Tue, 14 Jan 2020 19:58:10 +0000 Subject: [PATCH 10/24] update NER utils --- utils_nlp/models/transformers/common.py | 2 + .../transformers/named_entity_recognition.py | 222 +++++++++--------- 2 files changed, 118 insertions(+), 106 deletions(-) mode change 100644 => 100755 utils_nlp/models/transformers/named_entity_recognition.py diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index ccaf48b46..2fa12af53 100755 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -183,6 +183,8 @@ def fine_tune( epoch_iterator.close() break + #del [batch] + #torch.cuda.empty_cache() return global_step, tr_loss / global_step def predict(self, eval_dataloader, device, get_inputs, verbose=True): diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py old mode 100644 new mode 100755 index 9e11e3e14..169bb21c8 --- a/utils_nlp/models/transformers/named_entity_recognition.py +++ b/utils_nlp/models/transformers/named_entity_recognition.py @@ -2,20 +2,19 @@ # Licensed under the MIT License. import logging +from collections import Iterable + import numpy as np import torch -import torch.nn as nn - -from collections import Iterable -from torch.utils.data import TensorDataset from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification -from utils_nlp.common.pytorch_utils import get_device -from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler -from torch.utils.data.distributed import DistributedSampler +from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device +from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer -TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP} +TC_MODEL_CLASS = {} +TC_MODEL_CLASS.update({k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) +TC_MODEL_CLASS.update({k: DistilBertForTokenClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}) class TokenClassificationProcessor: @@ -40,27 +39,36 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."): ) @staticmethod - def get_inputs(batch, model_name, train_mode=True): + def get_inputs(batch, device, model_name, train_mode=True): """ - Produce a dictionary object for model training or prediction. + Creates an input dictionary given a model name. Args: - model_name (str): The pretained model name. - train_mode (bool, optional): Whether it's for model training. Set it to False if - it's for testing and it won't have the 'labels' data field. - Defaults to True, for model training. + batch (tuple): A tuple containing input ids, attention mask, + segment ids, and labels tensors. + device (torch.device): A PyTorch device. + model_name (bool, optional): Model name used to format the inputs. + train_mode (bool, optional): Training mode flag. + Defaults to True. Returns: - dict: A dictionary object contains all needed information for training or testing. + dict: Dictionary containing input ids, segment ids, masks, and labels. + Labels are only returned when train_mode is True. """ + batch = tuple(t.to(device) for t in batch) + if model_name.split("-")[0] in ["bert", "distilbert"]: + if train_mode: + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + else: + inputs = {"input_ids": batch[0], "attention_mask": batch[1]} - if model_name.split("-")[0] not in ["bert"]: - raise ValueError("Model not supported: {}".format(model_name)) + # distilbert doesn't support segment ids + if model_name.split("-")[0] not in ["distilbert"]: + inputs["token_type_ids"] = batch[2] - if train_mode: - return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + return inputs else: - return {"input_ids": batch[0], "attention_mask": batch[1]} + raise ValueError("Model not supported: {}".format(model_name)) @staticmethod def create_label_map(label_lists, trailing_piece_tag="X"): @@ -87,9 +95,7 @@ def create_label_map(label_lists, trailing_piece_tag="X"): label_map[trailing_piece_tag] = len(label_set) return label_map - def preprocess_for_bert( - self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X" - ): + def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"): """ Tokenize and preprocesses input word lists, involving the following steps 0. WordPiece tokenization. @@ -144,9 +150,7 @@ def _is_iterable_but_not_string(obj): return isinstance(obj, Iterable) and not isinstance(obj, str) if max_len > MAX_SEQ_LEN: - logging.warning( - "Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN) - ) + logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)) max_len = MAX_SEQ_LEN if not _is_iterable_but_not_string(text): @@ -179,9 +183,7 @@ def _is_iterable_but_not_string(obj): for t, t_labels in zip(text, labels): if len(t) != len(t_labels): raise ValueError( - "The number of words is {0}, but the number of labels is {1}.".format( - len(t), len(t_labels) - ) + "The number of words is {0}, but the number of labels is {1}.".format(len(t), len(t_labels)) ) new_labels = [] @@ -195,11 +197,7 @@ def _is_iterable_but_not_string(obj): new_tokens.append(sub_word) if len(new_tokens) > max_len: - logging.warn( - "Text after tokenization with length {} has been truncated".format( - len(new_tokens) - ) - ) + logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens))) new_tokens = new_tokens[:max_len] new_labels = new_labels[:max_len] input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens) @@ -216,9 +214,7 @@ def _is_iterable_but_not_string(obj): input_mask += padding new_labels += label_padding - trailing_token_mask_all.append( - [True if label != trailing_piece_tag else False for label in new_labels] - ) + trailing_token_mask_all.append([True if label != trailing_piece_tag else False for label in new_labels]) if label_map: label_ids = [label_map[label] for label in new_labels] @@ -244,21 +240,6 @@ def _is_iterable_but_not_string(obj): ) return td - def create_dataloader_from_dataset( - self, dataset, shuffle=False, batch_size=32, num_gpus=None, distributed=False - ): - if num_gpus is None: - num_gpus = torch.cuda.device_count() - - batch_size = batch_size * max(1, num_gpus) - - if distributed: - sampler = DistributedSampler(dataset) - else: - sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset) - - return DataLoader(dataset, sampler=sampler, batch_size=batch_size) - class TokenClassifier(Transformer): """ @@ -275,10 +256,7 @@ class TokenClassifier(Transformer): def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."): super().__init__( - model_class=TC_MODEL_CLASS, - model_name=model_name, - num_labels=num_labels, - cache_dir=cache_dir, + model_class=TC_MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir, ) @staticmethod @@ -289,7 +267,10 @@ def fit( self, train_dataloader, num_epochs=1, + max_steps=-1, + gradient_accumulation_steps=1, num_gpus=None, + gpu_ids=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, @@ -299,73 +280,104 @@ def fit( seed=None, ): """ - Fit the TokenClassifier model using the given training dataset. + Fine-tunes a pre-trained token classification model. Args: - train_dataloader (DataLoader): DataLoader instance for training. - num_epochs (int, optional): Number of training epochs. - Defaults to 1. + train_dataloader (Dataloader): A PyTorch DataLoader to be used for training. + num_epochs (int, optional): Number of training epochs. Defaults to 1. + max_steps (int, optional): Total number of training steps. + If set to a positive value, it overrides num_epochs. + Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. + Defualts to -1. + gradient_accumulation_steps (int, optional): Number of steps to accumulate + before performing a backward/update pass. + Default to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will - be used. If set to 0 or GPUs are not available, CPU device will - be used. Defaults to None. - local_rank (int, optional): Whether need to do distributed training. - Defaults to -1, no distributed training. - weight_decay (float, optional): Weight decay rate. - Defaults to 0. - learning_rate (float, optional): The learning rate. - Defaults to 5e-5. - adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer. - Defaults to 1e-8. - warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'. - Defaults to 0. - verbose (bool, optional): Verbose model. - Defaults to False. - seed (int, optional): The seed for the transformers. - Defaults to None, use the default seed. + be used. If set to 0 or GPUs are not available, CPU device will be used. + Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. + local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to + -1, which means non-distributed training. + weight_decay (float, optional): Weight decay to apply after each parameter update. + Defaults to 0.0. + learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to + 5e-5. + adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. + warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 + to `learning rate`. Defaults to 0. + verbose (bool, optional): Whether to print out the training log. Defaults to True. + seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) + + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) + + # init optimizer + optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) + + # compute the max number of training steps + max_steps = compute_training_steps( + train_dataloader, + num_epochs=num_epochs, + max_steps=max_steps, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + + # init scheduler + scheduler = Transformer.get_default_scheduler( + optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, + ) + + # fine tune super().fine_tune( train_dataloader=train_dataloader, - get_inputs=TokenClassificationProcessor.get_inputs, - n_gpu=num_gpus, - num_train_epochs=num_epochs, - weight_decay=weight_decay, - learning_rate=learning_rate, - adam_epsilon=adam_epsilon, - warmup_steps=warmup_steps, + device=device, + num_gpus=num_gpus, + get_inputs=Processor.get_inputs, + max_steps=max_steps, + gradient_accumulation_steps=gradient_accumulation_steps, + optimizer=optimizer, + scheduler=scheduler, + local_rank=local_rank, verbose=verbose, seed=seed, ) - def predict(self, eval_dataloader, num_gpus=None, verbose=True): + def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): """ - Test on an evaluation dataset and get the token label predictions. + Scores a dataset using a fine-tuned model and a given dataloader. Args: - eval_dataset (TensorDataset): A TensorDataset for evaluation. + eval_dataloader (Dataloader): Dataloader for the evaluation data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will - be used. If set to 0 or GPUs are not available, CPU device will - be used. Defaults to None. - verbose (bool, optional): Verbose model. - Defaults to False. - - Returns: - ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is - [number_of_examples, sequence_length, number_of_labels]. Each - value in the ndarray is not normalized. Post-process will be needed - to get the probability for each class label. + be used. If set to 0 or GPUs are not available, CPU device will be used. + Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. + verbose (bool, optional): Whether to print out the training log. Defaults to True. + + Returns + 1darray: numpy array of predicted label indices. """ + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) + preds = list( super().predict( - eval_dataloader=eval_dataloader, - get_inputs=TokenClassificationProcessor.get_inputs, - n_gpu=num_gpus, - verbose=verbose, + eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose, ) ) - preds_np = np.concatenate(preds) - return preds_np + preds = np.concatenate(preds) + return np.argmax(preds, axis=1) def get_predicted_token_labels(self, predictions, label_map, dataset): """ @@ -386,9 +398,7 @@ def get_predicted_token_labels(self, predictions, label_map, dataset): num_samples = len(dataset.tensors[0]) if num_samples != predictions.shape[0]: raise ValueError( - "Predictions have {0} samples, but got {1} samples in dataset".format( - predictions.shape[0], num_samples - ) + "Predictions have {0} samples, but got {1} samples in dataset".format(predictions.shape[0], num_samples) ) label_id2str = {v: k for k, v in label_map.items()} From 699092593905388378f278f4117bac8ada4b39a6 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Thu, 16 Jan 2020 18:29:15 +0000 Subject: [PATCH 11/24] additional ordering of things --- utils_nlp/dataset/bbc_hindi.py | 11 ++-- utils_nlp/dataset/dac.py | 1 - utils_nlp/dataset/wikigold.py | 56 +++++++------------ utils_nlp/models/transformers/common.py | 40 +++++++++---- .../models/transformers/question_answering.py | 14 ++--- .../transformers/sequence_classification.py | 27 ++++----- 6 files changed, 69 insertions(+), 80 deletions(-) diff --git a/utils_nlp/dataset/bbc_hindi.py b/utils_nlp/dataset/bbc_hindi.py index 08a779049..c24710680 100644 --- a/utils_nlp/dataset/bbc_hindi.py +++ b/utils_nlp/dataset/bbc_hindi.py @@ -12,7 +12,6 @@ import tarfile from tempfile import TemporaryDirectory -import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder @@ -93,7 +92,7 @@ def load_tc_dataset( cache_dir (str, optional): The default folder for saving cache files. Defaults to TemporaryDirectory().name. max_len (int, optional): Maximum length of the list of tokens. Lists longer - than this are truncated and shorter ones are padded with "O"s. + than this are truncated and shorter ones are padded with "O"s. Default value is BERT_MAX_LEN=512. batch_size (int, optional): The batch size for training and testing. Defaults to 32. @@ -105,12 +104,12 @@ def load_tc_dataset( train_dataloader (DataLoader): a PyTorch DataLoader instance for training. test_dataloader (DataLoader): a PyTorch DataLoader instance for testing. - + label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values can be retrieved by calling the `inverse_transform` function. - + test_labels (Series): a Pandas Series of testing label (in label ID format). If - the labels are in raw label values format, we will need to transform it to + the labels are in raw label values format, we will need to transform it to label IDs by using the label_encoder.transform function. """ @@ -172,7 +171,7 @@ def load_tc_dataset( def get_label_values(label_encoder, label_ids): """ - Get the label values from label IDs. + Get the label values from label IDs. Args: label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance diff --git a/utils_nlp/dataset/dac.py b/utils_nlp/dataset/dac.py index 750e95915..c8af1ad87 100644 --- a/utils_nlp/dataset/dac.py +++ b/utils_nlp/dataset/dac.py @@ -12,7 +12,6 @@ import os from tempfile import TemporaryDirectory -import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py index 4713451fb..508d5dc56 100644 --- a/utils_nlp/dataset/wikigold.py +++ b/utils_nlp/dataset/wikigold.py @@ -7,18 +7,19 @@ https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data """ -import random +import logging import os +import random +from tempfile import TemporaryDirectory + import pandas as pd -import logging -from tempfile import TemporaryDirectory -from utils_nlp.dataset.url_utils import maybe_download +from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.dataset.ner_utils import preprocess_conll +from utils_nlp.dataset.url_utils import maybe_download from utils_nlp.models.transformers.common import MAX_SEQ_LEN from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor - URL = ( "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets" "/master/data/wikigold/CONLL-format/data/wikigold.conll.txt" @@ -91,7 +92,7 @@ def load_dataset( max_len=MAX_SEQ_LEN, trailing_piece_tag="X", batch_size=32, - num_gpus=None + num_gpus=None, ): """ Load the wikigold dataset and split into training and testing datasets. @@ -155,9 +156,7 @@ def load_dataset( """ train_df, test_df = load_train_test_dfs( - local_cache_path=local_path, - test_fraction=test_fraction, - random_seed=random_seed + local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed ) if train_sample_ratio > 1.0: @@ -166,7 +165,7 @@ def load_dataset( elif train_sample_ratio < 0: logging.error("Invalid training sample ration: {}".format(train_sample_ratio)) raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio)) - + if test_sample_ratio > 1.0: test_sample_ratio = 1.0 logging.warning("Setting the testing sample ratio to 1.0") @@ -179,47 +178,34 @@ def load_dataset( if test_sample_ratio < 1.0: test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True) - processor = TokenClassificationProcessor( - model_name=model_name, - to_lower=to_lower, - cache_dir=cache_dir - ) + processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir) label_map = TokenClassificationProcessor.create_label_map( - label_lists=train_df['labels'], - trailing_piece_tag=trailing_piece_tag + label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag ) train_dataset = processor.preprocess_for_bert( - text=train_df['sentence'], + text=train_df["sentence"], max_len=max_len, - labels=train_df['labels'], + labels=train_df["labels"], label_map=label_map, - trailing_piece_tag=trailing_piece_tag + trailing_piece_tag=trailing_piece_tag, ) test_dataset = processor.preprocess_for_bert( - text=test_df['sentence'], + text=test_df["sentence"], max_len=max_len, - labels=test_df['labels'], + labels=test_df["labels"], label_map=label_map, - trailing_piece_tag=trailing_piece_tag + trailing_piece_tag=trailing_piece_tag, ) - train_dataloader = processor.create_dataloader_from_dataset( - train_dataset, - shuffle=True, - batch_size=batch_size, - num_gpus=num_gpus, - distributed=False + train_dataloader = dataloader_from_dataset( + train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False ) - test_dataloader = processor.create_dataloader_from_dataset( - test_dataset, - shuffle=False, - batch_size=batch_size, - num_gpus=num_gpus, - distributed=False + test_dataloader = dataloader_from_dataset( + test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False ) return (train_dataloader, test_dataloader, label_map, test_dataset) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index ccaf48b46..7fce22c6b 100755 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -12,7 +12,7 @@ import numpy as np import torch -from tqdm import tqdm, trange +from tqdm import tqdm from transformers import AdamW, get_linear_schedule_with_warmup from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -23,6 +23,8 @@ from transformers.tokenization_roberta import RobertaTokenizer from transformers.tokenization_xlnet import XLNetTokenizer +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device + TOKENIZER_CLASS = {} TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}) TOKENIZER_CLASS.update({k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}) @@ -101,9 +103,9 @@ def get_default_scheduler(optimizer, warmup_steps, num_training_steps): def fine_tune( self, train_dataloader, - device, - num_gpus, get_inputs, + num_gpus=None, + gpu_ids=None, max_steps=-1, max_grad_norm=1.0, gradient_accumulation_steps=1, @@ -118,6 +120,9 @@ def fine_tune( clip_grad_norm=True, ): + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) + if seed is not None: Transformer.set_seed(seed, num_gpus > 0) @@ -128,6 +133,9 @@ def fine_tune( raise ImportError("Please install apex from https://www.github.com/nvidia/apex") self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level) + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) + # init training global_step = 0 tr_loss = 0.0 @@ -152,22 +160,25 @@ def fine_tune( if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - if clip_grad_norm: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() - if clip_grad_norm: - torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) tr_loss += loss.item() - accum_loss += loss.item() + if (step + 1) % gradient_accumulation_steps == 0: global_step += 1 + + if clip_grad_norm: + if fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) + if global_step % report_every == 0 and verbose: end = time.time() print( - "loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format( + "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format( accum_loss / report_every, end - start, len(batch), global_step, max_steps, ) ) @@ -185,9 +196,16 @@ def fine_tune( return global_step, tr_loss / global_step - def predict(self, eval_dataloader, device, get_inputs, verbose=True): + def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True): + # get device + device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) + + # move model + self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) + + # predict self.model.eval() - for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): + for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose): with torch.no_grad(): inputs = get_inputs(batch, device, self.model_name, train_mode=False) outputs = self.model(**inputs) diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py index 99cd59724..c0415a579 100755 --- a/utils_nlp/models/transformers/question_answering.py +++ b/utils_nlp/models/transformers/question_answering.py @@ -184,7 +184,7 @@ def preprocess( answer texts from predicted answer start and answer end indices. Defaults to "./cached_qa_features". Returns: - DataSet: A Pytorch DataSet. + DataSet: A Pytorch DataSet. """ if not os.path.exists(feature_cache_dir): @@ -509,12 +509,6 @@ def fit( """ - # get device - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) - - # move model - self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) - # init optimizer optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) @@ -534,9 +528,9 @@ def fit( # fine tune super().fine_tune( train_dataloader=train_dataloader, - device=device, - num_gpus=num_gpus, get_inputs=QAProcessor.get_inputs, + num_gpus=num_gpus, + gpu_ids=gpu_ids, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, @@ -555,7 +549,7 @@ def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True): Predicts answer start and end logits. Args: - test_dataloader (QADataset): Dataloader for the testing data. + test_dataloader (DataLoader): DataLoader for scoring the data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py index 4d26e39f6..e8a4a288b 100755 --- a/utils_nlp/models/transformers/sequence_classification.py +++ b/utils_nlp/models/transformers/sequence_classification.py @@ -11,7 +11,7 @@ from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification -from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device +from utils_nlp.common.pytorch_utils import compute_training_steps from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet @@ -249,12 +249,6 @@ def fit( seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ - # get device - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) - - # move model - self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) - # init optimizer optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) @@ -274,9 +268,9 @@ def fit( # fine tune super().fine_tune( train_dataloader=train_dataloader, - device=device, - num_gpus=num_gpus, get_inputs=Processor.get_inputs, + num_gpus=num_gpus, + gpu_ids=gpu_ids, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, @@ -286,12 +280,12 @@ def fit( seed=seed, ) - def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): + def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. Args: - eval_dataloader (Dataloader): Dataloader for the evaluation data. + test_dataloader (DataLoader): DataLoader for scoring the data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. @@ -304,14 +298,13 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): 1darray: numpy array of predicted label indices. """ - # get device - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) - # move model - self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) - preds = list( super().predict( - eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose, + eval_dataloader=test_dataloader, + get_inputs=Processor.get_inputs, + num_gpus=num_gpus, + gpu_ids=gpu_ids, + verbose=verbose, ) ) preds = np.concatenate(preds) From 82816318dca76ac3a459f27a45b4a89df5ac1010 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Thu, 16 Jan 2020 19:14:24 +0000 Subject: [PATCH 12/24] update summarization files --- ...test_notebooks_extractive_summarization.py | 12 +- tests/unit/test_extractive_summarization.py | 45 +++---- utils_nlp/eval/evaluate_summarization.py | 12 +- .../transformers/extractive_summarization.py | 114 ++++++++---------- 4 files changed, 78 insertions(+), 105 deletions(-) diff --git a/tests/integration/test_notebooks_extractive_summarization.py b/tests/integration/test_notebooks_extractive_summarization.py index a39ab0c1d..fdb9cfebf 100644 --- a/tests/integration/test_notebooks_extractive_summarization.py +++ b/tests/integration/test_notebooks_extractive_summarization.py @@ -1,14 +1,10 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import os -import json -import shutil -import pytest import papermill as pm +import pytest import scrapbook as sb -from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME - +from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK ABS_TOL = 0.02 @@ -31,7 +27,7 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp): CACHE_DIR=tmp, BATCH_SIZE=3000, REPORT_EVERY=50, - MAX_STEPS=1e3, + MAX_STEPS=1000, WARMUP_STEPS=5e2, MODEL_NAME="distilbert-base-uncased", ), @@ -39,5 +35,3 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp): result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict print(result) assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL) - - diff --git a/tests/unit/test_extractive_summarization.py b/tests/unit/test_extractive_summarization.py index 40cacbeca..797e631e5 100644 --- a/tests/unit/test_extractive_summarization.py +++ b/tests/unit/test_extractive_summarization.py @@ -1,14 +1,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import nltk +import os +import nltk nltk.download("punkt") -from nltk import tokenize import pytest -import os -import shutil - +from nltk import tokenize from utils_nlp.models.transformers.datasets import SummarizationDataset from utils_nlp.models.transformers.extractive_summarization import ( @@ -17,6 +15,9 @@ ExtSumProcessor, ) + + + # @pytest.fixture() def source_data(): return ( @@ -48,18 +49,10 @@ def data_to_file(tmp_module): f.write(target) f.close() train_dataset = SummarizationDataset( - source_file, - target_file, - [tokenize.sent_tokenize], - [tokenize.sent_tokenize], - nltk.word_tokenize, + source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize, ) test_dataset = SummarizationDataset( - source_file, - target_file, - [tokenize.sent_tokenize], - [tokenize.sent_tokenize], - nltk.word_tokenize, + source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize, ) processor = ExtSumProcessor( @@ -70,20 +63,12 @@ def data_to_file(tmp_module): min_nsents=0, min_src_ntokens=1, ) - ext_sum_train = processor.preprocess( - train_dataset, train_dataset.get_target(), oracle_mode="greedy" - ) - ext_sum_test = processor.preprocess( - test_dataset, test_dataset.get_target(), oracle_mode="greedy" - ) + ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy") + ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy") save_path = os.path.join(tmp_module, "processed") - train_files = ExtSumProcessedData.save_data( - ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000 - ) - test_files = ExtSumProcessedData.save_data( - ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000 - ) + train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000) + test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000) print(train_files) print(test_files) assert os.path.exists(train_files[0]) @@ -96,10 +81,10 @@ def test_bert_training(data_to_file, tmp_module): CACHE_DIR = tmp_module ENCODER = "transformer" - BATCH_SIZE = 200 + BATCH_SIZE = 128 LEARNING_RATE = 2e-3 - REPORT_EVERY = 100 - MAX_STEPS = 5e2 + REPORT_EVERY = 50 + MAX_STEPS = 2e2 WARMUP_STEPS = 1e2 DATA_SAVED_PATH = data_to_file result_base_path = "./results" diff --git a/utils_nlp/eval/evaluate_summarization.py b/utils_nlp/eval/evaluate_summarization.py index 4912717e9..421d48f13 100644 --- a/utils_nlp/eval/evaluate_summarization.py +++ b/utils_nlp/eval/evaluate_summarization.py @@ -3,22 +3,24 @@ import os from random import random, seed + from bertsum.others.utils import test_rouge -def get_rouge(predictions, targets, temp_dir): +def get_rouge(predictions, targets, temp_dir, random_seed=42): """ function to get the rouge metric for the prediction and the reference. Args: predictions (list of strings): Predictions to be compared. target (list of strings): References - temp_dir (str): Path where temporary folders are created to host the files - generated by ROUGE applicatoin. + temp_dir (str): Path where temporary folders are created to host the files + generated by ROUGE application. + seed (int, optional): Random seed. Defaults to 42. Return: dictionary: rouge metric - + """ def _write_list_to_file(list_items, filename): @@ -27,7 +29,7 @@ def _write_list_to_file(list_items, filename): for item in list_items: filehandle.write("%s\n" % item) - seed(42) + seed(random_seed) random_number = random() os.makedirs(temp_dir, exist_ok=True) candidate_path = os.path.join(temp_dir, "candidate" + str(random_number)) diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py index 426f9002c..54a7f64ef 100644 --- a/utils_nlp/models/transformers/extractive_summarization.py +++ b/utils_nlp/models/transformers/extractive_summarization.py @@ -5,24 +5,22 @@ import itertools import logging -import numpy as np import os import random + +import numpy as np import torch -import torch.nn as nn -from torch.utils.data import Dataset, IterableDataset -from torch.utils.data import DataLoader, SequentialSampler +from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSampler # from torch.utils.data.distributed import DistributedSampler -from transformers import DistilBertModel, BertModel +from transformers import BertModel, DistilBertModel -from bertsum.models import model_builder, data_loader +from bertsum.models import data_loader, model_builder from bertsum.models.data_loader import Batch from bertsum.models.model_builder import Summarizer - -from utils_nlp.common.pytorch_utils import get_device -from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer +from utils_nlp.common.pytorch_utils import compute_training_steps, get_device from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection +from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": DistilBertModel} @@ -42,8 +40,8 @@ def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000): Args: data_iter (generator): data generator. - shuffle (bool): whether the data is shuffled - is_labeled (bool): it specifies whether the data objects are labeled data. + shuffle (bool): whether the data is shuffled. + is_labeled (bool): specifies whether the data objects are labeled data. batch_size (int): number of tokens per batch. Returns: @@ -79,9 +77,7 @@ def get_stream(self): if self.is_shuffle: return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list))) else: - return itertools.chain.from_iterable( - map(get_dataset, itertools.cycle(random.shuffle(self.file_list))) - ) + return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))) def __iter__(self): return self.get_stream() @@ -114,9 +110,7 @@ def __getitem__(self, idx): return self.data[idx] -def get_pred( - example, sent_scores, cal_lead=False, sentence_separator="", block_trigram=True, top_n=3 -): +def get_pred(example, sent_scores, cal_lead=False, sentence_separator="", block_trigram=True, top_n=3): """ Get the summarization prediction for the paragraph example based on the scores returned by the transformer summarization model. @@ -229,9 +223,7 @@ def _chunks(iterable, chunk_size): def _get_files(self, root): train_files = [] test_files = [] - files = [ - os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f)) - ] + files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))] for fname in files: if fname.find("train") != -1: train_files.append(fname) @@ -324,7 +316,7 @@ def model_name(self, value): self._model_name = value @staticmethod - def get_inputs(batch, model_name, train_mode=True): + def get_inputs(batch, device, model_name, train_mode=True): """ Creates an input dictionary given a model name. @@ -332,6 +324,7 @@ def get_inputs(batch, model_name, train_mode=True): batch (object): A Batch containing input ids, segment ids, sentence class ids, masks for the input ids, masks for sentence class ids and source text. If train_model is True, it also contains the labels and target text. + device (torch.device): A PyTorch device. model_name (bool, optional): Model name used to format the inputs. train_mode (bool, optional): Training mode flag. Defaults to True. @@ -345,6 +338,7 @@ def get_inputs(batch, model_name, train_mode=True): if model_name.split("-")[0] in ["bert", "distilbert"]: if train_mode: # labels must be the last + batch = batch.to(device) return { "x": batch.src, "segs": batch.segs, @@ -354,6 +348,14 @@ def get_inputs(batch, model_name, train_mode=True): "labels": batch.labels, } else: + batch["src"] = batch["src"].to(device) + batch["segs"] = batch["segs"].to(device) + batch["clss"] = batch["clss"].to(device) + batch["mask"] = batch["mask"].to(device) + batch["mask_cls"] = batch["mask_cls"].to(device) + if "labels" in batch: + batch["labels"] = batch["labels"].to(device) + batch = Bunch(batch) return { "x": batch.src, "segs": batch.segs, @@ -489,9 +491,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer", cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". """ - super().__init__( - model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir - ) + super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir) if model_name not in self.list_supported_models(): raise ValueError( "Model name {} is not supported by ExtractiveSummarizer. " @@ -522,6 +522,7 @@ def fit( self, train_dataset, num_gpus=None, + gpu_ids=None, batch_size=3000, local_rank=-1, max_steps=5e5, @@ -546,6 +547,9 @@ def fit( num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. batch_size (int, optional): Maximum number of tokens in each batch. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. @@ -571,16 +575,7 @@ def fit( seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) - - def move_batch_to_device(batch, device): - return batch.to(device) - - # if isinstance(self.model, nn.DataParallel): - # self.model.module.to(device) - # else: - self.model.to(device) - + # init optimizer optimizer = model_builder.build_optim( optimization_method, learning_rate, @@ -594,31 +589,34 @@ def move_batch_to_device(batch, device): ) # batch_size is the number of tokens in a batch - train_dataloader = get_dataloader( - train_dataset.get_stream(), is_labeled=True, batch_size=batch_size + train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size) + + # compute the max number of training steps + max_steps = compute_training_steps( + train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) super().fine_tune( train_dataloader=train_dataloader, get_inputs=ExtSumProcessor.get_inputs, - move_batch_to_device=move_batch_to_device, - n_gpu=num_gpus, - num_train_epochs=-1, + num_gpus=num_gpus, + gpu_ids=gpu_ids, max_steps=max_steps, - optimizer=optimizer, - warmup_steps=warmup_steps, + max_grad_norm=max_grad_norm, gradient_accumulation_steps=gradient_accumulation_steps, + optimizer=optimizer, + scheduler=None, verbose=verbose, seed=seed, report_every=report_every, clip_grad_norm=False, - max_grad_norm=max_grad_norm, ) def predict( self, test_dataset, num_gpus=1, + gpu_ids=None, batch_size=16, sentence_separator="", top_n=3, @@ -632,6 +630,9 @@ def predict( Args: test_dataset (Dataset): Dataset for which the summary to be predicted num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. batch_size (int, optional): The number of test examples in each batch. Defaults to 16. sentence_separator (str, optional): String to be inserted between sentences in the prediction. Defaults to ''. @@ -678,10 +679,8 @@ def collate_fn(dict_list): } test_sampler = SequentialSampler(test_dataset) - test_dataloader = DataLoader( - test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn - ) - sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus) + test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn) + sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids) sent_scores_list = list(sent_scores) scores_list = [] for i in sent_scores_list: @@ -699,15 +698,18 @@ def collate_fn(dict_list): prediction.extend(temp_pred) return prediction - def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True): + def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. Args: - eval_dataloader (Dataloader): Dataloader for the evaluation data. + test_dataloader (Dataloader): Dataloader for scoring the data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. + gpu_ids (list): List of GPU IDs to be used. + If set to None, the first num_gpus GPUs will be used. + Defaults to None. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns @@ -716,23 +718,13 @@ def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True): device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) - def move_batch_to_device(batch, device): - batch["src"] = batch["src"].to(device) - batch["segs"] = batch["segs"].to(device) - batch["clss"] = batch["clss"].to(device) - batch["mask"] = batch["mask"].to(device) - batch["mask_cls"] = batch["mask_cls"].to(device) - if "labels" in batch: - batch["labels"] = batch["labels"].to(device) - return Bunch(batch) - preds = list( super().predict( - eval_dataloader=eval_dataloader, + eval_dataloader=test_dataloader, get_inputs=ExtSumProcessor.get_inputs, - n_gpu=num_gpus, + num_gpus=num_gpus, + gpu_ids=gpu_ids, verbose=verbose, - move_batch_to_device=move_batch_to_device, ) ) return preds From b76750aaa691c0f391c0c384eaff3e851bd533ce Mon Sep 17 00:00:00 2001 From: saidbleik Date: Fri, 17 Jan 2020 02:16:10 +0000 Subject: [PATCH 13/24] NER updates --- .../ner_wikigold_transformer.ipynb | 2 +- tests/unit/test_bert_token_classification.py | 79 ------------------- .../test_transformers_token_classification.py | 31 ++++++++ utils_nlp/dataset/wikigold.py | 14 ++-- utils_nlp/models/transformers/common.py | 3 + .../transformers/named_entity_recognition.py | 47 +++++------ 6 files changed, 60 insertions(+), 116 deletions(-) delete mode 100644 tests/unit/test_bert_token_classification.py create mode 100644 tests/unit/test_transformers_token_classification.py diff --git a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb index 8bbc82a7c..f077f8d62 100644 --- a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb +++ b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb @@ -233,7 +233,7 @@ "source": [ "with Timer() as t:\n", " preds = model.predict(\n", - " eval_dataloader=test_dataloader,\n", + " test_dataloader=test_dataloader,\n", " num_gpus=None,\n", " verbose=True\n", " )\n", diff --git a/tests/unit/test_bert_token_classification.py b/tests/unit/test_bert_token_classification.py deleted file mode 100644 index c3a46584f..000000000 --- a/tests/unit/test_bert_token_classification.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -import pytest - -from utils_nlp.models.bert.token_classification import ( - BERTTokenClassifier, - postprocess_token_labels, -) - - -def test_token_classifier_num_labels(): - with pytest.raises(ValueError): - BERTTokenClassifier(num_labels=1) - - -def test_token_classifier_fit_predict(tmp_path, ner_test_data): - token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=tmp_path) - - # test fit, no warmup - token_classifier.fit( - token_ids=ner_test_data["INPUT_TOKEN_IDS"], - input_mask=ner_test_data["INPUT_MASK"], - labels=ner_test_data["INPUT_LABEL_IDS"], - ) - - # test fit, with warmup - token_classifier.fit( - token_ids=ner_test_data["INPUT_TOKEN_IDS"], - input_mask=ner_test_data["INPUT_MASK"], - labels=ner_test_data["INPUT_LABEL_IDS"], - warmup_proportion=0.1, - ) - # test predict, no labels - token_classifier.predict( - token_ids=ner_test_data["INPUT_TOKEN_IDS"], - input_mask=ner_test_data["INPUT_MASK"], - ) - - # test predict, with labels - token_classifier.predict( - token_ids=ner_test_data["INPUT_TOKEN_IDS"], - input_mask=ner_test_data["INPUT_MASK"], - labels=ner_test_data["INPUT_LABEL_IDS"], - ) - - # test output probabilities - predictions = token_classifier.predict( - token_ids=ner_test_data["INPUT_TOKEN_IDS"], - input_mask=ner_test_data["INPUT_MASK"], - labels=ner_test_data["INPUT_LABEL_IDS"], - probabilities=True, - ) - assert len(predictions.classes) == predictions.probabilities.shape[0] - - -def test_postprocess_token_labels(ner_test_data): - labels_no_padding = postprocess_token_labels( - labels=ner_test_data["PREDICTED_LABELS"], - input_mask=ner_test_data["INPUT_MASK"], - label_map=ner_test_data["LABEL_MAP"], - ) - - assert labels_no_padding == ner_test_data["EXPECTED_TOKENS_NO_PADDING"] - - -def test_postprocess_token_labels_remove_trailing(ner_test_data): - labels_no_padding_no_trailing = postprocess_token_labels( - labels=ner_test_data["PREDICTED_LABELS"], - input_mask=ner_test_data["INPUT_MASK"], - label_map=ner_test_data["LABEL_MAP"], - remove_trailing_word_pieces=True, - trailing_token_mask=ner_test_data["TRAILING_TOKEN_MASK"], - ) - - assert ( - labels_no_padding_no_trailing - == ner_test_data["EXPECTED_TOKENS_NO_PADDING_NO_TRAILING"] - ) diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py new file mode 100644 index 000000000..eda90c6d4 --- /dev/null +++ b/tests/unit/test_transformers_token_classification.py @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import pytest + +from utils_nlp.common.pytorch_utils import dataloader_from_dataset +from utils_nlp.models.transformers.named_entity_recognition import ( + TokenClassificationProcessor, + TokenClassifier, +) + + +def test_token_classifier_num_labels(): + with pytest.raises(ValueError): + TokenClassifier(num_labels=1) + + +def test_token_classifier_fit_predict(tmp_path, ner_test_data): + token_classifier = TokenClassifier(num_labels=6, cache_dir=tmp_path) + processor = TokenClassificationProcessor(cache_dir=tmp_path) + + # test fit, no warmup + train_dataset = processor.preprocess_for_bert( + text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"], + ) + train_dataloader = dataloader_from_dataset(train_dataset) + token_classifier.fit(train_dataloader) + + # test predict, no labels + preds = token_classifier.predict(train_dataloader, verbose=False) + assert len(preds) == len(ner_test_data["INPUT_LABELS"]) diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py index 508d5dc56..32a0c5420 100644 --- a/utils_nlp/dataset/wikigold.py +++ b/utils_nlp/dataset/wikigold.py @@ -117,7 +117,7 @@ def load_dataset( cache_dir (str, optional): The default folder for saving cache files. Defaults to './temp'. max_len (int, optional): Maximum length of the list of tokens. Lists longer - than this are truncated and shorter ones are padded with "O"s. + than this are truncated and shorter ones are padded with "O"s. Default value is BERT_MAX_LEN=512. trailing_piece_tag (str, optional): Tag used to label trailing word pieces. For example, "criticize" is broken into "critic" and "##ize", "critic" @@ -130,16 +130,12 @@ def load_dataset( Returns: tuple. The tuple contains four elements. - train_dataload (DataLoader): a PyTorch DataLoader instance for training. - - test_dataload (DataLoader): a PyTorch DataLoader instance for testing. - - label_map (dict): A dictionary object to map a label (str) to an ID (int). - + train_dataloader (DataLoader): a PyTorch DataLoader instance for training. + test_dataloader (DataLoader): a PyTorch DataLoader instance for testing. + label_map (dict): A dictionary object to map a label (str) to an ID (int). test_dataset (TensorDataset): A TensorDataset containing the following four tensors. 1. input_ids_all: Tensor. Each sublist contains numerical values, - i.e. token ids, corresponding to the tokens in the input - text data. + i.e. token ids, corresponding to the tokens in the input text data. 2. input_mask_all: Tensor. Each sublist contains the attention mask of the input token id list, 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 7fce22c6b..9808719a7 100755 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -41,6 +41,9 @@ def __init__( self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None, ): + if num_labels < 2: + raise ValueError("Number of labels should be at least 2.") + if model_name not in self.list_supported_models(): raise ValueError( "Model name {0} is not supported by {1}. " diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py index 169bb21c8..76d0b5e37 100755 --- a/utils_nlp/models/transformers/named_entity_recognition.py +++ b/utils_nlp/models/transformers/named_entity_recognition.py @@ -6,10 +6,11 @@ import numpy as np import torch +from torch.utils.data import TensorDataset from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification -from utils_nlp.common.pytorch_utils import get_device, move_model_to_device +from utils_nlp.common.pytorch_utils import compute_training_steps from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer TC_MODEL_CLASS = {} @@ -129,7 +130,7 @@ def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map= Returns: TensorDataset: A TensorDataset containing the following four tensors. 1. input_ids_all: Tensor. Each sublist contains numerical values, - i.e. token ids, corresponding to the tokens in the input + i.e. token ids, corresponding to the tokens in the input text data. 2. input_mask_all: Tensor. Each sublist contains the attention mask of the input token id list, 1 for input tokens and 0 for @@ -229,14 +230,14 @@ def _is_iterable_but_not_string(obj): td = TensorDataset( torch.tensor(input_ids_all, dtype=torch.long), torch.tensor(input_mask_all, dtype=torch.long), - torch.tensor(trailing_token_mask_all, dtype=torch.bool), + torch.tensor(trailing_token_mask_all, dtype=torch.long), torch.tensor(label_ids_all, dtype=torch.long), ) else: td = TensorDataset( torch.tensor(input_ids_all, dtype=torch.long), torch.tensor(input_mask_all, dtype=torch.long), - torch.tensor(trailing_token_mask_all, dtype=torch.bool), + torch.tensor(trailing_token_mask_all, dtype=torch.long), ) return td @@ -311,12 +312,6 @@ def fit( seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ - # get device - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) - - # move model - self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) - # init optimizer optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) @@ -336,9 +331,9 @@ def fit( # fine tune super().fine_tune( train_dataloader=train_dataloader, - device=device, + get_inputs=TokenClassificationProcessor.get_inputs, num_gpus=num_gpus, - get_inputs=Processor.get_inputs, + gpu_ids=gpu_ids, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, @@ -348,12 +343,12 @@ def fit( seed=seed, ) - def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): + def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. Args: - eval_dataloader (Dataloader): Dataloader for the evaluation data. + test_dataloader (DataLoader): DataLoader for scoring the data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. @@ -366,18 +361,16 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True): 1darray: numpy array of predicted label indices. """ - # get device - device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) - # move model - self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1) - preds = list( super().predict( - eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose, + eval_dataloader=test_dataloader, + get_inputs=TokenClassificationProcessor.get_inputs, + num_gpus=num_gpus, + gpu_ids=gpu_ids, + verbose=verbose, ) ) - preds = np.concatenate(preds) - return np.argmax(preds, axis=1) + return np.concatenate(preds) def get_predicted_token_labels(self, predictions, label_map, dataset): """ @@ -386,13 +379,13 @@ def get_predicted_token_labels(self, predictions, label_map, dataset): Args: predictions (ndarray): A numpy ndarray produced from the `predict` function call. The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels]. - label_map (dict): A dictionary object to map a label (str) to an ID (int). + label_map (dict): A dictionary object to map a label (str) to an ID (int). dataset (TensorDataset): The TensorDataset for evaluation. dataset (Dataset): The test Dataset instance. Returns: list: A list of lists. The size of the retured list is the number of testing samples. - Each sublist represents the predicted label for each token. + Each sublist represents the predicted label for each token. """ num_samples = len(dataset.tensors[0]) @@ -417,7 +410,7 @@ def get_predicted_token_labels(self, predictions, label_map, dataset): if attention_mask[sid] == 0: break - if not trailing_mask[sid]: + if not bool(trailing_mask[sid]): continue label_id = seq_probs[sid].argmax() @@ -430,13 +423,13 @@ def get_true_test_labels(self, label_map, dataset): Get the true testing label values. Args: - label_map (dict): A dictionary object to map a label (str) to an ID (int). + label_map (dict): A dictionary object to map a label (str) to an ID (int). dataset (TensorDataset): The TensorDataset for evaluation. dataset (Dataset): The test Dataset instance. Returns: list: A list of lists. The size of the retured list is the number of testing samples. - Each sublist represents the predicted label for each token. + Each sublist represents the predicted label for each token. """ num_samples = len(dataset.tensors[0]) From 97f6f0814bd6972a2921f2a3368d14e1655b99a4 Mon Sep 17 00:00:00 2001 From: Said Bleik Date: Fri, 17 Jan 2020 00:06:37 -0500 Subject: [PATCH 14/24] Update test_notebooks_extractive_summarization.py --- tests/integration/test_notebooks_extractive_summarization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_notebooks_extractive_summarization.py b/tests/integration/test_notebooks_extractive_summarization.py index fdb9cfebf..4f9e17f79 100644 --- a/tests/integration/test_notebooks_extractive_summarization.py +++ b/tests/integration/test_notebooks_extractive_summarization.py @@ -33,5 +33,4 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp): ), ) result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict - print(result) assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL) From b6424d173e2f253f2b36e8320edd614bdfdae680 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 06:09:20 +0000 Subject: [PATCH 15/24] update pytorch_utils tests --- tests/unit/test_common_pytorch_utils.py | 31 ++++++++++------------- utils_nlp/common/pytorch_utils.py | 33 ++++++++++++++++--------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py index e2fce1e10..4cdb125c6 100644 --- a/tests/unit/test_common_pytorch_utils.py +++ b/tests/unit/test_common_pytorch_utils.py @@ -5,10 +5,10 @@ import pytest import torch import torch.nn as nn -from torch.nn.parallel.data_parallel import DataParallel from torch.nn.modules.container import Sequential +from torch.nn.parallel.data_parallel import DataParallel -from utils_nlp.common.pytorch_utils import get_device, move_to_device +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device @pytest.fixture @@ -55,49 +55,47 @@ def test_get_device_local_rank(): def test_move_to_device_cpu(model): # test when device.type="cpu" - model_cpu = move_to_device(model, torch.device("cpu")) + model_cpu = move_model_to_device(model, torch.device("cpu")) assert isinstance(model_cpu, nn.modules.container.Sequential) def test_move_to_device_cpu_parallelized(model): # test when input model is parallelized model_parallelized = nn.DataParallel(model) - model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu")) + model_parallelized_output = move_model_to_device(model_parallelized, torch.device("cpu")) assert isinstance(model_parallelized_output, nn.modules.container.Sequential) def test_move_to_device_exception_not_torch_device(model): # test when device is not torch.device with pytest.raises(ValueError): - move_to_device(model, "abc") + move_model_to_device(model, "abc") def test_move_to_device_exception_wrong_type(model): # test when device.type is not "cuda" or "cpu" with pytest.raises(Exception): - move_to_device(model, torch.device("opengl")) + move_model_to_device(model, torch.device("opengl")) -@pytest.mark.skipif( - torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine" -) +@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine") def test_move_to_device_exception_gpu_model_on_cpu_machine(model): # test when the model is moved to a gpu but it is a cpu machine with pytest.raises(Exception): - move_to_device(model, torch.device("cuda")) + move_model_to_device(model, torch.device("cuda")) @pytest.mark.gpu def test_move_to_device_exception_cuda_zero_gpus(model): # test when device.type is cuda, but num_gpus is 0 with pytest.raises(ValueError): - move_to_device(model, torch.device("cuda"), num_gpus=0) + move_model_to_device(model, torch.device("cuda"), num_gpus=0) @pytest.mark.gpu def test_move_to_device_gpu(model): # test when device.type="cuda" - model_cuda = move_to_device(model, torch.device("cuda")) + model_cuda = move_model_to_device(model, torch.device("cuda")) num_cuda_devices = torch.cuda.device_count() if num_cuda_devices > 1: @@ -105,20 +103,17 @@ def test_move_to_device_gpu(model): else: assert isinstance(model_cuda, Sequential) - model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1) + model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=1) assert isinstance(model_cuda_1_gpu, Sequential) - model_cuda_1_more_gpu = move_to_device( - model, torch.device("cuda"), num_gpus=num_cuda_devices + 1 - ) + model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1) if num_cuda_devices > 1: assert isinstance(model_cuda_1_more_gpu, DataParallel) else: assert isinstance(model_cuda_1_more_gpu, Sequential) - model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices) + model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices) if num_cuda_devices > 1: assert isinstance(model_cuda_same_gpu, DataParallel) else: assert isinstance(model_cuda_same_gpu, Sequential) - diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index 89f98ab2a..432692380 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -3,10 +3,7 @@ """Common PyTorch utilities that facilitate building Pytorch models.""" -import warnings - import torch -import torch.nn as nn from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler @@ -42,15 +39,20 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= If set to None, all available GPUs will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. - If set to None, the first num_gpus GPUs will be used. + If None, the first num_gpus GPUs will be used. + If not None, overrides num_gpus. Defaults to None. local_rank (int): Local GPU ID within a node. Used in distributed environments. + If not -1, num_gpus and gpu_ids are ignored. Defaults to -1. - + Returns: Module, DataParallel, DistributedDataParallel: A PyTorch Module or a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used). """ + if not isinstance(device, torch.device): + raise ValueError("device must be of type torch.device.") + # unwrap model if isinstance(model, torch.nn.DataParallel): model = model.module @@ -60,10 +62,18 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank= self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, ) else: - if num_gpus > 1: + if device.type == "cuda": + if num_gpus is not None: + if num_gpus < 1: + raise ValueError("num_gpus must be at least 1 or None") + num_cuda_devices = torch.cuda.device_count() + if num_cuda_devices < 1: + raise Exception("CUDA devices are not available.") if gpu_ids is None: + num_gpus = num_cuda_devices if num_gpus is None else min(num_gpus, num_cuda_devices) gpu_ids = list(range(num_gpus)) - model = torch.nn.DataParallel(model, device_ids=gpu_ids) + if len(gpu_ids) > 1: + model = torch.nn.DataParallel(model, device_ids=gpu_ids) # move to device return model.to(device) @@ -94,9 +104,10 @@ def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, dis return DataLoader(ds, sampler=sampler, batch_size=batch_size) + def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1): - """Computes the max training steps given a dataloader. - + """Computes the max training steps given a dataloader. + Args: dataloader (Dataloader): A PyTorch DataLoader. num_epochs (int, optional): Number of training epochs. Defaults to 1. @@ -107,7 +118,7 @@ def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accu gradient_accumulation_steps (int, optional): Number of steps to accumulate before performing a backward/update pass. Default to 1. - + Returns: int: The max number of steps to be used in a training loop. """ @@ -120,4 +131,4 @@ def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accu max_steps = dataset_length // gradient_accumulation_steps * num_epochs if max_steps <= 0: raise Exception("Max steps cannot be determined.") - return max_steps \ No newline at end of file + return max_steps From 2b1736086f1fb3e8be4dd8cb35b8601a4890e278 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 06:15:32 +0000 Subject: [PATCH 16/24] update pytorch utils tests --- tests/unit/test_common_pytorch_utils.py | 1 + utils_nlp/common/pytorch_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py index 4cdb125c6..7105283aa 100644 --- a/tests/unit/test_common_pytorch_utils.py +++ b/tests/unit/test_common_pytorch_utils.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +"""PyTorch utils tests.""" import pytest import torch diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index 432692380..77918d70c 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -"""Common PyTorch utilities that facilitate building Pytorch models.""" +"""Common PyTorch utilities that facilitate building PyTorch models.""" import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler From 371a8582d3cb893709aee18b5068fa8d671984a6 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 17:56:03 +0000 Subject: [PATCH 17/24] update move_to_device refs --- .../models/bert/sequence_classification.py | 4 +- .../sequence_classification_distributed.py | 6 +- utils_nlp/models/bert/sequence_encoding.py | 78 +++++----------- utils_nlp/models/bert/token_classification.py | 6 +- .../models/xlnet/sequence_classification.py | 89 ++++++------------- 5 files changed, 57 insertions(+), 126 deletions(-) diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py index 03a324604..4748ceec3 100644 --- a/utils_nlp/models/bert/sequence_classification.py +++ b/utils_nlp/models/bert/sequence_classification.py @@ -91,7 +91,7 @@ def fit( device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) @@ -211,7 +211,7 @@ def predict( (classes, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) # score self.model.eval() diff --git a/utils_nlp/models/bert/sequence_classification_distributed.py b/utils_nlp/models/bert/sequence_classification_distributed.py index ee5061158..d448515ba 100644 --- a/utils_nlp/models/bert/sequence_classification_distributed.py +++ b/utils_nlp/models/bert/sequence_classification_distributed.py @@ -14,7 +14,7 @@ from pytorch_pretrained_bert.optimization import BertAdam from tqdm import tqdm -from utils_nlp.common.pytorch_utils import get_device, move_to_device +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device from utils_nlp.models.bert.common import Language try: @@ -192,7 +192,7 @@ def fit( device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) if bert_optimizer is None: bert_optimizer = self.create_optimizer( @@ -277,7 +277,7 @@ def predict(self, test_loader, num_gpus=None, probabilities=False): a dictionary with classes, target labels, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) # score self.model.eval() diff --git a/utils_nlp/models/bert/sequence_encoding.py b/utils_nlp/models/bert/sequence_encoding.py index 088a6310d..520c56a3d 100644 --- a/utils_nlp/models/bert/sequence_encoding.py +++ b/utils_nlp/models/bert/sequence_encoding.py @@ -4,19 +4,17 @@ # This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples # /extract_features.py, with necessary modifications. -from pytorch_pretrained_bert.modeling import BertModel - -from utils_nlp.common.pytorch_utils import get_device, move_to_device from enum import Enum + import numpy as np import pandas as pd -import os import torch +from cached_property import cached_property +from pytorch_pretrained_bert.modeling import BertModel +from torch.utils.data import DataLoader, SequentialSampler, TensorDataset -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset - +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device from utils_nlp.models.bert.common import Language, Tokenizer -from cached_property import cached_property class PoolingStrategy(str, Enum): @@ -43,27 +41,21 @@ def __init__( pooling_strategy=PoolingStrategy.MEAN, ): """Initialize the encoder's underlying model and tokenizer - + Args: bert_model: BERT model to use for encoding. Defaults to pretrained BertModel. tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer. language: The pretrained model's language. Defaults to Language.ENGLISH. - num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. + num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. cache_dir: Location of BERT's cache directory. Defaults to "." to_lower: True to lowercase before tokenization. Defaults to False. max_len: Maximum number of tokens. - layer_index: The layer from which to extract features. + layer_index: The layer from which to extract features. Defaults to the last layer; can also be a list of integers for experimentation. pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding. """ - self.model = ( - bert_model.model.bert - if bert_model - else BertModel.from_pretrained(language, cache_dir=cache_dir) - ) - self.tokenizer = ( - tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir) - ) + self.model = bert_model.model.bert if bert_model else BertModel.from_pretrained(language, cache_dir=cache_dir) + self.tokenizer = tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir) self.num_gpus = num_gpus self.max_len = max_len self.layer_index = layer_index @@ -98,16 +90,17 @@ def pooling_strategy(self, pooling_strategy): def get_hidden_states(self, text, batch_size=32): """Extract the hidden states from the pretrained model - + Args: text: List of documents to extract features from. batch_size: Batch size, defaults to 32. - + Returns: - pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). + pd.DataFrame with columns: + text_index (int), token (str), layer_index (int), values (list[float]). """ device, num_gpus = get_device(self.num_gpus) - self.model = move_to_device(self.model, device, self.num_gpus) + self.model = move_model_to_device(self.model, device, self.num_gpus) self.model.eval() @@ -122,9 +115,7 @@ def get_hidden_states(self, text, batch_size=32): input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device) eval_data = TensorDataset(input_ids, input_mask, input_type_ids) - eval_dataloader = DataLoader( - eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size - ) + eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size) hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []} for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader: @@ -142,9 +133,7 @@ def get_hidden_states(self, text, batch_size=32): hidden_states["text_index"].append(example_index.item()) hidden_states["token"].append(token) hidden_states["layer_index"].append(layer_index) - hidden_states["values"].append( - [round(x.item(), 6) for x in layer_output[i]] - ) + hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]]) # empty cache del [input_ids_tensor, input_mask_tensor, example_indices_tensor] @@ -158,7 +147,7 @@ def get_hidden_states(self, text, batch_size=32): def pool(self, df): """Pooling to aggregate token-wise embeddings to sentence embeddings - + Args: df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]) @@ -167,31 +156,16 @@ def pool(self, df): """ def max_pool(x): - values = np.array( - [ - np.reshape(np.array(x.values[i]), self.embedding_dim) - for i in range(x.values.shape[0]) - ] - ) + values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])]) m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0) return m.numpy() def mean_pool(x): - values = np.array( - [ - np.reshape(np.array(x.values[i]), self.embedding_dim) - for i in range(x.values.shape[0]) - ] - ) + values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])]) return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy() def cls_pool(x): - values = np.array( - [ - np.reshape(np.array(x.values[i]), self.embedding_dim) - for i in range(x.values.shape[0]) - ] - ) + values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])]) return values[0] try: @@ -206,15 +180,11 @@ def cls_pool(x): except ValueError as ve: print(ve) - return ( - df.groupby(["text_index", "layer_index"])["values"] - .apply(lambda x: pool_func(x)) - .reset_index() - ) + return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index() def encode(self, text, batch_size=32, as_numpy=False): - """Computes sentence encodings - + """Computes sentence encodings + Args: text: List of documents to encode. batch_size: Batch size, defaults to 32. diff --git a/utils_nlp/models/bert/token_classification.py b/utils_nlp/models/bert/token_classification.py index 3965c41c1..816cb0216 100644 --- a/utils_nlp/models/bert/token_classification.py +++ b/utils_nlp/models/bert/token_classification.py @@ -16,7 +16,7 @@ from tqdm import tqdm, trange from utils_nlp.models.bert.common import Language, create_data_loader -from utils_nlp.common.pytorch_utils import get_device, move_to_device +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device from cached_property import cached_property @@ -144,7 +144,7 @@ def fit( device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) if num_gpus is None: num_gpus_used = torch.cuda.device_count() @@ -228,7 +228,7 @@ def predict( ) device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) self.model.eval() eval_loss = 0 diff --git a/utils_nlp/models/xlnet/sequence_classification.py b/utils_nlp/models/xlnet/sequence_classification.py index 32c239866..a7a086ea3 100644 --- a/utils_nlp/models/xlnet/sequence_classification.py +++ b/utils_nlp/models/xlnet/sequence_classification.py @@ -2,23 +2,20 @@ # Licensed under the MIT License. """Utilities for Xlnet Sequence Classification""" -import numpy as np +import os from collections import namedtuple + +import mlflow +import mlflow.pytorch +import numpy as np import torch import torch.nn as nn -from transformers import ( - XLNetConfig, - XLNetForSequenceClassification, - AdamW, - WarmupLinearSchedule, -) -from tqdm import tqdm from torch.utils.data import DataLoader, RandomSampler, TensorDataset -from utils_nlp.common.pytorch_utils import get_device, move_to_device +from tqdm import tqdm +from transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification + +from utils_nlp.common.pytorch_utils import get_device, move_model_to_device from utils_nlp.models.xlnet.common import Language -import mlflow -import mlflow.pytorch -import os class XLNetSequenceClassifier: @@ -79,9 +76,7 @@ def __init__( self.max_grad_norm = max_grad_norm # create classifier - self.config = XLNetConfig.from_pretrained( - self.language.value, num_labels=num_labels, cache_dir=cache_dir - ) + self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir) self.model = XLNetForSequenceClassification(self.config) def fit( @@ -114,7 +109,7 @@ def fit( """ device, num_gpus = get_device(self.num_gpus) - self.model = move_to_device(self.model, device, self.num_gpus) + self.model = move_model_to_device(self.model, device, self.num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) @@ -128,24 +123,17 @@ def fit( token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long) - train_dataset = TensorDataset( - token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor - ) + train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor) val_dataset = TensorDataset( - val_token_ids_tensor, - val_input_mask_tensor, - val_token_type_ids_tensor, - val_labels_tensor, + val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor, ) else: train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor) - val_dataset = TensorDataset( - val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor - ) + val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) @@ -155,10 +143,7 @@ def fit( "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": self.weight_decay, }, - { - "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, + {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] val_sampler = RandomSampler(val_dataset) @@ -181,9 +166,7 @@ def fit( train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=self.batch_size - ) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size) tr_loss = 0.0 logging_loss = 0.0 @@ -191,18 +174,13 @@ def fit( for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if token_type_ids: - x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( - t.to(device) for t in batch - ) + x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch) else: token_type_ids_batch = None x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch) outputs = self.model( - input_ids=x_batch, - token_type_ids=token_type_ids_batch, - attention_mask=mask_batch, - labels=y_batch, + input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch, ) loss = outputs[0] # model outputs are always tuple in pytorch-transformers @@ -220,9 +198,7 @@ def fit( if logging_steps > 0 and global_step % logging_steps == 0: mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step) mlflow.log_metric( - "training loss", - (tr_loss - logging_loss) / (logging_steps * self.batch_size), - step=global_step, + "training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step, ) logging_loss = tr_loss # model checkpointing @@ -245,9 +221,7 @@ def fit( ) else: token_type_ids_batch = None - val_x_batch, val_mask_batch, val_y_batch = tuple( - t.to(device) for t in val_batch - ) + val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch) val_outputs = self.model( input_ids=val_x_batch, token_type_ids=val_token_type_ids_batch, @@ -256,9 +230,7 @@ def fit( ) vloss = val_outputs[0] val_loss += vloss.sum().item() - mlflow.log_metric( - "validation loss", val_loss / len(val_dataset), step=global_step - ) + mlflow.log_metric("validation loss", val_loss / len(val_dataset), step=global_step) self.model.train() if verbose: @@ -300,13 +272,7 @@ def fit( torch.cuda.empty_cache() def predict( - self, - token_ids, - input_mask, - token_type_ids=None, - num_gpus=None, - batch_size=8, - probabilities=False, + self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False, ): """Scores the given dataset and returns the predicted classes. @@ -330,7 +296,7 @@ def predict( """ device, num_gpus = get_device(num_gpus) - self.model = move_to_device(self.model, device, num_gpus) + self.model = move_model_to_device(self.model, device, num_gpus) self.model.eval() preds = [] @@ -342,16 +308,11 @@ def predict( x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device) mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device) - token_type_ids_batch = torch.tensor( - token_type_ids[start:end], dtype=torch.long, device=device - ) + token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device) with torch.no_grad(): pred_batch = self.model( - input_ids=x_batch, - token_type_ids=token_type_ids_batch, - attention_mask=mask_batch, - labels=None, + input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(pred_batch[0].cpu()) if i % batch_size == 0: From ea11200338bf8d99e1a718aa1e967f239bca0009 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 18:04:22 +0000 Subject: [PATCH 18/24] rem num_label requirement --- utils_nlp/models/transformers/common.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 9808719a7..7fce22c6b 100755 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -41,9 +41,6 @@ def __init__( self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None, ): - if num_labels < 2: - raise ValueError("Number of labels should be at least 2.") - if model_name not in self.list_supported_models(): raise ValueError( "Model name {0} is not supported by {1}. " From 4b2ced52baf31f0a3869b34854c09470ea8a838a Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 18:07:39 +0000 Subject: [PATCH 19/24] rem num_labels check --- tests/unit/test_transformers_token_classification.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py index eda90c6d4..b4da4014e 100644 --- a/tests/unit/test_transformers_token_classification.py +++ b/tests/unit/test_transformers_token_classification.py @@ -1,18 +1,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import pytest - from utils_nlp.common.pytorch_utils import dataloader_from_dataset -from utils_nlp.models.transformers.named_entity_recognition import ( - TokenClassificationProcessor, - TokenClassifier, -) - - -def test_token_classifier_num_labels(): - with pytest.raises(ValueError): - TokenClassifier(num_labels=1) +from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier def test_token_classifier_fit_predict(tmp_path, ner_test_data): From 30a9e0342b4d188903396621d27c03bbb070637d Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sat, 18 Jan 2020 19:38:10 +0000 Subject: [PATCH 20/24] add pytest marker to ner test --- tests/unit/test_transformers_token_classification.py | 9 ++++++--- utils_nlp/models/bert/sequence_classification.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py index b4da4014e..a39a93c66 100644 --- a/tests/unit/test_transformers_token_classification.py +++ b/tests/unit/test_transformers_token_classification.py @@ -1,13 +1,16 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import pytest + from utils_nlp.common.pytorch_utils import dataloader_from_dataset from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier -def test_token_classifier_fit_predict(tmp_path, ner_test_data): - token_classifier = TokenClassifier(num_labels=6, cache_dir=tmp_path) - processor = TokenClassificationProcessor(cache_dir=tmp_path) +@pytest.mark.cpu +def test_token_classifier_fit_predict(tmpdir, ner_test_data): + token_classifier = TokenClassifier(num_labels=6, cache_dir=tmpdir) + processor = TokenClassificationProcessor(cache_dir=tmpdir) # test fit, no warmup train_dataset = processor.preprocess_for_bert( diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py index 4748ceec3..8fbe416c8 100644 --- a/utils_nlp/models/bert/sequence_classification.py +++ b/utils_nlp/models/bert/sequence_classification.py @@ -13,7 +13,7 @@ from tqdm import tqdm from utils_nlp.models.bert.common import Language -from utils_nlp.common.pytorch_utils import get_device, move_to_device +from utils_nlp.common.pytorch_utils import get_device from cached_property import cached_property From 489f5381b8fae5ba0f35ab46420ffd360253090a Mon Sep 17 00:00:00 2001 From: saidbleik Date: Sun, 19 Jan 2020 19:24:10 +0000 Subject: [PATCH 21/24] specify model name in NER test --- tests/unit/test_transformers_token_classification.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py index a39a93c66..e8c780f21 100644 --- a/tests/unit/test_transformers_token_classification.py +++ b/tests/unit/test_transformers_token_classification.py @@ -9,8 +9,8 @@ @pytest.mark.cpu def test_token_classifier_fit_predict(tmpdir, ner_test_data): - token_classifier = TokenClassifier(num_labels=6, cache_dir=tmpdir) - processor = TokenClassificationProcessor(cache_dir=tmpdir) + token_classifier = TokenClassifier(model_name="bert-base-uncased", num_labels=6, cache_dir=tmpdir) + processor = TokenClassificationProcessor(model_name="bert-base-uncased", cache_dir=tmpdir) # test fit, no warmup train_dataset = processor.preprocess_for_bert( @@ -20,5 +20,4 @@ def test_token_classifier_fit_predict(tmpdir, ner_test_data): token_classifier.fit(train_dataloader) # test predict, no labels - preds = token_classifier.predict(train_dataloader, verbose=False) - assert len(preds) == len(ner_test_data["INPUT_LABELS"]) + _ = token_classifier.predict(train_dataloader, verbose=False) From 9db5a3708c1fe1eb4a185a17b19238d6a14cff3f Mon Sep 17 00:00:00 2001 From: saidbleik Date: Thu, 23 Jan 2020 23:59:31 +0000 Subject: [PATCH 22/24] minor edits --- tests/smoke/test_gpu_utils.py | 1 - utils_nlp/common/pytorch_utils.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/smoke/test_gpu_utils.py b/tests/smoke/test_gpu_utils.py index 11418ad38..32d04a136 100644 --- a/tests/smoke/test_gpu_utils.py +++ b/tests/smoke/test_gpu_utils.py @@ -9,4 +9,3 @@ @pytest.mark.gpu def test_machine_is_gpu_machine(): assert torch.cuda.is_available() is True - diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py index 77918d70c..2badb45e5 100644 --- a/utils_nlp/common/pytorch_utils.py +++ b/utils_nlp/common/pytorch_utils.py @@ -83,7 +83,9 @@ def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, dis Args: ds (torch.utils.data.DataSet): A PyTorch dataset. - batch_size (int, optional): Batch size. Defaults to 32. + batch_size (int, optional): Batch size. + If more than 1 gpu is used, this would be the batch size per gpu. + Defaults to 32. num_gpus (int, optional): The number of GPUs to be used. Defaults to None. shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False. distributed (book, optional): If True, a DistributedSampler is used. Defaults to False. From 186ce2710ad0b8cfe54c0cd9c972d0050a32a5a7 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Fri, 24 Jan 2020 05:50:51 +0000 Subject: [PATCH 23/24] minor edits --- .../transformers/extractive_summarization.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py index 54a7f64ef..f4a567e7e 100644 --- a/utils_nlp/models/transformers/extractive_summarization.py +++ b/utils_nlp/models/transformers/extractive_summarization.py @@ -336,9 +336,9 @@ def get_inputs(batch, device, model_name, train_mode=True): """ if model_name.split("-")[0] in ["bert", "distilbert"]: + batch = batch.to(device) if train_mode: # labels must be the last - batch = batch.to(device) return { "x": batch.src, "segs": batch.segs, @@ -348,13 +348,6 @@ def get_inputs(batch, device, model_name, train_mode=True): "labels": batch.labels, } else: - batch["src"] = batch["src"].to(device) - batch["segs"] = batch["segs"].to(device) - batch["clss"] = batch["clss"].to(device) - batch["mask"] = batch["mask"].to(device) - batch["mask_cls"] = batch["mask_cls"].to(device) - if "labels" in batch: - batch["labels"] = batch["labels"].to(device) batch = Bunch(batch) return { "x": batch.src, @@ -478,7 +471,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer", Args: model_name (str, optional): Transformer model name used in preprocessing. check MODEL_CLASS for supported models. Defaults to "distilbert-base-uncased". - encoder (str, optional): Encoder algorithm used by summarization layer. + encoder (str, optional): Encoder algorithm used by summarization layer. There are four options: - baseline: it used a smaller transformer model to replace the bert model and with transformer summarization layer. @@ -487,7 +480,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer", - transformer: it uses pretrained BERT and fine-tune BERT with transformer summarization layer. - RNN: it uses pretrained BERT and fine-tune BERT with LSTM summarization layer. - Defaults to "transformer". + Defaults to "transformer". cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". """ @@ -550,7 +543,7 @@ def fit( gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. - batch_size (int, optional): Maximum number of tokens in each batch. + batch_size (int, optional): Maximum number of tokens in each batch. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. max_steps (int, optional): Maximum number of training steps. Defaults to 5e5. From 6b35c4917af632fd54a711b5125919134d6e7879 Mon Sep 17 00:00:00 2001 From: saidbleik Date: Fri, 24 Jan 2020 06:54:12 +0000 Subject: [PATCH 24/24] minor edits --- .../models/transformers/extractive_summarization.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py index f4a567e7e..1defdad5c 100644 --- a/utils_nlp/models/transformers/extractive_summarization.py +++ b/utils_nlp/models/transformers/extractive_summarization.py @@ -336,8 +336,8 @@ def get_inputs(batch, device, model_name, train_mode=True): """ if model_name.split("-")[0] in ["bert", "distilbert"]: - batch = batch.to(device) if train_mode: + batch = batch.to(device) # labels must be the last return { "x": batch.src, @@ -350,11 +350,11 @@ def get_inputs(batch, device, model_name, train_mode=True): else: batch = Bunch(batch) return { - "x": batch.src, - "segs": batch.segs, - "clss": batch.clss, - "mask": batch.mask, - "mask_cls": batch.mask_cls, + "x": batch.src.to(device), + "segs": batch.segs.to(device), + "clss": batch.clss.to(device), + "mask": batch.mask.to(device), + "mask_cls": batch.mask_cls.to(device), } else: raise ValueError("Model not supported: {}".format(model_name))