From 21eadea232f957bdfd3ae28ffae0a0191ffe5bc8 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Mon, 6 Jan 2020 19:22:23 +0000
Subject: [PATCH 01/24] moved the order of  moving to device and creating the
 optimizer

---
 utils_nlp/models/transformers/common.py | 46 ++++++++++++++-----------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 42aedebfb..1dc66625c 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -102,9 +102,28 @@ def fine_tune(
         verbose=True,
         seed=None,
     ):
-
+        # get device
         device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
 
+        # unwrap model
+        if isinstance(self.model, torch.nn.DataParallel):
+            self.model = self.model.module
+
+        # wrap in DataParallel or DistributedDataParallel
+        if local_rank != -1:
+            self.model = torch.nn.parallel.DistributedDataParallel(
+                self.model,
+                device_ids=[local_rank],
+                output_device=local_rank,
+                find_unused_parameters=True,
+            )
+        else:
+            if num_gpus > 1:
+                self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
+
+        # move to device
+        self.model.to(device)
+
         if seed is not None:
             Transformer.set_seed(seed, num_gpus > 0)
 
@@ -116,6 +135,7 @@ def fine_tune(
         else:
             t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
 
+        # set optimizer
         if optimizer is None:
             no_decay = ["bias", "LayerNorm.weight"]
             optimizer_grouped_parameters = [
@@ -138,6 +158,7 @@ def fine_tune(
             ]
             optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
 
+        # set scheduler
         if scheduler is None:
             scheduler = get_linear_schedule_with_warmup(
                 optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
@@ -150,30 +171,16 @@ def fine_tune(
                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
             self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)
 
-        if local_rank != -1:
-            self.model = torch.nn.parallel.DistributedDataParallel(
-                self.model,
-                device_ids=[local_rank],
-                output_device=local_rank,
-                find_unused_parameters=True,
-            )
-        else:
-            if isinstance(self.model, torch.nn.DataParallel):
-                self.model = self.model.module
-
-            if num_gpus > 1:
-                self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        self.model.to(device)
-        self.model.train()
-
+        # init training
         global_step = 0
         tr_loss = 0.0
+        self.model.train()
         self.model.zero_grad()
         train_iterator = trange(
             int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0] or not verbose
         )
 
+        # train
         for _ in train_iterator:
             epoch_iterator = tqdm(
                 train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose
@@ -214,9 +221,6 @@ def fine_tune(
                 train_iterator.close()
                 break
 
-            # empty cache
-            del [batch]
-            torch.cuda.empty_cache()
         return global_step, tr_loss / global_step
 
     def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True):

From 9ab859958754ab8fc97a3c929fe28bce8905c995 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Tue, 7 Jan 2020 20:13:24 +0000
Subject: [PATCH 02/24] added move_model_to_device to pytorch_utils

---
 utils_nlp/common/pytorch_utils.py | 36 +++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index ea09f8768..0410775ac 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -32,6 +32,42 @@ def get_device(
     return device, num_gpus
 
 
+def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=-1):
+    """Moves a model to the specified device (cpu or gpu/s)
+       and implements data parallelism when multiple gpus are specified.
+
+    Args:
+        model (Module): A PyTorch model.
+        device (torch.device): A PyTorch device.
+        num_gpus (int): The number of GPUs to be used.
+            If set to None, all available GPUs will be used.
+            Defaults to None.
+        gpu_ids (list): List of GPU IDs to be used.
+            If set to None, the first num_gpus GPUs will be used.
+            Defaults to None.
+        local_rank (int): Local GPU ID within a node. Used in distributed environments.
+            Defaults to -1.
+    """
+    # unwrap model
+    if isinstance(model, torch.nn.DataParallel):
+        model = model.module
+    # wrap in DataParallel or DistributedDataParallel
+    if local_rank != -1:
+        self.model = torch.nn.parallel.DistributedDataParallel(
+            self.model,
+            device_ids=[local_rank],
+            output_device=local_rank,
+            find_unused_parameters=True,
+        )
+    else:
+        if num_gpus > 1:
+            if gpu_ids is None:
+                gpu_ids = list(range(num_gpus))
+            model = torch.nn.DataParallel(model, device_ids=gpu_ids)
+    # move to device
+    model.to(device)
+
+
 def move_to_device(model, device, num_gpus=None):
     """Moves a model to the specified device (cpu or gpu/s)
        and implements data parallelism when multiple gpus are specified.

From ab4b496558ed53b321bdcbf74aa8a106cdd3cd78 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Tue, 7 Jan 2020 20:17:38 +0000
Subject: [PATCH 03/24] moved optim and scheduler init out of fine_tune

---
 utils_nlp/models/transformers/common.py | 118 +++++++++---------------
 1 file changed, 43 insertions(+), 75 deletions(-)

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 1dc66625c..ccaadbf6d 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -22,7 +22,7 @@
 from transformers.tokenization_distilbert import DistilBertTokenizer
 from transformers.tokenization_roberta import RobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
-from utils_nlp.common.pytorch_utils import get_device
+
 
 TOKENIZER_CLASS = {}
 TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@@ -81,6 +81,47 @@ def set_seed(seed, cuda=True):
         if cuda and torch.cuda.is_available():
             torch.cuda.manual_seed_all(seed)
 
+    @staticmethod
+    def get_default_optimizer(model, learning_rate, adam_epsilon):
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [
+                    p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": weight_decay,
+            },
+            {
+                "params": [
+                    p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
+        return optimizer
+
+    @staticmethod
+    def get_default_scheduler(
+        optimizer, warmup_steps, data_loader, max_steps, num_epochs, gradient_accumulation_steps
+    ):
+        try:
+            dataset_length = len(data_loader)
+        except Exception:
+            dataset_length = -1
+
+        if max_steps <= 0:
+            if dataset_length != -1 and num_epochs > 0:
+                max_steps = dataset_length // gradient_accumulation_steps * num_epochs
+
+        if max_steps <= 0:
+            raise Exception("Max steps cannot be determined.")
+
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps
+        )
+        return scheduler
+
     def fine_tune(
         self,
         train_dataloader,
@@ -89,81 +130,18 @@ def fine_tune(
         num_train_epochs=1,
         max_grad_norm=1.0,
         gradient_accumulation_steps=1,
-        n_gpu=1,
         optimizer=None,
         scheduler=None,
-        weight_decay=0.0,
-        learning_rate=5e-5,
-        adam_epsilon=1e-8,
-        warmup_steps=0,
         fp16=False,
         fp16_opt_level="O1",
         local_rank=-1,
         verbose=True,
         seed=None,
     ):
-        # get device
-        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
-
-        # unwrap model
-        if isinstance(self.model, torch.nn.DataParallel):
-            self.model = self.model.module
-
-        # wrap in DataParallel or DistributedDataParallel
-        if local_rank != -1:
-            self.model = torch.nn.parallel.DistributedDataParallel(
-                self.model,
-                device_ids=[local_rank],
-                output_device=local_rank,
-                find_unused_parameters=True,
-            )
-        else:
-            if num_gpus > 1:
-                self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        # move to device
-        self.model.to(device)
 
         if seed is not None:
             Transformer.set_seed(seed, num_gpus > 0)
 
-        if max_steps > 0:
-            t_total = max_steps
-            num_train_epochs = (
-                max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
-            )
-        else:
-            t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
-
-        # set optimizer
-        if optimizer is None:
-            no_decay = ["bias", "LayerNorm.weight"]
-            optimizer_grouped_parameters = [
-                {
-                    "params": [
-                        p
-                        for n, p in self.model.named_parameters()
-                        if not any(nd in n for nd in no_decay)
-                    ],
-                    "weight_decay": weight_decay,
-                },
-                {
-                    "params": [
-                        p
-                        for n, p in self.model.named_parameters()
-                        if any(nd in n for nd in no_decay)
-                    ],
-                    "weight_decay": 0.0,
-                },
-            ]
-            optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
-
-        # set scheduler
-        if scheduler is None:
-            scheduler = get_linear_schedule_with_warmup(
-                optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
-            )
-
         if fp16:
             try:
                 from apex import amp
@@ -223,18 +201,8 @@ def fine_tune(
 
         return global_step, tr_loss / global_step
 
-    def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True):
-        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
-
-        if isinstance(self.model, torch.nn.DataParallel):
-            self.model = self.model.module
-
-        if num_gpus > 1:
-            self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        self.model.to(device)
+    def predict(self, eval_dataloader, get_inputs, verbose=True):
         self.model.eval()
-
         for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
             batch = tuple(t.to(device) for t in batch)
             with torch.no_grad():

From 8242bcf767aa7f1d2357c8d5ccbdeb5806b0bb84 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Wed, 8 Jan 2020 06:37:42 +0000
Subject: [PATCH 04/24] modified structure of transformer wrapper

---
 ...st_transformers_sequence_classification.py | 20 +++--
 utils_nlp/common/pytorch_utils.py             |  6 +-
 utils_nlp/models/transformers/common.py       |  6 +-
 .../transformers/sequence_classification.py   | 73 ++++++++++++++-----
 4 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/tests/unit/test_transformers_sequence_classification.py b/tests/unit/test_transformers_sequence_classification.py
index 156854200..c402d106e 100644
--- a/tests/unit/test_transformers_sequence_classification.py
+++ b/tests/unit/test_transformers_sequence_classification.py
@@ -19,12 +19,11 @@ def test_classifier(data, tmpdir):
     num_labels = len(pd.unique(data[1]))
     model_name = "bert-base-uncased"
     processor = Processor(model_name=model_name, cache_dir=tmpdir)
-    train_dataloader = processor.create_dataloader_from_df(
-        df, "text", "label", batch_size=2, num_gpus=0
-    )
+    ds = processor.dataset_from_dataframe(df, "text", "label")
+    dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
     classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
-    classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=0, verbose=False)
-    preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
+    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)
+    preds = classifier.predict(dl, num_gpus=0, verbose=False)
     assert len(preds) == len(data[1])
 
 
@@ -35,17 +34,16 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir):
     num_labels = len(pd.unique(data[1]))
     model_name = "bert-base-uncased"
     processor = Processor(model_name=model_name, cache_dir=tmpdir)
-    train_dataloader = processor.create_dataloader_from_df(
-        df, "text", "label", batch_size=2, num_gpus=1
-    )
+    ds = processor.dataset_from_dataframe(df, "text", "label")
+    dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
     classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
-    classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=1, verbose=False)
+    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)
 
     assert next(classifier.model.parameters()).is_cuda is True
     # gpu prediction, no model move
-    preds = classifier.predict(train_dataloader, num_gpus=1, verbose=False)
+    preds = classifier.predict(dl, num_gpus=1, verbose=False)
     assert len(preds) == len(data[1])
     # cpu prediction, need model move
     assert next(classifier.model.parameters()).is_cuda is True
-    preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
+    preds = classifier.predict(dl, num_gpus=0, verbose=False)
     assert next(classifier.model.parameters()).is_cuda is False
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index 0410775ac..fee66269e 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -47,6 +47,10 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
             Defaults to None.
         local_rank (int): Local GPU ID within a node. Used in distributed environments.
             Defaults to -1.
+    
+    Returns:
+        Module, DataParallel, DistributedDataParallel: A PyTorch Module or
+            a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used).
     """
     # unwrap model
     if isinstance(model, torch.nn.DataParallel):
@@ -65,7 +69,7 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
                 gpu_ids = list(range(num_gpus))
             model = torch.nn.DataParallel(model, device_ids=gpu_ids)
     # move to device
-    model.to(device)
+    return model.to(device)
 
 
 def move_to_device(model, device, num_gpus=None):
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index ccaadbf6d..d5f4d5588 100644
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -82,7 +82,7 @@ def set_seed(seed, cuda=True):
             torch.cuda.manual_seed_all(seed)
 
     @staticmethod
-    def get_default_optimizer(model, learning_rate, adam_epsilon):
+    def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
@@ -125,6 +125,8 @@ def get_default_scheduler(
     def fine_tune(
         self,
         train_dataloader,
+        device,
+        num_gpus,
         get_inputs,
         max_steps=-1,
         num_train_epochs=1,
@@ -201,7 +203,7 @@ def fine_tune(
 
         return global_step, tr_loss / global_step
 
-    def predict(self, eval_dataloader, get_inputs, verbose=True):
+    def predict(self, eval_dataloader, device, get_inputs, verbose=True):
         self.model.eval()
         for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
             batch = tuple(t.to(device) for t in batch)
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 93668471b..b245383db 100644
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -21,6 +21,7 @@
     XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLNetForSequenceClassification,
 )
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
 
@@ -188,20 +189,11 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 
         return input_ids, attention_mask, token_type_ids
 
-    def create_dataloader_from_df(
-        self,
-        df,
-        text_col,
-        label_col=None,
-        text2_col=None,
-        shuffle=False,
-        max_len=MAX_SEQ_LEN,
-        batch_size=32,
-        num_gpus=None,
-        distributed=False,
+    def dataset_from_dataframe(
+        self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN
     ):
         if text2_col is None:
-            ds = SCDataSet(
+            return SCDataSet(
                 df,
                 text_col,
                 label_col,
@@ -210,7 +202,7 @@ def create_dataloader_from_df(
                 max_len=max_len,
             )
         else:
-            ds = SPCDataSet(
+            return SPCDataSet(
                 df,
                 text_col,
                 text2_col,
@@ -220,6 +212,9 @@ def create_dataloader_from_df(
                 max_len=max_len,
             )
 
+    def dataloader_from_dataset(
+        self, ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False
+    ):
         if num_gpus is None:
             num_gpus = torch.cuda.device_count()
 
@@ -250,7 +245,10 @@ def fit(
         self,
         train_dataloader,
         num_epochs=1,
+        max_steps=-1,
+        gradient_accumulation_steps=1,
         num_gpus=None,
+        gpu_ids=None,
         local_rank=-1,
         weight_decay=0.0,
         learning_rate=5e-5,
@@ -265,9 +263,16 @@ def fit(
         Args:
             train_dataloader (Dataloader): Dataloader for the training data.
             num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            max_steps (int, optional): Total number of training steps. Overrides num_epochs.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will be used.
                 Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                 -1, which means non-distributed training.
             weight_decay (float, optional): Weight decay to apply after each parameter update.
@@ -281,20 +286,40 @@ def fit(
             seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
         """
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
+
+        # init optimizer and scheduler
+        optimizer = Transformer.get_default_optimizer(
+            self.model, weight_decay, learning_rate, adam_epsilon
+        )
+        scheduler = Transformer.get_default_scheduler(
+            optimizer,
+            warmup_steps,
+            train_dataloader,
+            max_steps,
+            num_epochs,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
         super().fine_tune(
             train_dataloader=train_dataloader,
+            device=device,
+            num_gpus=num_gpus,
             get_inputs=Processor.get_inputs,
-            n_gpu=num_gpus,
+            max_steps=max_steps,
             num_train_epochs=num_epochs,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            local_rank=local_rank,
             verbose=verbose,
             seed=seed,
         )
 
-    def predict(self, eval_dataloader, num_gpus=None, verbose=True):
+    def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
         """
         Scores a dataset using a fine-tuned model and a given dataloader.
 
@@ -303,17 +328,25 @@ def predict(self, eval_dataloader, num_gpus=None, verbose=True):
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will be used.
                 Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             verbose (bool, optional): Whether to print out the training log. Defaults to True.
 
         Returns
             1darray: numpy array of predicted label indices.
         """
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
+
         preds = list(
             super().predict(
                 eval_dataloader=eval_dataloader,
+                device=device,
                 get_inputs=Processor.get_inputs,
-                n_gpu=num_gpus,
                 verbose=verbose,
             )
         )

From 9811b8abfb12b2dd4ee1d1e8621a72a02fc3e85b Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Mon, 13 Jan 2020 17:45:00 +0000
Subject: [PATCH 05/24] restructuring of common transformers utils

---
 ...st_transformers_sequence_classification.py |   5 +-
 utils_nlp/common/pytorch_utils.py             | 105 ++++++++----------
 utils_nlp/models/transformers/common.py       |  91 ++++++---------
 .../transformers/sequence_classification.py   |  98 ++++++----------
 4 files changed, 125 insertions(+), 174 deletions(-)
 mode change 100644 => 100755 tests/unit/test_transformers_sequence_classification.py
 mode change 100644 => 100755 utils_nlp/models/transformers/common.py
 mode change 100644 => 100755 utils_nlp/models/transformers/sequence_classification.py

diff --git a/tests/unit/test_transformers_sequence_classification.py b/tests/unit/test_transformers_sequence_classification.py
old mode 100644
new mode 100755
index c402d106e..3ffb1f8b9
--- a/tests/unit/test_transformers_sequence_classification.py
+++ b/tests/unit/test_transformers_sequence_classification.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from utils_nlp.models.transformers.sequence_classification import SequenceClassifier, Processor
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 
 
 @pytest.fixture()
@@ -20,7 +21,7 @@ def test_classifier(data, tmpdir):
     model_name = "bert-base-uncased"
     processor = Processor(model_name=model_name, cache_dir=tmpdir)
     ds = processor.dataset_from_dataframe(df, "text", "label")
-    dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
+    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
     classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
     classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)
     preds = classifier.predict(dl, num_gpus=0, verbose=False)
@@ -35,7 +36,7 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir):
     model_name = "bert-base-uncased"
     processor = Processor(model_name=model_name, cache_dir=tmpdir)
     ds = processor.dataset_from_dataframe(df, "text", "label")
-    dl = processor.dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
+    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
     classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
     classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)
 
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index fee66269e..89f98ab2a 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -3,9 +3,12 @@
 
 """Common PyTorch utilities that facilitate building Pytorch models."""
 
+import warnings
+
 import torch
 import torch.nn as nn
-import warnings
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
 
 
 def get_device(
@@ -17,11 +20,7 @@ def get_device(
     #    init_method="file:///distributed",
 ):
     if local_rank == -1:
-        num_gpus = (
-            min(num_gpus, torch.cuda.device_count())
-            if num_gpus is not None
-            else torch.cuda.device_count()
-        )
+        num_gpus = min(num_gpus, torch.cuda.device_count()) if num_gpus is not None else torch.cuda.device_count()
         device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
     else:
         torch.cuda.set_device(local_rank)
@@ -58,10 +57,7 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
     # wrap in DataParallel or DistributedDataParallel
     if local_rank != -1:
         self.model = torch.nn.parallel.DistributedDataParallel(
-            self.model,
-            device_ids=[local_rank],
-            output_device=local_rank,
-            find_unused_parameters=True,
+            self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True,
         )
     else:
         if num_gpus > 1:
@@ -72,59 +68,56 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
     return model.to(device)
 
 
-def move_to_device(model, device, num_gpus=None):
-    """Moves a model to the specified device (cpu or gpu/s)
-       and implements data parallelism when multiple gpus are specified.
+def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False):
+    """Creates a PyTorch DataLoader given a Dataset object.
 
     Args:
-        model (Module): A PyTorch model
-        device (torch.device): A PyTorch device
-        num_gpus (int): The number of GPUs to be used. Defaults to None,
-            all gpus are used.
+        ds (torch.utils.data.DataSet): A PyTorch dataset.
+        batch_size (int, optional): Batch size. Defaults to 32.
+        num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
+        shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
+        distributed (book, optional): If True, a DistributedSampler is used. Defaults to False.
 
     Returns:
         Module, DataParallel: A PyTorch Module or
             a DataParallel wrapper (when multiple gpus are used).
     """
-    if isinstance(model, nn.DataParallel):
-        model = model.module
+    if num_gpus is None:
+        num_gpus = torch.cuda.device_count()
 
-    if not isinstance(device, torch.device):
-        raise ValueError("device must be of type torch.device.")
-
-    if device.type == "cuda":
-        model.to(device)  # inplace
-        if num_gpus == 0:
-            raise ValueError("num_gpus must be non-zero when device.type is 'cuda'")
-        elif num_gpus == 1:
-            return model
-        else:
-            # parallelize
-            num_cuda_devices = torch.cuda.device_count()
-            if num_cuda_devices < 1:
-                raise Exception("CUDA devices are not available.")
-            elif num_cuda_devices < 2:
-                print("Warning: Only 1 CUDA device is available. Data parallelism is not possible.")
-                return model
-            else:
-                if num_gpus is None:
-                    # use all available devices
-                    return nn.DataParallel(model, device_ids=None)
-                elif num_gpus > num_cuda_devices:
-                    print(
-                        "Warning: Only {0} devices are available. "
-                        "Setting the number of gpus to {0}".format(num_cuda_devices)
-                    )
-                    return nn.DataParallel(model, device_ids=None)
-                else:
-                    return nn.DataParallel(model, device_ids=list(range(num_gpus)))
-    elif device.type == "cpu":
-        if num_gpus != 0 and num_gpus is not None:
-            warnings.warn("Device type is 'cpu'. num_gpus is ignored.")
-        return model.to(device)
+    batch_size = batch_size * max(1, num_gpus)
 
+    if distributed:
+        sampler = DistributedSampler(ds)
     else:
-        raise Exception(
-            "Device type '{}' not supported. Currently, only cpu "
-            "and cuda devices are supported.".format(device.type)
-        )
+        sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
+
+    return DataLoader(ds, sampler=sampler, batch_size=batch_size)
+
+def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1):
+    """Computes the max training steps given a dataloader. 
+    
+    Args:
+        dataloader (Dataloader): A PyTorch DataLoader.
+        num_epochs (int, optional): Number of training epochs. Defaults to 1.
+        max_steps (int, optional): Total number of training steps.
+            If set to a positive value, it overrides num_epochs.
+            Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+            Defualts to -1.
+        gradient_accumulation_steps (int, optional): Number of steps to accumulate
+            before performing a backward/update pass.
+            Default to 1.
+      
+    Returns:
+        int: The max number of steps to be used in a training loop.
+    """
+    try:
+        dataset_length = len(dataloader)
+    except Exception:
+        dataset_length = -1
+    if max_steps <= 0:
+        if dataset_length != -1 and num_epochs > 0:
+            max_steps = dataset_length // gradient_accumulation_steps * num_epochs
+    if max_steps <= 0:
+        raise Exception("Max steps cannot be determined.")
+    return max_steps
\ No newline at end of file
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
old mode 100644
new mode 100755
index d5f4d5588..ccaf48b46
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -7,13 +7,13 @@
 import logging
 import os
 import random
+import time
+from itertools import cycle
 
 import numpy as np
 import torch
 from tqdm import tqdm, trange
-from transformers import AdamW
-from transformers import get_linear_schedule_with_warmup
-
+from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -23,7 +23,6 @@
 from transformers.tokenization_roberta import RobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
 
-
 TOKENIZER_CLASS = {}
 TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 TOKENIZER_CLASS.update({k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
@@ -37,12 +36,7 @@
 
 class Transformer:
     def __init__(
-        self,
-        model_class,
-        model_name="bert-base-cased",
-        num_labels=2,
-        cache_dir=".",
-        load_model_from_dir=None,
+        self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
     ):
 
         if model_name not in self.list_supported_models():
@@ -86,15 +80,11 @@ def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [
-                    p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
-                ],
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                 "weight_decay": weight_decay,
             },
             {
-                "params": [
-                    p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
-                ],
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                 "weight_decay": 0.0,
             },
         ]
@@ -102,23 +92,9 @@ def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
         return optimizer
 
     @staticmethod
-    def get_default_scheduler(
-        optimizer, warmup_steps, data_loader, max_steps, num_epochs, gradient_accumulation_steps
-    ):
-        try:
-            dataset_length = len(data_loader)
-        except Exception:
-            dataset_length = -1
-
-        if max_steps <= 0:
-            if dataset_length != -1 and num_epochs > 0:
-                max_steps = dataset_length // gradient_accumulation_steps * num_epochs
-
-        if max_steps <= 0:
-            raise Exception("Max steps cannot be determined.")
-
+    def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
         scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps
+            optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
         )
         return scheduler
 
@@ -129,7 +105,6 @@ def fine_tune(
         num_gpus,
         get_inputs,
         max_steps=-1,
-        num_train_epochs=1,
         max_grad_norm=1.0,
         gradient_accumulation_steps=1,
         optimizer=None,
@@ -139,6 +114,8 @@ def fine_tune(
         local_rank=-1,
         verbose=True,
         seed=None,
+        report_every=10,
+        clip_grad_norm=True,
     ):
 
         if seed is not None:
@@ -154,20 +131,16 @@ def fine_tune(
         # init training
         global_step = 0
         tr_loss = 0.0
+        accum_loss = 0
         self.model.train()
         self.model.zero_grad()
-        train_iterator = trange(
-            int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0] or not verbose
-        )
 
         # train
-        for _ in train_iterator:
-            epoch_iterator = tqdm(
-                train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose
-            )
+        start = time.time()
+        while global_step < max_steps:
+            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
             for step, batch in enumerate(epoch_iterator):
-                batch = tuple(t.to(device) for t in batch)
-                inputs = get_inputs(batch, self.model_name)
+                inputs = get_inputs(batch, device, self.model_name)
                 outputs = self.model(**inputs)
                 loss = outputs[0]
 
@@ -176,39 +149,47 @@ def fine_tune(
                 if gradient_accumulation_steps > 1:
                     loss = loss / gradient_accumulation_steps
 
-                if step % 10 == 0 and verbose:
-                    tqdm.write("Loss:{:.6f}".format(loss))
-
                 if fp16:
                     with amp.scale_loss(loss, optimizer) as scaled_loss:
                         scaled_loss.backward()
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
+                    if clip_grad_norm:
+                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                 else:
                     loss.backward()
-                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
+                    if clip_grad_norm:
+                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
 
                 tr_loss += loss.item()
+
+                accum_loss += loss.item()
                 if (step + 1) % gradient_accumulation_steps == 0:
+                    global_step += 1
+                    if global_step % report_every == 0 and verbose:
+                        end = time.time()
+                        print(
+                            "loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format(
+                                accum_loss / report_every, end - start, len(batch), global_step, max_steps,
+                            )
+                        )
+                        accum_loss = 0
+                        start = end
+
                     optimizer.step()
-                    scheduler.step()
+                    if scheduler:
+                        scheduler.step()
                     self.model.zero_grad()
-                    global_step += 1
 
-                if max_steps > 0 and global_step > max_steps:
+                if global_step > max_steps:
                     epoch_iterator.close()
                     break
-            if max_steps > 0 and global_step > max_steps:
-                train_iterator.close()
-                break
 
         return global_step, tr_loss / global_step
 
     def predict(self, eval_dataloader, device, get_inputs, verbose=True):
         self.model.eval()
         for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
-            batch = tuple(t.to(device) for t in batch)
             with torch.no_grad():
-                inputs = get_inputs(batch, self.model_name, train_mode=False)
+                inputs = get_inputs(batch, device, self.model_name, train_mode=False)
                 outputs = self.model(**inputs)
                 logits = outputs[0]
             yield logits.detach().cpu().numpy()
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
old mode 100644
new mode 100755
index b245383db..5e2e3763e
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -3,8 +3,10 @@
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
+from transformers.modeling_albert import (
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForSequenceClassification,
+)
 from transformers.modeling_bert import (
     BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     BertForSequenceClassification,
@@ -21,19 +23,17 @@
     XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
     XLNetForSequenceClassification,
 )
-from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device, compute_training_steps
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
 
 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update({k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 
 
 class Processor:
@@ -57,13 +57,14 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
         )
 
     @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
         """
         Creates an input dictionary given a model name.
 
         Args:
             batch (tuple): A tuple containing input ids, attention mask,
                 segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
             model_name (bool, optional): Model name used to format the inputs.
             train_mode (bool, optional): Training mode flag.
                 Defaults to True.
@@ -72,7 +73,8 @@ def get_inputs(batch, model_name, train_mode=True):
             dict: Dictionary containing input ids, segment ids, masks, and labels.
                 Labels are only returned when train_mode is True.
         """
-        if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert"]:
+        batch = tuple(t.to(device) for t in batch)
+        if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert", "albert"]:
             if train_mode:
                 inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
             else:
@@ -104,11 +106,7 @@ def text_transform(text, tokenizer, max_len=MAX_SEQ_LEN):
             print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
             max_len = MAX_SEQ_LEN
         # truncate and add CLS & SEP markers
-        tokens = (
-            [tokenizer.cls_token]
-            + tokenizer.tokenize(text)[0 : max_len - 2]
-            + [tokenizer.sep_token]
-        )
+        tokens = [tokenizer.cls_token] + tokenizer.tokenize(text)[0 : max_len - 2] + [tokenizer.sep_token]
         # get input ids
         input_ids = tokenizer.convert_tokens_to_ids(tokens)
         # pad sequence
@@ -189,17 +187,10 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 
         return input_ids, attention_mask, token_type_ids
 
-    def dataset_from_dataframe(
-        self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN
-    ):
+    def dataset_from_dataframe(self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN):
         if text2_col is None:
             return SCDataSet(
-                df,
-                text_col,
-                label_col,
-                transform=Processor.text_transform,
-                tokenizer=self.tokenizer,
-                max_len=max_len,
+                df, text_col, label_col, transform=Processor.text_transform, tokenizer=self.tokenizer, max_len=max_len,
             )
         else:
             return SPCDataSet(
@@ -212,29 +203,11 @@ def dataset_from_dataframe(
                 max_len=max_len,
             )
 
-    def dataloader_from_dataset(
-        self, ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False
-    ):
-        if num_gpus is None:
-            num_gpus = torch.cuda.device_count()
-
-        batch_size = batch_size * max(1, num_gpus)
-
-        if distributed:
-            sampler = DistributedSampler(ds)
-        else:
-            sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
-
-        return DataLoader(ds, sampler=sampler, batch_size=batch_size)
-
 
 class SequenceClassifier(Transformer):
     def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
         super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+            model_class=MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
         )
 
     @staticmethod
@@ -261,9 +234,12 @@ def fit(
         Fine-tunes a pre-trained sequence classification model.
 
         Args:
-            train_dataloader (Dataloader): Dataloader for the training data.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
             num_epochs (int, optional): Number of training epochs. Defaults to 1.
-            max_steps (int, optional): Total number of training steps. Overrides num_epochs.
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
             gradient_accumulation_steps (int, optional): Number of steps to accumulate
                 before performing a backward/update pass.
                 Default to 1.
@@ -288,29 +264,33 @@ def fit(
 
         # get device
         device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
+
         # move model
         self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
 
-        # init optimizer and scheduler
-        optimizer = Transformer.get_default_optimizer(
-            self.model, weight_decay, learning_rate, adam_epsilon
-        )
-        scheduler = Transformer.get_default_scheduler(
-            optimizer,
-            warmup_steps,
+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
             train_dataloader,
-            max_steps,
-            num_epochs,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
             gradient_accumulation_steps=gradient_accumulation_steps,
         )
 
+        # inint scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
             device=device,
             num_gpus=num_gpus,
             get_inputs=Processor.get_inputs,
             max_steps=max_steps,
-            num_train_epochs=num_epochs,
             gradient_accumulation_steps=gradient_accumulation_steps,
             optimizer=optimizer,
             scheduler=scheduler,
@@ -344,12 +324,8 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
 
         preds = list(
             super().predict(
-                eval_dataloader=eval_dataloader,
-                device=device,
-                get_inputs=Processor.get_inputs,
-                verbose=verbose,
+                eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose,
             )
         )
         preds = np.concatenate(preds)
-        # todo generator & probs
         return np.argmax(preds, axis=1)

From 74f6ba6662dec87591677a575505b938bcec04dd Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Mon, 13 Jan 2020 18:20:20 +0000
Subject: [PATCH 06/24] updated seq classification tests

---
 utils_nlp/models/transformers/sequence_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 5e2e3763e..a86f27608 100755
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -279,7 +279,7 @@ def fit(
             gradient_accumulation_steps=gradient_accumulation_steps,
         )
 
-        # inint scheduler
+        # inin scheduler
         scheduler = Transformer.get_default_scheduler(
             optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
         )

From 5611740f960ae4fc9831dbed85a7fa5e23846df3 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Mon, 13 Jan 2020 21:00:11 +0000
Subject: [PATCH 07/24] update seq classification examples

---
 .../tc_mnli_transformers.ipynb                | 205 +++++++++-----
 .../tc_multi_languages_transformers.ipynb     | 249 +++++++++++++++---
 .../test_notebooks_text_classification.py     |   4 +-
 utils_nlp/dataset/bbc_hindi.py                |  82 ++----
 utils_nlp/dataset/dac.py                      |  68 ++---
 utils_nlp/dataset/multinli.py                 |  70 ++---
 6 files changed, 431 insertions(+), 247 deletions(-)

diff --git a/examples/text_classification/tc_mnli_transformers.ipynb b/examples/text_classification/tc_mnli_transformers.ipynb
index 952f2bafa..bfbd91ffe 100644
--- a/examples/text_classification/tc_mnli_transformers.ipynb
+++ b/examples/text_classification/tc_mnli_transformers.ipynb
@@ -32,6 +32,7 @@
     "from sklearn.preprocessing import LabelEncoder\n",
     "from tqdm import tqdm\n",
     "from utils_nlp.common.timer import Timer\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
     "from utils_nlp.dataset.multinli import load_pandas_df\n",
     "from utils_nlp.models.transformers.sequence_classification import (\n",
     "    Processor, SequenceClassifier)"
@@ -93,7 +94,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n"
+      "100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \n"
      ]
     }
    ],
@@ -196,7 +197,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
       "  FutureWarning)\n"
      ]
     }
@@ -232,11 +233,11 @@
     {
      "data": {
       "text/plain": [
-       "telephone     1055\n",
-       "slate         1003\n",
-       "travel         961\n",
-       "fiction        952\n",
-       "government     938\n",
+       "telephone     1043\n",
+       "slate          989\n",
+       "fiction        968\n",
+       "travel         964\n",
+       "government     945\n",
        "Name: genre, dtype: int64"
       ]
      },
@@ -385,32 +386,108 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
-       "      <td>roberta-base</td>\n",
+       "      <td>bert-base-japanese</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>roberta-large</td>\n",
+       "      <td>bert-base-japanese-whole-word-masking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>roberta-large-mnli</td>\n",
+       "      <td>bert-base-japanese-char</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
-       "      <td>xlnet-base-cased</td>\n",
+       "      <td>bert-base-japanese-char-whole-word-masking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
-       "      <td>xlnet-large-cased</td>\n",
+       "      <td>bert-base-finnish-cased-v1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
-       "      <td>distilbert-base-uncased</td>\n",
+       "      <td>bert-base-finnish-uncased-v1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
+       "      <td>roberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>roberta-large</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>roberta-large-mnli</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>distilroberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>roberta-base-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>roberta-large-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>xlnet-base-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>xlnet-large-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>distilbert-base-uncased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
        "      <td>distilbert-base-uncased-distilled-squad</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>distilbert-base-german-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>distilbert-base-multilingual-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>albert-base-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>albert-large-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>albert-xlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>albert-xxlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>albert-base-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>albert-large-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>albert-xlarge-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>albert-xxlarge-v2</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
@@ -432,13 +509,32 @@
        "12                     bert-base-cased-finetuned-mrpc\n",
        "13                       bert-base-german-dbmdz-cased\n",
        "14                     bert-base-german-dbmdz-uncased\n",
-       "15                                       roberta-base\n",
-       "16                                      roberta-large\n",
-       "17                                 roberta-large-mnli\n",
-       "18                                   xlnet-base-cased\n",
-       "19                                  xlnet-large-cased\n",
-       "20                            distilbert-base-uncased\n",
-       "21            distilbert-base-uncased-distilled-squad"
+       "15                                 bert-base-japanese\n",
+       "16              bert-base-japanese-whole-word-masking\n",
+       "17                            bert-base-japanese-char\n",
+       "18         bert-base-japanese-char-whole-word-masking\n",
+       "19                         bert-base-finnish-cased-v1\n",
+       "20                       bert-base-finnish-uncased-v1\n",
+       "21                                       roberta-base\n",
+       "22                                      roberta-large\n",
+       "23                                 roberta-large-mnli\n",
+       "24                                 distilroberta-base\n",
+       "25                       roberta-base-openai-detector\n",
+       "26                      roberta-large-openai-detector\n",
+       "27                                   xlnet-base-cased\n",
+       "28                                  xlnet-large-cased\n",
+       "29                            distilbert-base-uncased\n",
+       "30            distilbert-base-uncased-distilled-squad\n",
+       "31                       distilbert-base-german-cased\n",
+       "32                 distilbert-base-multilingual-cased\n",
+       "33                                     albert-base-v1\n",
+       "34                                    albert-large-v1\n",
+       "35                                   albert-xlarge-v1\n",
+       "36                                  albert-xxlarge-v1\n",
+       "37                                     albert-base-v2\n",
+       "38                                    albert-large-v2\n",
+       "39                                   albert-xlarge-v2\n",
+       "40                                  albert-xxlarge-v2"
       ]
      },
      "execution_count": 10,
@@ -492,18 +588,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n",
-      "100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n",
-      "100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n",
-      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n",
-      "100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n",
-      "100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n",
-      "100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n",
-      "100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n",
-      "100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n",
-      "100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n"
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
      ]
     }
    ],
@@ -518,11 +604,17 @@
     "        to_lower=model_name.endswith(\"uncased\"),\n",
     "        cache_dir=CACHE_DIR,\n",
     "    )\n",
-    "    train_dataloader = processor.create_dataloader_from_df(\n",
-    "        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    "    train_dataset = processor.dataset_from_dataframe(\n",
+    "        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
     "    )\n",
-    "    test_dataloader = processor.create_dataloader_from_df(\n",
-    "        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
+    "    train_dataloader = dataloader_from_dataset(\n",
+    "        train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    "    )\n",
+    "    test_dataset = processor.dataset_from_dataframe(\n",
+    "        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
+    "    )\n",
+    "    test_dataloader = dataloader_from_dataset(\n",
+    "        test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
     "    )\n",
     "\n",
     "    # fine-tune\n",
@@ -531,17 +623,12 @@
     "    )\n",
     "    with Timer() as t:\n",
     "        classifier.fit(\n",
-    "            train_dataloader,\n",
-    "            num_epochs=NUM_EPOCHS,\n",
-    "            num_gpus=NUM_GPUS,\n",
-    "            verbose=False,\n",
+    "            train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\n",
     "        )\n",
     "    train_time = t.interval / 3600\n",
     "\n",
     "    # predict\n",
-    "    preds = classifier.predict(\n",
-    "        test_dataloader, num_gpus=NUM_GPUS, verbose=False\n",
-    "    )\n",
+    "    preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\n",
     "\n",
     "    # eval\n",
     "    accuracy = accuracy_score(df_test[LABEL_COL], preds)\n",
@@ -600,21 +687,21 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>accuracy</th>\n",
-       "      <td>0.895477</td>\n",
-       "      <td>0.879584</td>\n",
-       "      <td>0.894866</td>\n",
+       "      <td>0.889364</td>\n",
+       "      <td>0.885697</td>\n",
+       "      <td>0.886308</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>f1-score</th>\n",
-       "      <td>0.896656</td>\n",
-       "      <td>0.881218</td>\n",
-       "      <td>0.896108</td>\n",
+       "      <td>0.885225</td>\n",
+       "      <td>0.880926</td>\n",
+       "      <td>0.881819</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>time(hrs)</th>\n",
-       "      <td>0.021865</td>\n",
-       "      <td>0.035351</td>\n",
-       "      <td>0.046295</td>\n",
+       "      <td>0.023326</td>\n",
+       "      <td>0.044209</td>\n",
+       "      <td>0.052801</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -622,9 +709,9 @@
       ],
       "text/plain": [
        "           distilbert-base-uncased  roberta-base  xlnet-base-cased\n",
-       "accuracy                  0.895477      0.879584          0.894866\n",
-       "f1-score                  0.896656      0.881218          0.896108\n",
-       "time(hrs)                 0.021865      0.035351          0.046295"
+       "accuracy                  0.889364      0.885697          0.886308\n",
+       "f1-score                  0.885225      0.880926          0.881819\n",
+       "time(hrs)                 0.023326      0.044209          0.052801"
       ]
      },
      "execution_count": 13,
@@ -645,7 +732,7 @@
     {
      "data": {
       "application/scrapbook.scrap.json+json": {
-       "data": 0.8899755501222494,
+       "data": 0.887123064384678,
        "encoder": "json",
        "name": "accuracy",
        "version": 1
@@ -663,7 +750,7 @@
     {
      "data": {
       "application/scrapbook.scrap.json+json": {
-       "data": 0.8913273009038569,
+       "data": 0.8826569624491233,
        "encoder": "json",
        "name": "f1",
        "version": 1
@@ -688,9 +775,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "nlp_gpu",
+   "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
    "language": "python",
-   "name": "nlp_gpu"
+   "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/examples/text_classification/tc_multi_languages_transformers.ipynb b/examples/text_classification/tc_multi_languages_transformers.ipynb
index 437c95cfb..d8dfd9244 100644
--- a/examples/text_classification/tc_multi_languages_transformers.ipynb
+++ b/examples/text_classification/tc_multi_languages_transformers.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "tags": [
      "parameters"
@@ -183,32 +183,108 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
-       "      <td>roberta-base</td>\n",
+       "      <td>bert-base-japanese</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>roberta-large</td>\n",
+       "      <td>bert-base-japanese-whole-word-masking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>roberta-large-mnli</td>\n",
+       "      <td>bert-base-japanese-char</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
-       "      <td>xlnet-base-cased</td>\n",
+       "      <td>bert-base-japanese-char-whole-word-masking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
-       "      <td>xlnet-large-cased</td>\n",
+       "      <td>bert-base-finnish-cased-v1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
-       "      <td>distilbert-base-uncased</td>\n",
+       "      <td>bert-base-finnish-uncased-v1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
+       "      <td>roberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>roberta-large</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>roberta-large-mnli</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>distilroberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>roberta-base-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>roberta-large-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>xlnet-base-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>xlnet-large-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>distilbert-base-uncased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
        "      <td>distilbert-base-uncased-distilled-squad</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>distilbert-base-german-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>distilbert-base-multilingual-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>albert-base-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>albert-large-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>albert-xlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>albert-xxlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>albert-base-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>albert-large-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>albert-xlarge-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>albert-xxlarge-v2</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
@@ -230,13 +306,32 @@
        "12                     bert-base-cased-finetuned-mrpc\n",
        "13                       bert-base-german-dbmdz-cased\n",
        "14                     bert-base-german-dbmdz-uncased\n",
-       "15                                       roberta-base\n",
-       "16                                      roberta-large\n",
-       "17                                 roberta-large-mnli\n",
-       "18                                   xlnet-base-cased\n",
-       "19                                  xlnet-large-cased\n",
-       "20                            distilbert-base-uncased\n",
-       "21            distilbert-base-uncased-distilled-squad"
+       "15                                 bert-base-japanese\n",
+       "16              bert-base-japanese-whole-word-masking\n",
+       "17                            bert-base-japanese-char\n",
+       "18         bert-base-japanese-char-whole-word-masking\n",
+       "19                         bert-base-finnish-cased-v1\n",
+       "20                       bert-base-finnish-uncased-v1\n",
+       "21                                       roberta-base\n",
+       "22                                      roberta-large\n",
+       "23                                 roberta-large-mnli\n",
+       "24                                 distilroberta-base\n",
+       "25                       roberta-base-openai-detector\n",
+       "26                      roberta-large-openai-detector\n",
+       "27                                   xlnet-base-cased\n",
+       "28                                  xlnet-large-cased\n",
+       "29                            distilbert-base-uncased\n",
+       "30            distilbert-base-uncased-distilled-squad\n",
+       "31                       distilbert-base-german-cased\n",
+       "32                 distilbert-base-multilingual-cased\n",
+       "33                                     albert-base-v1\n",
+       "34                                    albert-large-v1\n",
+       "35                                   albert-xlarge-v1\n",
+       "36                                  albert-xxlarge-v1\n",
+       "37                                     albert-base-v2\n",
+       "38                                    albert-large-v2\n",
+       "39                                   albert-xlarge-v2\n",
+       "40                                  albert-xxlarge-v2"
       ]
      },
      "execution_count": 3,
@@ -264,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -281,7 +376,7 @@
     "    'num_train_epochs': 5,\n",
     "    'num_gpus': 2,\n",
     "    'batch_size': 16,\n",
-    "    'verbose': True,\n",
+    "    'verbose': False,\n",
     "    'load_dataset_func': None,\n",
     "    'get_labels_func': None\n",
     "}\n",
@@ -325,9 +420,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\n",
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
+      "  FutureWarning)\n"
+     ]
+    }
+   ],
    "source": [
     "train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\n",
     "    local_path=CONFIG['local_path'],\n",
@@ -354,11 +459,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training time : 0.190 hrs\n"
+     ]
+    }
+   ],
    "source": [
     "model = SequenceClassifier(\n",
     "    model_name=CONFIG['model_name'],\n",
@@ -390,9 +511,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction time : 0.021 hrs\n"
+     ]
+    }
+   ],
    "source": [
     "with Timer() as t:\n",
     "    preds = model.predict(\n",
@@ -422,11 +551,11 @@
      "text": [
       "              precision    recall  f1-score   support\n",
       "\n",
-      "     culture       0.89      0.89      0.89       843\n",
-      "     diverse       0.99      0.99      0.99      1738\n",
-      "     economy       0.96      0.96      0.96       661\n",
-      "    politics       0.94      0.94      0.94       530\n",
-      "      sports       0.87      0.87      0.87       580\n",
+      "     culture       0.93      0.94      0.93       548\n",
+      "     diverse       0.94      0.94      0.94       640\n",
+      "     economy       0.90      0.88      0.89       570\n",
+      "    politics       0.87      0.88      0.88       809\n",
+      "      sports       0.99      0.98      0.99      1785\n",
       "\n",
       "   micro avg       0.94      0.94      0.94      4352\n",
       "   macro avg       0.93      0.93      0.93      4352\n",
@@ -449,9 +578,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "precision",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "precision"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "recall",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "recall"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "f1",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "f1"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# for testing\n",
     "report_splits = report.split('\\n')[-2].split()\n",
@@ -463,11 +647,10 @@
   }
  ],
  "metadata": {
-  "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
    "language": "python",
-   "name": "python3"
+   "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/tests/integration/test_notebooks_text_classification.py b/tests/integration/test_notebooks_text_classification.py
index 8f00107eb..97eb9d6d7 100644
--- a/tests/integration/test_notebooks_text_classification.py
+++ b/tests/integration/test_notebooks_text_classification.py
@@ -33,8 +33,8 @@ def test_tc_mnli_transformers(notebooks, tmp):
         ),
     )
     result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
-    assert pytest.approx(result["accuracy"], 0.89, abs=ABS_TOL)
-    assert pytest.approx(result["f1"], 0.89, abs=ABS_TOL)
+    assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL)
+    assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL)
 
 
 @pytest.mark.integration
diff --git a/utils_nlp/dataset/bbc_hindi.py b/utils_nlp/dataset/bbc_hindi.py
index c8212cd63..08a779049 100644
--- a/utils_nlp/dataset/bbc_hindi.py
+++ b/utils_nlp/dataset/bbc_hindi.py
@@ -7,24 +7,22 @@
     https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1
 """
 
-import os
-import pandas as pd
 import logging
-import numpy as np
+import os
 import tarfile
-
 from tempfile import TemporaryDirectory
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.url_utils import maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.sequence_classification import Processor
-from sklearn.preprocessing import LabelEncoder
-from sklearn.model_selection import train_test_split
 
-
-URL = (
-    "https://github.com/NirantK/hindi2vec/releases/"
-    "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
-)
+URL = "https://github.com/NirantK/hindi2vec/releases/" "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
 
 
 def load_pandas_df(local_cache_path=TemporaryDirectory().name):
@@ -49,19 +47,9 @@ def load_pandas_df(local_cache_path=TemporaryDirectory().name):
     train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv")
     test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv")
 
-    train_df = pd.read_csv(
-        train_csv_file_path,
-        sep="\t",
-        encoding='utf-8',
-        header=None
-    )
+    train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None)
 
-    test_df = pd.read_csv(
-        test_csv_file_path,
-        sep="\t",
-        encoding='utf-8',
-        header=None
-    )
+    test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None)
 
     train_df = train_df.fillna("")
     test_df = test_df.fillna("")
@@ -80,7 +68,7 @@ def load_tc_dataset(
     cache_dir=TemporaryDirectory().name,
     max_len=MAX_SEQ_LEN,
     batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
     """
     Load the multinli dataset and split into training and testing datasets.
@@ -114,9 +102,9 @@ def load_tc_dataset(
 
     Returns:
         tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
 
-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
         
         label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
             can be retrieved by calling the `inverse_transform` function.
@@ -140,12 +128,8 @@ def load_tc_dataset(
     if test_fraction < 0 or test_fraction >= 1.0:
         logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
         test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
 
     if train_sample_ratio > 1.0:
         train_sample_ratio = 1.0
@@ -153,7 +137,7 @@ def load_tc_dataset(
     elif train_sample_ratio < 0:
         logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
         raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
     if test_sample_ratio > 1.0:
         test_sample_ratio = 1.0
         logging.warning("Setting the testing sample ratio to 1.0")
@@ -171,35 +155,17 @@ def load_tc_dataset(
     test_labels = label_encoder.transform(test_df[label_col])
     test_df[label_col] = test_labels
 
-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
 
-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
 
-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
 
     return (train_dataloader, test_dataloader, label_encoder, test_labels)
 
diff --git a/utils_nlp/dataset/dac.py b/utils_nlp/dataset/dac.py
index c692dfb56..750e95915 100644
--- a/utils_nlp/dataset/dac.py
+++ b/utils_nlp/dataset/dac.py
@@ -8,18 +8,19 @@
         arabic-text-classification-using-deep-learning-technics/")
 """
 
-import os
-import pandas as pd
 import logging
+import os
+from tempfile import TemporaryDirectory
+
 import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
 
-from tempfile import TemporaryDirectory
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.url_utils import extract_zip, maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.sequence_classification import Processor
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
-
 
 URL = (
     "https://data.mendeley.com/datasets/v524p5dhpj/2"
@@ -58,7 +59,7 @@ def load_tc_dataset(
     cache_dir=TemporaryDirectory().name,
     max_len=MAX_SEQ_LEN,
     batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
     """
     Load the multinli dataset and split into training and testing datasets.
@@ -92,9 +93,9 @@ def load_tc_dataset(
 
     Returns:
         tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
 
-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
         
         label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
             can be retrieved by calling the `inverse_transform` function.
@@ -104,11 +105,8 @@ def load_tc_dataset(
             label IDs by using the label_encoder.transform function.
     """
 
-     # download and load the original dataset
-    all_df = load_pandas_df(
-        local_cache_path=local_path,
-        num_rows=None
-    )
+    # download and load the original dataset
+    all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)
 
     # set the text and label columns
     text_col = all_df.columns[0]
@@ -123,12 +121,8 @@ def load_tc_dataset(
     if test_fraction < 0 or test_fraction >= 1.0:
         logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
         test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
 
     if train_sample_ratio > 1.0:
         train_sample_ratio = 1.0
@@ -136,7 +130,7 @@ def load_tc_dataset(
     elif train_sample_ratio < 0:
         logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
         raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
     if test_sample_ratio > 1.0:
         test_sample_ratio = 1.0
         logging.warning("Setting the testing sample ratio to 1.0")
@@ -149,35 +143,17 @@ def load_tc_dataset(
     if test_sample_ratio < 1.0:
         test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
 
-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
 
-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
 
-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
 
     # the DAC dataset already converted the labels to label ID format
     test_labels = test_df[label_col]
diff --git a/utils_nlp/dataset/multinli.py b/utils_nlp/dataset/multinli.py
index 62b772cd1..adab4c925 100644
--- a/utils_nlp/dataset/multinli.py
+++ b/utils_nlp/dataset/multinli.py
@@ -7,18 +7,19 @@
     https://www.nyu.edu/projects/bowman/multinli/
 """
 
+import logging
 import os
+from tempfile import TemporaryDirectory
 
 import pandas as pd
-import logging
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
 
-from tempfile import TemporaryDirectory
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.data_loaders import DaskJSONLoader
 from utils_nlp.dataset.url_utils import extract_zip, maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.sequence_classification import Processor
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
 
 URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
 DATA_FILES = {
@@ -63,9 +64,7 @@ def load_pandas_df(local_cache_path=".", file_split="train"):
     return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
 
 
-def get_generator(
-    local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
-):
+def get_generator(local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None):
     """ Returns an extracted dataset as a random batch generator that
     yields pandas dataframes.
     Args:
@@ -85,9 +84,7 @@ def get_generator(
     except Exception as e:
         raise e
 
-    loader = DaskJSONLoader(
-        os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
-    )
+    loader = DaskJSONLoader(os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size)
 
     return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
 
@@ -103,7 +100,7 @@ def load_tc_dataset(
     cache_dir=TemporaryDirectory().name,
     max_len=MAX_SEQ_LEN,
     batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
     """
     Load the multinli dataset and split into training and testing datasets.
@@ -137,9 +134,9 @@ def load_tc_dataset(
 
     Returns:
         tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
 
-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
         
         label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
             can be retrieved by calling the `inverse_transform` function.
@@ -150,10 +147,7 @@ def load_tc_dataset(
     """
 
     # download and load the original dataset
-    all_df = load_pandas_df(
-        local_cache_path=local_path,
-        file_split="train"
-    )
+    all_df = load_pandas_df(local_cache_path=local_path, file_split="train")
 
     # select the examples corresponding to one of the entailment labels (neutral
     # in this case) to avoid duplicate rows, as the sentences are not unique,
@@ -169,12 +163,8 @@ def load_tc_dataset(
     if test_fraction < 0 or test_fraction >= 1.0:
         logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
         test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
 
     if train_sample_ratio > 1.0:
         train_sample_ratio = 1.0
@@ -182,7 +172,7 @@ def load_tc_dataset(
     elif train_sample_ratio < 0:
         logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
         raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
     if test_sample_ratio > 1.0:
         test_sample_ratio = 1.0
         logging.warning("Setting the testing sample ratio to 1.0")
@@ -200,35 +190,17 @@ def load_tc_dataset(
     test_labels = label_encoder.transform(test_df[label_col])
     test_df[label_col] = test_labels
 
-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
 
-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
 
-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
     )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
 
     return (train_dataloader, test_dataloader, label_encoder, test_labels)
 

From c7d3409dfdc52d56f86027646865aa02d1d6de98 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Tue, 14 Jan 2020 17:08:58 +0000
Subject: [PATCH 08/24] update QA utils and tests

---
 ..._models_transformers_question_answering.py |  76 +++---
 .../models/transformers/question_answering.py | 234 +++++++-----------
 2 files changed, 128 insertions(+), 182 deletions(-)
 mode change 100644 => 100755 utils_nlp/models/transformers/question_answering.py

diff --git a/tests/unit/test_models_transformers_question_answering.py b/tests/unit/test_models_transformers_question_answering.py
index 010bf5c5d..7f14f0d0e 100644
--- a/tests/unit/test_models_transformers_question_answering.py
+++ b/tests/unit/test_models_transformers_question_answering.py
@@ -1,18 +1,20 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import pytest
 import os
+
+import pytest
+import torch
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.models.transformers.datasets import QADataset
 from utils_nlp.models.transformers.question_answering import (
-    QAProcessor,
-    AnswerExtractor,
     CACHED_EXAMPLES_TEST_FILE,
     CACHED_FEATURES_TEST_FILE,
+    AnswerExtractor,
+    QAProcessor,
 )
 
-import torch
-
 NUM_GPUS = max(1, torch.cuda.device_count())
 BATCH_SIZE = 8
 
@@ -109,9 +111,7 @@ def qa_test_data(qa_test_df, tmp_module):
         feature_cache_dir=tmp_module,
     )
 
-    qa_processor_distilbert = QAProcessor(
-        model_name="distilbert-base-uncased", cache_dir=tmp_module
-    )
+    qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
     train_features_distilbert = qa_processor_distilbert.preprocess(
         train_dataset,
         batch_size=BATCH_SIZE,
@@ -153,15 +153,9 @@ def qa_test_data(qa_test_df, tmp_module):
 def test_QAProcessor(qa_test_data, tmp_module):
     for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
         qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
-        qa_processor.preprocess(
-            qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module
-        )
-        qa_processor.preprocess(
-            qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module
-        )
-        qa_processor.preprocess(
-            qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module
-        )
+        qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
 
     # test unsupported model type
     with pytest.raises(ValueError):
@@ -169,51 +163,49 @@ def test_QAProcessor(qa_test_data, tmp_module):
 
     # test training data has no ground truth exception
     with pytest.raises(Exception):
-        qa_processor.preprocess(
-            qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
-        )
+        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
 
     # test when answer start is a list, but answer text is not
     with pytest.raises(Exception):
         qa_processor.preprocess(
-            qa_test_data["train_dataset_start_text_mismatch"],
-            is_training=True,
-            feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
         )
 
     # test when training data has multiple answers
     with pytest.raises(Exception):
         qa_processor.preprocess(
-            qa_test_data["train_dataset_multi_answers"],
-            is_training=True,
-            feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
         )
 
 
 def test_AnswerExtractor(qa_test_data, tmp_module):
-    # test bert
+    # bert
     qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
-    qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True)
+    train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
+    test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
+    qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)
 
     # test saving fine-tuned model
     model_output_dir = os.path.join(tmp_module, "fine_tuned")
     assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
     assert os.path.exists(os.path.join(model_output_dir, "config.json"))
 
-    qa_extractor_from_cache = AnswerExtractor(
-        cache_dir=tmp_module, load_model_from_dir=model_output_dir
-    )
-    qa_extractor_from_cache.predict(qa_test_data["test_features_bert"])
+    qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
+    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
 
+    # xlnet
+    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
+    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
     qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
-    qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False)
-    qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"])
+    qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
+    qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)
 
-    qa_extractor_distilbert = AnswerExtractor(
-        model_name="distilbert-base-uncased", cache_dir=tmp_module
-    )
-    qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False)
-    qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"])
+    # distilbert
+    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
+    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
+    qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
+    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
 
 
 def test_postprocess_bert_answer(qa_test_data, tmp_module):
@@ -226,8 +218,9 @@ def test_postprocess_bert_answer(qa_test_data, tmp_module):
         doc_stride=32,
         feature_cache_dir=tmp_module,
     )
+    test_loader = dataloader_from_dataset(test_features, shuffle=False)
     qa_extractor = AnswerExtractor(cache_dir=tmp_module)
-    predictions = qa_extractor.predict(test_features)
+    predictions = qa_extractor.predict(test_loader)
 
     qa_processor.postprocess(
         results=predictions,
@@ -260,8 +253,9 @@ def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
         doc_stride=32,
         feature_cache_dir=tmp_module,
     )
+    test_loader = dataloader_from_dataset(test_features, shuffle=False)
     qa_extractor = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
-    predictions = qa_extractor.predict(test_features)
+    predictions = qa_extractor.predict(test_loader)
 
     qa_processor.postprocess(
         results=predictions,
diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py
old mode 100644
new mode 100755
index 4f48e58d9..99cd59724
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@@ -17,38 +17,30 @@
 # Modifications copyright © Microsoft Corporation
 
 
-import os
-import logging
-from tqdm import tqdm
 import collections
 import json
+import logging
 import math
-import jsonlines
+import os
 
+import jsonlines
 import torch
-from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from torch.utils.data import TensorDataset
+from tqdm import tqdm
+from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
-from transformers.modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForQuestionAnswering,
-)
-from transformers.modeling_distilbert import (
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForQuestionAnswering,
-)
+from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
+from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
-from utils_nlp.common.pytorch_utils import get_device
+from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 
 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 
 # cached files during preprocessing
 # these are used in postprocessing to generate the final answer texts
@@ -85,9 +77,7 @@ class QAProcessor:
         cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
     """
 
-    def __init__(
-        self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."
-    ):
+    def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
         self.model_name = model_name
         self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
             model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
@@ -116,13 +106,14 @@ def model_type(self):
         return self._model_type
 
     @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
         """
         Creates an input dictionary given a model name.
 
         Args:
             batch (tuple): A tuple containing input ids, attention mask,
                 segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
             model_name (bool, optional): Model name used to format the inputs.
             train_mode (bool, optional): Training mode flag.
                 Defaults to True.
@@ -131,6 +122,7 @@ def get_inputs(batch, model_name, train_mode=True):
             dict: Dictionary containing input ids, segment ids, masks, and labels.
                 Labels are only returned when train_mode is True.
         """
+        batch = tuple(t.to(device) for t in batch)
         model_type = model_name.split("-")[0]
 
         inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
@@ -191,6 +183,8 @@ def preprocess(
                 directory. These files are required during postprocessing to generate the final
                 answer texts from predicted answer start and answer end indices. Defaults to
                 "./cached_qa_features".
+        Returns:
+            DataSet: A Pytorch DataSet.        
         """
 
         if not os.path.exists(feature_cache_dir):
@@ -223,9 +217,7 @@ def preprocess(
 
                 qa_examples.append(qa_example_cur)
 
-                qa_examples_json.append(
-                    {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
-                )
+                qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
 
                 features_cur = _create_qa_features(
                     qa_example_cur,
@@ -271,28 +263,13 @@ def preprocess(
             start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
             end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
             qa_dataset = TensorDataset(
-                input_ids,
-                input_mask,
-                segment_ids,
-                start_positions,
-                end_positions,
-                cls_index,
-                p_mask,
+                input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
             )
         else:
             unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
-            qa_dataset = TensorDataset(
-                input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
-            )
-
-        if num_gpus is not None:
-            batch_size = batch_size * max(1, num_gpus)
-        if distributed:
-            sampler = DistributedSampler(qa_dataset)
-        else:
-            sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset)
+            qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
 
-        return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size)
+        return qa_dataset
 
     def postprocess(
         self,
@@ -420,14 +397,7 @@ class QAResult(QAResult_):
 
 QAResultExtended_ = collections.namedtuple(
     "QAResultExtended",
-    [
-        "unique_id",
-        "start_top_log_probs",
-        "start_top_index",
-        "end_top_log_probs",
-        "end_top_index",
-        "cls_logits",
-    ],
+    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
 )
 
 
@@ -489,18 +459,16 @@ def list_supported_models():
     def fit(
         self,
         train_dataloader,
-        num_gpus=None,
         num_epochs=1,
-        learning_rate=5e-5,
-        max_grad_norm=1.0,
         max_steps=-1,
         gradient_accumulation_steps=1,
-        warmup_steps=0,
+        num_gpus=None,
+        gpu_ids=None,
+        local_rank=-1,
         weight_decay=0.0,
+        learning_rate=5e-5,
         adam_epsilon=1e-8,
-        fp16=False,
-        fp16_opt_level="O1",
-        local_rank=-1,
+        warmup_steps=0,
         verbose=True,
         seed=None,
         cache_model=True,
@@ -509,31 +477,30 @@ def fit(
         Fine-tune pre-trained transofmer models for question answering.
 
         Args:
-            train_dataloader (Dataloader): Dataloader for the training data.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
             num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
+            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
+                -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each parameter update.
+                Defaults to 0.0.
             learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
                 5e-5.
-            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
-                Defaults to 1.0.
-            max_steps (int, optional): Maximum number of training steps. If specified,
-                `num_epochs` will be ignored. Defaults to -1.
-            gradient_accumulation_steps (int, optional): Number of batches to accumulate
-                gradients on between each model parameter update. Defaults to 1.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
             warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
                 to `learning rate`. Defaults to 0.
-            weight_decay (float, optional): Weight decay to apply after each parameter update.
-                Defaults to 0.0.
-            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
-            fp16 (bool, optional): Whether to use 16-bit (mixed) precision (through NVIDIA apex)
-                instead of 32-bit. Defaults to False.
-            fp16_opt_level (str, optional): For fp16: Apex AMP optimization level selected in
-                ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html.
-                Defaults to "O1",
-            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-                -1, which means non-distributed training.
             verbose (bool, optional): Whether to print out the training log. Defaults to True.
             seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
             cache_model (bool, optional): Whether to save the fine-tuned model. If True,
@@ -542,28 +509,47 @@ def fit(
 
         """
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
+
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
+
+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
+        # inin scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
+            device=device,
+            num_gpus=num_gpus,
             get_inputs=QAProcessor.get_inputs,
             max_steps=max_steps,
-            num_train_epochs=num_epochs,
-            max_grad_norm=max_grad_norm,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            n_gpu=num_gpus,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
-            fp16=fp16,
-            fp16_opt_level=fp16_opt_level,
+            optimizer=optimizer,
+            scheduler=scheduler,
             local_rank=local_rank,
             verbose=verbose,
             seed=seed,
         )
+
         if cache_model:
             self.save_model()
 
-    def predict(self, test_dataloader, num_gpus=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
 
         """
         Predicts answer start and end logits.
@@ -573,8 +559,9 @@ def predict(self, test_dataloader, num_gpus=None, verbose=True):
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will
                 be used. Defaults to None.
-            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-                -1, which means non-distributed.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
 
         Returns:
@@ -584,25 +571,16 @@ def predict(self, test_dataloader, num_gpus=None, verbose=True):
         def _to_list(tensor):
             return tensor.detach().cpu().tolist()
 
+        # get device
         device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
-
-        if isinstance(self.model, torch.nn.DataParallel):
-            self.model = self.model.module
-
-        if num_gpus > 1:
-            self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        self.model.to(device)
-        self.model.eval()
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
 
         all_results = []
         for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
-            batch = tuple(t.to(device) for t in batch)
             with torch.no_grad():
-                inputs = QAProcessor.get_inputs(batch, self.model_name, train_mode=False)
-
+                inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
                 outputs = self.model(**inputs)
-
                 unique_id_tensor = batch[5]
 
             for i, u_id in enumerate(unique_id_tensor):
@@ -617,9 +595,7 @@ def _to_list(tensor):
                     )
                 else:
                     result = QAResult(
-                        unique_id=u_id.item(),
-                        start_logits=_to_list(outputs[0][i]),
-                        end_logits=_to_list(outputs[1][i]),
+                        unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
                     )
                 all_results.append(result)
             torch.cuda.empty_cache()
@@ -783,9 +759,7 @@ def postprocess_bert_answer(
 
         # Sort by the sum of the start and end logits in ascending order,
         # so that the first element is the most probable answer
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
-        )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         seen_predictions = {}
         nbest = []
@@ -818,19 +792,11 @@ def postprocess_bert_answer(
                 final_text = ""
                 seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
-                )
-            )
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
         # if we didn't include the empty option in the n-best, include it
         if unanswerable_exists:
             if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="", start_logit=null_start_logit, end_logit=null_end_logit
-                    )
-                )
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
 
             # In very rare edge cases we could only have single null prediction.
             # So we just create a nonce prediction in this case to avoid failure.
@@ -874,9 +840,7 @@ def postprocess_bert_answer(
             all_probs[example["qa_id"]] = nbest_json[0]["probability"]
         else:
             # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = (
-                score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-            )
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
             scores_diff_json[example["qa_id"]] = score_diff
             if score_diff > null_score_diff_threshold:
                 all_predictions[example["qa_id"]] = ""
@@ -1042,9 +1006,7 @@ def postprocess_xlnet_answer(
                         )
                     )
 
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
-        )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
         seen_predictions = {}
         nbest = []
@@ -1075,20 +1037,14 @@ def postprocess_xlnet_answer(
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = _get_final_text(
-                tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
-            )
+            final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
 
             seen_predictions[final_text] = True
 
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
-                )
-            )
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
@@ -1235,9 +1191,7 @@ def _is_whitespace(c):
             actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
             cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
             if actual_text.find(cleaned_answer_text) == -1:
-                logger.warning(
-                    "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text
-                )
+                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                 return
         else:
             start_position = -1
@@ -1696,9 +1650,7 @@ def _strip_spaces(text):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info(
-                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text
-            )
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using

From d0a3a13567ce6280f9090e76401dd7f7bf238347 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Tue, 14 Jan 2020 19:47:22 +0000
Subject: [PATCH 09/24] minor edits to seq classification utils

---
 .../transformers/sequence_classification.py   | 25 +++++--------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index a86f27608..5199f2d3d 100755
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -2,29 +2,16 @@
 # Licensed under the MIT License.
 
 import numpy as np
-import torch
-from transformers.modeling_albert import (
-    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForSequenceClassification,
-)
-from transformers.modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForSequenceClassification,
-)
+from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForSequenceClassification
+from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification
 from transformers.modeling_distilbert import (
     DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     DistilBertForSequenceClassification,
 )
-from transformers.modeling_roberta import (
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForSequenceClassification,
-)
-from transformers.modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForSequenceClassification,
-)
+from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification
+from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification
 
-from utils_nlp.common.pytorch_utils import get_device, move_model_to_device, compute_training_steps
+from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
 
@@ -279,7 +266,7 @@ def fit(
             gradient_accumulation_steps=gradient_accumulation_steps,
         )
 
-        # inin scheduler
+        # init scheduler
         scheduler = Transformer.get_default_scheduler(
             optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
         )

From 8bb1930ccd2f407edcb9fe8b5f869e3243a41449 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Tue, 14 Jan 2020 19:58:10 +0000
Subject: [PATCH 10/24] update NER utils

---
 utils_nlp/models/transformers/common.py       |   2 +
 .../transformers/named_entity_recognition.py  | 222 +++++++++---------
 2 files changed, 118 insertions(+), 106 deletions(-)
 mode change 100644 => 100755 utils_nlp/models/transformers/named_entity_recognition.py

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index ccaf48b46..2fa12af53 100755
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -183,6 +183,8 @@ def fine_tune(
                     epoch_iterator.close()
                     break
 
+        #del [batch]
+        #torch.cuda.empty_cache()
         return global_step, tr_loss / global_step
 
     def predict(self, eval_dataloader, device, get_inputs, verbose=True):
diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py
old mode 100644
new mode 100755
index 9e11e3e14..169bb21c8
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@@ -2,20 +2,19 @@
 # Licensed under the MIT License.
 
 import logging
+from collections import Iterable
+
 import numpy as np
 import torch
-import torch.nn as nn
-
-from collections import Iterable
-from torch.utils.data import TensorDataset
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
-from utils_nlp.common.pytorch_utils import get_device
-from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
+from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification
 
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 
-TC_MODEL_CLASS = {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
+TC_MODEL_CLASS = {}
+TC_MODEL_CLASS.update({k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+TC_MODEL_CLASS.update({k: DistilBertForTokenClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 
 
 class TokenClassificationProcessor:
@@ -40,27 +39,36 @@ def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
         )
 
     @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
         """
-        Produce a dictionary object for model training or prediction.
+        Creates an input dictionary given a model name.
 
         Args:
-            model_name (str): The pretained model name.
-            train_mode (bool, optional): Whether it's for model training. Set it to False if
-                it's for testing and it won't have the 'labels' data field.
-                Defaults to True, for model training.
+            batch (tuple): A tuple containing input ids, attention mask,
+                segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
+            model_name (bool, optional): Model name used to format the inputs.
+            train_mode (bool, optional): Training mode flag.
+                Defaults to True.
 
         Returns:
-            dict: A dictionary object contains all needed information for training or testing.
+            dict: Dictionary containing input ids, segment ids, masks, and labels.
+                Labels are only returned when train_mode is True.
         """
+        batch = tuple(t.to(device) for t in batch)
+        if model_name.split("-")[0] in ["bert", "distilbert"]:
+            if train_mode:
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            else:
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
 
-        if model_name.split("-")[0] not in ["bert"]:
-            raise ValueError("Model not supported: {}".format(model_name))
+            # distilbert doesn't support segment ids
+            if model_name.split("-")[0] not in ["distilbert"]:
+                inputs["token_type_ids"] = batch[2]
 
-        if train_mode:
-            return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            return inputs
         else:
-            return {"input_ids": batch[0], "attention_mask": batch[1]}
+            raise ValueError("Model not supported: {}".format(model_name))
 
     @staticmethod
     def create_label_map(label_lists, trailing_piece_tag="X"):
@@ -87,9 +95,7 @@ def create_label_map(label_lists, trailing_piece_tag="X"):
             label_map[trailing_piece_tag] = len(label_set)
         return label_map
 
-    def preprocess_for_bert(
-        self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"
-    ):
+    def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"):
         """
         Tokenize and preprocesses input word lists, involving the following steps
             0. WordPiece tokenization.
@@ -144,9 +150,7 @@ def _is_iterable_but_not_string(obj):
             return isinstance(obj, Iterable) and not isinstance(obj, str)
 
         if max_len > MAX_SEQ_LEN:
-            logging.warning(
-                "Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)
-            )
+            logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
             max_len = MAX_SEQ_LEN
 
         if not _is_iterable_but_not_string(text):
@@ -179,9 +183,7 @@ def _is_iterable_but_not_string(obj):
         for t, t_labels in zip(text, labels):
             if len(t) != len(t_labels):
                 raise ValueError(
-                    "The number of words is {0}, but the number of labels is {1}.".format(
-                        len(t), len(t_labels)
-                    )
+                    "The number of words is {0}, but the number of labels is {1}.".format(len(t), len(t_labels))
                 )
 
             new_labels = []
@@ -195,11 +197,7 @@ def _is_iterable_but_not_string(obj):
                     new_tokens.append(sub_word)
 
             if len(new_tokens) > max_len:
-                logging.warn(
-                    "Text after tokenization with length {} has been truncated".format(
-                        len(new_tokens)
-                    )
-                )
+                logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens)))
                 new_tokens = new_tokens[:max_len]
                 new_labels = new_labels[:max_len]
             input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@@ -216,9 +214,7 @@ def _is_iterable_but_not_string(obj):
             input_mask += padding
             new_labels += label_padding
 
-            trailing_token_mask_all.append(
-                [True if label != trailing_piece_tag else False for label in new_labels]
-            )
+            trailing_token_mask_all.append([True if label != trailing_piece_tag else False for label in new_labels])
 
             if label_map:
                 label_ids = [label_map[label] for label in new_labels]
@@ -244,21 +240,6 @@ def _is_iterable_but_not_string(obj):
             )
         return td
 
-    def create_dataloader_from_dataset(
-        self, dataset, shuffle=False, batch_size=32, num_gpus=None, distributed=False
-    ):
-        if num_gpus is None:
-            num_gpus = torch.cuda.device_count()
-
-        batch_size = batch_size * max(1, num_gpus)
-
-        if distributed:
-            sampler = DistributedSampler(dataset)
-        else:
-            sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
-
-        return DataLoader(dataset, sampler=sampler, batch_size=batch_size)
-
 
 class TokenClassifier(Transformer):
     """
@@ -275,10 +256,7 @@ class TokenClassifier(Transformer):
 
     def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
         super().__init__(
-            model_class=TC_MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+            model_class=TC_MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
         )
 
     @staticmethod
@@ -289,7 +267,10 @@ def fit(
         self,
         train_dataloader,
         num_epochs=1,
+        max_steps=-1,
+        gradient_accumulation_steps=1,
         num_gpus=None,
+        gpu_ids=None,
         local_rank=-1,
         weight_decay=0.0,
         learning_rate=5e-5,
@@ -299,73 +280,104 @@ def fit(
         seed=None,
     ):
         """
-        Fit the TokenClassifier model using the given training dataset.
+        Fine-tunes a pre-trained token classification model.
 
         Args:
-            train_dataloader (DataLoader): DataLoader instance for training.
-            num_epochs (int, optional): Number of training epochs.
-                Defaults to 1.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
+            num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
-            local_rank (int, optional): Whether need to do distributed training.
-                Defaults to -1, no distributed training.
-            weight_decay (float, optional): Weight decay rate.
-                Defaults to 0.
-            learning_rate (float, optional): The learning rate.
-                Defaults to 5e-5.
-            adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer.
-                Defaults to 1e-8.
-            warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'.
-                Defaults to 0.
-            verbose (bool, optional): Verbose model.
-                Defaults to False.
-            seed (int, optional): The seed for the transformers.
-                Defaults to None, use the default seed.
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
+                -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each parameter update.
+                Defaults to 0.0.
+            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
+                5e-5.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
+            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
+                to `learning rate`. Defaults to 0.
+            verbose (bool, optional): Whether to print out the training log. Defaults to True.
+            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
         """
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
+
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
+
+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
+        # init scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
-            get_inputs=TokenClassificationProcessor.get_inputs,
-            n_gpu=num_gpus,
-            num_train_epochs=num_epochs,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
+            device=device,
+            num_gpus=num_gpus,
+            get_inputs=Processor.get_inputs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            local_rank=local_rank,
             verbose=verbose,
             seed=seed,
         )
 
-    def predict(self, eval_dataloader, num_gpus=None, verbose=True):
+    def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
         """
-        Test on an evaluation dataset and get the token label predictions.
+        Scores a dataset using a fine-tuned model and a given dataloader.
 
         Args:
-            eval_dataset (TensorDataset): A TensorDataset for evaluation.
+            eval_dataloader (Dataloader): Dataloader for the evaluation data.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
-            verbose (bool, optional): Verbose model.
-                Defaults to False.
-
-        Returns:
-            ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
-            [number_of_examples, sequence_length, number_of_labels]. Each
-            value in the ndarray is not normalized. Post-process will be needed
-            to get the probability for each class label.
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            verbose (bool, optional): Whether to print out the training log. Defaults to True.
+
+        Returns
+            1darray: numpy array of predicted label indices.
         """
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
+
         preds = list(
             super().predict(
-                eval_dataloader=eval_dataloader,
-                get_inputs=TokenClassificationProcessor.get_inputs,
-                n_gpu=num_gpus,
-                verbose=verbose,
+                eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose,
             )
         )
-        preds_np = np.concatenate(preds)
-        return preds_np
+        preds = np.concatenate(preds)
+        return np.argmax(preds, axis=1)
 
     def get_predicted_token_labels(self, predictions, label_map, dataset):
         """
@@ -386,9 +398,7 @@ def get_predicted_token_labels(self, predictions, label_map, dataset):
         num_samples = len(dataset.tensors[0])
         if num_samples != predictions.shape[0]:
             raise ValueError(
-                "Predictions have {0} samples, but got {1} samples in dataset".format(
-                    predictions.shape[0], num_samples
-                )
+                "Predictions have {0} samples, but got {1} samples in dataset".format(predictions.shape[0], num_samples)
             )
 
         label_id2str = {v: k for k, v in label_map.items()}

From 699092593905388378f278f4117bac8ada4b39a6 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Thu, 16 Jan 2020 18:29:15 +0000
Subject: [PATCH 11/24] additional ordering of things

---
 utils_nlp/dataset/bbc_hindi.py                | 11 ++--
 utils_nlp/dataset/dac.py                      |  1 -
 utils_nlp/dataset/wikigold.py                 | 56 +++++++------------
 utils_nlp/models/transformers/common.py       | 40 +++++++++----
 .../models/transformers/question_answering.py | 14 ++---
 .../transformers/sequence_classification.py   | 27 ++++-----
 6 files changed, 69 insertions(+), 80 deletions(-)

diff --git a/utils_nlp/dataset/bbc_hindi.py b/utils_nlp/dataset/bbc_hindi.py
index 08a779049..c24710680 100644
--- a/utils_nlp/dataset/bbc_hindi.py
+++ b/utils_nlp/dataset/bbc_hindi.py
@@ -12,7 +12,6 @@
 import tarfile
 from tempfile import TemporaryDirectory
 
-import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
@@ -93,7 +92,7 @@ def load_tc_dataset(
         cache_dir (str, optional): The default folder for saving cache files.
             Defaults to TemporaryDirectory().name.
         max_len (int, optional): Maximum length of the list of tokens. Lists longer
-            than this are truncated and shorter ones are padded with "O"s. 
+            than this are truncated and shorter ones are padded with "O"s.
             Default value is BERT_MAX_LEN=512.
         batch_size (int, optional): The batch size for training and testing.
             Defaults to 32.
@@ -105,12 +104,12 @@ def load_tc_dataset(
         train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
 
         test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
-        
+
         label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
             can be retrieved by calling the `inverse_transform` function.
-        
+
         test_labels (Series): a Pandas Series of testing label (in label ID format). If
-            the labels are in raw label values format, we will need to transform it to 
+            the labels are in raw label values format, we will need to transform it to
             label IDs by using the label_encoder.transform function.
     """
 
@@ -172,7 +171,7 @@ def load_tc_dataset(
 
 def get_label_values(label_encoder, label_ids):
     """
-    Get the label values from label IDs. 
+    Get the label values from label IDs.
 
     Args:
         label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance
diff --git a/utils_nlp/dataset/dac.py b/utils_nlp/dataset/dac.py
index 750e95915..c8af1ad87 100644
--- a/utils_nlp/dataset/dac.py
+++ b/utils_nlp/dataset/dac.py
@@ -12,7 +12,6 @@
 import os
 from tempfile import TemporaryDirectory
 
-import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py
index 4713451fb..508d5dc56 100644
--- a/utils_nlp/dataset/wikigold.py
+++ b/utils_nlp/dataset/wikigold.py
@@ -7,18 +7,19 @@
     https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
 """
 
-import random
+import logging
 import os
+import random
+from tempfile import TemporaryDirectory
+
 import pandas as pd
-import logging
 
-from tempfile import TemporaryDirectory
-from utils_nlp.dataset.url_utils import maybe_download
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.ner_utils import preprocess_conll
+from utils_nlp.dataset.url_utils import maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor
 
-
 URL = (
     "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
     "/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
@@ -91,7 +92,7 @@ def load_dataset(
     max_len=MAX_SEQ_LEN,
     trailing_piece_tag="X",
     batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
     """
     Load the wikigold dataset and split into training and testing datasets.
@@ -155,9 +156,7 @@ def load_dataset(
     """
 
     train_df, test_df = load_train_test_dfs(
-        local_cache_path=local_path,
-        test_fraction=test_fraction,
-        random_seed=random_seed
+        local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
     )
 
     if train_sample_ratio > 1.0:
@@ -166,7 +165,7 @@ def load_dataset(
     elif train_sample_ratio < 0:
         logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
         raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
     if test_sample_ratio > 1.0:
         test_sample_ratio = 1.0
         logging.warning("Setting the testing sample ratio to 1.0")
@@ -179,47 +178,34 @@ def load_dataset(
     if test_sample_ratio < 1.0:
         test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
 
-    processor = TokenClassificationProcessor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
 
     label_map = TokenClassificationProcessor.create_label_map(
-        label_lists=train_df['labels'],
-        trailing_piece_tag=trailing_piece_tag
+        label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
     )
 
     train_dataset = processor.preprocess_for_bert(
-        text=train_df['sentence'],
+        text=train_df["sentence"],
         max_len=max_len,
-        labels=train_df['labels'],
+        labels=train_df["labels"],
         label_map=label_map,
-        trailing_piece_tag=trailing_piece_tag
+        trailing_piece_tag=trailing_piece_tag,
     )
 
     test_dataset = processor.preprocess_for_bert(
-        text=test_df['sentence'],
+        text=test_df["sentence"],
         max_len=max_len,
-        labels=test_df['labels'],
+        labels=test_df["labels"],
         label_map=label_map,
-        trailing_piece_tag=trailing_piece_tag
+        trailing_piece_tag=trailing_piece_tag,
     )
 
-    train_dataloader = processor.create_dataloader_from_dataset(
-        train_dataset,
-        shuffle=True,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        distributed=False
+    train_dataloader = dataloader_from_dataset(
+        train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
     )
 
-    test_dataloader = processor.create_dataloader_from_dataset(
-        test_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        distributed=False
+    test_dataloader = dataloader_from_dataset(
+        test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
     )
 
     return (train_dataloader, test_dataloader, label_map, test_dataset)
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index ccaf48b46..7fce22c6b 100755
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 import torch
-from tqdm import tqdm, trange
+from tqdm import tqdm
 from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -23,6 +23,8 @@
 from transformers.tokenization_roberta import RobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
 
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+
 TOKENIZER_CLASS = {}
 TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 TOKENIZER_CLASS.update({k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
@@ -101,9 +103,9 @@ def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
     def fine_tune(
         self,
         train_dataloader,
-        device,
-        num_gpus,
         get_inputs,
+        num_gpus=None,
+        gpu_ids=None,
         max_steps=-1,
         max_grad_norm=1.0,
         gradient_accumulation_steps=1,
@@ -118,6 +120,9 @@ def fine_tune(
         clip_grad_norm=True,
     ):
 
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
+
         if seed is not None:
             Transformer.set_seed(seed, num_gpus > 0)
 
@@ -128,6 +133,9 @@ def fine_tune(
                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
             self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)
 
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
+
         # init training
         global_step = 0
         tr_loss = 0.0
@@ -152,22 +160,25 @@ def fine_tune(
                 if fp16:
                     with amp.scale_loss(loss, optimizer) as scaled_loss:
                         scaled_loss.backward()
-                    if clip_grad_norm:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                 else:
                     loss.backward()
-                    if clip_grad_norm:
-                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
 
                 tr_loss += loss.item()
-
                 accum_loss += loss.item()
+
                 if (step + 1) % gradient_accumulation_steps == 0:
                     global_step += 1
+
+                    if clip_grad_norm:
+                        if fp16:
+                            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
+                        else:
+                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
+
                     if global_step % report_every == 0 and verbose:
                         end = time.time()
                         print(
-                            "loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format(
+                            "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
                                 accum_loss / report_every, end - start, len(batch), global_step, max_steps,
                             )
                         )
@@ -185,9 +196,16 @@ def fine_tune(
 
         return global_step, tr_loss / global_step
 
-    def predict(self, eval_dataloader, device, get_inputs, verbose=True):
+    def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True):
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
+
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
+
+        # predict
         self.model.eval()
-        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
+        for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose):
             with torch.no_grad():
                 inputs = get_inputs(batch, device, self.model_name, train_mode=False)
                 outputs = self.model(**inputs)
diff --git a/utils_nlp/models/transformers/question_answering.py b/utils_nlp/models/transformers/question_answering.py
index 99cd59724..c0415a579 100755
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@@ -184,7 +184,7 @@ def preprocess(
                 answer texts from predicted answer start and answer end indices. Defaults to
                 "./cached_qa_features".
         Returns:
-            DataSet: A Pytorch DataSet.        
+            DataSet: A Pytorch DataSet.
         """
 
         if not os.path.exists(feature_cache_dir):
@@ -509,12 +509,6 @@ def fit(
 
         """
 
-        # get device
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
-
-        # move model
-        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
-
         # init optimizer
         optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
 
@@ -534,9 +528,9 @@ def fit(
         # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
-            device=device,
-            num_gpus=num_gpus,
             get_inputs=QAProcessor.get_inputs,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,          
             max_steps=max_steps,
             gradient_accumulation_steps=gradient_accumulation_steps,
             optimizer=optimizer,
@@ -555,7 +549,7 @@ def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
         Predicts answer start and end logits.
 
         Args:
-            test_dataloader (QADataset): Dataloader for the testing data.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will
                 be used. Defaults to None.
diff --git a/utils_nlp/models/transformers/sequence_classification.py b/utils_nlp/models/transformers/sequence_classification.py
index 4d26e39f6..e8a4a288b 100755
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@@ -11,7 +11,7 @@
 from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification
 from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification
 
-from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
+from utils_nlp.common.pytorch_utils import compute_training_steps
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
 
@@ -249,12 +249,6 @@ def fit(
             seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
         """
 
-        # get device
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
-
-        # move model
-        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
-
         # init optimizer
         optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
 
@@ -274,9 +268,9 @@ def fit(
         # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
-            device=device,
-            num_gpus=num_gpus,
             get_inputs=Processor.get_inputs,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
             max_steps=max_steps,
             gradient_accumulation_steps=gradient_accumulation_steps,
             optimizer=optimizer,
@@ -286,12 +280,12 @@ def fit(
             seed=seed,
         )
 
-    def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
         """
         Scores a dataset using a fine-tuned model and a given dataloader.
 
         Args:
-            eval_dataloader (Dataloader): Dataloader for the evaluation data.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will be used.
                 Defaults to None.
@@ -304,14 +298,13 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
             1darray: numpy array of predicted label indices.
         """
 
-        # get device
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
-        # move model
-        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
-
         preds = list(
             super().predict(
-                eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose,
+                eval_dataloader=test_dataloader,
+                get_inputs=Processor.get_inputs,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
+                verbose=verbose,
             )
         )
         preds = np.concatenate(preds)

From 82816318dca76ac3a459f27a45b4a89df5ac1010 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Thu, 16 Jan 2020 19:14:24 +0000
Subject: [PATCH 12/24] update summarization files

---
 ...test_notebooks_extractive_summarization.py |  12 +-
 tests/unit/test_extractive_summarization.py   |  45 +++----
 utils_nlp/eval/evaluate_summarization.py      |  12 +-
 .../transformers/extractive_summarization.py  | 114 ++++++++----------
 4 files changed, 78 insertions(+), 105 deletions(-)

diff --git a/tests/integration/test_notebooks_extractive_summarization.py b/tests/integration/test_notebooks_extractive_summarization.py
index a39ab0c1d..fdb9cfebf 100644
--- a/tests/integration/test_notebooks_extractive_summarization.py
+++ b/tests/integration/test_notebooks_extractive_summarization.py
@@ -1,14 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
-import json
-import shutil
-import pytest
 import papermill as pm
+import pytest
 import scrapbook as sb
-from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
-
+from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK
 
 ABS_TOL = 0.02
 
@@ -31,7 +27,7 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
             CACHE_DIR=tmp,
             BATCH_SIZE=3000,
             REPORT_EVERY=50,
-            MAX_STEPS=1e3,
+            MAX_STEPS=1000,
             WARMUP_STEPS=5e2,
             MODEL_NAME="distilbert-base-uncased",
         ),
@@ -39,5 +35,3 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
     result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
     print(result)
     assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL)
-
-
diff --git a/tests/unit/test_extractive_summarization.py b/tests/unit/test_extractive_summarization.py
index 40cacbeca..797e631e5 100644
--- a/tests/unit/test_extractive_summarization.py
+++ b/tests/unit/test_extractive_summarization.py
@@ -1,14 +1,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import nltk
+import os
 
+import nltk
 nltk.download("punkt")
-from nltk import tokenize
 import pytest
-import os
-import shutil
-
+from nltk import tokenize
 
 from utils_nlp.models.transformers.datasets import SummarizationDataset
 from utils_nlp.models.transformers.extractive_summarization import (
@@ -17,6 +15,9 @@
     ExtSumProcessor,
 )
 
+
+
+
 # @pytest.fixture()
 def source_data():
     return (
@@ -48,18 +49,10 @@ def data_to_file(tmp_module):
     f.write(target)
     f.close()
     train_dataset = SummarizationDataset(
-        source_file,
-        target_file,
-        [tokenize.sent_tokenize],
-        [tokenize.sent_tokenize],
-        nltk.word_tokenize,
+        source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
     )
     test_dataset = SummarizationDataset(
-        source_file,
-        target_file,
-        [tokenize.sent_tokenize],
-        [tokenize.sent_tokenize],
-        nltk.word_tokenize,
+        source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
     )
 
     processor = ExtSumProcessor(
@@ -70,20 +63,12 @@ def data_to_file(tmp_module):
         min_nsents=0,
         min_src_ntokens=1,
     )
-    ext_sum_train = processor.preprocess(
-        train_dataset, train_dataset.get_target(), oracle_mode="greedy"
-    )
-    ext_sum_test = processor.preprocess(
-        test_dataset, test_dataset.get_target(), oracle_mode="greedy"
-    )
+    ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy")
+    ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy")
 
     save_path = os.path.join(tmp_module, "processed")
-    train_files = ExtSumProcessedData.save_data(
-        ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000
-    )
-    test_files = ExtSumProcessedData.save_data(
-        ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000
-    )
+    train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000)
+    test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000)
     print(train_files)
     print(test_files)
     assert os.path.exists(train_files[0])
@@ -96,10 +81,10 @@ def test_bert_training(data_to_file, tmp_module):
 
     CACHE_DIR = tmp_module
     ENCODER = "transformer"
-    BATCH_SIZE = 200
+    BATCH_SIZE = 128
     LEARNING_RATE = 2e-3
-    REPORT_EVERY = 100
-    MAX_STEPS = 5e2
+    REPORT_EVERY = 50
+    MAX_STEPS = 2e2
     WARMUP_STEPS = 1e2
     DATA_SAVED_PATH = data_to_file
     result_base_path = "./results"
diff --git a/utils_nlp/eval/evaluate_summarization.py b/utils_nlp/eval/evaluate_summarization.py
index 4912717e9..421d48f13 100644
--- a/utils_nlp/eval/evaluate_summarization.py
+++ b/utils_nlp/eval/evaluate_summarization.py
@@ -3,22 +3,24 @@
 
 import os
 from random import random, seed
+
 from bertsum.others.utils import test_rouge
 
 
-def get_rouge(predictions, targets, temp_dir):
+def get_rouge(predictions, targets, temp_dir, random_seed=42):
     """
     function to get the rouge metric for the prediction and the reference.
 
     Args:
         predictions (list of strings): Predictions to be compared.
         target (list of strings): References
-        temp_dir (str): Path where temporary folders are created to host the files 
-            generated by ROUGE applicatoin.
+        temp_dir (str): Path where temporary folders are created to host the files
+            generated by ROUGE application.
+        seed (int, optional): Random seed. Defaults to 42.
 
     Return:
         dictionary: rouge metric
-        
+
     """
 
     def _write_list_to_file(list_items, filename):
@@ -27,7 +29,7 @@ def _write_list_to_file(list_items, filename):
             for item in list_items:
                 filehandle.write("%s\n" % item)
 
-    seed(42)
+    seed(random_seed)
     random_number = random()
     os.makedirs(temp_dir, exist_ok=True)
     candidate_path = os.path.join(temp_dir, "candidate" + str(random_number))
diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py
index 426f9002c..54a7f64ef 100644
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@@ -5,24 +5,22 @@
 
 import itertools
 import logging
-import numpy as np
 import os
 import random
+
+import numpy as np
 import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, IterableDataset
-from torch.utils.data import DataLoader, SequentialSampler
+from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSampler
 
 # from torch.utils.data.distributed import DistributedSampler
-from transformers import DistilBertModel, BertModel
+from transformers import BertModel, DistilBertModel
 
-from bertsum.models import model_builder, data_loader
+from bertsum.models import data_loader, model_builder
 from bertsum.models.data_loader import Batch
 from bertsum.models.model_builder import Summarizer
-
-from utils_nlp.common.pytorch_utils import get_device
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
+from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
 from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
+from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
 
 MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": DistilBertModel}
 
@@ -42,8 +40,8 @@ def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
 
     Args:
         data_iter (generator): data generator.
-        shuffle (bool): whether the data is shuffled
-        is_labeled (bool): it specifies whether the data objects are labeled data.
+        shuffle (bool): whether the data is shuffled.
+        is_labeled (bool): specifies whether the data objects are labeled data.
         batch_size (int): number of tokens per batch.
 
     Returns:
@@ -79,9 +77,7 @@ def get_stream(self):
         if self.is_shuffle:
             return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
         else:
-            return itertools.chain.from_iterable(
-                map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
-            )
+            return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
 
     def __iter__(self):
         return self.get_stream()
@@ -114,9 +110,7 @@ def __getitem__(self, idx):
         return self.data[idx]
 
 
-def get_pred(
-    example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
-):
+def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
     """
         Get the summarization prediction for the paragraph example based on the scores
         returned by the transformer summarization model.
@@ -229,9 +223,7 @@ def _chunks(iterable, chunk_size):
     def _get_files(self, root):
         train_files = []
         test_files = []
-        files = [
-            os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
-        ]
+        files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
         for fname in files:
             if fname.find("train") != -1:
                 train_files.append(fname)
@@ -324,7 +316,7 @@ def model_name(self, value):
         self._model_name = value
 
     @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
         """
         Creates an input dictionary given a model name.
 
@@ -332,6 +324,7 @@ def get_inputs(batch, model_name, train_mode=True):
             batch (object): A Batch containing input ids, segment ids, sentence class ids,
                 masks for the input ids, masks for  sentence class ids and source text.
                 If train_model is True, it also contains the labels and target text.
+            device (torch.device): A PyTorch device.
             model_name (bool, optional): Model name used to format the inputs.
             train_mode (bool, optional): Training mode flag.
                 Defaults to True.
@@ -345,6 +338,7 @@ def get_inputs(batch, model_name, train_mode=True):
         if model_name.split("-")[0] in ["bert", "distilbert"]:
             if train_mode:
                 # labels must be the last
+                batch = batch.to(device)
                 return {
                     "x": batch.src,
                     "segs": batch.segs,
@@ -354,6 +348,14 @@ def get_inputs(batch, model_name, train_mode=True):
                     "labels": batch.labels,
                 }
             else:
+                batch["src"] = batch["src"].to(device)
+                batch["segs"] = batch["segs"].to(device)
+                batch["clss"] = batch["clss"].to(device)
+                batch["mask"] = batch["mask"].to(device)
+                batch["mask_cls"] = batch["mask_cls"].to(device)
+                if "labels" in batch:
+                    batch["labels"] = batch["labels"].to(device)
+                batch = Bunch(batch)
                 return {
                     "x": batch.src,
                     "segs": batch.segs,
@@ -489,9 +491,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer",
             cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
         """
 
-        super().__init__(
-            model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
-        )
+        super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
         if model_name not in self.list_supported_models():
             raise ValueError(
                 "Model name {} is not supported by ExtractiveSummarizer. "
@@ -522,6 +522,7 @@ def fit(
         self,
         train_dataset,
         num_gpus=None,
+        gpu_ids=None,
         batch_size=3000,
         local_rank=-1,
         max_steps=5e5,
@@ -546,6 +547,9 @@ def fit(
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will
                 be used. Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             batch_size (int, optional): Maximum number of tokens in each batch. 
             local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                 -1, which means non-distributed training.
@@ -571,16 +575,7 @@ def fit(
             seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
         """
 
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
-
-        def move_batch_to_device(batch, device):
-            return batch.to(device)
-
-        # if isinstance(self.model, nn.DataParallel):
-        #    self.model.module.to(device)
-        # else:
-        self.model.to(device)
-
+        # init optimizer
         optimizer = model_builder.build_optim(
             optimization_method,
             learning_rate,
@@ -594,31 +589,34 @@ def move_batch_to_device(batch, device):
         )
 
         # batch_size is the number of tokens in a batch
-        train_dataloader = get_dataloader(
-            train_dataset.get_stream(), is_labeled=True, batch_size=batch_size
+        train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
         )
 
         super().fine_tune(
             train_dataloader=train_dataloader,
             get_inputs=ExtSumProcessor.get_inputs,
-            move_batch_to_device=move_batch_to_device,
-            n_gpu=num_gpus,
-            num_train_epochs=-1,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
             max_steps=max_steps,
-            optimizer=optimizer,
-            warmup_steps=warmup_steps,
+            max_grad_norm=max_grad_norm,
             gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=None,
             verbose=verbose,
             seed=seed,
             report_every=report_every,
             clip_grad_norm=False,
-            max_grad_norm=max_grad_norm,
         )
 
     def predict(
         self,
         test_dataset,
         num_gpus=1,
+        gpu_ids=None,
         batch_size=16,
         sentence_separator="<q>",
         top_n=3,
@@ -632,6 +630,9 @@ def predict(
         Args:
             test_dataset (Dataset): Dataset for which the summary to be predicted
             num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             batch_size (int, optional): The number of test examples in each batch. Defaults to 16.
             sentence_separator (str, optional): String to be inserted between sentences in
                 the prediction. Defaults to '<q>'.
@@ -678,10 +679,8 @@ def collate_fn(dict_list):
                 }
 
         test_sampler = SequentialSampler(test_dataset)
-        test_dataloader = DataLoader(
-            test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
-        )
-        sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus)
+        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
+        sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
         sent_scores_list = list(sent_scores)
         scores_list = []
         for i in sent_scores_list:
@@ -699,15 +698,18 @@ def collate_fn(dict_list):
             prediction.extend(temp_pred)
         return prediction
 
-    def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True):
+    def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):
         """
         Scores a dataset using a fine-tuned model and a given dataloader.
 
         Args:
-            eval_dataloader (Dataloader): Dataloader for the evaluation data.
+            test_dataloader (Dataloader): Dataloader for scoring the data.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will be used.
                 Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
             verbose (bool, optional): Whether to print out the training log. Defaults to True.
 
         Returns
@@ -716,23 +718,13 @@ def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True):
 
         device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
 
-        def move_batch_to_device(batch, device):
-            batch["src"] = batch["src"].to(device)
-            batch["segs"] = batch["segs"].to(device)
-            batch["clss"] = batch["clss"].to(device)
-            batch["mask"] = batch["mask"].to(device)
-            batch["mask_cls"] = batch["mask_cls"].to(device)
-            if "labels" in batch:
-                batch["labels"] = batch["labels"].to(device)
-            return Bunch(batch)
-
         preds = list(
             super().predict(
-                eval_dataloader=eval_dataloader,
+                eval_dataloader=test_dataloader,
                 get_inputs=ExtSumProcessor.get_inputs,
-                n_gpu=num_gpus,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
                 verbose=verbose,
-                move_batch_to_device=move_batch_to_device,
             )
         )
         return preds

From b76750aaa691c0f391c0c384eaff3e851bd533ce Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Fri, 17 Jan 2020 02:16:10 +0000
Subject: [PATCH 13/24] NER updates

---
 .../ner_wikigold_transformer.ipynb            |  2 +-
 tests/unit/test_bert_token_classification.py  | 79 -------------------
 .../test_transformers_token_classification.py | 31 ++++++++
 utils_nlp/dataset/wikigold.py                 | 14 ++--
 utils_nlp/models/transformers/common.py       |  3 +
 .../transformers/named_entity_recognition.py  | 47 +++++------
 6 files changed, 60 insertions(+), 116 deletions(-)
 delete mode 100644 tests/unit/test_bert_token_classification.py
 create mode 100644 tests/unit/test_transformers_token_classification.py

diff --git a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
index 8bbc82a7c..f077f8d62 100644
--- a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
+++ b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
@@ -233,7 +233,7 @@
    "source": [
     "with Timer() as t:\n",
     "    preds = model.predict(\n",
-    "        eval_dataloader=test_dataloader,\n",
+    "        test_dataloader=test_dataloader,\n",
     "        num_gpus=None,\n",
     "        verbose=True\n",
     "    )\n",
diff --git a/tests/unit/test_bert_token_classification.py b/tests/unit/test_bert_token_classification.py
deleted file mode 100644
index c3a46584f..000000000
--- a/tests/unit/test_bert_token_classification.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import pytest
-
-from utils_nlp.models.bert.token_classification import (
-    BERTTokenClassifier,
-    postprocess_token_labels,
-)
-
-
-def test_token_classifier_num_labels():
-    with pytest.raises(ValueError):
-        BERTTokenClassifier(num_labels=1)
-
-
-def test_token_classifier_fit_predict(tmp_path, ner_test_data):
-    token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=tmp_path)
-
-    # test fit, no warmup
-    token_classifier.fit(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-    )
-
-    # test fit, with warmup
-    token_classifier.fit(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-        warmup_proportion=0.1,
-    )
-    # test predict, no labels
-    token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-    )
-
-    # test predict, with labels
-    token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-    )
-
-    # test output probabilities
-    predictions = token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-        probabilities=True,
-    )
-    assert len(predictions.classes) == predictions.probabilities.shape[0]
-
-
-def test_postprocess_token_labels(ner_test_data):
-    labels_no_padding = postprocess_token_labels(
-        labels=ner_test_data["PREDICTED_LABELS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        label_map=ner_test_data["LABEL_MAP"],
-    )
-
-    assert labels_no_padding == ner_test_data["EXPECTED_TOKENS_NO_PADDING"]
-
-
-def test_postprocess_token_labels_remove_trailing(ner_test_data):
-    labels_no_padding_no_trailing = postprocess_token_labels(
-        labels=ner_test_data["PREDICTED_LABELS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        label_map=ner_test_data["LABEL_MAP"],
-        remove_trailing_word_pieces=True,
-        trailing_token_mask=ner_test_data["TRAILING_TOKEN_MASK"],
-    )
-
-    assert (
-        labels_no_padding_no_trailing
-        == ner_test_data["EXPECTED_TOKENS_NO_PADDING_NO_TRAILING"]
-    )
diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py
new file mode 100644
index 000000000..eda90c6d4
--- /dev/null
+++ b/tests/unit/test_transformers_token_classification.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
+from utils_nlp.models.transformers.named_entity_recognition import (
+    TokenClassificationProcessor,
+    TokenClassifier,    
+)
+
+
+def test_token_classifier_num_labels():
+    with pytest.raises(ValueError):
+        TokenClassifier(num_labels=1)
+
+
+def test_token_classifier_fit_predict(tmp_path, ner_test_data):
+    token_classifier = TokenClassifier(num_labels=6, cache_dir=tmp_path)
+    processor = TokenClassificationProcessor(cache_dir=tmp_path)
+
+    # test fit, no warmup
+    train_dataset = processor.preprocess_for_bert(
+        text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"],
+    )
+    train_dataloader = dataloader_from_dataset(train_dataset)
+    token_classifier.fit(train_dataloader)
+
+    # test predict, no labels
+    preds = token_classifier.predict(train_dataloader, verbose=False)
+    assert len(preds) == len(ner_test_data["INPUT_LABELS"])
diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py
index 508d5dc56..32a0c5420 100644
--- a/utils_nlp/dataset/wikigold.py
+++ b/utils_nlp/dataset/wikigold.py
@@ -117,7 +117,7 @@ def load_dataset(
         cache_dir (str, optional): The default folder for saving cache files.
             Defaults to './temp'.
         max_len (int, optional): Maximum length of the list of tokens. Lists longer
-            than this are truncated and shorter ones are padded with "O"s. 
+            than this are truncated and shorter ones are padded with "O"s.
             Default value is BERT_MAX_LEN=512.
         trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
             For example, "criticize" is broken into "critic" and "##ize", "critic"
@@ -130,16 +130,12 @@ def load_dataset(
 
     Returns:
         tuple. The tuple contains four elements.
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
-
-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
-        
-        label_map (dict): A dictionary object to map a label (str) to an ID (int). 
-
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
+        label_map (dict): A dictionary object to map a label (str) to an ID (int).
         test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
             1. input_ids_all: Tensor. Each sublist contains numerical values,
-                i.e. token ids, corresponding to the tokens in the input 
-                text data.
+                i.e. token ids, corresponding to the tokens in the input text data.
             2. input_mask_all: Tensor. Each sublist contains the attention
                 mask of the input token id list, 1 for input tokens and 0 for
                 padded tokens, so that padded tokens are not attended to.
diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 7fce22c6b..9808719a7 100755
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -41,6 +41,9 @@ def __init__(
         self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
     ):
 
+        if num_labels < 2:
+            raise ValueError("Number of labels should be at least 2.")
+        
         if model_name not in self.list_supported_models():
             raise ValueError(
                 "Model name {0} is not supported by {1}. "
diff --git a/utils_nlp/models/transformers/named_entity_recognition.py b/utils_nlp/models/transformers/named_entity_recognition.py
index 169bb21c8..76d0b5e37 100755
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@@ -6,10 +6,11 @@
 
 import numpy as np
 import torch
+from torch.utils.data import TensorDataset
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
 from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification
 
-from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+from utils_nlp.common.pytorch_utils import compute_training_steps
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 
 TC_MODEL_CLASS = {}
@@ -129,7 +130,7 @@ def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=
         Returns:
             TensorDataset: A TensorDataset containing the following four tensors.
                 1. input_ids_all: Tensor. Each sublist contains numerical values,
-                    i.e. token ids, corresponding to the tokens in the input 
+                    i.e. token ids, corresponding to the tokens in the input
                     text data.
                 2. input_mask_all: Tensor. Each sublist contains the attention
                     mask of the input token id list, 1 for input tokens and 0 for
@@ -229,14 +230,14 @@ def _is_iterable_but_not_string(obj):
             td = TensorDataset(
                 torch.tensor(input_ids_all, dtype=torch.long),
                 torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.bool),
+                torch.tensor(trailing_token_mask_all, dtype=torch.long),
                 torch.tensor(label_ids_all, dtype=torch.long),
             )
         else:
             td = TensorDataset(
                 torch.tensor(input_ids_all, dtype=torch.long),
                 torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.bool),
+                torch.tensor(trailing_token_mask_all, dtype=torch.long),
             )
         return td
 
@@ -311,12 +312,6 @@ def fit(
             seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
         """
 
-        # get device
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
-
-        # move model
-        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
-
         # init optimizer
         optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
 
@@ -336,9 +331,9 @@ def fit(
         # fine tune
         super().fine_tune(
             train_dataloader=train_dataloader,
-            device=device,
+            get_inputs=TokenClassificationProcessor.get_inputs,
             num_gpus=num_gpus,
-            get_inputs=Processor.get_inputs,
+            gpu_ids=gpu_ids,
             max_steps=max_steps,
             gradient_accumulation_steps=gradient_accumulation_steps,
             optimizer=optimizer,
@@ -348,12 +343,12 @@ def fit(
             seed=seed,
         )
 
-    def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
         """
         Scores a dataset using a fine-tuned model and a given dataloader.
 
         Args:
-            eval_dataloader (Dataloader): Dataloader for the evaluation data.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
             num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                 be used. If set to 0 or GPUs are not available, CPU device will be used.
                 Defaults to None.
@@ -366,18 +361,16 @@ def predict(self, eval_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
             1darray: numpy array of predicted label indices.
         """
 
-        # get device
-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
-        # move model
-        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
-
         preds = list(
             super().predict(
-                eval_dataloader=eval_dataloader, device=device, get_inputs=Processor.get_inputs, verbose=verbose,
+                eval_dataloader=test_dataloader,
+                get_inputs=TokenClassificationProcessor.get_inputs,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
+                verbose=verbose,
             )
         )
-        preds = np.concatenate(preds)
-        return np.argmax(preds, axis=1)
+        return np.concatenate(preds)
 
     def get_predicted_token_labels(self, predictions, label_map, dataset):
         """
@@ -386,13 +379,13 @@ def get_predicted_token_labels(self, predictions, label_map, dataset):
         Args:
             predictions (ndarray): A numpy ndarray produced from the `predict` function call.
                 The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels].
-            label_map (dict): A dictionary object to map a label (str) to an ID (int). 
+            label_map (dict): A dictionary object to map a label (str) to an ID (int).
                 dataset (TensorDataset): The TensorDataset for evaluation.
             dataset (Dataset): The test Dataset instance.
 
         Returns:
             list: A list of lists. The size of the retured list is the number of testing samples.
-            Each sublist represents the predicted label for each token. 
+            Each sublist represents the predicted label for each token.
         """
 
         num_samples = len(dataset.tensors[0])
@@ -417,7 +410,7 @@ def get_predicted_token_labels(self, predictions, label_map, dataset):
                 if attention_mask[sid] == 0:
                     break
 
-                if not trailing_mask[sid]:
+                if not bool(trailing_mask[sid]):
                     continue
 
                 label_id = seq_probs[sid].argmax()
@@ -430,13 +423,13 @@ def get_true_test_labels(self, label_map, dataset):
         Get the true testing label values.
 
         Args:
-            label_map (dict): A dictionary object to map a label (str) to an ID (int). 
+            label_map (dict): A dictionary object to map a label (str) to an ID (int).
                 dataset (TensorDataset): The TensorDataset for evaluation.
             dataset (Dataset): The test Dataset instance.
 
         Returns:
             list: A list of lists. The size of the retured list is the number of testing samples.
-            Each sublist represents the predicted label for each token. 
+            Each sublist represents the predicted label for each token.
         """
 
         num_samples = len(dataset.tensors[0])

From 97f6f0814bd6972a2921f2a3368d14e1655b99a4 Mon Sep 17 00:00:00 2001
From: Said Bleik <saidbleik@outlook.com>
Date: Fri, 17 Jan 2020 00:06:37 -0500
Subject: [PATCH 14/24] Update test_notebooks_extractive_summarization.py

---
 tests/integration/test_notebooks_extractive_summarization.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_notebooks_extractive_summarization.py b/tests/integration/test_notebooks_extractive_summarization.py
index fdb9cfebf..4f9e17f79 100644
--- a/tests/integration/test_notebooks_extractive_summarization.py
+++ b/tests/integration/test_notebooks_extractive_summarization.py
@@ -33,5 +33,4 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
         ),
     )
     result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
-    print(result)
     assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL)

From b6424d173e2f253f2b36e8320edd614bdfdae680 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 06:09:20 +0000
Subject: [PATCH 15/24] update pytorch_utils tests

---
 tests/unit/test_common_pytorch_utils.py | 31 ++++++++++-------------
 utils_nlp/common/pytorch_utils.py       | 33 ++++++++++++++++---------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py
index e2fce1e10..4cdb125c6 100644
--- a/tests/unit/test_common_pytorch_utils.py
+++ b/tests/unit/test_common_pytorch_utils.py
@@ -5,10 +5,10 @@
 import pytest
 import torch
 import torch.nn as nn
-from torch.nn.parallel.data_parallel import DataParallel
 from torch.nn.modules.container import Sequential
+from torch.nn.parallel.data_parallel import DataParallel
 
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 
 
 @pytest.fixture
@@ -55,49 +55,47 @@ def test_get_device_local_rank():
 
 def test_move_to_device_cpu(model):
     # test when device.type="cpu"
-    model_cpu = move_to_device(model, torch.device("cpu"))
+    model_cpu = move_model_to_device(model, torch.device("cpu"))
     assert isinstance(model_cpu, nn.modules.container.Sequential)
 
 
 def test_move_to_device_cpu_parallelized(model):
     # test when input model is parallelized
     model_parallelized = nn.DataParallel(model)
-    model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu"))
+    model_parallelized_output = move_model_to_device(model_parallelized, torch.device("cpu"))
     assert isinstance(model_parallelized_output, nn.modules.container.Sequential)
 
 
 def test_move_to_device_exception_not_torch_device(model):
     # test when device is not torch.device
     with pytest.raises(ValueError):
-        move_to_device(model, "abc")
+        move_model_to_device(model, "abc")
 
 
 def test_move_to_device_exception_wrong_type(model):
     # test when device.type is not "cuda" or "cpu"
     with pytest.raises(Exception):
-        move_to_device(model, torch.device("opengl"))
+        move_model_to_device(model, torch.device("opengl"))
 
 
-@pytest.mark.skipif(
-    torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine"
-)
+@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine")
 def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
     # test when the model is moved to a gpu but it is a cpu machine
     with pytest.raises(Exception):
-        move_to_device(model, torch.device("cuda"))
+        move_model_to_device(model, torch.device("cuda"))
 
 
 @pytest.mark.gpu
 def test_move_to_device_exception_cuda_zero_gpus(model):
     # test when device.type is cuda, but num_gpus is 0
     with pytest.raises(ValueError):
-        move_to_device(model, torch.device("cuda"), num_gpus=0)
+        move_model_to_device(model, torch.device("cuda"), num_gpus=0)
 
 
 @pytest.mark.gpu
 def test_move_to_device_gpu(model):
     # test when device.type="cuda"
-    model_cuda = move_to_device(model, torch.device("cuda"))
+    model_cuda = move_model_to_device(model, torch.device("cuda"))
     num_cuda_devices = torch.cuda.device_count()
 
     if num_cuda_devices > 1:
@@ -105,20 +103,17 @@ def test_move_to_device_gpu(model):
     else:
         assert isinstance(model_cuda, Sequential)
 
-    model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
+    model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=1)
     assert isinstance(model_cuda_1_gpu, Sequential)
 
-    model_cuda_1_more_gpu = move_to_device(
-        model, torch.device("cuda"), num_gpus=num_cuda_devices + 1
-    )
+    model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1)
     if num_cuda_devices > 1:
         assert isinstance(model_cuda_1_more_gpu, DataParallel)
     else:
         assert isinstance(model_cuda_1_more_gpu, Sequential)
 
-    model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
+    model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
     if num_cuda_devices > 1:
         assert isinstance(model_cuda_same_gpu, DataParallel)
     else:
         assert isinstance(model_cuda_same_gpu, Sequential)
-
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index 89f98ab2a..432692380 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -3,10 +3,7 @@
 
 """Common PyTorch utilities that facilitate building Pytorch models."""
 
-import warnings
-
 import torch
-import torch.nn as nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
@@ -42,15 +39,20 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
             If set to None, all available GPUs will be used.
             Defaults to None.
         gpu_ids (list): List of GPU IDs to be used.
-            If set to None, the first num_gpus GPUs will be used.
+            If None, the first num_gpus GPUs will be used.
+            If not None, overrides num_gpus.
             Defaults to None.
         local_rank (int): Local GPU ID within a node. Used in distributed environments.
+            If not -1, num_gpus and gpu_ids are ignored.
             Defaults to -1.
-    
+
     Returns:
         Module, DataParallel, DistributedDataParallel: A PyTorch Module or
             a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used).
     """
+    if not isinstance(device, torch.device):
+        raise ValueError("device must be of type torch.device.")
+
     # unwrap model
     if isinstance(model, torch.nn.DataParallel):
         model = model.module
@@ -60,10 +62,18 @@ def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=
             self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True,
         )
     else:
-        if num_gpus > 1:
+        if device.type == "cuda":
+            if num_gpus is not None:
+                if num_gpus < 1:
+                    raise ValueError("num_gpus must be at least 1 or None")
+            num_cuda_devices = torch.cuda.device_count()
+            if num_cuda_devices < 1:
+                raise Exception("CUDA devices are not available.")
             if gpu_ids is None:
+                num_gpus = num_cuda_devices if num_gpus is None else min(num_gpus, num_cuda_devices)
                 gpu_ids = list(range(num_gpus))
-            model = torch.nn.DataParallel(model, device_ids=gpu_ids)
+            if len(gpu_ids) > 1:
+                model = torch.nn.DataParallel(model, device_ids=gpu_ids)
     # move to device
     return model.to(device)
 
@@ -94,9 +104,10 @@ def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, dis
 
     return DataLoader(ds, sampler=sampler, batch_size=batch_size)
 
+
 def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1):
-    """Computes the max training steps given a dataloader. 
-    
+    """Computes the max training steps given a dataloader.
+
     Args:
         dataloader (Dataloader): A PyTorch DataLoader.
         num_epochs (int, optional): Number of training epochs. Defaults to 1.
@@ -107,7 +118,7 @@ def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accu
         gradient_accumulation_steps (int, optional): Number of steps to accumulate
             before performing a backward/update pass.
             Default to 1.
-      
+
     Returns:
         int: The max number of steps to be used in a training loop.
     """
@@ -120,4 +131,4 @@ def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accu
             max_steps = dataset_length // gradient_accumulation_steps * num_epochs
     if max_steps <= 0:
         raise Exception("Max steps cannot be determined.")
-    return max_steps
\ No newline at end of file
+    return max_steps

From 2b1736086f1fb3e8be4dd8cb35b8601a4890e278 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 06:15:32 +0000
Subject: [PATCH 16/24] update pytorch utils tests

---
 tests/unit/test_common_pytorch_utils.py | 1 +
 utils_nlp/common/pytorch_utils.py       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_common_pytorch_utils.py b/tests/unit/test_common_pytorch_utils.py
index 4cdb125c6..7105283aa 100644
--- a/tests/unit/test_common_pytorch_utils.py
+++ b/tests/unit/test_common_pytorch_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+"""PyTorch utils tests."""
 
 import pytest
 import torch
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index 432692380..77918d70c 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-"""Common PyTorch utilities that facilitate building Pytorch models."""
+"""Common PyTorch utilities that facilitate building PyTorch models."""
 
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

From 371a8582d3cb893709aee18b5068fa8d671984a6 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 17:56:03 +0000
Subject: [PATCH 17/24] update move_to_device refs

---
 .../models/bert/sequence_classification.py    |  4 +-
 .../sequence_classification_distributed.py    |  6 +-
 utils_nlp/models/bert/sequence_encoding.py    | 78 +++++-----------
 utils_nlp/models/bert/token_classification.py |  6 +-
 .../models/xlnet/sequence_classification.py   | 89 ++++++-------------
 5 files changed, 57 insertions(+), 126 deletions(-)

diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py
index 03a324604..4748ceec3 100644
--- a/utils_nlp/models/bert/sequence_classification.py
+++ b/utils_nlp/models/bert/sequence_classification.py
@@ -91,7 +91,7 @@ def fit(
 
         device, num_gpus = get_device(num_gpus)
 
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
         input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@@ -211,7 +211,7 @@ def predict(
                 (classes, probabilities) if probabilities is True.
         """
         device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         # score
         self.model.eval()
diff --git a/utils_nlp/models/bert/sequence_classification_distributed.py b/utils_nlp/models/bert/sequence_classification_distributed.py
index ee5061158..d448515ba 100644
--- a/utils_nlp/models/bert/sequence_classification_distributed.py
+++ b/utils_nlp/models/bert/sequence_classification_distributed.py
@@ -14,7 +14,7 @@
 from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm
 
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 from utils_nlp.models.bert.common import Language
 
 try:
@@ -192,7 +192,7 @@ def fit(
 
         device, num_gpus = get_device(num_gpus)
 
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         if bert_optimizer is None:
             bert_optimizer = self.create_optimizer(
@@ -277,7 +277,7 @@ def predict(self, test_loader, num_gpus=None, probabilities=False):
                 a dictionary with classes, target labels, probabilities) if probabilities is True.
         """
         device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         # score
         self.model.eval()
diff --git a/utils_nlp/models/bert/sequence_encoding.py b/utils_nlp/models/bert/sequence_encoding.py
index 088a6310d..520c56a3d 100644
--- a/utils_nlp/models/bert/sequence_encoding.py
+++ b/utils_nlp/models/bert/sequence_encoding.py
@@ -4,19 +4,17 @@
 # This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
 # /extract_features.py, with necessary modifications.
 
-from pytorch_pretrained_bert.modeling import BertModel
-
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
 from enum import Enum
+
 import numpy as np
 import pandas as pd
-import os
 import torch
+from cached_property import cached_property
+from pytorch_pretrained_bert.modeling import BertModel
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
 
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 from utils_nlp.models.bert.common import Language, Tokenizer
-from cached_property import cached_property
 
 
 class PoolingStrategy(str, Enum):
@@ -43,27 +41,21 @@ def __init__(
         pooling_strategy=PoolingStrategy.MEAN,
     ):
         """Initialize the encoder's underlying model and tokenizer
-        
+
         Args:
             bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
             tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
             language: The pretrained model's language. Defaults to Language.ENGLISH.
-            num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. 
+            num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
             cache_dir: Location of BERT's cache directory. Defaults to "."
             to_lower: True to lowercase before tokenization. Defaults to False.
             max_len: Maximum number of tokens.
-            layer_index: The layer from which to extract features. 
+            layer_index: The layer from which to extract features.
                          Defaults to the last layer; can also be a list of integers for experimentation.
             pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
         """
-        self.model = (
-            bert_model.model.bert
-            if bert_model
-            else BertModel.from_pretrained(language, cache_dir=cache_dir)
-        )
-        self.tokenizer = (
-            tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
-        )
+        self.model = bert_model.model.bert if bert_model else BertModel.from_pretrained(language, cache_dir=cache_dir)
+        self.tokenizer = tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
         self.num_gpus = num_gpus
         self.max_len = max_len
         self.layer_index = layer_index
@@ -98,16 +90,17 @@ def pooling_strategy(self, pooling_strategy):
 
     def get_hidden_states(self, text, batch_size=32):
         """Extract the hidden states from the pretrained model
-        
+
         Args:
             text: List of documents to extract features from.
             batch_size: Batch size, defaults to 32.
-        
+
         Returns:
-            pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
+            pd.DataFrame with columns:
+                text_index (int), token (str), layer_index (int), values (list[float]).
         """
         device, num_gpus = get_device(self.num_gpus)
-        self.model = move_to_device(self.model, device, self.num_gpus)
+        self.model = move_model_to_device(self.model, device, self.num_gpus)
 
         self.model.eval()
 
@@ -122,9 +115,7 @@ def get_hidden_states(self, text, batch_size=32):
         input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)
 
         eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
-        eval_dataloader = DataLoader(
-            eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
-        )
+        eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size)
 
         hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
         for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
@@ -142,9 +133,7 @@ def get_hidden_states(self, text, batch_size=32):
                         hidden_states["text_index"].append(example_index.item())
                         hidden_states["token"].append(token)
                         hidden_states["layer_index"].append(layer_index)
-                        hidden_states["values"].append(
-                            [round(x.item(), 6) for x in layer_output[i]]
-                        )
+                        hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]])
 
             # empty cache
             del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
@@ -158,7 +147,7 @@ def get_hidden_states(self, text, batch_size=32):
 
     def pool(self, df):
         """Pooling to aggregate token-wise embeddings to sentence embeddings
-        
+
         Args:
             df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])
 
@@ -167,31 +156,16 @@ def pool(self, df):
         """
 
         def max_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
             m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
             return m.numpy()
 
         def mean_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
             return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy()
 
         def cls_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
             return values[0]
 
         try:
@@ -206,15 +180,11 @@ def cls_pool(x):
         except ValueError as ve:
             print(ve)
 
-        return (
-            df.groupby(["text_index", "layer_index"])["values"]
-            .apply(lambda x: pool_func(x))
-            .reset_index()
-        )
+        return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()
 
     def encode(self, text, batch_size=32, as_numpy=False):
-        """Computes sentence encodings 
-        
+        """Computes sentence encodings
+
         Args:
             text: List of documents to encode.
             batch_size: Batch size, defaults to 32.
diff --git a/utils_nlp/models/bert/token_classification.py b/utils_nlp/models/bert/token_classification.py
index 3965c41c1..816cb0216 100644
--- a/utils_nlp/models/bert/token_classification.py
+++ b/utils_nlp/models/bert/token_classification.py
@@ -16,7 +16,7 @@
 from tqdm import tqdm, trange
 
 from utils_nlp.models.bert.common import Language, create_data_loader
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 
 from cached_property import cached_property
 
@@ -144,7 +144,7 @@ def fit(
 
         device, num_gpus = get_device(num_gpus)
 
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         if num_gpus is None:
             num_gpus_used = torch.cuda.device_count()
@@ -228,7 +228,7 @@ def predict(
         )
         device, num_gpus = get_device(num_gpus)
 
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         self.model.eval()
         eval_loss = 0
diff --git a/utils_nlp/models/xlnet/sequence_classification.py b/utils_nlp/models/xlnet/sequence_classification.py
index 32c239866..a7a086ea3 100644
--- a/utils_nlp/models/xlnet/sequence_classification.py
+++ b/utils_nlp/models/xlnet/sequence_classification.py
@@ -2,23 +2,20 @@
 # Licensed under the MIT License.
 
 """Utilities for Xlnet Sequence Classification"""
-import numpy as np
+import os
 from collections import namedtuple
+
+import mlflow
+import mlflow.pytorch
+import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (
-    XLNetConfig,
-    XLNetForSequenceClassification,
-    AdamW,
-    WarmupLinearSchedule,
-)
-from tqdm import tqdm
 from torch.utils.data import DataLoader, RandomSampler, TensorDataset
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from tqdm import tqdm
+from transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification
+
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 from utils_nlp.models.xlnet.common import Language
-import mlflow
-import mlflow.pytorch
-import os
 
 
 class XLNetSequenceClassifier:
@@ -79,9 +76,7 @@ def __init__(
         self.max_grad_norm = max_grad_norm
 
         # create classifier
-        self.config = XLNetConfig.from_pretrained(
-            self.language.value, num_labels=num_labels, cache_dir=cache_dir
-        )
+        self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir)
         self.model = XLNetForSequenceClassification(self.config)
 
     def fit(
@@ -114,7 +109,7 @@ def fit(
         """
 
         device, num_gpus = get_device(self.num_gpus)
-        self.model = move_to_device(self.model, device, self.num_gpus)
+        self.model = move_model_to_device(self.model, device, self.num_gpus)
 
         token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
         input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@@ -128,24 +123,17 @@ def fit(
             token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
             val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)
 
-            train_dataset = TensorDataset(
-                token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
-            )
+            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor)
 
             val_dataset = TensorDataset(
-                val_token_ids_tensor,
-                val_input_mask_tensor,
-                val_token_type_ids_tensor,
-                val_labels_tensor,
+                val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor,
             )
 
         else:
 
             train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)
 
-            val_dataset = TensorDataset(
-                val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
-            )
+            val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor)
 
         # define optimizer and model parameters
         param_optimizer = list(self.model.named_parameters())
@@ -155,10 +143,7 @@ def fit(
                 "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 "weight_decay": self.weight_decay,
             },
-            {
-                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
         ]
 
         val_sampler = RandomSampler(val_dataset)
@@ -181,9 +166,7 @@ def fit(
 
             train_sampler = RandomSampler(train_dataset)
 
-            train_dataloader = DataLoader(
-                train_dataset, sampler=train_sampler, batch_size=self.batch_size
-            )
+            train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)
 
             tr_loss = 0.0
             logging_loss = 0.0
@@ -191,18 +174,13 @@ def fit(
 
             for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 if token_type_ids:
-                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
-                        t.to(device) for t in batch
-                    )
+                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch)
                 else:
                     token_type_ids_batch = None
                     x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)
 
                 outputs = self.model(
-                    input_ids=x_batch,
-                    token_type_ids=token_type_ids_batch,
-                    attention_mask=mask_batch,
-                    labels=y_batch,
+                    input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch,
                 )
 
                 loss = outputs[0]  # model outputs are always tuple in pytorch-transformers
@@ -220,9 +198,7 @@ def fit(
                 if logging_steps > 0 and global_step % logging_steps == 0:
                     mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
                     mlflow.log_metric(
-                        "training loss",
-                        (tr_loss - logging_loss) / (logging_steps * self.batch_size),
-                        step=global_step,
+                        "training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step,
                     )
                     logging_loss = tr_loss
                 # model checkpointing
@@ -245,9 +221,7 @@ def fit(
                             )
                         else:
                             token_type_ids_batch = None
-                            val_x_batch, val_mask_batch, val_y_batch = tuple(
-                                t.to(device) for t in val_batch
-                            )
+                            val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch)
                         val_outputs = self.model(
                             input_ids=val_x_batch,
                             token_type_ids=val_token_type_ids_batch,
@@ -256,9 +230,7 @@ def fit(
                         )
                         vloss = val_outputs[0]
                         val_loss += vloss.sum().item()
-                    mlflow.log_metric(
-                        "validation loss", val_loss / len(val_dataset), step=global_step
-                    )
+                    mlflow.log_metric("validation loss", val_loss / len(val_dataset), step=global_step)
                     self.model.train()
 
                 if verbose:
@@ -300,13 +272,7 @@ def fit(
         torch.cuda.empty_cache()
 
     def predict(
-        self,
-        token_ids,
-        input_mask,
-        token_type_ids=None,
-        num_gpus=None,
-        batch_size=8,
-        probabilities=False,
+        self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False,
     ):
         """Scores the given dataset and returns the predicted classes.
 
@@ -330,7 +296,7 @@ def predict(
         """
 
         device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)
 
         self.model.eval()
         preds = []
@@ -342,16 +308,11 @@ def predict(
                 x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
                 mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)
 
-                token_type_ids_batch = torch.tensor(
-                    token_type_ids[start:end], dtype=torch.long, device=device
-                )
+                token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device)
 
                 with torch.no_grad():
                     pred_batch = self.model(
-                        input_ids=x_batch,
-                        token_type_ids=token_type_ids_batch,
-                        attention_mask=mask_batch,
-                        labels=None,
+                        input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None,
                     )
                     preds.append(pred_batch[0].cpu())
                     if i % batch_size == 0:

From ea11200338bf8d99e1a718aa1e967f239bca0009 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 18:04:22 +0000
Subject: [PATCH 18/24] rem num_label requirement

---
 utils_nlp/models/transformers/common.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 9808719a7..7fce22c6b 100755
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -41,9 +41,6 @@ def __init__(
         self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
     ):
 
-        if num_labels < 2:
-            raise ValueError("Number of labels should be at least 2.")
-        
         if model_name not in self.list_supported_models():
             raise ValueError(
                 "Model name {0} is not supported by {1}. "

From 4b2ced52baf31f0a3869b34854c09470ea8a838a Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 18:07:39 +0000
Subject: [PATCH 19/24] rem num_labels check

---
 tests/unit/test_transformers_token_classification.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py
index eda90c6d4..b4da4014e 100644
--- a/tests/unit/test_transformers_token_classification.py
+++ b/tests/unit/test_transformers_token_classification.py
@@ -1,18 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import pytest
-
 from utils_nlp.common.pytorch_utils import dataloader_from_dataset
-from utils_nlp.models.transformers.named_entity_recognition import (
-    TokenClassificationProcessor,
-    TokenClassifier,    
-)
-
-
-def test_token_classifier_num_labels():
-    with pytest.raises(ValueError):
-        TokenClassifier(num_labels=1)
+from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier
 
 
 def test_token_classifier_fit_predict(tmp_path, ner_test_data):

From 30a9e0342b4d188903396621d27c03bbb070637d Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sat, 18 Jan 2020 19:38:10 +0000
Subject: [PATCH 20/24] add pytest marker to ner test

---
 tests/unit/test_transformers_token_classification.py | 9 ++++++---
 utils_nlp/models/bert/sequence_classification.py     | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py
index b4da4014e..a39a93c66 100644
--- a/tests/unit/test_transformers_token_classification.py
+++ b/tests/unit/test_transformers_token_classification.py
@@ -1,13 +1,16 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import pytest
+
 from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier
 
 
-def test_token_classifier_fit_predict(tmp_path, ner_test_data):
-    token_classifier = TokenClassifier(num_labels=6, cache_dir=tmp_path)
-    processor = TokenClassificationProcessor(cache_dir=tmp_path)
+@pytest.mark.cpu
+def test_token_classifier_fit_predict(tmpdir, ner_test_data):
+    token_classifier = TokenClassifier(num_labels=6, cache_dir=tmpdir)
+    processor = TokenClassificationProcessor(cache_dir=tmpdir)
 
     # test fit, no warmup
     train_dataset = processor.preprocess_for_bert(
diff --git a/utils_nlp/models/bert/sequence_classification.py b/utils_nlp/models/bert/sequence_classification.py
index 4748ceec3..8fbe416c8 100644
--- a/utils_nlp/models/bert/sequence_classification.py
+++ b/utils_nlp/models/bert/sequence_classification.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 from utils_nlp.models.bert.common import Language
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device
 
 from cached_property import cached_property
 

From 489f5381b8fae5ba0f35ab46420ffd360253090a Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Sun, 19 Jan 2020 19:24:10 +0000
Subject: [PATCH 21/24] specify model name in NER test

---
 tests/unit/test_transformers_token_classification.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_transformers_token_classification.py b/tests/unit/test_transformers_token_classification.py
index a39a93c66..e8c780f21 100644
--- a/tests/unit/test_transformers_token_classification.py
+++ b/tests/unit/test_transformers_token_classification.py
@@ -9,8 +9,8 @@
 
 @pytest.mark.cpu
 def test_token_classifier_fit_predict(tmpdir, ner_test_data):
-    token_classifier = TokenClassifier(num_labels=6, cache_dir=tmpdir)
-    processor = TokenClassificationProcessor(cache_dir=tmpdir)
+    token_classifier = TokenClassifier(model_name="bert-base-uncased", num_labels=6, cache_dir=tmpdir)
+    processor = TokenClassificationProcessor(model_name="bert-base-uncased", cache_dir=tmpdir)
 
     # test fit, no warmup
     train_dataset = processor.preprocess_for_bert(
@@ -20,5 +20,4 @@ def test_token_classifier_fit_predict(tmpdir, ner_test_data):
     token_classifier.fit(train_dataloader)
 
     # test predict, no labels
-    preds = token_classifier.predict(train_dataloader, verbose=False)
-    assert len(preds) == len(ner_test_data["INPUT_LABELS"])
+    _ = token_classifier.predict(train_dataloader, verbose=False)

From 9db5a3708c1fe1eb4a185a17b19238d6a14cff3f Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Thu, 23 Jan 2020 23:59:31 +0000
Subject: [PATCH 22/24] minor edits

---
 tests/smoke/test_gpu_utils.py     | 1 -
 utils_nlp/common/pytorch_utils.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/smoke/test_gpu_utils.py b/tests/smoke/test_gpu_utils.py
index 11418ad38..32d04a136 100644
--- a/tests/smoke/test_gpu_utils.py
+++ b/tests/smoke/test_gpu_utils.py
@@ -9,4 +9,3 @@
 @pytest.mark.gpu
 def test_machine_is_gpu_machine():
     assert torch.cuda.is_available() is True
-
diff --git a/utils_nlp/common/pytorch_utils.py b/utils_nlp/common/pytorch_utils.py
index 77918d70c..2badb45e5 100644
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@@ -83,7 +83,9 @@ def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, dis
 
     Args:
         ds (torch.utils.data.DataSet): A PyTorch dataset.
-        batch_size (int, optional): Batch size. Defaults to 32.
+        batch_size (int, optional): Batch size.
+            If more than 1 gpu is used, this would be the batch size per gpu.
+            Defaults to 32.
         num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
         shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
         distributed (book, optional): If True, a DistributedSampler is used. Defaults to False.

From 186ce2710ad0b8cfe54c0cd9c972d0050a32a5a7 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Fri, 24 Jan 2020 05:50:51 +0000
Subject: [PATCH 23/24] minor edits

---
 .../transformers/extractive_summarization.py      | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py
index 54a7f64ef..f4a567e7e 100644
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@@ -336,9 +336,9 @@ def get_inputs(batch, device, model_name, train_mode=True):
         """
 
         if model_name.split("-")[0] in ["bert", "distilbert"]:
+            batch = batch.to(device)
             if train_mode:
                 # labels must be the last
-                batch = batch.to(device)
                 return {
                     "x": batch.src,
                     "segs": batch.segs,
@@ -348,13 +348,6 @@ def get_inputs(batch, device, model_name, train_mode=True):
                     "labels": batch.labels,
                 }
             else:
-                batch["src"] = batch["src"].to(device)
-                batch["segs"] = batch["segs"].to(device)
-                batch["clss"] = batch["clss"].to(device)
-                batch["mask"] = batch["mask"].to(device)
-                batch["mask_cls"] = batch["mask_cls"].to(device)
-                if "labels" in batch:
-                    batch["labels"] = batch["labels"].to(device)
                 batch = Bunch(batch)
                 return {
                     "x": batch.src,
@@ -478,7 +471,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer",
         Args:
             model_name (str, optional): Transformer model name used in preprocessing.
                 check MODEL_CLASS for supported models. Defaults to "distilbert-base-uncased".
-            encoder (str, optional): Encoder algorithm used by summarization layer. 
+            encoder (str, optional): Encoder algorithm used by summarization layer.
                 There are four options:
                     - baseline: it used a smaller transformer model to replace the bert model
                       and with transformer summarization layer.
@@ -487,7 +480,7 @@ def __init__(self, model_name="distilbert-base-uncased", encoder="transformer",
                     - transformer: it uses pretrained BERT and fine-tune BERT with transformer
                       summarization layer.
                     - RNN: it uses pretrained BERT and fine-tune BERT with LSTM summarization layer.
-                Defaults to "transformer". 
+                Defaults to "transformer".
             cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
         """
 
@@ -550,7 +543,7 @@ def fit(
             gpu_ids (list): List of GPU IDs to be used.
                 If set to None, the first num_gpus GPUs will be used.
                 Defaults to None.
-            batch_size (int, optional): Maximum number of tokens in each batch. 
+            batch_size (int, optional): Maximum number of tokens in each batch.
             local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                 -1, which means non-distributed training.
             max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.

From 6b35c4917af632fd54a711b5125919134d6e7879 Mon Sep 17 00:00:00 2001
From: saidbleik <saidbleik@outlook.com>
Date: Fri, 24 Jan 2020 06:54:12 +0000
Subject: [PATCH 24/24] minor edits

---
 .../models/transformers/extractive_summarization.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/utils_nlp/models/transformers/extractive_summarization.py b/utils_nlp/models/transformers/extractive_summarization.py
index f4a567e7e..1defdad5c 100644
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@@ -336,8 +336,8 @@ def get_inputs(batch, device, model_name, train_mode=True):
         """
 
         if model_name.split("-")[0] in ["bert", "distilbert"]:
-            batch = batch.to(device)
             if train_mode:
+                batch = batch.to(device)
                 # labels must be the last
                 return {
                     "x": batch.src,
@@ -350,11 +350,11 @@ def get_inputs(batch, device, model_name, train_mode=True):
             else:
                 batch = Bunch(batch)
                 return {
-                    "x": batch.src,
-                    "segs": batch.segs,
-                    "clss": batch.clss,
-                    "mask": batch.mask,
-                    "mask_cls": batch.mask_cls,
+                    "x": batch.src.to(device),
+                    "segs": batch.segs.to(device),
+                    "clss": batch.clss.to(device),
+                    "mask": batch.mask.to(device),
+                    "mask_cls": batch.mask_cls.to(device),
                 }
         else:
             raise ValueError("Model not supported: {}".format(model_name))