Machine-Learning-for-Medical-Language · tmills · Sep 1, 2023 · Aug 24, 2023
diff --git a/DAPT.md b/DAPT.md
@@ -0,0 +1,80 @@
+# Domain-adaptive pretraining
+
+[Don’t Stop Pretraining: Adapt Language Models to Domains and Tasks](https://aclanthology.org/2020.acl-main.740) (Gururangan et al., ACL 2020)
+
+## Dataset format
+
+`DaptDataset` expects largely the same dataset format to that used by
+`ClinicalNlpDataset`. The main restriction is that there
+should be a `text` column; datasets with `text_a` and `text_b` columns
+will not be accepted.
+
+
+## Usage
+
+Use `cnlpt.dapt` for domain-adaptive pretraining on an existing encoder. 
+
+```
+$ python -m cnlpt.dapt --help
+usage: dapt.py [-h] [--encoder_name ENCODER_NAME]
+               [--config_name CONFIG_NAME]
+               [--tokenizer_name TOKENIZER_NAME]
+               [--output_dir OUTPUT_DIR]
+               [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
+               [--data_dir DATA_DIR] [--cache_dir CACHE_DIR]
+               [--chunk_size CHUNK_SIZE]
+               [--mlm_probability MLM_PROBABILITY]
+               [--test_size TEST_SIZE] [--seed SEED]
+               [--no_eval [NO_EVAL]]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --encoder_name ENCODER_NAME
+                        Path to pretrained model or model
+                        identifier from huggingface.co/models
+                        (default: roberta-base)
+  --config_name CONFIG_NAME
+                        Pretrained config name or path if not the
+                        same as model_name (default: None)
+  --tokenizer_name TOKENIZER_NAME
+                        Pretrained tokenizer name or path if not
+                        the same as model_name (default: None)
+  --output_dir OUTPUT_DIR
+                        Directory path to write trained model to.
+                        (default: None)
+  --overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
+                        Overwrite the content of the output
+                        directory. Use this to continue training if
+                        output_dir points to a checkpoint
+                        directory. (default: False)
+  --data_dir DATA_DIR   The data dir for domain-adaptive
+                        pretraining. (default: None)
+  --cache_dir CACHE_DIR
+                        Where do you want to store the pretrained
+                        models downloaded from s3 (default: None)
+  --chunk_size CHUNK_SIZE
+                        The chunk size for domain-adaptive
+                        pretraining. (default: 128)
+  --mlm_probability MLM_PROBABILITY
+                        The token masking probability for domain-
+                        adaptive pretraining. (default: 0.15)
+  --test_size TEST_SIZE
+                        The test split proportion for domain-
+                        adaptive pretraining. (default: 0.2)
+  --seed SEED           The random seed to use for a train/test
+                        split for domain-adaptive pretraining
+                        (requires --dapt-encoder). (default: 42)
+  --no_eval [NO_EVAL]   Don't split into train and test; just
+                        pretrain. (default: False)
+
+```
+
+This will save the adapted encoder to the disk at `--output_dir`, where
+it can then be passed into `train_system` as `--encoder_name`.
+
+The common idiom will be to use `cnlpt.dapt` on a portion of your
+unlabeled data (the task dataset), then run `train_system` using a 
+labeled dataset of in-domain data. To evaluate the effectiveness of this 
+idiom, you can use an artificially-unlabeled dataset and then evaluate 
+the fine-tuned classifier out of `train_system` on the labeled portion
+of your task dataset.
diff --git a/src/cnlpt/cnlp_args.py b/src/cnlpt/cnlp_args.py
@@ -244,3 +244,70 @@ def to_dict(self):
             if k.endswith("_token"):
                 d[k] = f"<{k.upper()}>"
         return d
+
+
+@dataclass
+class DaptArguments:
+    encoder_name: Optional[str] = field(
+        default="roberta-base",
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"
+        },
+    )
+    output_dir: Optional[str] = field(
+        default=None, metadata={"help": "Directory path to write trained model to."}
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The data dir for domain-adaptive pretraining."}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Where do you want to store the pretrained models downloaded from s3"
+        },
+    )
+    chunk_size: int = field(
+        default=128,
+        metadata={"help": "The chunk size for domain-adaptive pretraining."},
+    )
+    mlm_probability: float = field(
+        default=0.15,
+        metadata={
+            "help": "The token masking probability for domain-adaptive pretraining."
+        },
+    )
+    test_size: float = field(
+        default=0.2,
+        metadata={"help": "The test split proportion for domain-adaptive pretraining."},
+    )
+    seed: int = field(
+        default=42,
+        metadata={
+            "help": "The random seed to use for a train/test split for domain-adaptive pretraining (requires --dapt-encoder)."
+        },
+    )
+    no_eval: bool = field(
+        default=False,
+        metadata={"help": "Don't split into train and test; just pretrain."},
+    )
diff --git a/src/cnlpt/cnlp_data.py b/src/cnlpt/cnlp_data.py
@@ -1,6 +1,7 @@
 import os
 from os.path import basename, dirname
 import time
+import functools
 import logging
 import json
 
@@ -11,12 +12,14 @@
 import torch
 from torch.utils.data.dataset import Dataset
 from transformers import BatchEncoding, InputExample
+from transformers import DataCollatorForLanguageModeling
 from transformers.tokenization_utils import PreTrainedTokenizer
-from datasets import Features
+from datasets import Features, DatasetDict, IterableDatasetDict
 from dataclasses import dataclass, field, asdict, astuple
 import datasets
 from enum import Enum
 
+from .cnlp_args import DaptArguments
 from .cnlp_processors import classification, tagging, relex, mtl, AutoProcessor
 
 special_tokens = ["<e>", "</e>", "<a1>", "</a1>", "<a2>", "</a2>", "<cr>", "<neg>"]
@@ -927,3 +930,80 @@ def get_labels(self) -> Dict[str, List[str]]:
         :return: the dictionary of label lists indexed by task name
         """
         return self.tasks_to_labels
+
+
+def group_texts(chunk_size, examples):
+    # Concatenate all texts
+    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    # Compute length of concatenated texts
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the last chunk if it's smaller than chunk_size
+    total_length = (total_length // chunk_size) * chunk_size
+    # Split by chunks of max_len
+    result = {
+        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
+        for k, t in concatenated_examples.items()
+    }
+    # Create a new labels column
+    result["labels"] = result["input_ids"].copy()
+    return result
+
+
+def tokenize_fn(tokenizer, examples):
+    result = tokenizer(examples["text"])
+    if tokenizer.is_fast:
+        result["word_ids"] = [
+            result.word_ids(i) for i in range(len(result["input_ids"]))
+        ]
+    return result
+
+
+class DaptDataset(Dataset):
+    def __getitem__(self, index):
+        return self.train[index]
+
+    def __init__(
+        self,
+        args: DaptArguments,
+        tokenizer: PreTrainedTokenizer,
+    ):
+        self.args = args
+        self.tokenizer = tokenizer
+
+        processor = AutoProcessor(self.args.data_dir, tasks=None)
+
+        # This can probably be refined
+        dataset: DatasetDict = processor.dataset
+        remove_columns = {"text", "id", *processor.get_labels()}.intersection(
+            set(dataset.column_names["train"])
+        )
+
+        dataset = dataset.map(
+            functools.partial(tokenize_fn, self.tokenizer),
+            batched=True,
+            remove_columns=list(remove_columns),
+        )
+        dataset = dataset.map(
+            functools.partial(group_texts, self.args.chunk_size),
+            batched=True,
+        )
+
+        if isinstance(dataset, (DatasetDict, IterableDatasetDict)) or args.no_eval:
+            self.dataset = dataset
+        else:
+            self.dataset = dataset.train_test_split(
+                test_size=args.test_size,
+                seed=args.seed,
+            )
+
+        self.data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm_probability=self.args.mlm_probability
+        )
+
+    @property
+    def train(self):
+        return self.dataset["train"]
+
+    @property
+    def test(self):
+        return self.dataset["test"]
diff --git a/src/cnlpt/dapt.py b/src/cnlpt/dapt.py
@@ -0,0 +1,120 @@
+"""
+Domain-adaptive pretraining (see DAPT.md for details)
+"""
+
+import logging
+import os
+import sys
+from typing import Optional, Dict, Any
+
+from transformers import (
+    Trainer,
+    AutoTokenizer,
+    AutoModelForMaskedLM,
+    set_seed,
+    HfArgumentParser,
+    TrainingArguments,
+)
+
+from .cnlp_data import DaptDataset
+from .cnlp_args import DaptArguments
+
+logger = logging.getLogger(__name__)
+
+
+def main(json_file: Optional[str] = None, json_obj: Optional[Dict[str, Any]] = None):
+    """
+    Domain-adaptive pretraining.
+
+    See :class:`cnlpt.cnlp_data.DaptArguments` for command-line arguments.
+
+    :param typing.Optional[str] json_file: if passed, a path to a JSON file
+        to use as the model, data, and training arguments instead of
+        retrieving them from the CLI (mutually exclusive with ``json_obj``)
+    :param typing.Optional[dict] json_obj: if passed, a JSON dictionary
+        to use as the model, data, and training arguments instead of
+        retrieving them from the CLI (mutually exclusive with ``json_file``)
+    :rtype: typing.Dict[str, typing.Dict[str, typing.Any]]
+    :return: the evaluation results (will be empty if ``--do_eval`` not passed)
+    """
+    parser = HfArgumentParser((DaptArguments,))
+    dapt_args: DaptArguments
+
+    if json_file is not None and json_obj is not None:
+        raise ValueError("cannot specify json_file and json_obj")
+
+    if json_file is not None:
+        (dapt_args,) = parser.parse_json_file(json_file=json_file)
+    elif json_obj is not None:
+        (dapt_args,) = parser.parse_dict(json_obj)
+    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        (dapt_args,) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        (dapt_args,) = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(dapt_args.output_dir)
+        and os.listdir(dapt_args.output_dir)
+        and not dapt_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({dapt_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,  # if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+
+    # logger.warning(
+    #     "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s" %
+    #     (training_args.local_rank,
+    #     training_args.device,
+    #     training_args.n_gpu,
+    #     bool(training_args.local_rank != -1),
+    #     training_args.fp16)
+    # )
+    # logger.info("Training/evaluation parameters %s" % training_args)
+    # logger.info("Data parameters %s" % data_args)
+    # logger.info("Model parameters %s" % model_args)
+
+    logger.info(f"Domain adaptation parameters {dapt_args}")
+
+    # Set seed
+    set_seed(dapt_args.seed)
+
+    # Load tokenizer: Need this first for loading the datasets
+    tokenizer = AutoTokenizer.from_pretrained(
+        dapt_args.tokenizer_name
+        if dapt_args.tokenizer_name
+        else dapt_args.encoder_name,
+        cache_dir=dapt_args.cache_dir,
+        add_prefix_space=True,
+        # additional_special_tokens=['<e>', '</e>', '<a1>', '</a1>', '<a2>', '</a2>', '<cr>', '<neg>']
+    )
+
+    model = AutoModelForMaskedLM.from_pretrained(dapt_args.encoder_name)
+
+    dataset = DaptDataset(dapt_args, tokenizer=tokenizer)
+
+    trainer = Trainer(
+        model=model,
+        args=TrainingArguments(output_dir=dapt_args.output_dir),
+        train_dataset=dataset.train,
+        eval_dataset=dataset.test if not dapt_args.no_eval else None,
+        data_collator=dataset.data_collator,
+        tokenizer=tokenizer,
+    )
+
+    trainer.train()
+
+    # write model out?
+    trainer.save_model()
+
+
+if __name__ == "__main__":
+    main()