diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py index 7fdae4687..26340a43b 100644 --- a/examples/image-classification/run_image_classification.py +++ b/examples/image-classification/run_image_classification.py @@ -48,6 +48,7 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainer as Trainer from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism """ Fine-tuning a 🤗 Transformers model for image classification""" @@ -55,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") @@ -187,6 +188,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) @@ -274,15 +279,17 @@ def compute_metrics(p): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForImageClassification.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForImageClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + ) + image_processor = AutoImageProcessor.from_pretrained( model_args.image_processor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index 4c4687df0..aa0e346c1 100755 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -131,6 +131,15 @@ class ModelArguments: "choices": ["auto", "bfloat16", "float16", "float32"], }, ) + low_cpu_mem_usage: bool = field( + default=False, + metadata={ + "help": ( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "set True will benefit LLM loading time and RAM consumption." + ) + }, + ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): @@ -174,7 +183,7 @@ class DataTrainingArguments: ) }, ) - + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) block_size: Optional[int] = field( default=None, metadata={ @@ -203,6 +212,9 @@ class DataTrainingArguments: ) def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + if self.dataset_name is None and self.train_file is None and self.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: @@ -238,6 +250,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -286,6 +302,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -294,6 +311,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -301,6 +319,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) else: data_files = {} @@ -400,10 +419,13 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, torch_dtype=torch_dtype, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) + else: with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): model = AutoModelForCausalLM.from_config(config) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") @@ -416,9 +438,9 @@ def main(): # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: - column_names = raw_datasets["train"].column_names + column_names = list(raw_datasets["train"].features) else: - column_names = raw_datasets["validation"].column_names + column_names = list(raw_datasets["validation"].features) text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function @@ -436,21 +458,29 @@ def tokenize_function(examples): return output with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) block_size = 1024 else: @@ -466,10 +496,9 @@ def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] @@ -486,13 +515,19 @@ def group_texts(examples): # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) + if not data_args.streaming: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) if training_args.do_train: if "train" not in tokenized_datasets: diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 7ba3b94bc..083694c0e 100755 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -50,10 +50,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainer as Trainer from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -116,6 +117,15 @@ class ModelArguments: ) }, ) + low_cpu_mem_usage: bool = field( + default=False, + metadata={ + "help": ( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "set True will benefit LLM loading time and RAM consumption." + ) + }, + ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): @@ -197,8 +207,12 @@ class DataTrainingArguments: ) }, ) + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + if self.dataset_name is None and self.train_file is None and self.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: @@ -236,6 +250,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -285,6 +303,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -293,6 +312,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -300,6 +320,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + streaming=data_args.streaming, ) else: data_files = {} @@ -377,17 +398,21 @@ def main(): ) if model_args.model_name_or_path: - model = AutoModelForMaskedLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForMaskedLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, + ) + else: logger.info("Training new model from scratch") - model = AutoModelForMaskedLM.from_config(config) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForMaskedLM.from_config(config) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -398,17 +423,18 @@ def main(): # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: - column_names = raw_datasets["train"].column_names + column_names = list(raw_datasets["train"].features) else: - column_names = raw_datasets["validation"].column_names + column_names = list(raw_datasets["validation"].features) text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: @@ -439,14 +465,21 @@ def tokenize_function(examples): ) with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=[text_column_name], + ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more @@ -455,14 +488,21 @@ def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on every text in dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. @@ -470,10 +510,9 @@ def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= max_seq_length: - total_length = (total_length // max_seq_length) * max_seq_length + # We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] @@ -489,13 +528,19 @@ def group_texts(examples): # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) + if not data_args.streaming: + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) + else: + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) if training_args.do_train: if "train" not in tokenized_datasets: diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py index 881d73ba9..cd522127a 100755 --- a/examples/multiple-choice/run_swag.py +++ b/examples/multiple-choice/run_swag.py @@ -44,10 +44,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainer as Trainer from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") logger = logging.getLogger(__name__) @@ -235,6 +236,11 @@ def main(): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) + + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -318,14 +324,15 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForMultipleChoice.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForMultipleChoice.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] @@ -336,8 +343,9 @@ def main(): max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index f4088b9f5..b369571e9 100755 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -46,10 +46,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -238,6 +239,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -327,14 +332,15 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForQuestionAnswering.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForQuestionAnswering.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): @@ -586,12 +592,12 @@ def post_processing_function(examples, features, predictions, stage="eval"): # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()] - references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad") diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py index 7c1e51642..fe5213a8d 100644 --- a/examples/question-answering/run_seq2seq_qa.py +++ b/examples/question-answering/run_seq2seq_qa.py @@ -26,6 +26,7 @@ import datasets import evaluate +import numpy as np import transformers from datasets import load_dataset from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer @@ -42,10 +43,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import Seq2SeqNeuronTrainingArguments as Seq2SeqTrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") @@ -283,6 +285,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -371,14 +377,15 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -610,6 +617,8 @@ def post_processing_function( preds = outputs.predictions if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features. diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 8d6309af2..4b05b3b08 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -49,10 +49,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import Seq2SeqNeuronTrainer as Seq2SeqTrainer from optimum.neuron import Seq2SeqNeuronTrainingArguments as Seq2SeqTrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -262,8 +263,13 @@ class DataTrainingArguments: ) def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training, validation, or test file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] @@ -271,6 +277,9 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." if self.val_max_target_length is None: self.val_max_target_length = self.max_target_length @@ -314,6 +323,11 @@ def main(): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) + + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -413,14 +427,15 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -462,10 +477,16 @@ def main(): # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") column_names = raw_datasets["train"].column_names elif training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") column_names = raw_datasets["validation"].column_names elif training_args.do_predict: + if "test" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") column_names = raw_datasets["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") @@ -541,8 +562,6 @@ def preprocess_function(examples): return model_inputs if training_args.do_train: - if "train" not in raw_datasets: - raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) @@ -559,8 +578,6 @@ def preprocess_function(examples): if training_args.do_eval: max_target_length = data_args.val_max_target_length - if "validation" not in raw_datasets: - raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) @@ -577,8 +594,6 @@ def preprocess_function(examples): if training_args.do_predict: max_target_length = data_args.val_max_target_length - if "test" not in raw_datasets: - raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) @@ -619,10 +634,10 @@ def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -634,6 +649,16 @@ def compute_metrics(eval_preds): result["gen_len"] = np.mean(prediction_lens) return result + # Override the decoding parameters of Seq2SeqTrainer + training_args.generation_max_length = ( + training_args.generation_max_length + if training_args.generation_max_length is not None + else data_args.val_max_target_length + ) + training_args.generation_num_beams = ( + data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams + ) + # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, @@ -667,15 +692,9 @@ def compute_metrics(eval_preds): # Evaluation results = {} - max_length = ( - training_args.generation_max_length - if training_args.generation_max_length is not None - else data_args.val_max_target_length - ) - num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") + metrics = trainer.evaluate(metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) @@ -685,9 +704,7 @@ def compute_metrics(eval_preds): if training_args.do_predict: logger.info("*** Predict ***") - predict_results = trainer.predict( - predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams - ) + predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict") metrics = predict_results.metrics max_predict_samples = ( data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) @@ -699,8 +716,10 @@ def compute_metrics(eval_preds): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index c26fed518..31d2cc67a 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -228,6 +228,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -484,6 +488,8 @@ def preprocess_function(examples): # Get the metric function if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) + elif is_regression: + metric = evaluate.load("mse") else: metric = evaluate.load("accuracy") @@ -492,15 +498,10 @@ def preprocess_function(examples): def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - if data_args.task_name is not None: - result = metric.compute(predictions=preds, references=p.label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - elif is_regression: - return {"mse": ((preds - p.label_ids) ** 2).mean().item()} - else: - return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if # we already did the padding. diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py index 145225c20..339a649fe 100755 --- a/examples/text-classification/run_xnli.py +++ b/examples/text-classification/run_xnli.py @@ -45,10 +45,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainer as Trainer from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -186,6 +187,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -269,6 +274,8 @@ def main(): config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, + id2label={str(i): label for i, label in enumerate(label_list)}, + label2id={label: i for i, label in enumerate(label_list)}, finetuning_task="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_revision, @@ -282,15 +289,16 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForSequenceClassification.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + ) # Preprocessing the datasets # Padding strategy diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 17dd883ea..ba33cd4a5 100755 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -46,10 +46,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronTrainer as Trainer from optimum.neuron import NeuronTrainingArguments as TrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") @@ -228,6 +229,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -366,15 +371,16 @@ def get_label_list(labels): use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForTokenClassification.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForTokenClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index defa8f958..cc1d79239 100755 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -49,10 +49,11 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import Seq2SeqNeuronTrainer as Seq2SeqTrainer from optimum.neuron import Seq2SeqNeuronTrainingArguments as Seq2SeqTrainingArguments +from optimum.neuron.distributed import lazy_load_for_parallelism # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.26.0") +check_min_version("4.31.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") @@ -272,6 +273,10 @@ def main(): handlers=[logging.StreamHandler(sys.stdout)], ) + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -371,14 +376,15 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -539,10 +545,10 @@ def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] + # Replace -100s used for padding as we can't decode them + preds = np.where(preds != -100, preds, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - if data_args.ignore_pad_token_for_loss: - # Replace -100 in the labels as we can't decode them. - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing @@ -622,8 +628,10 @@ def compute_metrics(eval_preds): if trainer.is_world_process_zero(): if training_args.predict_with_generate: + predictions = predict_results.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") diff --git a/tests/distributed/test_model_parallelization.py b/tests/distributed/test_model_parallelization.py index 5f3bc4de3..05ef41bf5 100644 --- a/tests/distributed/test_model_parallelization.py +++ b/tests/distributed/test_model_parallelization.py @@ -40,6 +40,8 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES, ) +from ..test_utils import is_trainium_test + if TYPE_CHECKING: from transformers import PretrainedConfig @@ -101,6 +103,7 @@ def _generate_supported_model_class_names( MODELS_TO_TEST.append((model_class_name, model_name_or_path)) +@is_trainium_test class ModelParallelizationTestCase(unittest.TestCase): def get_parallel_test_python_file_content( self, diff --git a/tools/create_examples_from_transformers.py b/tools/create_examples_from_transformers.py index b2a5660a4..61d25030d 100755 --- a/tools/create_examples_from_transformers.py +++ b/tools/create_examples_from_transformers.py @@ -98,6 +98,7 @@ def predicate(_): path_prefix = Path(tmpdirname) / "examples" / "pytorch" dest_dir.mkdir(parents=True, exist_ok=True) + for example in example_names: example_dir = path_prefix / example for file_path in example_dir.iterdir(): @@ -146,6 +147,43 @@ def generate_new_import_code(*optimum_neuron_imports: str) -> str: return " ".join(import_line) +def wrap_with_lazy_load_for_parallelism(file_content: str) -> str: + model_loading_pattern = r"\w+ = AutoModel[\w.]+" + shift = 0 + for m in re.finditer(model_loading_pattern, file_content): + position = m.end(0) + shift + opened = 1 + if file_content[position] != "(": + raise ValueError(f"Did not find an opening parenthesis, match: {m}") + while opened > 0: + position += 1 + if file_content[position] == ")": + opened -= 1 + elif file_content[position] == "(": + opened += 1 + + start = m.start(0) + shift + model_loading_content = file_content[start : position + 1] + initial_length = len(model_loading_content) + model_loading_content = model_loading_content.replace("\n", "\n ") + number_of_spaces = 0 + for i in range(start - 1, 0, -1): + if file_content[i] == "\n": + break + elif file_content[i] == "\t": + number_of_spaces += 4 + else: + number_of_spaces += 1 + # Adding one tab to indent from the lazy_load_for_parallelism context manager. + number_of_spaces += 4 + model_loading_content = " " * number_of_spaces + model_loading_content + new_content = f"with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):\n{model_loading_content}\n" + file_content = file_content[:start] + new_content + file_content[position + 1 :] + shift += len(new_content) - initial_length + + return file_content + + def parse_args(): parser = ArgumentParser( description="Tool to download and prepare 🤗 Transformers example training scripts for AWS Trainium instances." @@ -181,6 +219,8 @@ def main(): if example_dir.is_file(): continue for file_path in example_dir.iterdir(): + if file_path.name == "run_generation.py": + continue if "run" in file_path.name and file_path.suffix == ".py": if file_path.name == "run_qa.py": trainer_file_path = file_path.parent / "trainer_qa.py" @@ -216,11 +256,17 @@ def main(): TRAINING_ARGUMENTS_IMPORT_PATTERN, file_content ) code = generate_new_import_code(AWS_CODE[training_args_cls]) - code = f"\n{code}\n" + code = f"\n{code}\nfrom optimum.neuron.distributed import lazy_load_for_parallelism\n" processed_content = insert_code_at_position(code, processed_content, import_end_index) with open(training_argument_file_path, "w") as fp: fp.write(processed_content) + with open(training_argument_file_path, "r") as fp: + file_content = fp.read() + processed_content = wrap_with_lazy_load_for_parallelism(file_content) + with open(training_argument_file_path, "w") as fp: + fp.write(processed_content) + elif file_path.name == "requirements.txt": with open(file_path, "r") as fp: file_content = fp.read()