From 049b00f61c9bb17bd2b20a3b77d04cc4c0f20d86 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:51:01 +0200 Subject: [PATCH] Add Transformers v4.45 support (#2023) * transformers v4.45 support * fix transformers v4.45 compatibility * update opset * update model * Add generation config saving * fix codegen * bump default opset m2m100 * fix codegen * fix bettertransformers * add warnign deprecation bettertransformer * bettertransformers fixes * disable transformers 4.45 for onnx export * update model ID --- Makefile | 4 +- optimum/bettertransformer/models/attention.py | 84 +++++++++++++++++-- .../models/decoder_models.py | 35 +++++++- optimum/bettertransformer/transformation.py | 4 + optimum/exporters/onnx/convert.py | 18 ++++ optimum/exporters/onnx/model_configs.py | 11 +-- optimum/modeling_base.py | 3 + optimum/onnxruntime/modeling_decoder.py | 58 ++++++++----- optimum/onnxruntime/modeling_ort.py | 3 - optimum/onnxruntime/modeling_seq2seq.py | 64 +++++++------- optimum/onnxruntime/optimization.py | 12 ++- setup.py | 9 +- tests/bettertransformer/testing_utils.py | 4 +- tests/onnxruntime/utils_onnxruntime_tests.py | 6 +- 14 files changed, 223 insertions(+), 92 deletions(-) diff --git a/Makefile b/Makefile index e2c2126303..824ef3d0cf 100644 --- a/Makefile +++ b/Makefile @@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) # Run code quality checks style_check: black --check . - ruff . + ruff check . style: black . - ruff . --fix + ruff check . --fix # Run tests for the library test: diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 9dfa57844d..22b8faf1c2 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -92,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product( return sdpa_result, None +# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn +def gptj_wrapped_scaled_dot_product( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, +): + raise_on_head_mask(head_mask) + batch_size = query.shape[0] + + mask_value = torch.finfo(value.dtype).min + mask_value = torch.full([], mask_value, dtype=value.dtype) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + query = query.to(value.dtype) + key = key.to(value.dtype) + + if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1: + raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.") + + dropout_p = self.dropout_prob_attn if self.training else 0.0 + if batch_size == 1 or self.training: + if query.shape[2] > 1: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True + ) + else: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False + ) + else: + query_length, key_length = query.size(-2), key.size(-2) + + # causal_mask is always [True, ..., True] otherwise, so executing this + # is unnecessary + if query_length > 1: + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + + causal_mask = torch.where(causal_mask, 0, mask_value) + + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + if attention_mask is not None: + attention_mask = causal_mask + attention_mask + + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False + ) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + sdpa_result = sdpa_result.to(value.dtype) + + return sdpa_result, None + + # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn def bark_wrapped_scaled_dot_product( self, @@ -195,7 +260,7 @@ def codegen_wrapped_scaled_dot_product( query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True ) else: - # in this case, which is the later decoding steps, the `causal_mask`` in + # in this case, which is the later decoding steps, the `causal_mask` in # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195 # is [True, ..., True] so actually not causal sdpa_result = torch.nn.functional.scaled_dot_product_attention( @@ -207,15 +272,20 @@ def codegen_wrapped_scaled_dot_product( # causal_mask is always [True, ..., True] otherwise, so executing this # is unnecessary if query_length > 1: - causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to( + torch.bool + ) - causal_mask = torch.where(causal_mask, 0, mask_value) + causal_mask = torch.where(causal_mask, 0, mask_value) - # torch.Tensor.expand does no memory copy - causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) - # we use torch.min to avoid having tensor(-inf) - attention_mask = torch.min(causal_mask, attention_mask) + # we use torch.min to avoid having tensor(-inf) + attention_mask = torch.min(causal_mask, attention_mask) + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] sdpa_result = torch.nn.functional.scaled_dot_product_attention( query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index b64b7f5a1e..52d28d076d 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -44,6 +44,7 @@ codegen_wrapped_scaled_dot_product, gpt2_wrapped_scaled_dot_product, gpt_neo_wrapped_scaled_dot_product, + gptj_wrapped_scaled_dot_product, opt_forward, t5_forward, ) @@ -82,7 +83,7 @@ def forward(self, *args, **kwargs): class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module): - _attn = gpt2_wrapped_scaled_dot_product + _attn = gptj_wrapped_scaled_dot_product def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super().__init__(config) @@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): "out_proj", "attn_dropout", "resid_dropout", - "bias", "scale_attn", - "masked_bias", ] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "bias"): + submodules.append("bias") + if hasattr(layer, "masked_bias"): + submodules.append("masked_bias") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_id"): + submodules.append("layer_id") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -238,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super(BetterTransformerBaseLayer, self).__init__(config) self.module_mapping = None - submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"] + submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "causal_mask"): + submodules.append("causal_mask") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index 2105e19987..a101757b6f 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -206,6 +206,10 @@ def transform( The converted model if the conversion has been successful. """ + logger.warning( + "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release." + ) + hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 63a9067b90..f2bf95f3e3 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,6 +26,7 @@ import numpy as np import onnx +import transformers from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -34,6 +35,7 @@ DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, TORCH_MINIMUM_VERSION, + check_if_transformers_greater, is_diffusers_available, is_torch_onnx_support_available, logging, @@ -999,6 +1001,10 @@ def onnx_export_from_model( >>> onnx_export_from_model(model, output="gpt2_onnx/") ``` """ + if check_if_transformers_greater("4.44.99"): + raise ImportError( + f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}" + ) TasksManager.standardize_model_attributes(model) @@ -1120,6 +1126,18 @@ def onnx_export_from_model( if isinstance(atol, dict): atol = atol[task.replace("-with-past", "")] + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + if model.can_generate() and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) + # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d4b15b2968..36963a986d 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: class AlbertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class ConvBertOnnxConfig(BertOnnxConfig): @@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig): class RobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class CamembertOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class FlaubertOnnxConfig(BertOnnxConfig): @@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig): class XLMRobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class DebertaOnnxConfig(BertOnnxConfig): @@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig): class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads") @@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class M2M100OnnxConfig(TextSeq2SeqOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( encoder_num_layers="encoder_layers", decoder_num_layers="decoder_layers", diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 3da2d9d0d2..29521b7c0c 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -371,6 +371,9 @@ def from_pretrained( export = from_transformers if len(model_id.split("@")) == 2: + logger.warning( + f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead." + ) if revision is not None: logger.warning( f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}" diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index f6d4b7e20a..bda3ec98d9 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -14,7 +14,6 @@ """Classes handling causal-lm related architectures in ONNX Runtime.""" import logging -import warnings from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union @@ -149,6 +148,19 @@ def __init__( generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + self.onnx_paths = [self.model_path] self.use_merged = "use_cache_branch" in self.input_names self.model_type = self.config.model_type @@ -393,7 +405,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -410,15 +421,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -586,6 +589,22 @@ def _from_pretrained( else: init_cls = ORTModelForCausalLM + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + return init_cls( model=model, config=config, @@ -593,6 +612,7 @@ def _from_pretrained( model_save_dir=model_save_dir, preprocessors=preprocessors, use_cache=use_cache, + generation_config=generation_config, ) @classmethod @@ -600,7 +620,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -616,15 +635,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - file_name = ONNX_WEIGHTS_NAME if use_merged: @@ -655,8 +665,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -712,6 +720,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> for layer_past in past ) + def _save_pretrained(self, save_directory: Union[str, Path]): + super()._save_pretrained(save_directory) + self.generation_config.save_pretrained(save_directory) + class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 9166f7c2cb..17bd3e2a4e 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -663,8 +663,6 @@ def _export( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -1171,7 +1169,6 @@ def _export( library_name="transformers", ) - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 3cecadafe3..fda3ca82bb 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -18,7 +18,6 @@ import logging import shutil -import warnings from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -706,6 +705,18 @@ def show_deprecated_argument(arg_name): generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + @abstractmethod def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: pass @@ -780,7 +791,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -799,15 +809,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -996,19 +998,21 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - generation_config = None - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - ) - except OSError: - logger.info("Generation config file not found, using a generation config created from the model config.") + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) onnx_paths = [encoder_path] if use_merged is False: @@ -1035,7 +1039,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -1051,15 +1054,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForConditionalGeneration": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - if use_cache is False and use_merged is True: raise ValueError( "The incompatible arguments use_cache=False, use_merged=True were passed to" @@ -1091,8 +1085,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py index 9e62a3f324..fd6958bba7 100644 --- a/optimum/onnxruntime/optimization.py +++ b/optimum/onnxruntime/optimization.py @@ -20,6 +20,7 @@ import onnx from onnx import load_model +from transformers import GenerationConfig from transformers.models.auto.configuration_auto import AutoConfig from onnxruntime.transformers.onnx_model_bert import BertOnnxModel @@ -152,10 +153,6 @@ def optimize( save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config) - - self.config.save_pretrained(save_dir) - maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) - model_type = ORTConfigManager.get_model_ort_type(self.config.model_type) optimization_options = optimization_config.create_fusion_options(model_type) @@ -236,6 +233,13 @@ def optimize( # Save the model configuration self.config.save_pretrained(save_dir) ort_config.save_pretrained(save_dir) + maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) + + try: + generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent) + generation_config.save_pretrained(save_dir) + except Exception: + pass logger.info( f"Optimized model saved at: {save_dir} (external data format: " diff --git a/setup.py b/setup.py index ac5db71a74..24c1ae1cd4 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29,<4.45.0", + "transformers[sentencepiece]>=4.29,<4.46.0", "torch>=1.11", "packaging", "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 @@ -54,6 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", + "transformers<4.45.0", ], "onnxruntime-gpu": [ "onnx", @@ -62,9 +63,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. + "transformers<4.45.0", ], - "exporters": ["onnx", "onnxruntime", "timm"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -75,6 +77,7 @@ "numpy<1.24.0", "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", + "transformers<4.45.0", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index e9e2edd979..098882180a 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -59,12 +59,12 @@ # "llama": "fxmarty/tiny-llama-fast-tokenizer", # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer", "m2m_100": "hf-internal-testing/tiny-random-nllb", - "marian": "fxmarty/tiny-marian", # the other tiny ones have a too small max_position_embeddings + "marian": "optimum-internal-testing/tiny-random-marian", # the other tiny ones have a too small max_position_embeddings "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "hf-internal-testing/tiny-random-mbart", "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", - "prophetnet": "hirotasoshu/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings + "prophetnet": "optimum-internal-testing/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings "rembert": "hf-internal-testing/tiny-random-RemBertModel", "roberta": "hf-internal-testing/tiny-random-RobertaModel", "rocbert": "hf-internal-testing/tiny-random-RoCBertModel", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 0790f6329d..17f3b391b0 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -112,9 +112,9 @@ "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", "longt5": "hf-internal-testing/tiny-random-LongT5Model", - "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama": "optimum-internal-testing/tiny-random-llama", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", - "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken + "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", @@ -152,7 +152,7 @@ "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel", "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2", "vit": "hf-internal-testing/tiny-random-vit", - "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken + "whisper": "optimum-internal-testing/tiny-random-whisper", "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", "wavlm": "hf-internal-testing/tiny-random-WavlmModel",