diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e77f649f69..9e97908464 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -72,7 +72,6 @@ ) from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME from .model_patcher import ( - CLIPModelPatcher, FalconModelPatcher, MistralModelPatcher, MusicgenModelPatcher, @@ -907,6 +906,7 @@ class CLIPNormalizedConfig(NormalizedTextAndVisionConfig): class CLIPVisionModelOnnxConfig(VisionOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -920,16 +920,10 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return common_outputs - def patch_model_for_export( - self, - model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> "ModelPatcher": - return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) - class CLIPOnnxConfig(TextAndVisionOnnxConfig): NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -948,13 +942,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]: "image_embeds": {0: "image_batch_size"}, } - def patch_model_for_export( - self, - model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> "ModelPatcher": - return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) - class SentenceTransformersCLIPOnnxConfig(CLIPOnnxConfig): @property @@ -1000,13 +987,6 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return common_outputs - def patch_model_for_export( - self, - model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> "ModelPatcher": - return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) - class CLIPTextOnnxConfig(CLIPTextWithProjectionOnnxConfig): @property @@ -1031,13 +1011,6 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32) return dummy_inputs - def patch_model_for_export( - self, - model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], - model_kwargs: Optional[Dict[str, Any]] = None, - ) -> "ModelPatcher": - return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) - class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-3 diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 34ed5fcae4..4c1f845893 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -1138,20 +1138,3 @@ def __init__( self._update_causal_mask_original = self._model.model._update_causal_mask else: self._update_causal_mask_original = self._model._update_causal_mask - - -class CLIPModelPatcher(ModelPatcher): - def __enter__(self): - super().__enter__() - - if _transformers_version >= version.parse("4.43"): - from transformers.models.clip.modeling_clip import CLIPAttention, CLIPSdpaAttention - - self.original_sdpa_forward, CLIPSdpaAttention.forward = CLIPSdpaAttention.forward, CLIPAttention.forward - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - if _transformers_version >= version.parse("4.43"): - from transformers.models.clip.modeling_clip import CLIPSdpaAttention - - CLIPSdpaAttention.forward = self.original_sdpa_forward