From 305b41a08327bc816d1b6153f7a86bf62b006621 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 6 Jun 2024 12:40:11 +0200 Subject: [PATCH] remove attention_mask creation as ORTModelForxxx's corresponding processors will create it --- optimum/onnxruntime/modeling_ort.py | 57 +++++------------------------ 1 file changed, 10 insertions(+), 47 deletions(-) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 84d36df9aa..734c9b6551 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -267,7 +267,6 @@ def __init__( **kwargs, ) - # why are these not lists ? self.input_names = {input_key.name: idx for idx, input_key in enumerate(model.get_inputs())} self.input_dtypes = {input_key.name: input_key.type for input_key in model.get_inputs()} @@ -740,7 +739,7 @@ def _output_shape_inference(self, axis_name: Union[str, int], dimensions: Dict[s # exception. return int(eval(" ".join(tokens))) - # this method is bloated with state arguments (that are accesible using self) why ? + # TODO: this method is bloated with state arguments (that are accesible using self) why ? def _prepare_io_binding( self, model: ort.InferenceSession, @@ -1013,15 +1012,12 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, attention_mask, token_type_ids, ordered_input_names=self._ordered_input_names + input_ids, + attention_mask, + token_type_ids, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1037,7 +1033,7 @@ def forward( onnx_outputs = self.model.run(None, onnx_inputs) model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # why do we only return last_hidden_state? why not all outputs? + # TODO: why do we only return last_hidden_state? why not all outputs? # that way, there will be less need for ORTModelForCustomTask in cases where # we just want to extend model outputs with attentions, hidden_states, etc. last_hidden_state = model_outputs["last_hidden_state"] @@ -1161,15 +1157,12 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, attention_mask, token_type_ids, ordered_input_names=self._ordered_input_names + input_ids, + attention_mask, + token_type_ids, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1253,12 +1246,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1366,12 +1353,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1462,12 +1443,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1551,12 +1526,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(input_ids) - else: - attention_mask = np.ones_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1862,12 +1831,6 @@ def forward( use_torch = isinstance(model_input, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if attention_mask is None: - if use_torch: - attention_mask = torch.ones_like(model_input) - else: - attention_mask = np.ones_like(model_input) - if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( model_input,