diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 4893b681a6..291a3b0833 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -50,6 +50,8 @@ jobs: pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s - name: Test with pytest (in parallel) + env: + FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} working-directory: tests run: | pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml new file mode 100644 index 0000000000..c71afbbb45 --- /dev/null +++ b/.github/workflows/trufflehog.yml @@ -0,0 +1,17 @@ +on: + push: + +name: Secret Leaks + +jobs: + trufflehog: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Secret Scanning + uses: trufflesecurity/trufflehog@main + + diff --git a/optimum/commands/__init__.py b/optimum/commands/__init__.py index 540ea4dd86..8a2a276d1c 100644 --- a/optimum/commands/__init__.py +++ b/optimum/commands/__init__.py @@ -15,5 +15,4 @@ from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand from .env import EnvironmentCommand from .export import ExportCommand, ONNXExportCommand, TFLiteExportCommand -from .onnxruntime import ONNXRuntimeCommand, ONNXRuntimeOptimizeCommand, ONNXRuntimeQuantizeCommand -from .optimum_cli import register_optimum_cli_subcommand +from .optimum_cli import optimum_cli_subcommand diff --git a/optimum/commands/optimum_cli.py b/optimum/commands/optimum_cli.py index 4bae9bb5f8..64a7075c6c 100644 --- a/optimum/commands/optimum_cli.py +++ b/optimum/commands/optimum_cli.py @@ -17,16 +17,57 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple, Type, Union +from ..subpackages import load_subpackages from ..utils import logging from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand from .env import EnvironmentCommand from .export import ExportCommand -from .onnxruntime import ONNXRuntimeCommand logger = logging.get_logger() -OPTIMUM_CLI_SUBCOMMANDS = [ExportCommand, EnvironmentCommand, ONNXRuntimeCommand] +# The table below contains the optimum-cli root subcommands provided by the optimum package +OPTIMUM_CLI_ROOT_SUBCOMMANDS = [ExportCommand, EnvironmentCommand] + +# The table below is dynamically populated when loading subpackages +_OPTIMUM_CLI_SUBCOMMANDS = [] + + +def optimum_cli_subcommand(parent_command: Optional[Type[BaseOptimumCLICommand]] = None): + """ + A decorator to declare optimum-cli subcommands. + + The declaration of an optimum-cli subcommand looks like this: + + ``` + @optimum_cli_subcommand() + class MySubcommand(BaseOptimumCLICommand): + + ``` + + or + + ``` + @optimum_cli_subcommand(ExportCommand) + class MySubcommand(BaseOptimumCLICommand): + + ``` + + Args: + parent_command: (`Optional[Type[BaseOptimumCLICommand]]`): + The class of the parent command or None if this is a top-level command. Defaults to None. + + """ + + if parent_command is not None and not issubclass(parent_command, BaseOptimumCLICommand): + raise ValueError(f"The parent command {parent_command} must be a subclass of BaseOptimumCLICommand") + + def wrapper(subcommand): + if not issubclass(subcommand, BaseOptimumCLICommand): + raise ValueError(f"The subcommand {subcommand} must be a subclass of BaseOptimumCLICommand") + _OPTIMUM_CLI_SUBCOMMANDS.append((subcommand, parent_command)) + + return wrapper def resolve_command_to_command_instance( @@ -137,15 +178,19 @@ def main(): root = RootOptimumCLICommand("Optimum CLI tool", usage="optimum-cli") parser = root.parser - for subcommand_cls in OPTIMUM_CLI_SUBCOMMANDS: + for subcommand_cls in OPTIMUM_CLI_ROOT_SUBCOMMANDS: register_optimum_cli_subcommand(subcommand_cls, parent_command=root) - commands_in_register = dynamic_load_commands_in_register() + # Load subpackages to give them a chance to declare their own subcommands + load_subpackages() + + # Register subcommands declared by the subpackages or found in the register files under commands/register + commands_to_register = _OPTIMUM_CLI_SUBCOMMANDS + dynamic_load_commands_in_register() command2command_instance = resolve_command_to_command_instance( - root, [parent_command_cls for _, parent_command_cls in commands_in_register if parent_command_cls is not None] + root, [parent_command_cls for _, parent_command_cls in commands_to_register if parent_command_cls is not None] ) - for command_or_command_info, parent_command in commands_in_register: + for command_or_command_info, parent_command in commands_to_register: if parent_command is None: parent_command_instance = root else: diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index 37a42714fc..b8734da478 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -182,40 +182,11 @@ def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train") def get_ptb(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if split == "train": - data = load_dataset("ptb_text_only", "penn_treebank", split="train") - elif split == "validation": - data = load_dataset("ptb_text_only", "penn_treebank", split="validation") - - enc = tokenizer(" ".join(data["sentence"]), return_tensors="pt") - - dataset = [] - for _ in range(nsamples): - i = random.randint(0, enc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = enc.input_ids[:, i:j] - attention_mask = torch.ones_like(inp) - dataset.append({"input_ids": inp, "attention_mask": attention_mask}) - - return dataset + raise RuntimeError("Loading the `ptb` dataset was deprecated") def get_ptb_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if split == "train": - data = load_dataset("ptb_text_only", "penn_treebank", split="train") - elif split == "validation": - data = load_dataset("ptb_text_only", "penn_treebank", split="test") - - enc = tokenizer(" ".join(data["sentence"]), return_tensors="pt") - - dataset = [] - for _ in range(nsamples): - i = random.randint(0, enc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = enc.input_ids[:, i:j] - attention_mask = torch.ones_like(inp) - dataset.append({"input_ids": inp, "attention_mask": attention_mask}) - return dataset + raise RuntimeError("Loading the `ptb` dataset was deprecated") def get_dataset( @@ -226,7 +197,7 @@ def get_dataset( Args: dataset_name (`str`): - Dataset name. Available options are `['wikitext2', 'c4', 'ptb', 'c4-new', 'ptb_new']`. + Dataset name. Available options are `['wikitext2', 'c4', 'c4-new']`. tokenizer (`Any`): Tokenizer of the model nsamples (`int`, defaults to `128`): @@ -247,11 +218,13 @@ def get_dataset( "wikitext2": get_wikitext2, "c4": get_c4, "c4-new": get_c4_new, - "ptb": get_ptb, - "ptb-new": get_ptb_new, } if split not in ["train", "validation"]: raise ValueError(f"The split need to be 'train' or 'validation' but found {split}") + if dataset_name in {"ptb", "ptb-new"}: + raise ValueError( + f"{dataset_name} dataset was deprecated, only the following dataset are supported : {list(get_dataset_map)}" + ) if dataset_name not in get_dataset_map: raise ValueError(f"Expected a value in {list(get_dataset_map.keys())} but found {dataset_name}") get_dataset_fn = get_dataset_map[dataset_name] diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 2c2c9d7e71..902af87bbb 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -432,7 +432,10 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - data[k] = v.to(0) + if not has_device_map or device.type == "cpu": + data[k] = v.to(0) + else: + data[k] = v.to(device) try: model(**data) except ValueError: @@ -458,7 +461,10 @@ def store_input_hook(_, input, *args): for data in dataset: for k, v in data.items(): # put the data on gpu, we won't put them back to cpu - data[k] = v.to(0) + if not has_device_map or device.type == "cpu": + data[k] = v.to(0) + else: + data[k] = v.to(device) try: model(**data) except ValueError: diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index bf9c80a86c..16461dce95 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -14,7 +14,7 @@ """Defines the base classes that are used to perform inference with ONNX Runtime of Transformers models.""" from abc import abstractmethod -from typing import TYPE_CHECKING, Dict, Optional, Set, Tuple, Union +from typing import Dict, Optional, Set, Tuple, Union import numpy as np import torch @@ -24,22 +24,22 @@ from ..utils import NormalizedConfigManager from ..utils.logging import warn_once +from .modeling_ort import ORTModel from .utils import get_ordered_input_names, logging logger = logging.get_logger(__name__) -if TYPE_CHECKING: - from .modeling_ort import ORTModel - - class ORTModelPart: """ For multi-file ONNX models, such as encoder-decoder models, represents a part of the model. It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. """ + _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs + _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs + def __init__( self, session: InferenceSession, @@ -53,6 +53,8 @@ def __init__( self.main_input_name = self.parent_model.main_input_name self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) @@ -98,25 +100,13 @@ def forward( last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - if use_torch: - onnx_inputs = {"input_ids": input_ids.cpu().detach().numpy()} - - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask.cpu().detach().numpy() - else: - onnx_inputs = {"input_ids": input_ids} + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # Run inference - outputs = self.session.run(None, onnx_inputs) - - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + last_hidden_state = model_outputs["last_hidden_state"] return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -350,83 +340,29 @@ def forward( else: raise ValueError("Unsupported num_pkv") else: - if use_torch: - onnx_inputs = { - "input_ids": input_ids.cpu().detach().numpy(), - } - - # Add the encoder_hidden_states inputs when needed - if "encoder_hidden_states" in self.input_names: - onnx_inputs["encoder_hidden_states"] = encoder_hidden_states.cpu().detach().numpy() - - # Add the decoder_attention_mask inputs when needed - if "decoder_attention_mask" in self.input_names: - onnx_inputs["decoder_attention_mask"] = decoder_attention_mask.cpu().detach().numpy() - - # Add the encoder_attention_mask inputs when needed - if "encoder_attention_mask" in self.input_names: - onnx_inputs["encoder_attention_mask"] = encoder_attention_mask.cpu().detach().numpy() - - if past_key_values is not None: - # Add the past_key_values to the decoder inputs - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - onnx_inputs[input_name] = past_key_value.cpu().detach().numpy() - - if "labels" in self.input_names: - # TODO: Any preprocessing like `self._shift_right(labels)`? - onnx_inputs["labels"] = labels.cpu().detach().numpy() - - if self.parent_model.use_merged is True: - onnx_inputs["use_cache_branch"] = use_cache_branch_tensor.cpu().detach().numpy() - else: - onnx_inputs = { - "input_ids": input_ids, - } - - # Add the encoder_hidden_states inputs when needed - if "encoder_hidden_states" in self.input_names: - onnx_inputs["encoder_hidden_states"] = encoder_hidden_states - - # Add the decoder_attention_mask inputs when needed - if "decoder_attention_mask" in self.input_names: - onnx_inputs["decoder_attention_mask"] = decoder_attention_mask - - # Add the encoder_attention_mask inputs when needed - if "encoder_attention_mask" in self.input_names: - onnx_inputs["encoder_attention_mask"] = encoder_attention_mask - - if past_key_values is not None: - # Add the past_key_values to the decoder inputs - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - onnx_inputs[input_name] = past_key_value - - if "labels" in self.input_names: - # TODO: Any preprocessing like `self._shift_right(labels)`? - onnx_inputs["labels"] = labels - - if self.parent_model.use_merged is True: - onnx_inputs["use_cache_branch"] = use_cache_branch_tensor + model_inputs = { + "input_ids": input_ids, + "encoder_hidden_states": encoder_hidden_states, + "decoder_attention_mask": decoder_attention_mask, + "encoder_attention_mask": encoder_attention_mask, + "use_cache_branch": use_cache_branch_tensor, + "labels": labels, + } + if past_key_values is not None: + model_inputs.update(zip(self.key_value_input_names, past_key_values)) - # Run inference - outputs = self.session.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # TODO: using two loops here is probably unefficient + # TODO: using a new variable out_past_key_values is memory inefficient, + # past_key_values is not used anymore at this point # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the # self-attention layer and 2 to the cross-attention layer) - out_past_key_values = tuple( - torch.from_numpy(outputs[self.output_names[key]]).to(self.device) - for key in self.key_value_output_names - ) - - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + out_past_key_values = tuple(model_outputs[output_name] for output_name in self.key_value_output_names) - loss = None - if "loss" in self.output_names: - loss = outputs[self.output_names["loss"]] - if use_torch: - loss = torch.from_numpy(loss).to(self.device) + loss = model_outputs.get("loss", None) + logits = model_outputs["logits"] # TODO: this is extremely ugly and unreadable. What if cross-attention k/v change? # Tuple of tuple of length `n_layers`, with each tuple of length equal to: diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 2d9be2d757..5d4bbe184e 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -46,7 +46,7 @@ if check_if_transformers_greater("4.25.0"): from transformers.generation import GenerationMixin else: - from transformers.generation_utils import GenerationMixin + from transformers.generation_utils import GenerationMixin # type: ignore # noqa: F401 logger = logging.getLogger(__name__) @@ -139,15 +139,16 @@ def __init__( self.num_pkv = 2 self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) - self.key_value_input_names = [key for key in self.inputs_names if (".key" in key) or (".value" in key)] + self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)] self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)] self.use_cache = len(self.key_value_input_names) > 0 if generation_config is None: generation_config = GenerationConfig.from_model_config(config) + self.generation_config = generation_config self.onnx_paths = [self.model_path] - self.use_merged = "use_cache_branch" in self.inputs_names + self.use_merged = "use_cache_branch" in self.input_names self.model_type = self.config.model_type self.use_fp16 = False @@ -160,7 +161,7 @@ def __init__( # Reference: https://github.com/huggingface/optimum/pull/1381 model_type = config.model_type.replace("_", "-") - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.inputs_names: + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.input_names: logger.warning( f"ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture {model_type}. " "We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support." @@ -202,7 +203,6 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - inputs = {} known_output_shapes = {} use_cache_branch = None loss = None @@ -226,10 +226,10 @@ def forward( # I suspect the reason is the contiguous python list that messes something up? model_inputs = [input_ids.contiguous()] - if "attention_mask" in self.inputs_names: + if "attention_mask" in self.input_names: model_inputs.append(attention_mask) - if "position_ids" in self.inputs_names: + if "position_ids" in self.input_names: if position_ids is None: raise ValueError("position_ids was not passed but is a required input for this ONNX model.") model_inputs.append(position_ids.contiguous()) @@ -240,12 +240,11 @@ def forward( if use_cache_branch is not None: model_inputs.append(use_cache_branch) - if "labels" in self.inputs_names: + if "labels" in self.input_names: model_inputs.append(labels) known_output_shapes.update({"loss": []}) - io_binding, output_shapes, output_buffers = self._prepare_io_binding( - self.model, + io_binding, output_shapes, output_buffers = self.prepare_io_binding( *model_inputs, known_output_shapes=known_output_shapes, ordered_input_names=self._ordered_input_names, @@ -259,53 +258,41 @@ def forward( io_binding.synchronize_outputs() if self.use_cache: - # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2) - past_key_values = () - for name in self.key_value_output_names: - past_key_values += (output_buffers[name].view(output_shapes[name]),) + # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2 for the self-attention) + past_key_values = tuple( + output_buffers[name].view(output_shapes[name]) for name in self.key_value_output_names + ) logits = output_buffers["logits"].view(output_shapes["logits"]) if "loss" in self.output_names: loss = output_buffers["loss"].view(output_shapes["loss"]) else: - inputs["input_ids"] = input_ids.cpu().detach().numpy() if use_torch else input_ids - - if "attention_mask" in self.inputs_names: - inputs["attention_mask"] = attention_mask.cpu().detach().numpy() if use_torch else attention_mask - - if "labels" in self.inputs_names: - inputs["labels"] = labels.cpu().detach().numpy() if use_torch else labels - - if "position_ids" in self.inputs_names: - if position_ids is None: - raise ValueError("position_ids was not passed but is a required input for this ONNX model.") - inputs["position_ids"] = position_ids.cpu().detach().numpy() if use_torch else position_ids - - # Add the past_key_values to the decoder inputs + model_inputs = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "use_cache_branch": use_cache_branch, + "labels": labels, + } if past_key_values is not None: - for input_name, past_key_value in zip(self.key_value_input_names, past_key_values): - inputs[input_name] = past_key_value.cpu().detach().numpy() if use_torch else past_key_value + model_inputs.update( + zip(self.key_value_input_names, past_key_values), + ) - if use_cache_branch is not None: - inputs["use_cache_branch"] = use_cache_branch.cpu().detach().numpy() if use_torch else use_cache_branch + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - outputs = self.model.run(None, inputs) + loss = model_outputs.get("loss", None) + logits = model_outputs["logits"] if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 for the self-attention) - past_key_values = tuple( - torch.from_numpy(outputs[self.output_names[key]]).to(self.device) - for key in self.key_value_output_names - ) - - logits = torch.from_numpy(outputs[self.output_names["logits"]]).to(self.device) - if "loss" in self.output_names: - loss = torch.from_numpy(outputs[self.output_names["loss"]]).to(self.device) + past_key_values = tuple(model_outputs[output_name] for output_name in self.key_value_output_names) if self.use_cache and self.model_type != "gpt_bigcode": - # Tuple of tuple of length `n_layers`, with each tuple of length equal to the number of self-attention and - # per decoder layer + # Tuple of tuple of length `n_layers`, with each tuple of length equal to the number of self-attention and per decoder layer past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) ) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index eb38a7fef1..734c9b6551 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -267,10 +267,13 @@ def __init__( **kwargs, ) - self.inputs_names = {input_key.name: idx for idx, input_key in enumerate(model.get_inputs())} + self.input_names = {input_key.name: idx for idx, input_key in enumerate(model.get_inputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in model.get_inputs()} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(model.get_outputs())} + self.output_dtypes = {output_key.name: output_key.type for output_key in model.get_outputs()} - self._ordered_input_names = get_ordered_input_names(self.inputs_names.keys(), func=self.forward) + self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) # TODO: why do we make device a property since we are only access the value, and do not do any check when setting the value? @property @@ -736,6 +739,7 @@ def _output_shape_inference(self, axis_name: Union[str, int], dimensions: Dict[s # exception. return int(eval(" ".join(tokens))) + # TODO: this method is bloated with state arguments (that are accesible using self) why ? def _prepare_io_binding( self, model: ort.InferenceSession, @@ -833,9 +837,15 @@ def _prepare_io_binding( return io_binding, output_shapes, output_buffers - def prepare_io_binding(self, *model_inputs, ordered_input_names, known_output_shapes=None): + def prepare_io_binding( + self, *model_inputs, ordered_input_names, outputs_to_not_bind=None, known_output_shapes=None + ): return self._prepare_io_binding( - self.model, ordered_input_names=ordered_input_names, known_output_shapes=known_output_shapes, *model_inputs + self.model, + *model_inputs, + ordered_input_names=ordered_input_names, + known_output_shapes=known_output_shapes, + outputs_to_not_bind=outputs_to_not_bind, ) def raise_on_numpy_input_io_binding(self, use_torch: bool): @@ -852,6 +862,39 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool): " with model.use_io_binding = False, or pass torch.Tensor inputs instead." ) + def _prepare_onnx_inputs( + self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray] + ) -> Dict[str, np.ndarray]: + onnx_inputs = {} + + # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): + onnx_inputs[input_name] = inputs.pop(input_name) + + if use_torch: + onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy() + + if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: + onnx_inputs[input_name] = onnx_inputs[input_name].astype( + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + ) + + return onnx_inputs + + def _prepare_onnx_outputs( + self, use_torch: bool, *onnx_outputs: np.ndarray + ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + model_outputs = {} + + # converts onnxruntime outputs into tensor for standard outputs + for output_name, idx in self.output_names.items(): + model_outputs[output_name] = onnx_outputs[idx] + + if use_torch: + model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device) + + return model_outputs + @staticmethod def _cached_file( model_path: Union[Path, str], @@ -970,9 +1013,6 @@ def forward( self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, attention_mask, @@ -985,35 +1025,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return BaseModelOutput( - last_hidden_state=output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) - ) + last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - if attention_mask is None: - attention_mask = np.ones_like(input_ids) - else: - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + # TODO: why do we only return last_hidden_state? why not all outputs? + # that way, there will be less need for ORTModelForCustomTask in cases where + # we just want to extend model outputs with attentions, hidden_states, etc. + last_hidden_state = model_outputs["last_hidden_state"] - # converts output to namedtuple for pipelines post-processing - return BaseModelOutput(last_hidden_state=last_hidden_state) + # converts output to namedtuple for pipelines post-processing + return BaseModelOutput(last_hidden_state=last_hidden_state) @classmethod def _export( @@ -1144,32 +1170,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return MaskedLMOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return MaskedLMOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return MaskedLMOutput(logits=logits) QUESTION_ANSWERING_EXAMPLE = r""" @@ -1247,37 +1259,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return QuestionAnsweringModelOutput( - start_logits=output_buffers["start_logits"].view(output_shapes["start_logits"]), - end_logits=output_buffers["end_logits"].view(output_shapes["end_logits"]), - ) + # TODO: this is the same routine in all io binding branches, should we refactor it into a prepare_io_binding_outputs method? + start_logits = output_buffers["start_logits"].view(output_shapes["start_logits"]) + end_logits = output_buffers["end_logits"].view(output_shapes["end_logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - - start_logits = outputs[self.output_names["start_logits"]] - end_logits = outputs[self.output_names["end_logits"]] - if use_torch: - start_logits = torch.from_numpy(start_logits).to(self.device) - end_logits = torch.from_numpy(end_logits).to(self.device) + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # converts output to namedtuple for pipelines post-processing - return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) + start_logits = model_outputs["start_logits"] + end_logits = model_outputs["end_logits"] + + # converts output to namedtuple for pipelines post-processing + return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits) SEQUENCE_CLASSIFICATION_EXAMPLE = r""" @@ -1370,30 +1366,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - outputs = self.model.run(None, onnx_inputs) + logits = model_outputs["logits"] - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return SequenceClassifierOutput(logits=logits) TOKEN_CLASSIFICATION_EXAMPLE = r""" @@ -1472,32 +1456,17 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=logits) + logits = model_outputs["logits"] + + return TokenClassifierOutput(logits=logits) MULTIPLE_CHOICE_EXAMPLE = r""" @@ -1570,31 +1539,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return MultipleChoiceModelOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - input_ids = input_ids.cpu().detach().numpy() - attention_mask = attention_mask.cpu().detach().numpy() - if token_type_ids is not None: - token_type_ids = token_type_ids.cpu().detach().numpy() - - onnx_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - if token_type_ids is not None: - onnx_inputs["token_type_ids"] = token_type_ids - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return MultipleChoiceModelOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return MultipleChoiceModelOutput(logits=logits) IMAGE_CLASSIFICATION_EXAMPLE = r""" @@ -1662,7 +1618,8 @@ def forward( if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - pixel_values, ordered_input_names=self._ordered_input_names + pixel_values, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1670,25 +1627,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return ImageClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - pixel_values = pixel_values.cpu().detach().numpy() + model_inputs = {"pixel_values": pixel_values} - onnx_inputs = { - "pixel_values": pixel_values, - } - - # run inference - outputs = self.model.run(None, onnx_inputs) - logits = outputs[self.output_names["logits"]] + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return ImageClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return ImageClassifierOutput(logits=logits) SEMANTIC_SEGMENTATION_EXAMPLE = r""" @@ -1746,51 +1696,37 @@ class ORTModelForSemanticSegmentation(ORTModel): checkpoint="optimum/segformer-b0-finetuned-ade-512-512", ) ) - def forward(self, **kwargs): - use_torch = isinstance(next(iter(kwargs.values())), torch.Tensor) + def forward( + self, + pixel_values: Union[torch.Tensor, np.ndarray], + **kwargs, + ): + use_torch = isinstance(pixel_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - io_binding = IOBindingHelper.prepare_io_binding( - self, - **kwargs, + io_binding, output_shapes, output_buffers = self.prepare_io_binding( + pixel_values, ordered_input_names=self._ordered_input_names, ) - # run inference with binding + # run inference with binding & synchronize in case of multiple CUDA streams io_binding.synchronize_inputs() self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} - for name, output in zip(self.output_names.keys(), io_binding._iobinding.get_outputs()): - outputs[name] = IOBindingHelper.to_pytorch(output) - - # converts output to namedtuple for pipelines post-processing - return SemanticSegmenterOutput(logits=outputs["logits"]) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, **kwargs) + model_inputs = {"pixel_values": pixel_values} - # run inference + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = onnx_outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return SemanticSegmenterOutput(logits=logits) - - def _prepare_onnx_inputs(self, use_torch: bool, **kwargs): - onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx - for input in self.inputs_names.keys(): - onnx_inputs[input] = kwargs.pop(input) - - if use_torch: - onnx_inputs[input] = onnx_inputs[input].cpu().detach().numpy() - - return onnx_inputs + # converts output to namedtuple for pipelines post-processing + return SemanticSegmenterOutput(logits=logits) AUDIO_CLASSIFICATION_EXAMPLE = r""" @@ -1878,18 +1814,28 @@ def __init__( ) def forward( self, - input_values: Optional[torch.Tensor] = None, - attenton_mask: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, + attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None, + input_features: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): - if input_values is None: - # Whisper uses input_features and not input_values. - input_values = kwargs["input_features"] - use_torch = isinstance(input_values, torch.Tensor) + if self.input_name == "input_features": + assert input_features is not None, "input_features must be provided for this model" + model_input = input_features + elif self.input_name == "input_values": + assert input_values is not None, "input_values must be provided for this model" + model_input = input_values + else: + raise ValueError(f"Input {self.input_name} not supported for Audio Classification") + + use_torch = isinstance(model_input, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, ordered_input_names=self._ordered_input_names + model_input, + attention_mask, + ordered_input_names=self._ordered_input_names, ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1897,28 +1843,18 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - self.input_name: input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - self.input_name: input_values, - } + model_inputs = {self.input_name: model_input, "attention_mask": attention_mask} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) + logits = model_outputs["logits"] - # converts output to namedtuple for pipelines post-processing - return SequenceClassifierOutput(logits=logits) + # converts output to namedtuple for pipelines post-processing + return SequenceClassifierOutput(logits=logits) CTC_EXAMPLE = r""" @@ -1966,11 +1902,12 @@ class ORTModelForCTC(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: input_size = input_values.shape[1] output_sizes = [] @@ -1985,9 +1922,7 @@ def _conv_output_size(input_size, kernel_size, stride): known_output_shapes = {"logits": [input_values.shape[0], output_sizes[-1], self.config.vocab_size]} io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, - ordered_input_names=self._ordered_input_names, - known_output_shapes=known_output_shapes, + input_values, ordered_input_names=self._ordered_input_names, known_output_shapes=known_output_shapes ) # run inference with binding & synchronize in case of multiple CUDA streams @@ -1995,28 +1930,18 @@ def _conv_output_size(input_size, kernel_size, stride): self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} - - return CausalLMOutput(logits=output_buffers["logits"].view(output_shapes["logits"])) + logits = output_buffers["logits"].view(output_shapes["logits"]) else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - # converts output to namedtuple for pipelines post-processing - return CausalLMOutput(logits=logits) + logits = model_outputs["logits"] + + # converts output to namedtuple for pipelines post-processing + return CausalLMOutput(logits=logits) AUDIO_XVECTOR_EXAMPLE = r""" @@ -2072,11 +1997,12 @@ class ORTModelForAudioXVector(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_values, ordered_input_names=self._ordered_input_names @@ -2087,33 +2013,21 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - # converts output to namedtuple for pipelines post-processing - return XVectorOutput( - logits=output_buffers["logits"].view(output_shapes["logits"]), - embeddings=output_buffers["embeddings"].view(output_shapes["embeddings"]), - ) + logits = output_buffers["logits"].view(output_shapes["logits"]) + embeddings = output_buffers["embeddings"].view(output_shapes["embeddings"]) + else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - embeddings = outputs[self.output_names["embeddings"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - embeddings = torch.from_numpy(embeddings).to(self.device) + logits = model_outputs["logits"] + embeddings = model_outputs["embeddings"] - # converts output to namedtuple for pipelines post-processing - return XVectorOutput(logits=logits, embeddings=embeddings) + # converts output to namedtuple for pipelines post-processing + return XVectorOutput(logits=logits, embeddings=embeddings) AUDIO_FRAME_CLASSIFICATION_EXAMPLE = r""" @@ -2161,7 +2075,7 @@ class ORTModelForAudioFrameClassification(ORTModel): ) def forward( self, - input_values: Optional[torch.Tensor] = None, + input_values: Optional[Union[torch.Tensor, np.ndarray]] = None, **kwargs, ): use_torch = isinstance(input_values, torch.Tensor) @@ -2170,24 +2084,16 @@ def forward( if self.device.type == "cuda" and self.use_io_binding: raise NotImplementedError() else: - if use_torch: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = { - "input_values": input_values.cpu().detach().numpy(), - } - else: - onnx_inputs = { - "input_values": input_values, - } + model_inputs = {"input_values": input_values} - # run inference - outputs = self.model.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - logits = outputs[self.output_names["logits"]] - if use_torch: - logits = torch.from_numpy(logits).to(self.device) - # converts output to namedtuple for pipelines post-processing - return TokenClassifierOutput(logits=logits) + logits = model_outputs["logits"] + + # converts output to namedtuple for pipelines post-processing + return TokenClassifierOutput(logits=logits) CUSTOM_TASKS_EXAMPLE = r""" @@ -2236,57 +2142,27 @@ class ORTModelForCustomTasks(ORTModel): checkpoint="optimum/sbert-all-MiniLM-L6-with-pooler", ) ) - def forward(self, **kwargs): - use_torch = isinstance(next(iter(kwargs.values())), torch.Tensor) + def forward(self, **model_inputs: Union[torch.Tensor, np.ndarray]): + use_torch = isinstance(next(iter(model_inputs.values())), torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) if self.device.type == "cuda" and self.use_io_binding: - io_binding = IOBindingHelper.prepare_io_binding( - self, - **kwargs, - ordered_input_names=self._ordered_input_names, - ) + # TODO: should this be used in favor of `model.prepare_io_binding`? + io_binding = IOBindingHelper.prepare_io_binding(self, **model_inputs) # run inference with binding io_binding.synchronize_inputs() self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() - outputs = {} + model_outputs = {} for name, output in zip(self.output_names.keys(), io_binding._iobinding.get_outputs()): - outputs[name] = IOBindingHelper.to_pytorch(output) + model_outputs[name] = IOBindingHelper.to_pytorch(output) - # converts output to namedtuple for pipelines post-processing - return ModelOutput(**outputs) else: - # converts pytorch inputs into numpy inputs for onnx - onnx_inputs = self._prepare_onnx_inputs(use_torch=use_torch, **kwargs) - - # run inference + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - outputs = self._prepare_onnx_outputs(onnx_outputs, use_torch=use_torch) - - # converts output to namedtuple for pipelines post-processing - return ModelOutput(outputs) - - def _prepare_onnx_inputs(self, use_torch: bool, **kwargs): - onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx - for input in self.inputs_names.keys(): - onnx_inputs[input] = kwargs.pop(input) - - if use_torch: - onnx_inputs[input] = onnx_inputs[input].cpu().detach().numpy() - - return onnx_inputs - - def _prepare_onnx_outputs(self, onnx_outputs, use_torch: bool): - outputs = {} - # converts onnxruntime outputs into tensor for standard outputs - for output, idx in self.output_names.items(): - outputs[output] = onnx_outputs[idx] - - if use_torch: - outputs[output] = torch.from_numpy(outputs[output]).to(self.device) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) - return outputs + # converts output to namedtuple for pipelines post-processing + return ModelOutput(**model_outputs) diff --git a/optimum/onnxruntime/subpackage/__init__.py b/optimum/onnxruntime/subpackage/__init__.py new file mode 100644 index 0000000000..7029af7132 --- /dev/null +++ b/optimum/onnxruntime/subpackage/__init__.py @@ -0,0 +1 @@ +from .commands import ONNXRuntimeCommand diff --git a/optimum/commands/onnxruntime/__init__.py b/optimum/onnxruntime/subpackage/commands/__init__.py similarity index 87% rename from optimum/commands/onnxruntime/__init__.py rename to optimum/onnxruntime/subpackage/commands/__init__.py index 1b9c24c3b2..44facf5ea5 100644 --- a/optimum/commands/onnxruntime/__init__.py +++ b/optimum/onnxruntime/subpackage/commands/__init__.py @@ -14,5 +14,3 @@ # limitations under the License. from .base import ONNXRuntimeCommand -from .optimize import ONNXRuntimeOptimizeCommand -from .quantize import ONNXRuntimeQuantizeCommand diff --git a/optimum/commands/onnxruntime/base.py b/optimum/onnxruntime/subpackage/commands/base.py similarity index 91% rename from optimum/commands/onnxruntime/base.py rename to optimum/onnxruntime/subpackage/commands/base.py index 53e3245ea4..df4414c19d 100644 --- a/optimum/commands/onnxruntime/base.py +++ b/optimum/onnxruntime/subpackage/commands/base.py @@ -14,11 +14,13 @@ # limitations under the License. """optimum.onnxruntime command-line interface base classes.""" -from .. import BaseOptimumCLICommand, CommandInfo +from optimum.commands import BaseOptimumCLICommand, CommandInfo, optimum_cli_subcommand + from .optimize import ONNXRuntimeOptimizeCommand from .quantize import ONNXRuntimeQuantizeCommand +@optimum_cli_subcommand() class ONNXRuntimeCommand(BaseOptimumCLICommand): COMMAND = CommandInfo( name="onnxruntime", diff --git a/optimum/commands/onnxruntime/optimize.py b/optimum/onnxruntime/subpackage/commands/optimize.py similarity index 96% rename from optimum/commands/onnxruntime/optimize.py rename to optimum/onnxruntime/subpackage/commands/optimize.py index 5890e0a07c..1dd82f0ee2 100644 --- a/optimum/commands/onnxruntime/optimize.py +++ b/optimum/onnxruntime/subpackage/commands/optimize.py @@ -75,8 +75,8 @@ def parse_args(parser: "ArgumentParser"): return parse_args_onnxruntime_optimize(parser) def run(self): - from ...onnxruntime.configuration import AutoOptimizationConfig, ORTConfig - from ...onnxruntime.optimization import ORTOptimizer + from ...configuration import AutoOptimizationConfig, ORTConfig + from ...optimization import ORTOptimizer if self.args.output == self.args.onnx_model: raise ValueError("The output directory must be different than the directory hosting the ONNX model.") diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/onnxruntime/subpackage/commands/quantize.py similarity index 95% rename from optimum/commands/onnxruntime/quantize.py rename to optimum/onnxruntime/subpackage/commands/quantize.py index 2613cb33ba..6f6d843cc7 100644 --- a/optimum/commands/onnxruntime/quantize.py +++ b/optimum/onnxruntime/subpackage/commands/quantize.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from .. import BaseOptimumCLICommand +from optimum.commands import BaseOptimumCLICommand if TYPE_CHECKING: @@ -69,8 +69,8 @@ def parse_args(parser: "ArgumentParser"): return parse_args_onnxruntime_quantize(parser) def run(self): - from ...onnxruntime.configuration import AutoQuantizationConfig, ORTConfig - from ...onnxruntime.quantization import ORTQuantizer + from ...configuration import AutoQuantizationConfig, ORTConfig + from ...quantization import ORTQuantizer if self.args.output == self.args.onnx_model: raise ValueError("The output directory must be different than the directory hosting the ONNX model.") diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 0e1da447a6..37d0feefcc 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -128,6 +128,7 @@ class ORTConfigManager: "nystromformer": "bert", "pegasus": "bert", "roberta": "bert", + "segformer": "vit", "t5": "bert", "vit": "vit", "whisper": "bart", diff --git a/optimum/subpackages.py b/optimum/subpackages.py new file mode 100644 index 0000000000..8729581521 --- /dev/null +++ b/optimum/subpackages.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import logging +import sys + + +if sys.version_info >= (3, 8): + from importlib import metadata as importlib_metadata +else: + import importlib_metadata +from importlib.util import find_spec, module_from_spec + +from .utils import is_onnxruntime_available + + +logger = logging.getLogger(__name__) + + +def load_namespace_modules(namespace: str, module: str): + """Load modules with a specific name inside a namespace + + This method operates on namespace packages: + https://packaging.python.org/en/latest/guides/packaging-namespace-packages/ + + For each package inside the specified `namespace`, it looks for the specified `module` and loads it. + + Args: + namespace (`str`): + The namespace containing modules to be loaded. + module (`str`): + The name of the module to load in each namespace package. + """ + for dist in importlib_metadata.distributions(): + dist_name = dist.metadata["Name"] + if not dist_name.startswith(f"{namespace}-"): + continue + package_import_name = dist_name.replace("-", ".") + module_import_name = f"{package_import_name}.{module}" + if module_import_name in sys.modules: + # Module already loaded + continue + backend_spec = find_spec(module_import_name) + if backend_spec is None: + continue + try: + imported_module = module_from_spec(backend_spec) + sys.modules[module_import_name] = imported_module + backend_spec.loader.exec_module(imported_module) + logger.debug(f"Successfully loaded {module_import_name}") + except Exception as e: + logger.error(f"An exception occured while loading {module_import_name}: {e}.") + + +def load_subpackages(): + """Load optimum subpackages + + This method goes through packages inside the `optimum` namespace and loads the `subpackage` module if it exists. + + This module is then in charge of registering the subpackage commands. + """ + SUBPACKAGE_LOADER = "subpackage" + load_namespace_modules("optimum", SUBPACKAGE_LOADER) + + # Load subpackages from internal modules not explicitly defined as namespace packages + loader_name = "." + SUBPACKAGE_LOADER + if is_onnxruntime_available(): + importlib.import_module(loader_name, package="optimum.onnxruntime") diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 682f70e3ca..81207b7649 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -102,6 +102,19 @@ class NormalizedVisionConfig(NormalizedConfig): INPUT_SIZE = "input_size" +class NormalizedSegformerConfig(NormalizedVisionConfig): + NUM_ATTENTION_HEADS = "num_attention_heads" + HIDDEN_SIZE = "hidden_sizes" + + # If the attribute is a list, return 0 + # 0 means let the optimizer infer the correct value based on the model graph + def __getattr__(self, attr_name): + attr_value = super().__getattr__(attr_name) + if isinstance(attr_value, list): + attr_value = 0 + return attr_value + + class NormalizedTextAndVisionConfig(NormalizedTextConfig, NormalizedVisionConfig): TEXT_CONFIG = None VISION_CONFIG = None @@ -203,7 +216,6 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', - 'segformer', 'squeezebert', 'table-transformer', """ @@ -258,6 +270,7 @@ class NormalizedConfigManager: "regnet": NormalizedVisionConfig, "resnet": NormalizedVisionConfig, "roberta": NormalizedTextConfig, + "segformer": NormalizedSegformerConfig, "speech-to-text": SpeechToTextLikeNormalizedTextConfig, "splinter": NormalizedTextConfig, "t5": T5LikeNormalizedTextConfig, diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index f1c2f668e3..41bd140d86 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -16,6 +16,7 @@ import importlib.util import itertools import os +import shutil import subprocess import sys import unittest @@ -36,9 +37,6 @@ # Used to test the hub USER = "__DUMMY_OPTIMUM_USER__" -# Not critical, only usable on the sandboxed CI instance. -TOKEN = "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" - def flatten_dict(dictionary: Dict): """ @@ -184,3 +182,16 @@ def grid_parameters( else: returned_list = [test_name] + list(params) if add_test_name is True else list(params) yield returned_list + + +def remove_directory(dirpath): + """ + Remove a directory and its content. + This is a cross-platform solution to remove a directory and its content that avoids the use of `shutil.rmtree` on Windows. + Reference: https://github.com/python/cpython/issues/107408 + """ + if os.path.exists(dirpath) and os.path.isdir(dirpath): + if os.name == "nt": + os.system(f"rmdir /S /Q {dirpath}") + else: + shutil.rmtree(dirpath) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 0c070f8c9e..5ed1619fde 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -394,7 +394,7 @@ class GPTQDataTest(unittest.TestCase): def setUp(self): self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) - @parameterized.expand(["wikitext2", "c4", "ptb", "c4-new", "ptb-new"]) + @parameterized.expand(["wikitext2", "c4", "c4-new"]) def test_dataset(self, dataset): train_dataset = get_dataset( dataset, self.tokenizer, nsamples=self.NBSAMPLES, seqlen=self.SEQLEN, split="train" diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 3fe2c5e14d..6c88fddb40 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -14,7 +14,6 @@ # limitations under the License. import gc import os -import shutil import subprocess import tempfile import time @@ -109,7 +108,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.testing_utils import grid_parameters, require_hf_token, require_ort_rocm +from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm logger = logging.get_logger() @@ -184,9 +183,8 @@ def test_load_model_from_cache(self): def test_load_model_from_empty_cache(self): dirpath = os.path.join(default_cache_path, "models--" + self.TINY_ONNX_MODEL_ID.replace("/", "--")) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTModel.from_pretrained(self.TINY_ONNX_MODEL_ID, local_files_only=True) @@ -202,9 +200,8 @@ def test_load_seq2seq_model_from_cache(self): def test_load_seq2seq_model_from_empty_cache(self): dirpath = os.path.join(default_cache_path, "models--" + self.TINY_ONNX_SEQ2SEQ_MODEL_ID.replace("/", "--")) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True) @@ -225,9 +222,8 @@ def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--") ) + remove_directory(dirpath) - if os.path.exists(dirpath) and os.path.isdir(dirpath): - shutil.rmtree(dirpath) with self.assertRaises(Exception): _ = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True @@ -938,11 +934,14 @@ def test_stable_diffusion_model_on_rocm_ep_str(self): self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) def test_load_model_from_hub_private(self): - subprocess.run("huggingface-cli logout", shell=True) - # Read token of fxmartyclone (dummy user). - token = "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" + token = os.environ.get("HF_HUB_READ_TOKEN", None) - model = ORTModelForCustomTasks.from_pretrained("fxmartyclone/tiny-onnx-private-2", use_auth_token=token) + if token is None: + self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.") + + model = ORTModelForCustomTasks.from_pretrained( + "optimum-internal-testing/tiny-random-phi-private", use_auth_token=token + ) self.assertIsInstance(model.model, onnxruntime.InferenceSession) self.assertIsInstance(model.config, PretrainedConfig) @@ -1005,6 +1004,7 @@ def test_save_load_ort_model_with_external_data(self): # verify loading from local folder works model = ORTModelForSequenceClassification.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) @pytest.mark.run_slow @@ -1012,11 +1012,7 @@ def test_save_load_ort_model_with_external_data(self): def test_save_load_decoder_model_with_external_data(self, use_cache: bool): with tempfile.TemporaryDirectory() as tmpdirname: model = ORTModelForCausalLM.from_pretrained( - "gpt2-large", - use_cache=use_cache, - export=True, - use_merged=False, - use_io_binding=False, + "gpt2-large", use_cache=use_cache, export=True, use_merged=False, use_io_binding=False ) model.save_pretrained(tmpdirname) @@ -1030,6 +1026,7 @@ def test_save_load_decoder_model_with_external_data(self, use_cache: bool): model = ORTModelForCausalLM.from_pretrained( tmpdirname, use_cache=use_cache, export=False, use_io_binding=False ) + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): @@ -1052,6 +1049,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): # verify loading from local folder works model = ORTModelForSeq2SeqLM.from_pretrained(tmpdirname, use_cache=use_cache, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: @@ -1073,6 +1071,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self): # verify loading from local folder works model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + remove_directory(tmpdirname) @parameterized.expand([(False,), (True,)]) @unittest.skip("Skipping as this test consumes too much memory") @@ -2275,6 +2274,8 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): @parameterized.expand([(False,), (True,)]) @pytest.mark.run_in_series + # TODO: still gotta find out why this needs to be ran in series / why it fails in parallel + # my guess is that the model surgery is happening in parallel and that's causing the issue def test_inference_old_onnx_model(self, use_cache): tokenizer = get_preprocessor("gpt2") model = AutoModelForCausalLM.from_pretrained("gpt2") @@ -2287,9 +2288,9 @@ def test_inference_old_onnx_model(self, use_cache): tokens = tokenizer(text, return_tensors="pt") onnx_outputs = onnx_model.generate( - **tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10 + **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30 ) - outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=10) + outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30) onnx_text_outputs = tokenizer.decode(onnx_outputs[0], skip_special_tokens=True) text_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True) self.assertEqual(onnx_text_outputs, text_outputs) @@ -3602,13 +3603,20 @@ def _get_onnx_model_dir(self, model_id, model_arch, test_name): @pytest.mark.run_in_series def test_inference_old_onnx_model(self): - model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") + tokenizer = get_preprocessor("t5-small") + model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") + onnx_model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") - tokenizer = get_preprocessor("optimum/t5-small") text = "This is a sample output" tokens = tokenizer(text, return_tensors="pt") - model.generate(**tokens) + outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30) + onnx_outputs = onnx_model.generate( + **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30 + ) + onnx_text_outputs = tokenizer.decode(onnx_outputs[0], skip_special_tokens=True) + text_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True) + self.assertEqual(onnx_text_outputs, text_outputs) def test_load_vanilla_transformers_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -4757,6 +4765,9 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertTrue("logits" in onnx_outputs) self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue( + torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3) + ) if use_cache: self.assertEqual( @@ -4765,19 +4776,17 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertEqual( len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0]) ) - for i, _ in enumerate(onnx_outputs["past_key_values"]): - for j, ort_pkv in enumerate(onnx_outputs["past_key_values"][i]): - trfs_pkv = transformers_outputs["past_key_values"][i][j] + for i in range(len(onnx_outputs["past_key_values"])): + print(onnx_outputs["past_key_values"][i]) + for ort_pkv, trfs_pkv in zip( + onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i] + ): + ort_pkv = torch.Tensor(ort_pkv) self.assertTrue( torch.allclose(ort_pkv, trfs_pkv, atol=1e-3), f" Maxdiff: {torch.abs(ort_pkv - trfs_pkv).max()}", ) - # Compare tensor outputs - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3) - ) - gc.collect() @parameterized.expand(grid_parameters(FULL_GRID)) diff --git a/tests/onnxruntime/test_optimization.py b/tests/onnxruntime/test_optimization.py index c9cadbaa82..82109fcd11 100644 --- a/tests/onnxruntime/test_optimization.py +++ b/tests/onnxruntime/test_optimization.py @@ -36,6 +36,7 @@ AutoOptimizationConfig, ORTConfig, ORTModelForImageClassification, + ORTModelForSemanticSegmentation, ORTModelForSequenceClassification, ORTOptimizer, ) @@ -171,6 +172,7 @@ def test_compare_original_seq2seq_model_with_optimized_model(self, model_cls, mo # Contribution note: Please add test models in alphabetical order. Find test models here: https://huggingface.co/hf-internal-testing. SUPPORTED_IMAGE_ARCHITECTURES_WITH_MODEL_ID = ( + (ORTModelForSemanticSegmentation, "hf-internal-testing/tiny-random-segformer"), (ORTModelForImageClassification, "hf-internal-testing/tiny-random-vit"), ) diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index af89aec2b9..1656704807 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -50,7 +50,7 @@ "dataset_data_keys": {"question": "question", "context": "answer"}, }, "image-classification": { - "dataset_args": "mnist", + "dataset_args": "sasha/dog-food", "dataset_data_keys": {"image": "image"}, }, } @@ -232,6 +232,11 @@ def test_load_dataset_with_max_length(self): input_ids = dataset[0]["input_ids"] self.assertEqual(len(input_ids), max_length) + def test_load_default_dataset(self): + self.skipTest( + "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" + ) + class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): TASK_NAME = "question-answering"