diff --git a/examples/onnxruntime/optimization/multiple-choice/run_swag.py b/examples/onnxruntime/optimization/multiple-choice/run_swag.py index 3c43846b9a..bcddc97590 100644 --- a/examples/onnxruntime/optimization/multiple-choice/run_swag.py +++ b/examples/onnxruntime/optimization/multiple-choice/run_swag.py @@ -37,7 +37,7 @@ from optimum.onnxruntime import ORTModelForMultipleChoice, ORTOptimizer from optimum.onnxruntime.configuration import OptimizationConfig -from optimum.onnxruntime.model import ORTModel +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0 @@ -236,7 +236,6 @@ def main(): ) os.makedirs(training_args.output_dir, exist_ok=True) - optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx") tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path) @@ -254,13 +253,18 @@ def main(): optimizer = ORTOptimizer.from_pretrained(model) # Optimize the model - optimizer.optimize( + optimized_model_path = optimizer.optimize( optimization_config=optimization_config, save_dir=training_args.output_dir, use_external_data_format=onnx_export_args.use_external_data_format, one_external_file=onnx_export_args.one_external_file, ) + model = ORTModelForMultipleChoice.from_pretrained( + optimized_model_path, + provider=optim_args.execution_provider, + ) + if training_args.do_eval: # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the # prediction step(s) @@ -339,13 +343,12 @@ def compute_metrics(eval_predictions): # Evaluation logger.info("*** Evaluate ***") - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, - compute_metrics=compute_metrics, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, label_names=["label"], + compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) # Save evaluation metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: diff --git a/examples/onnxruntime/optimization/question-answering/run_qa.py b/examples/onnxruntime/optimization/question-answering/run_qa.py index 04a9bd34f3..407714cb01 100644 --- a/examples/onnxruntime/optimization/question-answering/run_qa.py +++ b/examples/onnxruntime/optimization/question-answering/run_qa.py @@ -37,7 +37,7 @@ from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer from optimum.onnxruntime.configuration import OptimizationConfig -from optimum.onnxruntime.model import ORTModel +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -305,7 +305,6 @@ def main(): ) os.makedirs(training_args.output_dir, exist_ok=True) - optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx") tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path) @@ -323,13 +322,15 @@ def main(): optimizer = ORTOptimizer.from_pretrained(model) # Optimize the model - optimizer.optimize( + optimized_model_path = optimizer.optimize( optimization_config=optimization_config, save_dir=training_args.output_dir, use_external_data_format=onnx_export_args.use_external_data_format, one_external_file=onnx_export_args.one_external_file, ) + model = ORTModelForQuestionAnswering.from_pretrained(optimized_model_path, provider=optim_args.execution_provider) + # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the # prediction step(s) if training_args.do_eval or training_args.do_predict: @@ -478,13 +479,12 @@ def compute_metrics(p: EvalPrediction): # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, - compute_metrics=compute_metrics, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, label_names=["start_positions", "end_positions"], + compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions) metrics = compute_metrics(predictions) @@ -514,12 +514,12 @@ def compute_metrics(p: EvalPrediction): # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, label_names=["start_positions", "end_positions"], + compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions) metrics = compute_metrics(predictions) diff --git a/examples/onnxruntime/optimization/text-classification/README.md b/examples/onnxruntime/optimization/text-classification/README.md index 42a99cc73d..3a7dce2b59 100644 --- a/examples/onnxruntime/optimization/text-classification/README.md +++ b/examples/onnxruntime/optimization/text-classification/README.md @@ -14,13 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. --> -# Text classification +# Text classification ## GLUE tasks -The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) -allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as -the ones from the [GLUE benchmark](https://gluebenchmark.com/). +The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/). The following example applies graph optimization on a DistilBERT fine-tuned on the sst-2 task. Here the optimization level is selected to be 1, enabling basic optimizations such as redundant node eliminations and constant folding. Higher optimization level will result in hardware dependent optimized graph. diff --git a/examples/onnxruntime/optimization/text-classification/run_glue.py b/examples/onnxruntime/optimization/text-classification/run_glue.py index a07193915b..222dda1507 100644 --- a/examples/onnxruntime/optimization/text-classification/run_glue.py +++ b/examples/onnxruntime/optimization/text-classification/run_glue.py @@ -42,7 +42,7 @@ from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer from optimum.onnxruntime.configuration import OptimizationConfig -from optimum.onnxruntime.model import ORTModel +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -250,7 +250,6 @@ def main(): ) os.makedirs(training_args.output_dir, exist_ok=True) - optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx") tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) @@ -268,13 +267,17 @@ def main(): optimizer = ORTOptimizer.from_pretrained(model) # Optimize the model - optimizer.optimize( + optimized_model_path = optimizer.optimize( optimization_config=optimization_config, save_dir=training_args.output_dir, use_external_data_format=onnx_export_args.use_external_data_format, one_external_file=onnx_export_args.one_external_file, ) + model = ORTModelForSequenceClassification.from_pretrained( + optimized_model_path, provider=optim_args.execution_provider + ) + # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the # prediction step(s) if training_args.do_eval or training_args.do_predict: @@ -408,13 +411,13 @@ def compute_metrics(p: EvalPrediction): desc="Running tokenizer on the evaluation dataset", ) - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + eval_dataset=eval_dataset, compute_metrics=compute_metrics, label_names=["label"], ) - outputs = ort_model.evaluation_loop(eval_dataset) + # Save metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: json.dump(outputs.metrics, f, indent=4, sort_keys=True) @@ -436,10 +439,12 @@ def compute_metrics(p: EvalPrediction): desc="Running tokenizer on the test dataset", ) - ort_model = ORTModel( - optimized_model_path, execution_provider=optim_args.execution_provider, label_names=["label"] + outputs = evaluation_loop( + model=model, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + label_names=["label"], ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1) # Save predictions diff --git a/examples/onnxruntime/optimization/token-classification/run_ner.py b/examples/onnxruntime/optimization/token-classification/run_ner.py index 73db3671d2..2e7b63792c 100644 --- a/examples/onnxruntime/optimization/token-classification/run_ner.py +++ b/examples/onnxruntime/optimization/token-classification/run_ner.py @@ -38,7 +38,7 @@ from optimum.onnxruntime import ORTModelForTokenClassification, ORTOptimizer from optimum.onnxruntime.configuration import OptimizationConfig -from optimum.onnxruntime.model import ORTModel +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -276,7 +276,6 @@ def main(): ) os.makedirs(training_args.output_dir, exist_ok=True) - optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx") tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path) @@ -480,12 +479,11 @@ def compute_metrics(p): desc="Running tokenizer on the validation dataset", ) - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) # Save evaluation metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: @@ -509,12 +507,11 @@ def compute_metrics(p): desc="Running tokenizer on the prediction dataset", ) - ort_model = ORTModel( - optimized_model_path, - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=predict_dataset, compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = np.argmax(outputs.predictions, axis=2) # Remove ignored index (special tokens) diff --git a/examples/onnxruntime/quantization/image-classification/run_image_classification.py b/examples/onnxruntime/quantization/image-classification/run_image_classification.py index 3d0fa72882..6feaaef4f3 100644 --- a/examples/onnxruntime/quantization/image-classification/run_image_classification.py +++ b/examples/onnxruntime/quantization/image-classification/run_image_classification.py @@ -22,7 +22,6 @@ import sys from dataclasses import dataclass, field from functools import partial -from pathlib import Path from typing import Optional import datasets @@ -38,7 +37,6 @@ from optimum.onnxruntime import ORTQuantizer from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig -from optimum.onnxruntime.model import ORTModel from optimum.onnxruntime.modeling_ort import ORTModelForImageClassification from optimum.onnxruntime.preprocessors import QuantizationPreprocessor from optimum.onnxruntime.preprocessors.passes import ( @@ -47,6 +45,7 @@ ExcludeNodeAfter, ExcludeNodeFollowedBy, ) +from optimum.onnxruntime.utils import evaluation_loop logger = logging.getLogger(__name__) @@ -378,13 +377,16 @@ def compute_metrics(p: EvalPrediction): quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax")) # Apply quantization on the model - quantizer.quantize( + quantized_model_path = quantizer.quantize( save_dir=training_args.output_dir, calibration_tensors_range=ranges, quantization_config=qconfig, preprocessor=quantization_preprocessor, use_external_data_format=onnx_export_args.use_external_data_format, ) + model = ORTModelForImageClassification.from_pretrained( + quantized_model_path, provider=optim_args.execution_provider + ) # Evaluation if training_args.do_eval: @@ -409,13 +411,12 @@ def compute_metrics(p: EvalPrediction): # Set the validation transforms eval_dataset = eval_dataset.with_transform(preprocess_function) - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, - compute_metrics=compute_metrics, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, label_names=[labels_column], + compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) # Save metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: json.dump(outputs.metrics, f, indent=4, sort_keys=True) diff --git a/examples/onnxruntime/quantization/multiple-choice/run_swag.py b/examples/onnxruntime/quantization/multiple-choice/run_swag.py index 9d9642c12d..9a8423f836 100644 --- a/examples/onnxruntime/quantization/multiple-choice/run_swag.py +++ b/examples/onnxruntime/quantization/multiple-choice/run_swag.py @@ -38,7 +38,6 @@ from optimum.onnxruntime import ORTModelForMultipleChoice, ORTQuantizer from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig -from optimum.onnxruntime.model import ORTModel from optimum.onnxruntime.preprocessors import QuantizationPreprocessor from optimum.onnxruntime.preprocessors.passes import ( ExcludeGeLUNodes, @@ -46,6 +45,7 @@ ExcludeNodeAfter, ExcludeNodeFollowedBy, ) +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0 @@ -409,13 +409,14 @@ def compute_metrics(eval_predictions): quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax")) # Apply quantization on the model - quantizer.quantize( + quantized_model_path = quantizer.quantize( save_dir=training_args.output_dir, calibration_tensors_range=ranges, quantization_config=qconfig, preprocessor=quantization_preprocessor, use_external_data_format=onnx_export_args.use_external_data_format, ) + model = ORTModelForMultipleChoice.from_pretrained(quantized_model_path, provider=optim_args.execution_provider) # Evaluation if training_args.do_eval: @@ -436,13 +437,12 @@ def compute_metrics(eval_predictions): load_from_cache_file=not data_args.overwrite_cache, ) - ort_model = ORTModel( - os.path.join(training_args.output_dir, "model_quantized.onnx"), - execution_provider=optim_args.execution_provider, - compute_metrics=compute_metrics, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, label_names=["label"], + compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) # Save evaluation metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: diff --git a/examples/onnxruntime/quantization/question-answering/README.md b/examples/onnxruntime/quantization/question-answering/README.md index 380afff8ca..8345ca8e4d 100644 --- a/examples/onnxruntime/quantization/question-answering/README.md +++ b/examples/onnxruntime/quantization/question-answering/README.md @@ -16,13 +16,9 @@ limitations under the License. # Question answering +The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks. -The script [`run_qa.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/question-answering/run_qa.py) -allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph -optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for question answering tasks. - -Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along -the flag `--version_2_with_negative`. +Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`. The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the SQuAD1.0 dataset. diff --git a/examples/onnxruntime/quantization/question-answering/run_qa.py b/examples/onnxruntime/quantization/question-answering/run_qa.py index 4a6a854fd9..50661b7b42 100644 --- a/examples/onnxruntime/quantization/question-answering/run_qa.py +++ b/examples/onnxruntime/quantization/question-answering/run_qa.py @@ -24,7 +24,6 @@ import sys from dataclasses import dataclass, field from functools import partial -from pathlib import Path from typing import Optional import datasets @@ -39,7 +38,6 @@ from optimum.onnxruntime import ORTQuantizer from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig -from optimum.onnxruntime.model import ORTModel from optimum.onnxruntime.modeling_ort import ORTModelForQuestionAnswering from optimum.onnxruntime.preprocessors import QuantizationPreprocessor from optimum.onnxruntime.preprocessors.passes import ( @@ -48,6 +46,7 @@ ExcludeNodeAfter, ExcludeNodeFollowedBy, ) +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -651,25 +650,25 @@ def compute_metrics(p: EvalPrediction): quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax")) # Apply quantization on the model - quantizer.quantize( + quantized_model_path = quantizer.quantize( save_dir=training_args.output_dir, calibration_tensors_range=ranges, quantization_config=qconfig, preprocessor=quantization_preprocessor, use_external_data_format=onnx_export_args.use_external_data_format, ) + model = ORTModelForQuestionAnswering.from_pretrained(quantized_model_path, provider=optim_args.execution_provider) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, compute_metrics=compute_metrics, label_names=["start_positions", "end_positions"], ) - outputs = ort_model.evaluation_loop(eval_dataset) predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions) metrics = compute_metrics(predictions) @@ -681,12 +680,11 @@ def compute_metrics(p: EvalPrediction): if training_args.do_predict: logger.info("*** Predict ***") - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=predict_dataset, label_names=["start_positions", "end_positions"], ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions) metrics = compute_metrics(predictions) diff --git a/examples/onnxruntime/quantization/text-classification/README.md b/examples/onnxruntime/quantization/text-classification/README.md index 460bb56fba..95fd333517 100644 --- a/examples/onnxruntime/quantization/text-classification/README.md +++ b/examples/onnxruntime/quantization/text-classification/README.md @@ -18,10 +18,7 @@ limitations under the License. ## GLUE tasks -The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py) -allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph -optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as -the ones from the [GLUE benchmark](https://gluebenchmark.com/). +The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/text-classification/run_glue.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/). The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the sst-2 task. diff --git a/examples/onnxruntime/quantization/text-classification/run_glue.py b/examples/onnxruntime/quantization/text-classification/run_glue.py index bc141b2194..4b9ee0403c 100644 --- a/examples/onnxruntime/quantization/text-classification/run_glue.py +++ b/examples/onnxruntime/quantization/text-classification/run_glue.py @@ -23,7 +23,6 @@ import sys from dataclasses import dataclass, field from functools import partial -from pathlib import Path from typing import Optional import datasets @@ -44,7 +43,6 @@ from optimum.onnxruntime import ORTQuantizer from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig -from optimum.onnxruntime.model import ORTModel from optimum.onnxruntime.modeling_ort import ORTModelForSequenceClassification from optimum.onnxruntime.preprocessors import QuantizationPreprocessor from optimum.onnxruntime.preprocessors.passes import ( @@ -53,6 +51,7 @@ ExcludeNodeAfter, ExcludeNodeFollowedBy, ) +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -476,13 +475,16 @@ def compute_metrics(p: EvalPrediction): quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax")) # Apply quantization on the model - quantizer.quantize( + quantized_model_path = quantizer.quantize( save_dir=training_args.output_dir, calibration_tensors_range=ranges, quantization_config=qconfig, preprocessor=quantization_preprocessor, use_external_data_format=onnx_export_args.use_external_data_format, ) + model = ORTModelForSequenceClassification.from_pretrained( + quantized_model_path, provider=optim_args.execution_provider + ) # Evaluation if training_args.do_eval: @@ -504,13 +506,13 @@ def compute_metrics(p: EvalPrediction): f" Evaluation results may suffer from a wrong matching." ) - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, compute_metrics=compute_metrics, label_names=["label"], ) - outputs = ort_model.evaluation_loop(eval_dataset) + # Save metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: json.dump(outputs.metrics, f, indent=4, sort_keys=True) @@ -525,12 +527,11 @@ def compute_metrics(p: EvalPrediction): if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=predict_dataset, label_names=["label"], ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1) # Save predictions diff --git a/examples/onnxruntime/quantization/token-classification/README.md b/examples/onnxruntime/quantization/token-classification/README.md index f56388ed3c..540b3cbe2d 100644 --- a/examples/onnxruntime/quantization/token-classification/README.md +++ b/examples/onnxruntime/quantization/token-classification/README.md @@ -16,10 +16,7 @@ limitations under the License. # Token classification - -The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py) -allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph -optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks. +The script [`run_ner.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/quantization/token-classification/run_ner.py) allows us to apply different quantization approaches (such as dynamic and static quantization) as well as graph optimizations using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for token classification tasks. The following example applies post-training dynamic quantization on a DistilBERT fine-tuned on the CoNLL-2003 task diff --git a/examples/onnxruntime/quantization/token-classification/run_ner.py b/examples/onnxruntime/quantization/token-classification/run_ner.py index 1cc12d3fbc..3a5798c57a 100644 --- a/examples/onnxruntime/quantization/token-classification/run_ner.py +++ b/examples/onnxruntime/quantization/token-classification/run_ner.py @@ -25,7 +25,6 @@ import sys from dataclasses import dataclass, field from functools import partial -from pathlib import Path from typing import Optional import datasets @@ -40,7 +39,6 @@ from optimum.onnxruntime import ORTQuantizer from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig -from optimum.onnxruntime.model import ORTModel from optimum.onnxruntime.modeling_ort import ORTModelForTokenClassification from optimum.onnxruntime.preprocessors import QuantizationPreprocessor from optimum.onnxruntime.preprocessors.passes import ( @@ -49,6 +47,7 @@ ExcludeNodeAfter, ExcludeNodeFollowedBy, ) +from optimum.onnxruntime.utils import evaluation_loop # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -551,13 +550,16 @@ def compute_metrics(p): quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax")) # Apply quantization on the model - quantizer.quantize( + quantized_model_path = quantizer.quantize( save_dir=training_args.output_dir, calibration_tensors_range=ranges, quantization_config=qconfig, preprocessor=quantization_preprocessor, use_external_data_format=onnx_export_args.use_external_data_format, ) + model = ORTModelForTokenClassification.from_pretrained( + quantized_model_path, provider=optim_args.execution_provider + ) # Evaluation if training_args.do_eval: @@ -572,12 +574,11 @@ def compute_metrics(p): desc="Running tokenizer on the validation dataset", ) - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=eval_dataset, compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(eval_dataset) # Save evaluation metrics with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f: @@ -602,12 +603,11 @@ def compute_metrics(p): desc="Running tokenizer on the prediction dataset", ) - ort_model = ORTModel( - Path(training_args.output_dir) / "model_quantized.onnx", - execution_provider=optim_args.execution_provider, + outputs = evaluation_loop( + model=model, + dataset=predict_dataset, compute_metrics=compute_metrics, ) - outputs = ort_model.evaluation_loop(predict_dataset) predictions = np.argmax(outputs.predictions, axis=2) # Remove ignored index (special tokens) diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py index 23ca6e5e6a..caa662f382 100644 --- a/optimum/onnxruntime/model.py +++ b/optimum/onnxruntime/model.py @@ -49,6 +49,11 @@ def __init__( label_names (`List[str]`, `optional`): The list of keys in your dictionary of inputs that correspond to the labels. """ + + logger.warning( + "The class `optimum.onnxruntime.model.ORTModel` is deprecated and will be removed in the next release." + ) + self.compute_metrics = compute_metrics self.label_names = ["labels"] if label_names is None else label_names self.session = InferenceSession(str(model_path), providers=[execution_provider]) diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 37d0feefcc..ad40af92b9 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -17,11 +17,15 @@ import re from enum import Enum from inspect import signature -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import numpy as np import torch from packaging import version +from tqdm import tqdm +from transformers import EvalPrediction +from transformers.trainer_pt_utils import nested_concat +from transformers.trainer_utils import EvalLoopOutput from transformers.utils import logging import onnxruntime as ort @@ -30,6 +34,12 @@ from ..utils.import_utils import _is_package_available +if TYPE_CHECKING: + from datasets import Dataset + + from .modeling_ort import ORTModel + + logger = logging.get_logger(__name__) ONNX_WEIGHTS_NAME = "model.onnx" @@ -341,3 +351,53 @@ class ORTQuantizableOperator(Enum): Resize = "Resize" AveragePool = "AveragePool" Concat = "Concat" + + +def evaluation_loop( + model: "ORTModel", + dataset: "Dataset", + label_names: Optional[List[str]] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, +): + """ + Run evaluation and returns metrics and predictions. + + Args: + model (`ORTModel`): + The ONNXRuntime model to use for the evaluation step. + dataset (`datasets.Dataset`): + Dataset to use for the evaluation step. + label_names (`List[str]`, `optional`): + The list of keys in your dictionary of inputs that correspond to the labels. + compute_metrics (`Callable[[EvalPrediction], Dict]`, `optional`): + The function that will be used to compute metrics at evaluation. Must take an `EvalPrediction` and + return a dictionary string to metric values. + """ + + all_preds = None + all_labels = None + + for inputs in tqdm(dataset, desc="Evaluation"): + has_labels = all(inputs.get(k) is not None for k in label_names) + if has_labels: + labels = tuple(np.array([inputs.get(name)]) for name in label_names) + if len(labels) == 1: + labels = labels[0] + else: + labels = None + + inputs = {key: np.array([inputs[key]]) for key in model.input_names if key in inputs} + preds = model(**inputs) + + if len(preds) == 1: + preds = preds[0] + + all_preds = preds if all_preds is None else nested_concat(all_preds, preds, padding_index=-100) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + if compute_metrics is not None and all_preds is not None and all_labels is not None: + metrics = compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) + else: + metrics = {} + + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))