From 096386299309ec9c4c737c9e5ce260008c48fad7 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 25 Aug 2023 21:18:48 +0900 Subject: [PATCH 1/2] Add GPTQ prefill benchmark (#1313) add prefill bench --- tests/benchmark/README.md | 63 ++++++++++++++++++++++++++++++- tests/benchmark/benchmark_gptq.py | 14 +++++-- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index 44aae5eb34..3315af5c47 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -4,6 +4,8 @@ Please refer to https://medium.com/pytorch/bettertransformer-out-of-the-box-perf # GPTQ benchmark +## Generation benchmark results + Run ```shell @@ -26,8 +28,6 @@ CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-cha CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama ``` -## Benchmark results - Here are results obtained on a single NVIDIA A100-SXM4-80GB GPU. We use a prompt length of 512, and generate exactly 512 new tokens. Each generation is repeated for 4 batches, and metrics are averaged over the number of batches and generation length. Additional benchmarks could be done in the act-order case. @@ -75,3 +75,62 @@ From the bencharmk, it appears that Exllama kernel is the best-in-class for GPTQ |False|None |None|None |None |26.0 |69.94 |228.76 |53986.51 | |True |False |4 |128 |exllama|36.2 |95.41 |167.68 |34777.04 | |True |False |4 |128 |autogptq-cuda-old|36.2 |192.48 |83.12 |35497.62 | + + +## Prefill-only benchmark results + +Run + +```shell +# pytorch fp16 +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill + +# exllama kernel +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill + +# cuda-old kernel +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama +``` + +The benchmark below is for a prompt length of 512, measuring only the prefill step on a single NVIDIA A100-SXM4-80GB GPU. The forward is repeated 10 times. This benchmark typically corresponds to the forward during training (to the difference that here `generate` is called, which has some overhead). + +### Batch size = 1 + +|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| +|False|None |None|None |None |512 |1 |27.22 |96.38 |10.38 |27999.54 | +|True |False |4 |128 |exllama |512 |1 |38.35 |112.54 |8.89 |9330.89 | +|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |368.13 |2.72 |9474.19 | + +### Batch size = 2 + +|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| +|False|None |None|None |None |512 |1 |27.22 |169.95 |11.77 |28524.37 | +|True |False |4 |128 |exllama |512 |1 |38.35 |190.44 |10.50 |9855.71 | +|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |443.80 |4.51 |9928.23 | + +### Batch size = 4 + +|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| +|False|None |None|None |None |512 |1 |27.22 |305.99 |13.07 |29574.01 | +|True |False |4 |128 |exllama |512 |1 |38.35 |345.54 |11.58 |10905.35 | +|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |597.24 |6.70 |10838.42 | + +### Batch size = 8 + +|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| +|False|None |None|None |None |512 |1 |27.22 |600.47 |13.32 |31673.30 | +|True |False |4 |128 |exllama |512 |1 |38.35 |659.61 |12.13 |13004.64 | +|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |909.09 |8.80 |12862.18 | + +### Batch size = 16 + +|gptq |act_order|bits|group_size|kernel |num_batches|batch_size|prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|----------|-----------------|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------| +|True |False |4 |128 |exllama |10 |16 |512 |1 |38.35 |1280.25 |12.50 |17203.22 | +|False|None |None|None |None |10 |16 |512 |1 |27.22 |1209.07 |13.23 |35871.88 | +|True |False |4 |128 |autogptq-cuda-old|10 |16 |512 |1 |43.94 |1533.54 |10.43 |17060.76 | + diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 50adfbde7e..52b8ba162c 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -58,6 +58,11 @@ def get_parser(): default=256, help="", ) + parser.add_argument( + "--prefill", + action="store_true", + help="For decoder models, benchmark only the prefill step with `prompt_length`.", + ) parser.add_argument( "--gptq", action="store_true", @@ -231,10 +236,13 @@ def benchmark_memory( prompt_lengths = [512] new_tokens = [512] else: - batch_sizes = args.batch_size - prompt_lengths = args.prompt_length - new_tokens = args.new_tokens + batch_sizes = [args.batch_size] + prompt_lengths = [args.prompt_length] + new_tokens = [args.new_tokens] +if args.prefill: + print("Running the prefill benchmark: generating only one new token.") + new_tokens = [1] if not torch.cuda.is_available(): raise ValueError("A cuda device is necessary to benchmark GPTQ.") From d7d17eb82c636e4459d5d689a4474855f9b2d87f Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 25 Aug 2023 23:46:40 +0900 Subject: [PATCH 2/2] Precise ORTModel documentation (#1268) * fix doc ortmodel * style * Update docs/source/exporters/onnx/usage_guides/export_a_model.mdx Co-authored-by: regisss <15324346+regisss@users.noreply.github.com> * Update optimum/onnxruntime/modeling_ort.py Co-authored-by: regisss <15324346+regisss@users.noreply.github.com> --------- Co-authored-by: regisss <15324346+regisss@users.noreply.github.com> --- .../onnx/usage_guides/export_a_model.mdx | 2 + optimum/onnxruntime/modeling_ort.py | 136 +++++------------- 2 files changed, 35 insertions(+), 103 deletions(-) diff --git a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx index c6d37b2404..1ff74cb11c 100644 --- a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx +++ b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx @@ -320,6 +320,8 @@ main_export( ) ``` +For tasks that require only a single ONNX file (e.g. encoder-only), an exported model with custom inputs/outputs can then be used with the class [`optimum.onnxruntime.ORTModelForCustomTasks`] for inference with ONNX Runtime on CPU or GPU. + ### Customize the export of Transformers models with custom modeling Optimum supports the export of Transformers models with custom modeling that use [`trust_remote_code=True`](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModel.from_pretrained.trust_remote_code), not officially supported in the Transormers library but usable with its functionality as [pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines) and [generation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.generate). diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 1784766c6a..75e44bd243 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -39,7 +39,7 @@ AutoModelForSequenceClassification, AutoModelForTokenClassification, ) -from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward +from transformers.file_utils import add_end_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from transformers.modeling_outputs import ( BaseModelOutput, CausalLMOutput, @@ -85,16 +85,11 @@ _FEATURE_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor" _PROCESSOR_FOR_DOC = "AutoProcessor" -ONNX_MODEL_START_DOCSTRING = r""" +ONNX_MODEL_END_DOCSTRING = r""" This model inherits from [`~onnxruntime.modeling_ort.ORTModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving) - Args: - config (`transformers.PretrainedConfig`): [PretrainedConfig](https://huggingface.co/docs/transformers/main_classes/configuration#transformers.PretrainedConfig) is the Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~onnxruntime.modeling_ort.ORTModel.from_pretrained`] method to load the model weights. - model (`onnxruntime.InferenceSession`): [onnxruntime.InferenceSession](https://onnxruntime.ai/docs/api/python/api_summary.html#inferencesession) is the main class used to run a model. Check out the [`~onnxruntime.modeling_ort.ORTModel.load_model`] method for more information. - use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`. + This class should be initialized using the [`onnxruntime.modeling_ort.ORTModel.from_pretrained`] method. """ ONNX_TEXT_INPUTS_DOCSTRING = r""" @@ -863,15 +858,10 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool): """ -@add_start_docstrings( - """ - Onnx Model with a BaseModelOutput for feature-extraction tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForFeatureExtraction(ORTModel): """ - Feature Extraction model for ONNX. + ONNX Model for feature-extraction task. """ auto_model_class = AutoModel @@ -976,15 +966,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a MaskedLMOutput for masked language modeling tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForMaskedLM(ORTModel): """ - Masked language model for ONNX. + ONNX Model with a MaskedLMOutput for masked language modeling tasks. """ auto_model_class = AutoModelForMaskedLM @@ -1084,15 +1069,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForQuestionAnswering(ORTModel): """ - Question Answering model for ONNX. + ONNX Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD. """ auto_model_class = AutoModelForQuestionAnswering @@ -1211,16 +1191,11 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a sequence classification/regression head on top (a linear layer on top of the - pooled output) e.g. for GLUE tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForSequenceClassification(ORTModel): """ - Sequence Classification model for ONNX. + ONNX Model with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. """ auto_model_class = AutoModelForSequenceClassification @@ -1317,16 +1292,11 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. - for Named-Entity-Recognition (NER) tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForTokenClassification(ORTModel): """ - Token Classification model for ONNX. + ONNX Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. """ auto_model_class = AutoModelForTokenClassification @@ -1420,16 +1390,11 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a - softmax) e.g. for RocStories/SWAG tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForMultipleChoice(ORTModel): """ - Multiple choice model for ONNX. + ONNX Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. """ auto_model_class = AutoModelForMultipleChoice @@ -1531,15 +1496,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model for image-classification tasks. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForImageClassification(ORTModel): """ - Image Classification model for ONNX. + ONNX Model for image-classification tasks. """ auto_model_class = AutoModelForImageClassification @@ -1630,15 +1590,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with an all-MLP decode head on top e.g. for ADE20k, CityScapes. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForSemanticSegmentation(ORTModel): """ - Semantic Segmentation model for ONNX. + ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. """ auto_model_class = AutoModelForSemanticSegmentation @@ -1741,16 +1696,11 @@ def _prepare_onnx_inputs(self, use_torch: bool, **kwargs): """ -@add_start_docstrings( - """ - Onnx Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like - SUPERB Keyword Spotting. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForAudioClassification(ORTModel): """ - Audio Classification model for ONNX. + ONNX Model for audio-classification, with a sequence classification head on top (a linear layer over the pooled output) for tasks like + SUPERB Keyword Spotting. """ auto_model_class = AutoModelForAudioClassification @@ -1832,15 +1782,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with a language modeling head on top for Connectionist Temporal Classification (CTC). - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForCTC(ORTModel): """ - CTC model for ONNX. + ONNX Model with a language modeling head on top for Connectionist Temporal Classification (CTC). """ auto_model_class = AutoModelForCTC @@ -1920,15 +1865,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model with an XVector feature extraction head on top for tasks like Speaker Verification. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForAudioXVector(ORTModel): """ - Audio XVector model for ONNX. + ONNX Model with an XVector feature extraction head on top for tasks like Speaker Verification. """ auto_model_class = AutoModelForAudioXVector @@ -2014,15 +1954,10 @@ def forward( """ -@add_start_docstrings( - """ - Onnx Model for with a frame classification head on top for tasks like Speaker Diarization. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForAudioFrameClassification(ORTModel): """ - Audio Frame Classification model for ONNX. + ONNX Model with a frame classification head on top for tasks like Speaker Diarization. """ auto_model_class = AutoModelForAudioFrameClassification @@ -2099,15 +2034,10 @@ def forward( """ -@add_start_docstrings( - """ - ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model. - """, - ONNX_MODEL_START_DOCSTRING, -) +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForCustomTasks(ORTModel): """ - Model for any custom tasks if the ONNX model is stored in a single file. + ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model, that may use custom inputs and outputs. """ @add_start_docstrings_to_model_forward(