From 096386299309ec9c4c737c9e5ce260008c48fad7 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Fri, 25 Aug 2023 21:18:48 +0900
Subject: [PATCH 1/2] Add GPTQ prefill benchmark (#1313)

add prefill bench
---
 tests/benchmark/README.md         | 63 ++++++++++++++++++++++++++++++-
 tests/benchmark/benchmark_gptq.py | 14 +++++--
 2 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index 44aae5eb34..3315af5c47 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -4,6 +4,8 @@ Please refer to https://medium.com/pytorch/bettertransformer-out-of-the-box-perf
 
 # GPTQ benchmark
 
+## Generation benchmark results
+
 Run
 
 ```shell
@@ -26,8 +28,6 @@ CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-cha
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama
 ```
 
-## Benchmark results
-
 Here are results obtained on a single NVIDIA A100-SXM4-80GB GPU. We use a prompt length of 512, and generate exactly 512 new tokens. Each generation is repeated for 4 batches, and metrics are averaged over the number of batches and generation length.
 
 Additional benchmarks could be done in the act-order case.
@@ -75,3 +75,62 @@ From the bencharmk, it appears that Exllama kernel is the best-in-class for GPTQ
 |False|None     |None|None      |None  |26.0         |69.94                 |228.76            |53986.51        |
 |True |False    |4   |128       |exllama|36.2         |95.41                 |167.68            |34777.04        |
 |True |False    |4   |128       |autogptq-cuda-old|36.2         |192.48                |83.12             |35497.62        |
+
+
+## Prefill-only benchmark results
+
+Run
+
+```shell
+# pytorch fp16
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill
+
+# exllama kernel
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill
+
+# cuda-old kernel
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama
+```
+
+The benchmark below is for a prompt length of 512, measuring only the prefill step on a single NVIDIA A100-SXM4-80GB GPU. The forward is repeated 10 times. This benchmark typically corresponds to the forward during training (to the difference that here `generate` is called, which has some overhead).
+
+### Batch size = 1
+
+|gptq |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
+|False|None     |None|None      |None             |512          |1         |27.22        |96.38                 |10.38             |27999.54       |
+|True |False    |4   |128       |exllama          |512          |1         |38.35        |112.54                |8.89              |9330.89        |
+|True |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |368.13                |2.72              |9474.19        |
+
+### Batch size = 2
+
+|gptq |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
+|False|None     |None|None      |None             |512          |1         |27.22        |169.95                |11.77             |28524.37       |
+|True |False    |4   |128       |exllama          |512          |1         |38.35        |190.44                |10.50             |9855.71        |
+|True |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |443.80                |4.51              |9928.23        |
+
+### Batch size = 4
+
+|gptq |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
+|False|None     |None|None      |None             |512          |1         |27.22        |305.99                |13.07             |29574.01       |
+|True |False    |4   |128       |exllama          |512          |1         |38.35        |345.54                |11.58             |10905.35       |
+|True |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |597.24                |6.70              |10838.42       |
+
+### Batch size = 8
+
+|gptq |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
+|False|None     |None|None      |None             |512          |1         |27.22        |600.47                |13.32             |31673.30       |
+|True |False    |4   |128       |exllama          |512          |1         |38.35        |659.61                |12.13             |13004.64       |
+|True |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |909.09                |8.80              |12862.18       |
+
+### Batch size = 16
+
+|gptq |act_order|bits|group_size|kernel           |num_batches|batch_size|prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|----------|-----------------|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------|
+|True |False    |4   |128       |exllama          |10         |16        |512          |1         |38.35        |1280.25               |12.50             |17203.22       |
+|False|None     |None|None      |None             |10         |16        |512          |1         |27.22        |1209.07               |13.23             |35871.88       |
+|True |False    |4   |128       |autogptq-cuda-old|10         |16        |512          |1         |43.94        |1533.54               |10.43             |17060.76       |
+
diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 50adfbde7e..52b8ba162c 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -58,6 +58,11 @@ def get_parser():
         default=256,
         help="",
     )
+    parser.add_argument(
+        "--prefill",
+        action="store_true",
+        help="For decoder models, benchmark only the prefill step with `prompt_length`.",
+    )
     parser.add_argument(
         "--gptq",
         action="store_true",
@@ -231,10 +236,13 @@ def benchmark_memory(
     prompt_lengths = [512]
     new_tokens = [512]
 else:
-    batch_sizes = args.batch_size
-    prompt_lengths = args.prompt_length
-    new_tokens = args.new_tokens
+    batch_sizes = [args.batch_size]
+    prompt_lengths = [args.prompt_length]
+    new_tokens = [args.new_tokens]
 
+if args.prefill:
+    print("Running the prefill benchmark: generating only one new token.")
+    new_tokens = [1]
 
 if not torch.cuda.is_available():
     raise ValueError("A cuda device is necessary to benchmark GPTQ.")

From d7d17eb82c636e4459d5d689a4474855f9b2d87f Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Fri, 25 Aug 2023 23:46:40 +0900
Subject: [PATCH 2/2] Precise ORTModel documentation (#1268)

* fix doc ortmodel

* style

* Update docs/source/exporters/onnx/usage_guides/export_a_model.mdx

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>

* Update optimum/onnxruntime/modeling_ort.py

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>

---------

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 .../onnx/usage_guides/export_a_model.mdx      |   2 +
 optimum/onnxruntime/modeling_ort.py           | 136 +++++-------------
 2 files changed, 35 insertions(+), 103 deletions(-)

diff --git a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
index c6d37b2404..1ff74cb11c 100644
--- a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
+++ b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
@@ -320,6 +320,8 @@ main_export(
 )
 ```
 
+For tasks that require only a single ONNX file (e.g. encoder-only), an exported model with custom inputs/outputs can then be used with the class [`optimum.onnxruntime.ORTModelForCustomTasks`] for inference with ONNX Runtime on CPU or GPU.
+
 ### Customize the export of Transformers models with custom modeling
 
 Optimum supports the export of Transformers models with custom modeling that use [`trust_remote_code=True`](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModel.from_pretrained.trust_remote_code), not officially supported in the Transormers library but usable with its functionality as [pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines) and [generation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.generate).
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 1784766c6a..75e44bd243 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -39,7 +39,7 @@
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
 )
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from transformers.file_utils import add_end_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -85,16 +85,11 @@
 _FEATURE_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
 _PROCESSOR_FOR_DOC = "AutoProcessor"
 
-ONNX_MODEL_START_DOCSTRING = r"""
+ONNX_MODEL_END_DOCSTRING = r"""
     This model inherits from [`~onnxruntime.modeling_ort.ORTModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving)
 
-    Args:
-        config (`transformers.PretrainedConfig`): [PretrainedConfig](https://huggingface.co/docs/transformers/main_classes/configuration#transformers.PretrainedConfig) is the Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~onnxruntime.modeling_ort.ORTModel.from_pretrained`] method to load the model weights.
-        model (`onnxruntime.InferenceSession`): [onnxruntime.InferenceSession](https://onnxruntime.ai/docs/api/python/api_summary.html#inferencesession) is the main class used to run a model. Check out the [`~onnxruntime.modeling_ort.ORTModel.load_model`] method for more information.
-        use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`.
+    This class should be initialized using the [`onnxruntime.modeling_ort.ORTModel.from_pretrained`] method.
 """
 
 ONNX_TEXT_INPUTS_DOCSTRING = r"""
@@ -863,15 +858,10 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool):
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a BaseModelOutput for feature-extraction tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForFeatureExtraction(ORTModel):
     """
-    Feature Extraction model for ONNX.
+    ONNX Model for feature-extraction task.
     """
 
     auto_model_class = AutoModel
@@ -976,15 +966,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a MaskedLMOutput for masked language modeling tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForMaskedLM(ORTModel):
     """
-    Masked language model for ONNX.
+    ONNX Model with a MaskedLMOutput for masked language modeling tasks.
     """
 
     auto_model_class = AutoModelForMaskedLM
@@ -1084,15 +1069,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForQuestionAnswering(ORTModel):
     """
-    Question Answering model for ONNX.
+    ONNX Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD.
     """
 
     auto_model_class = AutoModelForQuestionAnswering
@@ -1211,16 +1191,11 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForSequenceClassification(ORTModel):
     """
-    Sequence Classification model for ONNX.
+    ONNX Model with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
     """
 
     auto_model_class = AutoModelForSequenceClassification
@@ -1317,16 +1292,11 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
-    for Named-Entity-Recognition (NER) tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForTokenClassification(ORTModel):
     """
-    Token Classification model for ONNX.
+    ONNX Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
     """
 
     auto_model_class = AutoModelForTokenClassification
@@ -1420,16 +1390,11 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForMultipleChoice(ORTModel):
     """
-    Multiple choice model for ONNX.
+    ONNX Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
     """
 
     auto_model_class = AutoModelForMultipleChoice
@@ -1531,15 +1496,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model for image-classification tasks.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForImageClassification(ORTModel):
     """
-    Image Classification model for ONNX.
+    ONNX Model for image-classification tasks.
     """
 
     auto_model_class = AutoModelForImageClassification
@@ -1630,15 +1590,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForSemanticSegmentation(ORTModel):
     """
-    Semantic Segmentation model for ONNX.
+    ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
     """
 
     auto_model_class = AutoModelForSemanticSegmentation
@@ -1741,16 +1696,11 @@ def _prepare_onnx_inputs(self, use_torch: bool, **kwargs):
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
-    SUPERB Keyword Spotting.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForAudioClassification(ORTModel):
     """
-    Audio Classification model for ONNX.
+    ONNX Model for audio-classification, with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
     """
 
     auto_model_class = AutoModelForAudioClassification
@@ -1832,15 +1782,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with a language modeling head on top for Connectionist Temporal Classification (CTC).
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForCTC(ORTModel):
     """
-    CTC model for ONNX.
+    ONNX Model with a language modeling head on top for Connectionist Temporal Classification (CTC).
     """
 
     auto_model_class = AutoModelForCTC
@@ -1920,15 +1865,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model with an XVector feature extraction head on top for tasks like Speaker Verification.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForAudioXVector(ORTModel):
     """
-    Audio XVector model for ONNX.
+    ONNX Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """
 
     auto_model_class = AutoModelForAudioXVector
@@ -2014,15 +1954,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    Onnx Model for with a frame classification head on top for tasks like Speaker Diarization.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForAudioFrameClassification(ORTModel):
     """
-    Audio Frame Classification model for ONNX.
+    ONNX Model with a frame classification head on top for tasks like Speaker Diarization.
     """
 
     auto_model_class = AutoModelForAudioFrameClassification
@@ -2099,15 +2034,10 @@ def forward(
 """
 
 
-@add_start_docstrings(
-    """
-    ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model.
-    """,
-    ONNX_MODEL_START_DOCSTRING,
-)
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForCustomTasks(ORTModel):
     """
-    Model for any custom tasks if the ONNX model is stored in a single file.
+    ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model, that may use custom inputs and outputs.
     """
 
     @add_start_docstrings_to_model_forward(