Skip to content

Commit

Permalink
Merge branch 'master' into fix-sd-doc
Browse files Browse the repository at this point in the history
  • Loading branch information
fxmarty committed Aug 25, 2023
2 parents 28d19d3 + d7d17eb commit d962bf2
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 108 deletions.
2 changes: 2 additions & 0 deletions docs/source/exporters/onnx/usage_guides/export_a_model.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,8 @@ main_export(
)
```

For tasks that require only a single ONNX file (e.g. encoder-only), an exported model with custom inputs/outputs can then be used with the class [`optimum.onnxruntime.ORTModelForCustomTasks`] for inference with ONNX Runtime on CPU or GPU.

### Customize the export of Transformers models with custom modeling

Optimum supports the export of Transformers models with custom modeling that use [`trust_remote_code=True`](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModel.from_pretrained.trust_remote_code), not officially supported in the Transormers library but usable with its functionality as [pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines) and [generation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.generate).
Expand Down
136 changes: 33 additions & 103 deletions optimum/onnxruntime/modeling_ort.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
)
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.file_utils import add_end_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
Expand Down Expand Up @@ -85,16 +85,11 @@
_FEATURE_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
_PROCESSOR_FOR_DOC = "AutoProcessor"

ONNX_MODEL_START_DOCSTRING = r"""
ONNX_MODEL_END_DOCSTRING = r"""
This model inherits from [`~onnxruntime.modeling_ort.ORTModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving)
Args:
config (`transformers.PretrainedConfig`): [PretrainedConfig](https://huggingface.co/docs/transformers/main_classes/configuration#transformers.PretrainedConfig) is the Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~onnxruntime.modeling_ort.ORTModel.from_pretrained`] method to load the model weights.
model (`onnxruntime.InferenceSession`): [onnxruntime.InferenceSession](https://onnxruntime.ai/docs/api/python/api_summary.html#inferencesession) is the main class used to run a model. Check out the [`~onnxruntime.modeling_ort.ORTModel.load_model`] method for more information.
use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`.
This class should be initialized using the [`onnxruntime.modeling_ort.ORTModel.from_pretrained`] method.
"""

ONNX_TEXT_INPUTS_DOCSTRING = r"""
Expand Down Expand Up @@ -863,15 +858,10 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool):
"""


@add_start_docstrings(
"""
Onnx Model with a BaseModelOutput for feature-extraction tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForFeatureExtraction(ORTModel):
"""
Feature Extraction model for ONNX.
ONNX Model for feature-extraction task.
"""

auto_model_class = AutoModel
Expand Down Expand Up @@ -976,15 +966,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a MaskedLMOutput for masked language modeling tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForMaskedLM(ORTModel):
"""
Masked language model for ONNX.
ONNX Model with a MaskedLMOutput for masked language modeling tasks.
"""

auto_model_class = AutoModelForMaskedLM
Expand Down Expand Up @@ -1084,15 +1069,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForQuestionAnswering(ORTModel):
"""
Question Answering model for ONNX.
ONNX Model with a QuestionAnsweringModelOutput for extractive question-answering tasks like SQuAD.
"""

auto_model_class = AutoModelForQuestionAnswering
Expand Down Expand Up @@ -1211,16 +1191,11 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForSequenceClassification(ORTModel):
"""
Sequence Classification model for ONNX.
ONNX Model with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
"""

auto_model_class = AutoModelForSequenceClassification
Expand Down Expand Up @@ -1317,16 +1292,11 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForTokenClassification(ORTModel):
"""
Token Classification model for ONNX.
ONNX Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
"""

auto_model_class = AutoModelForTokenClassification
Expand Down Expand Up @@ -1420,16 +1390,11 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForMultipleChoice(ORTModel):
"""
Multiple choice model for ONNX.
ONNX Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""

auto_model_class = AutoModelForMultipleChoice
Expand Down Expand Up @@ -1531,15 +1496,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model for image-classification tasks.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForImageClassification(ORTModel):
"""
Image Classification model for ONNX.
ONNX Model for image-classification tasks.
"""

auto_model_class = AutoModelForImageClassification
Expand Down Expand Up @@ -1630,15 +1590,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForSemanticSegmentation(ORTModel):
"""
Semantic Segmentation model for ONNX.
ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
"""

auto_model_class = AutoModelForSemanticSegmentation
Expand Down Expand Up @@ -1741,16 +1696,11 @@ def _prepare_onnx_inputs(self, use_torch: bool, **kwargs):
"""


@add_start_docstrings(
"""
Onnx Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForAudioClassification(ORTModel):
"""
Audio Classification model for ONNX.
ONNX Model for audio-classification, with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.
"""

auto_model_class = AutoModelForAudioClassification
Expand Down Expand Up @@ -1832,15 +1782,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with a language modeling head on top for Connectionist Temporal Classification (CTC).
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForCTC(ORTModel):
"""
CTC model for ONNX.
ONNX Model with a language modeling head on top for Connectionist Temporal Classification (CTC).
"""

auto_model_class = AutoModelForCTC
Expand Down Expand Up @@ -1920,15 +1865,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForAudioXVector(ORTModel):
"""
Audio XVector model for ONNX.
ONNX Model with an XVector feature extraction head on top for tasks like Speaker Verification.
"""

auto_model_class = AutoModelForAudioXVector
Expand Down Expand Up @@ -2014,15 +1954,10 @@ def forward(
"""


@add_start_docstrings(
"""
Onnx Model for with a frame classification head on top for tasks like Speaker Diarization.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForAudioFrameClassification(ORTModel):
"""
Audio Frame Classification model for ONNX.
ONNX Model with a frame classification head on top for tasks like Speaker Diarization.
"""

auto_model_class = AutoModelForAudioFrameClassification
Expand Down Expand Up @@ -2099,15 +2034,10 @@ def forward(
"""


@add_start_docstrings(
"""
ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model.
""",
ONNX_MODEL_START_DOCSTRING,
)
@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
class ORTModelForCustomTasks(ORTModel):
"""
Model for any custom tasks if the ONNX model is stored in a single file.
ONNX Model for any custom tasks. It can be used to leverage the inference acceleration for any single-file ONNX model, that may use custom inputs and outputs.
"""

@add_start_docstrings_to_model_forward(
Expand Down
63 changes: 61 additions & 2 deletions tests/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Please refer to https://medium.com/pytorch/bettertransformer-out-of-the-box-perf

# GPTQ benchmark

## Generation benchmark results

Run

```shell
Expand All @@ -26,8 +28,6 @@ CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-cha
CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama
```

## Benchmark results

Here are results obtained on a single NVIDIA A100-SXM4-80GB GPU. We use a prompt length of 512, and generate exactly 512 new tokens. Each generation is repeated for 4 batches, and metrics are averaged over the number of batches and generation length.

Additional benchmarks could be done in the act-order case.
Expand Down Expand Up @@ -75,3 +75,62 @@ From the bencharmk, it appears that Exllama kernel is the best-in-class for GPTQ
|False|None |None|None |None |26.0 |69.94 |228.76 |53986.51 |
|True |False |4 |128 |exllama|36.2 |95.41 |167.68 |34777.04 |
|True |False |4 |128 |autogptq-cuda-old|36.2 |192.48 |83.12 |35497.62 |


## Prefill-only benchmark results

Run

```shell
# pytorch fp16
CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill

# exllama kernel
CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill

# cuda-old kernel
CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama
```

The benchmark below is for a prompt length of 512, measuring only the prefill step on a single NVIDIA A100-SXM4-80GB GPU. The forward is repeated 10 times. This benchmark typically corresponds to the forward during training (to the difference that here `generate` is called, which has some overhead).

### Batch size = 1

|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
|False|None |None|None |None |512 |1 |27.22 |96.38 |10.38 |27999.54 |
|True |False |4 |128 |exllama |512 |1 |38.35 |112.54 |8.89 |9330.89 |
|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |368.13 |2.72 |9474.19 |

### Batch size = 2

|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
|False|None |None|None |None |512 |1 |27.22 |169.95 |11.77 |28524.37 |
|True |False |4 |128 |exllama |512 |1 |38.35 |190.44 |10.50 |9855.71 |
|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |443.80 |4.51 |9928.23 |

### Batch size = 4

|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
|False|None |None|None |None |512 |1 |27.22 |305.99 |13.07 |29574.01 |
|True |False |4 |128 |exllama |512 |1 |38.35 |345.54 |11.58 |10905.35 |
|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |597.24 |6.70 |10838.42 |

### Batch size = 8

|gptq |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
|-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
|False|None |None|None |None |512 |1 |27.22 |600.47 |13.32 |31673.30 |
|True |False |4 |128 |exllama |512 |1 |38.35 |659.61 |12.13 |13004.64 |
|True |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |909.09 |8.80 |12862.18 |

### Batch size = 16

|gptq |act_order|bits|group_size|kernel |num_batches|batch_size|prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
|-----|---------|----|----------|-----------------|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------|
|True |False |4 |128 |exllama |10 |16 |512 |1 |38.35 |1280.25 |12.50 |17203.22 |
|False|None |None|None |None |10 |16 |512 |1 |27.22 |1209.07 |13.23 |35871.88 |
|True |False |4 |128 |autogptq-cuda-old|10 |16 |512 |1 |43.94 |1533.54 |10.43 |17060.76 |

14 changes: 11 additions & 3 deletions tests/benchmark/benchmark_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ def get_parser():
default=256,
help="",
)
parser.add_argument(
"--prefill",
action="store_true",
help="For decoder models, benchmark only the prefill step with `prompt_length`.",
)
parser.add_argument(
"--gptq",
action="store_true",
Expand Down Expand Up @@ -231,10 +236,13 @@ def benchmark_memory(
prompt_lengths = [512]
new_tokens = [512]
else:
batch_sizes = args.batch_size
prompt_lengths = args.prompt_length
new_tokens = args.new_tokens
batch_sizes = [args.batch_size]
prompt_lengths = [args.prompt_length]
new_tokens = [args.new_tokens]

if args.prefill:
print("Running the prefill benchmark: generating only one new token.")
new_tokens = [1]

if not torch.cuda.is_available():
raise ValueError("A cuda device is necessary to benchmark GPTQ.")
Expand Down

0 comments on commit d962bf2

Please sign in to comment.