From d99a418db26997c33c91be32855d1d3704fa7823 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 22 Aug 2023 12:39:19 -0400 Subject: [PATCH] Fix gptq params (#1284) * fix bits * space * fix damp --- .../llm_quantization/usage_guides/quantization.mdx | 3 ++- optimum/gptq/quantizer.py | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index 87f21bce01..2ec8d1f668 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -2,13 +2,14 @@ ## AutoGPTQ Integration -🤗 Optimum collaborated with [AutoGPTQ library](https://github.com/PanQiWei/AutoGPTQ) to provide a simple API that apply GPTQ quantization on language models. With GPTQ quantization, you can quantize your favorite language model to 8, 6, 4 or even 2 bits. This comes without a big drop of performance and with faster inference speed. This is supported by most GPU hardwares. +🤗 Optimum collaborated with [AutoGPTQ library](https://github.com/PanQiWei/AutoGPTQ) to provide a simple API that apply GPTQ quantization on language models. With GPTQ quantization, you can quantize your favorite language model to 8, 4, 3 or even 2 bits. This comes without a big drop of performance and with faster inference speed. This is supported by most GPU hardwares. If you want to quantize 🤗 Transformers models with GPTQ, follow this [documentation](https://huggingface.co/docs/transformers/main_classes/quantization). To learn more about the quantization technique used in GPTQ, please refer to: - the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper - the [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library used as the backend + Note that the AutoGPTQ library provides more advanced usage (triton backend, fused attention, fused MLP) that are not integrated with Optimum. For now, we leverage only the CUDA kernel for GPTQ. ### Requirements diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 1352946b60..c38f8be997 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -58,8 +58,8 @@ def __init__( bits: int, dataset: Optional[Union[List[str], str]] = None, group_size: int = 128, - damp_percent: float = 0.01, - desc_act: bool = True, + damp_percent: float = 0.1, + desc_act: bool = False, sym: bool = True, true_sequential: bool = True, use_cuda_fp16: bool = False, @@ -81,9 +81,9 @@ def __init__( in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']. group_size (int, defaults to 128): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. - damp_percent (`float`, defaults to `0.01`): - The percent of the average Hessian diagonal to use for dampening, recommended value is 0.01. - desc_act (`bool`, defaults to `True`): + damp_percent (`float`, defaults to `0.1`): + The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1. + desc_act (`bool`, defaults to `False`): Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly speed up inference but the perplexity may become slightly worse. Also known as act-order. @@ -124,8 +124,8 @@ def __init__( self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama - if self.bits not in [2, 4, 6, 8]: - raise ValueError("only support quantize to [2,4,6,8] bits.") + if self.bits not in [2, 3, 4, 8]: + raise ValueError("only support quantize to [2,3,4,8] bits.") if self.group_size != -1 and self.group_size <= 0: raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1):