diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index d5cea9c54d..289e325682 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -354,7 +354,8 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): self.use_cuda_fp16 = model.dtype == torch.float16 if self.model_seqlen is None: - self.model_seqlen = get_seqlen(model) + # We allow a max value of 4028 to avoid passing data with huge length to the model during the calibration step + self.model_seqlen = min(4028, get_seqlen(model)) device = get_device(model)