Skip to content

Commit

Permalink
Fix quant_primitive dtype that caused perf regression (pytorch#253)
Browse files Browse the repository at this point in the history
Summary:
API call in quantize_activation_per_token_absmax is not exactly preserving the original
code, this PR fixes that

Test Plan:
need to check perf with torchbenchmarks

Reviewers:

Subscribers:

Tasks:

Tags:
  • Loading branch information
jerryzh168 authored May 18, 2024
1 parent 5741aa2 commit e9e5fae
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
16 changes: 16 additions & 0 deletions test/quantization/test_quant_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,22 @@ def test_quantize_activation_per_token_abs_max_zero_input(self):
quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)


@unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
def test_quantize_activation_per_token_abs_max_dtype(self):
from torchao.quantization.quant_primitives import quantize_activation_per_token_absmax
input = torch.zeros(10, 10, dtype=torch.bfloat16)
quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
self.assertTrue(scale_ref.dtype, torch.bfloat16)

input = torch.zeros(10, 10, dtype=torch.float32)
quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
self.assertTrue(scale_ref.dtype, torch.float32)

input = torch.zeros(10, 10, dtype=torch.float16)
quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
self.assertTrue(scale_ref.dtype, torch.float32)


@unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
def test_quantize_dequantize_group_sym(self):
input = torch.randn(10, 10)
Expand Down
4 changes: 3 additions & 1 deletion torchao/quantization/quant_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,9 @@ def quantize_activation_per_token_absmax(t):
# if we don't clamp. TODO(future) look into this further.
quant_min = -127
quant_max = 127
scale, zero_point = choose_qparams_affine(t, mapping_type, block_size, dtype, quant_min, quant_max, eps, scale_dtype=torch.float)
scale_dtype = torch.float32 if t.dtype == torch.float16 else None

scale, zero_point = choose_qparams_affine(t, mapping_type, block_size, dtype, quant_min, quant_max, eps, scale_dtype=scale_dtype)

quantized = quantize_affine(t, block_size, scale, zero_point, dtype, quant_min, quant_max)

Expand Down

0 comments on commit e9e5fae

Please sign in to comment.