Fix quant_primitive dtype that caused perf regression (pytorch#253)

Summary: API call in quantize_activation_per_token_absmax is not exactly preserving the original code, this PR fixes that Test Plan: need to check perf with torchbenchmarks Reviewers: Subscribers: Tasks: Tags:
jerryzh168 · May 18, 2024 · e9e5fae · e9e5fae
1 parent 5741aa2
commit e9e5fae
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 1 deletion.
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -156,6 +156,22 @@ def test_quantize_activation_per_token_abs_max_zero_input(self):
         quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
 
 
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    def test_quantize_activation_per_token_abs_max_dtype(self):
+        from torchao.quantization.quant_primitives import quantize_activation_per_token_absmax
+        input = torch.zeros(10, 10, dtype=torch.bfloat16)
+        quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
+        self.assertTrue(scale_ref.dtype, torch.bfloat16)
+
+        input = torch.zeros(10, 10, dtype=torch.float32)
+        quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
+        self.assertTrue(scale_ref.dtype, torch.float32)
+
+        input = torch.zeros(10, 10, dtype=torch.float16)
+        quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
+        self.assertTrue(scale_ref.dtype, torch.float32)
+
+
     @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
     def test_quantize_dequantize_group_sym(self):
         input = torch.randn(10, 10)

diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -416,7 +416,9 @@ def quantize_activation_per_token_absmax(t):
     # if we don't clamp.  TODO(future) look into this further.
     quant_min = -127
     quant_max = 127
-    scale, zero_point = choose_qparams_affine(t, mapping_type, block_size, dtype, quant_min, quant_max, eps, scale_dtype=torch.float)
+    scale_dtype = torch.float32 if t.dtype == torch.float16 else None
+
+    scale, zero_point = choose_qparams_affine(t, mapping_type, block_size, dtype, quant_min, quant_max, eps, scale_dtype=scale_dtype)
 
     quantized = quantize_affine(t, block_size, scale, zero_point, dtype, quant_min, quant_max)