microsoft · LeiWang1999 · Jun 6, 2024 · May 21, 2024 · May 23, 2024 · May 25, 2024
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.1.dev10
+0.0.1.dev12
diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py
@@ -33,6 +33,7 @@
 from . import testing  # noqa: F401
 from .utils import auto_detect_nvidia_target  # noqa: F401
 from .ops.general_matmul import MatmulConfig, Matmul  # noqa: F401
+from .ops.general_matmul_splitk import MatmulConfigWithSplitK, MatmulWithSplitK  # noqa: F401
 from .ops.matmul_dequantize import MatmulWeightOnlyDequantizeConfig, MatmulWeightOnlyDequantize  # noqa: F401
 from .module import Linear  # noqa: F401
 
@@ -81,4 +82,4 @@ def _init_logger():
 
 _init_logger()
 
-__version__ = "0.0.1.dev10"
+__version__ = "0.0.1.dev12"
diff --git a/python/bitblas/quantization/quantization.py b/python/bitblas/quantization/quantization.py
@@ -139,14 +139,23 @@ def _tir_u32_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype
     return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
 
 
-def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
+def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
     assert dtype == "float16"
     s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    prefix = tir.Select(s_f16 == 0, tir.const(0x2000, "uint16"), tir.const(0xc000, "uint16"))
-    e_f16 = (((val & tir.const(127, "uint16")) << tir.const(7, "uint16"))) | prefix
+    e4 = val & tir.const(0x40, "uint16")
+    prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"), tir.const(0x4000, "uint16"))
+    e_f16 = (((val & tir.const(63, "uint16")) << tir.const(7, "uint16"))) | prefix
     return tir.reinterpret("float16", s_f16 | e_f16)
 
+def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
+    assert nbit == 8
+    assert dtype == "float16"
+    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
+    e4 = val & tir.const(0x40, "uint16")
+    e_f16 = (((val & tir.const(63, "uint16")) << tir.const(7, "uint16"))) | (e4 << tir.const(8, "uint16")) | (e4 << tir.const(7, "uint16"))
+    e_f16 = e_f16 ^ tir.const(0x2000, "uint16")
+    return tir.reinterpret("float16", s_f16 | e_f16)
 
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8

diff --git a/testing/python/operators/test_general_matmul_splitk_ops.py b/testing/python/operators/test_general_matmul_splitk_ops.py
@@ -143,6 +143,10 @@ def map_torch_type(intype):
     matmul.forward(torch_a, torch_b, output=bitblas_out)
     print("torch_ref_out", ref_out)
     print("bitblas_out", bitblas_out)
+
+    matmul.forward(torch_a, torch_b, output=bitblas_out)
+    print("torch_ref_out", ref_out)
+    print("bitblas_out", bitblas_out)
 
     torch.testing.assert_close(bitblas_out, ref_out, rtol=1e0, atol=1e-1)