[Dev] Fix GEMV Dynamic Scheduling with Splitk (#52)

* improve e4m3 decoding. * append fp16xint1 * Update submodule commit reference * chore: Update shared memory scope for float32 output dtype * BUGFIX: UINT8/INT8 Decoding * feat: Add rasterization options for roller module * Refactor tensorcore_legalization method to optimize tensor core usage * feat: Add function to collect variables from expression, improve for splitk * chore: Update typing import in __init__.py * chore: Refactor CPU execution of operators * Refactor matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * chore: Update version to 0.0.1.dev8 * chore: Enable debug output in bitblas.set_debug_level() * Refactor Linear module matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * Refactor CUDA kernel launch string for dynamic symbolic set * Bumpt version to v0.0.1.dev9 * Refactor CUDA kernel launch string for dynamic symbolic set * Bump version to v0.0.1.dev10 --------- Co-authored-by: LeiWang199 <leiwang199>
microsoft · Jun 6, 2024 · da9695a · da9695a
1 parent b78dcfe
commit da9695a
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 3 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.1.dev9
+0.0.1.dev10
diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py
@@ -81,4 +81,4 @@ def _init_logger():
 
 _init_logger()
 
-__version__ = "0.0.1.dev9"
+__version__ = "0.0.1.dev10"
diff --git a/python/bitblas/gpu/gemv.py b/python/bitblas/gpu/gemv.py
@@ -775,7 +775,8 @@ def apply_config(  # pylint: disable=too-many-locals,missing-docstring
             return None
 
         block_info = block_infos[0]
-        if len(block_info.iters) not in [2, 3]:
+        if len(block_info.iters) not in [2, 3, 4]:
+            # either [SK, B, S, R] = [SK, B, S, R] * [SK, B, R]
             # either [B, S, R] = [B, S, R] * [B, R]
             # or [S, R] = [S, R] * [R]
             return None

diff --git a/python/bitblas/gpu/gemv_dequantize.py b/python/bitblas/gpu/gemv_dequantize.py
@@ -110,6 +110,11 @@ def get_vectorize_factor(target_format):
         if len(sch.get_loops(block_b)) == 3:
             i = sch.get_loops(block_b)[0]
             sch.bind(i, "blockIdx.z")
+        elif len(sch.get_loops(block_b)) == 4:
+            # splitk case
+            sk, i = sch.get_loops(block_b)[:2]
+            sch.bind(sk, "blockIdx.y")
+            sch.bind(i, "blockIdx.z")
 
         # get target dequantize buffer's idx
         def get_idx(weight_decode_info: Dict):
@@ -274,6 +279,14 @@ def get_vectorize_factor(target_format):
         if len(sch.get_loops(block_b)) == 3:
             i = sch.get_loops(block_b)[0]
             sch.bind(i, "blockIdx.z")
+        elif len(sch.get_loops(block_b)) == 4:
+            # splitk case
+            sk, i = sch.get_loops(block_b)[:2]
+            sch.bind(sk, "blockIdx.y")
+            sch.bind(i, "blockIdx.z")
+            assert len(config.thread) == 2, "SplitK only support 2D thread config"
+            num_warps = int(num_warps // config.thread[0])
+
 
         # get target dequantize buffer's idx
         def get_idx(weight_decode_info: Dict):