Skip to content

Commit

Permalink
[Dev] Fix GEMV Dynamic Scheduling with Splitk (#52)
Browse files Browse the repository at this point in the history
* improve e4m3 decoding.

* append fp16xint1

* Update submodule commit reference

* chore: Update shared memory scope for float32 output dtype

* BUGFIX: UINT8/INT8 Decoding

* feat: Add rasterization options for roller module

* Refactor tensorcore_legalization method to optimize tensor core usage

* feat: Add function to collect variables from expression, improve for splitk

* chore: Update typing import in __init__.py

* chore: Refactor CPU execution of operators

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* chore: Update version to 0.0.1.dev8

* chore: Enable debug output in bitblas.set_debug_level()

* Refactor Linear module matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bumpt version to v0.0.1.dev9

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bump version to v0.0.1.dev10

---------

Co-authored-by: LeiWang199 <leiwang199>
  • Loading branch information
LeiWang1999 authored Jun 6, 2024
1 parent b78dcfe commit da9695a
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1.dev9
0.0.1.dev10
2 changes: 1 addition & 1 deletion python/bitblas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,4 @@ def _init_logger():

_init_logger()

__version__ = "0.0.1.dev9"
__version__ = "0.0.1.dev10"
3 changes: 2 additions & 1 deletion python/bitblas/gpu/gemv.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,8 @@ def apply_config( # pylint: disable=too-many-locals,missing-docstring
return None

block_info = block_infos[0]
if len(block_info.iters) not in [2, 3]:
if len(block_info.iters) not in [2, 3, 4]:
# either [SK, B, S, R] = [SK, B, S, R] * [SK, B, R]
# either [B, S, R] = [B, S, R] * [B, R]
# or [S, R] = [S, R] * [R]
return None
Expand Down
13 changes: 13 additions & 0 deletions python/bitblas/gpu/gemv_dequantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ def get_vectorize_factor(target_format):
if len(sch.get_loops(block_b)) == 3:
i = sch.get_loops(block_b)[0]
sch.bind(i, "blockIdx.z")
elif len(sch.get_loops(block_b)) == 4:
# splitk case
sk, i = sch.get_loops(block_b)[:2]
sch.bind(sk, "blockIdx.y")
sch.bind(i, "blockIdx.z")

# get target dequantize buffer's idx
def get_idx(weight_decode_info: Dict):
Expand Down Expand Up @@ -274,6 +279,14 @@ def get_vectorize_factor(target_format):
if len(sch.get_loops(block_b)) == 3:
i = sch.get_loops(block_b)[0]
sch.bind(i, "blockIdx.z")
elif len(sch.get_loops(block_b)) == 4:
# splitk case
sk, i = sch.get_loops(block_b)[:2]
sch.bind(sk, "blockIdx.y")
sch.bind(i, "blockIdx.z")
assert len(config.thread) == 2, "SplitK only support 2D thread config"
num_warps = int(num_warps // config.thread[0])


# get target dequantize buffer's idx
def get_idx(weight_decode_info: Dict):
Expand Down

0 comments on commit da9695a

Please sign in to comment.