microsoft · LeiWang1999 · Jul 22, 2024 · Jul 5, 2024 · Jul 5, 2024 · Jul 5, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "3rdparty/tvm"]
 	path = 3rdparty/tvm
 	url = https://github.com/LeiWang1999/tvm
-	branch = bitblas
+	branch = bitblas_tl
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/base/roller/policy/tensorcore.py b/bitblas/base/roller/policy/tensorcore.py
@@ -4,14 +4,16 @@
 from bitblas import tvm
 from typing import Dict, List, Tuple, Optional
 import numpy as np
-
+import logging
 from ...arch import TileDevice
 from ..hint import Hint, Stride, TileDict, IntrinInfo
 from ..node import PrimFuncNode
 from .common import coalesced_factor, factorize, get_all_factors
 from .default import DefaultPolicy
 from ..rasterization import NoRasterization, Rasterization2DColumn
 
+logger = logging.getLogger(__name__)
+
 
 class TensorCorePolicy(DefaultPolicy):
 
@@ -47,9 +49,9 @@ def _legalize_info(self):
                 self.use_async_copy = False
         # TODO: block reduction depth is not used for now.
         # As there still exists some performance issues for block reduction.
-        # block_reduction_depth = self.prim_func_node.get_tag("block_reduction_depth")
-        # if block_reduction_depth:
-        #     self.block_reduction_depth = block_reduction_depth
+        block_reduction_depth = self.prim_func_node.get_tag("block_reduction_depth")
+        if block_reduction_depth:
+            self.block_reduction_depth = block_reduction_depth
 
     def _compute_tc_strides(
         self,
@@ -120,7 +122,6 @@ def _check_small_tile(td: TileDict):
 
         smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
         rstep_map = td.rstep_map.copy()
-        is_block_reduction = self.block_reduction_depth is not None
 
         def _optimize(node, rstep):
             all_steps = self.get_node_reduce_step_candidates(node)
@@ -185,12 +186,12 @@ def _enlarge(rstep_id):
                 rstep = _optimize(node, rstep_map)
                 rstep_map = rstep
 
-        if is_block_reduction:
-            # If block reduction, we should constrain the max value is 64
-            # Otherwise it will introduce an issue of cuda invalid args.
-            MAX_REDUCE_K = 64
-            for k in rstep_map:
-                rstep_map[k] = min(rstep_map[k], MAX_REDUCE_K)
+        # if is_block_reduction:
+        #     # If block reduction, we should constrain the max value is 64
+        #     # Otherwise it will introduce an issue of cuda invalid args.
+        #     MAX_REDUCE_K = 64
+        #     for k in rstep_map:
+        #         rstep_map[k] = min(rstep_map[k], MAX_REDUCE_K)
         td.rstep_map = rstep_map
         td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td)
         return
@@ -315,7 +316,12 @@ def _score(node, thread):  # small is better
             if intrin_info["out_dtype"] in ["float32"]:
                 codegen_dict.shared_scope = "shared.dyn"
         # smem capacity
-        if td.smem_cost > self.arch.smem_cap:
+        # TODO: This is a dummy mul which avoid reusing some shared memory.
+        # Should be removed in the future.
+        if td.smem_cost > (self.arch.smem_cap * 1.3):
+            info_message = f"Tile Dict: {td.output_tile} Shared memory exceeds the static capacity," \
+                " use dynamic shared memory."
+            logger.info(info_message)
             codegen_dict.shared_scope = "shared.dyn"
 
         codegen_dict.complete_config(node)

diff --git a/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py
@@ -1986,7 +1986,7 @@ def get_param_indices(
         k0, kr = sch.split(k0, [None, reduce_k])
 
         sch.reorder(i0, j0, i1, j1, i2, j2, kr, k0, k1, i3, j3)
-        # sch.reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3)
+
         block_idy = sch.fuse(i0, j0)
         block_idx = sch.fuse(i1, j1)
         thread_idy = i2
@@ -1998,6 +1998,10 @@ def get_param_indices(
         thread_idz = j2 = thread_idy = sch.fuse(thread_idy, thread_idz)
         sch.bind(thread_idy, "threadIdx.y")
 
+        # Put the thread binding after the shared memory prefetch
+        # Otherwise there's a axis missing bug behind tvm
+        sch.bind(kr, "threadIdx.z")
+
         def smooth_layout_recover(block, scope, l=16, r=16, enable=True):  # noqa: E741
             if not enable:
                 return