Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dev] Refactor testing scripts and fix security issues #72

Merged
merged 20 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bitblas/ops/impl/ladder_permutate_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def select_implementation(
inp = te.placeholder((M, N // scaling_factor), name="inp", dtype=storage_dtype)
args = [inp]

assert transform_kind != 0, "Permute only apply when transform_kind >= 1"
if transform_kind >= 1:
arg = args[-1]

Expand Down
3 changes: 1 addition & 2 deletions bitblas/ops/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ def tvm_callback_cuda_postproc(code, _):
**self.pass_context
}):
rt_mod = tvm.build(self.optimized_func, target=target, name=self.name)
except Exception as e:
rt_build_error = e # noqa
except Exception: # noqa: F841
logger.debug(
"Failed to build optimized function for CUDA target with default schedule, Please consider enable hardware aware tuning!"
)
Expand Down
2 changes: 1 addition & 1 deletion integration/BitNet/modeling_bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa: F401


logger = logging.get_logger(__name__)
Expand Down
55 changes: 25 additions & 30 deletions testing/python/operators/test_general_matmul_splitk_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,7 @@ def get_codegen_result(ops):


# fmt: off
@pytest.mark.parametrize(
"M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None),
(16, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None),
],
)
def test_matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout,
def matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode):

matmul_config = MatmulConfigWithSplitK(
Expand All @@ -37,21 +28,21 @@ def test_matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtyp
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
propagate_a=False,
propagate_b=False,
)
matmul = MatmulWithSplitK(config=matmul_config, enable_tuning=False)
assert get_codegen_result(matmul)


@pytest.mark.parametrize(
"SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None),
(4, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None),
],
)
def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
def test_matmul_codegen_default():
matmul_codegen_default(1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None)
matmul_codegen_default(16, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None)


def matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
layout, with_bias, group_size, with_scaling, with_zeros,
zeros_mode):
import torch
Expand All @@ -71,6 +62,8 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
propagate_a=False,
propagate_b=False,
)
matmul = MatmulWithSplitK(config=matmul_config, enable_tuning=False)

Expand All @@ -84,17 +77,13 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu
output_torch = torch.matmul(inputs[0], inputs[1].t() if layout == "nt" else inputs[1])
torch.testing.assert_close(output_bitblas, output_torch, rtol=1e-2, atol=1e-1)

def test_matmul_torch_forward_consistent():
matmul_torch_forward_consistent(1, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None)
matmul_torch_forward_consistent(4, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None)

@pytest.mark.parametrize(
"SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 16, 4096, 12800, "float16", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None),
(4, 16, 4096, 12800, "float16", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None),
],
)
def test_matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
def matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
layout, with_bias, group_size, with_scaling, with_zeros,
zeros_mode):
import torch
Expand Down Expand Up @@ -157,6 +146,12 @@ def map_torch_type(intype):

torch.testing.assert_close(bitblas_out, ref_out, rtol=1e0, atol=1e-1)

@bitblas.testing.requires_cuda_compute_version(8, 9)
def test_matmul_torch_forward_fp8e4m3():
matmul_torch_forward_fp8e4m3(1, 16, 4096, 12800, "e4m3_float8", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None)
matmul_torch_forward_fp8e4m3(4, 16, 4096, 12800, "e4m3_float8", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None)

# fmt: on
if __name__ == "__main__":
Expand Down
32 changes: 12 additions & 20 deletions testing/python/operators/test_ladder_permutate_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,7 @@


# fmt: off
@pytest.mark.parametrize(
"M,N,datatype,dequantize_bits,storage_dtype,propagate_kind,transpose_matrix,transform_kind,target_instruction",
[
(1024, 1024, "float16", -1, "float16", "B", True, 0, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "B", True, 1, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "B", True, 2, "nvidia-mma"),
# dequantize propagation
(1024, 1024, "float16", 4, "uint32", "B", True, 2, "nvidia-mma"),
])
def test_ladder_permutate_profile_latency(
def ladder_permutate_profile_latency(
M,
N,
datatype,
Expand Down Expand Up @@ -49,16 +40,13 @@ def test_ladder_permutate_profile_latency(
assert latency


@pytest.mark.parametrize(
"M,N,datatype,dequantize_bits,storage_dtype,propagate_kind,transpose_matrix,transform_kind,target_instruction",
[
(1024, 1024, "float16", -1, "float16", "A", True, 0, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "A", True, 1, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "A", True, 2, "nvidia-mma"),
# dequantize propagation
(1024, 1024, "float16", 4, "uint32", "A", True, 2, "nvidia-mma"),
])
def test_ladder_permutate_profile_latency_cuda(
def test_ladder_permutate_profile_latency():
ladder_permutate_profile_latency(1024, 1024, "float16", -1, "float16", "B", True, 1, "nvidia-mma")
ladder_permutate_profile_latency(1024, 1024, "float16", -1, "float16", "B", True, 2, "nvidia-mma")
ladder_permutate_profile_latency(1024, 1024, "float16", 4, "uint32", "B", True, 2, "nvidia-mma")


def ladder_permutate_profile_latency_cuda(
M,
N,
datatype,
Expand Down Expand Up @@ -91,6 +79,10 @@ def test_ladder_permutate_profile_latency_cuda(
assert latency


def test_ladder_permutate_profile_latency_cuda():
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", -1, "float16", "A", True, 1, "nvidia-mma")
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", -1, "float16", "A", True, 2, "nvidia-mma")
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", 4, "uint32", "A", True, 2, "nvidia-mma")
# fmt: on

if __name__ == "__main__":
Expand Down
Loading
Loading