Skip to content

Commit

Permalink
[FRONTEND] fix the silent return issue in AOT launcher (#2013)
Browse files Browse the repository at this point in the history
In the current link.py, it produces the launcher code as below:

```python
CUresult matmul_fp16xfp16_16x16x16(CUstream stream, unsigned int gX, unsigned int gY, unsigned int gZ, CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int32_t stride_cm, int32_t stride_am, int32_t stride_bk){
  if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0) && (stride_cm % 16 == 0))
    return matmul_fp16xfp16_16x16x16_688cc413_0d1d2d3d45d(stream, gX, gY, gZ, C, A, B, stride_cm, stride_am, stride_bk);
  // ...
  if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0))
    return matmul_fp16xfp16_16x16x16_7c0255bf_0d1d2d345(stream, gX, gY, gZ, C, A, B, stride_cm, stride_am, stride_bk);
}
```
Note that, when the input does not match any of the if branches, it will
do nothing, and the compiler should make it return 0 as a default
behavior, which equals to `CUDA_SUCCESS`, this doesn't match the
expectation.

This PR adds a `return CUDA_VALUE_ERROR;` to the tail of launchers, and
it produces code like:

```c++
CUresult matmul_fp16xfp16_16x16x16(CUstream stream, unsigned int gX, unsigned int gY, unsigned int gZ, CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int32_t stride_cm, int32_t stride_cn, int32_t stride_am, int32_t stride_ak, int32_t stride_bk, int32_t stride_bn){
  if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0) && (stride_cm == 1) && (stride_cn == 1) && (stride_am == 1) && (stride_ak == 1) && (stride_bk % 16 == 0) && (stride_bn == 1))
    return matmul_fp16xfp16_16x16x16_1f18a6da_0d1d2d3c4c5c6c7d8c(stream, gX, gY, gZ, C, A, B, stride_bk);

  return CUDA_ERROR_INVALID_VALUE;
}
```

And it requires users to check the result in their application, which I
think should match the initial AOT ideas.
  • Loading branch information
Superjomn authored Jul 31, 2023
1 parent 52c146f commit 89b0b79
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 48 deletions.
136 changes: 88 additions & 48 deletions python/test/unit/tools/test_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,20 @@ def kernel(C, A, B,
tl.store(C + ms[:, None] * stride_cm + ns[None, :] * stride_cn, c)
"""

test_src = """

def gen_test_bin(dir, M, N, K, BM, BN, BK):
test_src = '''
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include "kernel.h"
static void write_buffer_to_csv(char *filename, int32_t *buffer, int size) {
FILE *file = fopen(filename, "w");
if (file == NULL) {
printf(\"Could not open file %s\\n\", filename);
printf("Could not open file %s\\n", filename);
return;
}
for (int i = 0; i < size; i++) {
Expand All @@ -65,19 +68,20 @@ def kernel(C, A, B,
static void read_csv_to_buffer(char *filename, int16_t *buffer, int size) {
FILE *file = fopen(filename, "r");
if (file == NULL) {
printf(\"Could not open file %s\\n\", filename);
printf("Could not open file %s\\n", filename);
return;
}
int index = 0;
while (fscanf(file, "%hd,", &buffer[index]) != EOF && index < size) {
index++;
}
fclose(file);
}
}'''

int main(int argc, char **argv) {
int M = 16, N = 16, K = 16;
int BM = 16, BN = 16, BK = 16;
test_src += f'''
int main(int argc, char **argv) {{
int M = {M}, N = {N}, K = {K};
int BM = {M}, BN = {N}, BK = {K};
// initialize CUDA handles
CUdevice dev;
Expand Down Expand Up @@ -105,9 +109,11 @@ def kernel(C, A, B,
cuMemcpyHtoD(B, hB, K*N*2);
// launch kernel
int gX = 1, gY = 1, gZ = 1;
cuStreamSynchronize(stream);
matmul_fp16xfp16_16x16x16(stream, M/BM, N/BN, 1, C, A, B, N, 1, K, 1, N, 1);
CUresult ret = matmul_fp16xfp16_16x16x16(stream, M/BM, N/BN, 1, C, A, B, N, 1, K, 1, N, 1);
if (ret != 0) fprintf(stderr, "kernel launch failed\\n");
assert(ret == 0);
cuStreamSynchronize(stream);
// read data
Expand All @@ -116,65 +122,75 @@ def kernel(C, A, B,
cuMemcpyDtoH(hC, C, M*N*4);
write_buffer_to_csv(argv[3], hC, M*N);
// free cuda handles
unload_matmul_fp16xfp16_16x16x16();
cuMemFree(A);
cuMemFree(B);
cuMemFree(C);
cuCtxDestroy(ctx);
}
"""
}}
'''

with open(os.path.join(dir, "test.c"), "w") as file:
file.write(test_src)
c_files = glob.glob(os.path.join(dir, "*.c"))
subprocess.run(["gcc"] + c_files + ["-I", cuda_include_dir(),
"-L", libcuda_dirs()[0],
"-l", "cuda",
"-o", "test"], check=True, cwd=dir)


def generate_matmul_launcher(dir, dtype, BM, BN, BK, ha_hb_hints):
kernel_path = os.path.join(dir, "kernel.py")
with open(kernel_path, "w") as file:
file.write(kernel_src)

kernel_utils_path = os.path.join(dir, "kernel_utils.py")
with open(kernel_utils_path, "w") as file:
file.write(kernel_utils_src)

compiler_path = os.path.join(triton.tools.__path__[0], "compile.py")
linker_path = os.path.join(triton.tools.__path__[0], "link.py")

# compile all desired configs
for ha in ha_hb_hints:
for hb in ha_hb_hints:
sig = f'*fp32:16, *{dtype}:16, *{dtype}:16, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}'
name = f"matmul_{dtype}x{dtype}_{BM}x{BN}x{BK}"
subprocess.run([sys.executable, compiler_path, "-n", "kernel", "--signature", sig, "--out-name", name, "-o", name, "-w", "1", kernel_path], check=True, cwd=dir)

# link all desired configs
h_files = glob.glob(os.path.join(dir, "*.h"))
subprocess.run([sys.executable, linker_path] + h_files + ["-o", "kernel"], check=True, cwd=dir)


def generate_matmul_test_data(dir, M, N, K):
a = np.random.randn(M * K).astype(np.float16).reshape((M, K))
b = np.random.randn(M * K).astype(np.float16).reshape((K, N))
a_path = os.path.join(dir, "a.csv")
b_path = os.path.join(dir, "b.csv")
c_path = os.path.join(dir, "c.csv")
for x, path in [(a, a_path), (b, b_path)]:
x.view(np.int16).ravel().tofile(path, sep=",")
return a, b, a_path, b_path, c_path


def test_compile_link_matmul():
np.random.seed(3)

with tempfile.TemporaryDirectory() as tmp_dir:
kernel_path = os.path.join(tmp_dir, "kernel.py")
with open(kernel_path, "w") as file:
file.write(kernel_src)

kernel_utils_path = os.path.join(tmp_dir, "kernel_utils.py")
with open(kernel_utils_path, "w") as file:
file.write(kernel_utils_src)

compiler_path = os.path.join(triton.tools.__path__[0], "compile.py")
linker_path = os.path.join(triton.tools.__path__[0], "link.py")

dtype = "fp16"
M, N, K = 16, 16, 16
BM, BN, BK = 16, 16, 16

# compile all desired configs
hints = [":16", ""]
for ha in hints:
for hb in hints:
sig = f'*fp32:16, *{dtype}:16, *{dtype}:16, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}'
name = f"matmul_{dtype}x{dtype}_{BM}x{BN}x{BK}"
subprocess.run([sys.executable, compiler_path, "-n", "kernel", "--signature", sig, "--out-name", name, "-o", name, "-w", "1", kernel_path], check=True, cwd=tmp_dir)

# link all desired configs
h_files = glob.glob(os.path.join(tmp_dir, "*.h"))
subprocess.run([sys.executable, linker_path] + h_files + ["-o", "kernel"], check=True, cwd=tmp_dir)
generate_matmul_launcher(tmp_dir, dtype, BM, BN, BK, ha_hb_hints=["", ":16"])

# compile test case
with open(os.path.join(tmp_dir, "test.c"), "w") as file:
file.write(test_src)
c_files = glob.glob(os.path.join(tmp_dir, "*.c"))
subprocess.run(["gcc"] + c_files + ["-I", cuda_include_dir(),
"-L", libcuda_dirs()[0],
"-l", "cuda",
"-o", "test"], check=True, cwd=tmp_dir)
M, N, K = 16, 16, 16
gen_test_bin(tmp_dir, M, N, K, BM, BN, BK)

# initialize test data
a = np.random.randn(M * K).astype(np.float16).reshape((M, K))
b = np.random.randn(M * K).astype(np.float16).reshape((K, N))
a_path = os.path.join(tmp_dir, "a.csv")
b_path = os.path.join(tmp_dir, "b.csv")
c_path = os.path.join(tmp_dir, "c.csv")
for x, path in [(a, a_path), (b, b_path)]:
x.view(np.int16).ravel().tofile(path, sep=",")
a, b, a_path, b_path, c_path = generate_matmul_test_data(tmp_dir, M, N, K)

# run test case
subprocess.run(["./test", a_path, b_path, c_path], check=True, cwd=tmp_dir)
Expand All @@ -186,6 +202,30 @@ def test_compile_link_matmul():
np.testing.assert_allclose(c_tri, c_ref * c_ref, atol=1e-4, rtol=0.)


def test_launcher_has_no_available_kernel():
np.random.seed(3)

with tempfile.TemporaryDirectory() as tmp_dir:
dtype = "fp16"
BM, BN, BK = 16, 16, 16

generate_matmul_launcher(tmp_dir, dtype, BM, BN, BK, ha_hb_hints=[":1"])

# compile test case
M, N, K = 16, 16, 16
gen_test_bin(tmp_dir, M, N, K, BM, BN, BK)

# initialize test data
a, b, a_path, b_path, c_path = generate_matmul_test_data(tmp_dir, M, N, K)

# run test case
result = subprocess.run(["./test", a_path, b_path, c_path], cwd=tmp_dir, capture_output=True, text=True)

# It should fail since the launcher requires all the strides be 1 while they are not.
assert result.returncode == -6
assert "kernel launch failed" in result.stderr


def test_ttgir_to_ptx():
src = """
module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
Expand Down
2 changes: 2 additions & 0 deletions python/triton/tools/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def make_kernel_dispatcher(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
src += f" if ({conds})\n"
arg_names = [arg for arg, hint in zip(meta.arg_names, meta.sizes) if hint != 1]
src += f" return {name}_{meta.sig_hash}_{meta.suffix}(stream, gX, gY, gZ, {', '.join(arg_names)});\n"
src += "\n"
src += " return CUDA_ERROR_INVALID_VALUE;\n"
src += "}\n"

for mode in ["load", "unload"]:
Expand Down

0 comments on commit 89b0b79

Please sign in to comment.