[FRONTEND] fix the silent return issue in AOT launcher (#2013)

In the current link.py, it produces the launcher code as below: ```python CUresult matmul_fp16xfp16_16x16x16(CUstream stream, unsigned int gX, unsigned int gY, unsigned int gZ, CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int32_t stride_cm, int32_t stride_am, int32_t stride_bk){ if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0) && (stride_cm % 16 == 0)) return matmul_fp16xfp16_16x16x16_688cc413_0d1d2d3d45d(stream, gX, gY, gZ, C, A, B, stride_cm, stride_am, stride_bk); // ... if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0)) return matmul_fp16xfp16_16x16x16_7c0255bf_0d1d2d345(stream, gX, gY, gZ, C, A, B, stride_cm, stride_am, stride_bk); } ``` Note that, when the input does not match any of the if branches, it will do nothing, and the compiler should make it return 0 as a default behavior, which equals to `CUDA_SUCCESS`, this doesn't match the expectation. This PR adds a `return CUDA_VALUE_ERROR;` to the tail of launchers, and it produces code like: ```c++ CUresult matmul_fp16xfp16_16x16x16(CUstream stream, unsigned int gX, unsigned int gY, unsigned int gZ, CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int32_t stride_cm, int32_t stride_cn, int32_t stride_am, int32_t stride_ak, int32_t stride_bk, int32_t stride_bn){ if ((C % 16 == 0) && (A % 16 == 0) && (B % 16 == 0) && (stride_cm == 1) && (stride_cn == 1) && (stride_am == 1) && (stride_ak == 1) && (stride_bk % 16 == 0) && (stride_bn == 1)) return matmul_fp16xfp16_16x16x16_1f18a6da_0d1d2d3c4c5c6c7d8c(stream, gX, gY, gZ, C, A, B, stride_bk); return CUDA_ERROR_INVALID_VALUE; } ``` And it requires users to check the result in their application, which I think should match the initial AOT ideas.
triton-lang · Jul 31, 2023 · 89b0b79 · 89b0b79
1 parent 52c146f
commit 89b0b79
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 48 deletions.
diff --git a/python/test/unit/tools/test_aot.py b/python/test/unit/tools/test_aot.py
@@ -40,17 +40,20 @@ def kernel(C, A, B,
   tl.store(C + ms[:, None] * stride_cm + ns[None, :] * stride_cn, c)
 """
 
-test_src = """
+
+def gen_test_bin(dir, M, N, K, BM, BN, BK):
+    test_src = '''
 #include <cuda.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
+#include <assert.h>
 #include "kernel.h"
 
 static void write_buffer_to_csv(char *filename, int32_t *buffer, int size) {
     FILE *file = fopen(filename, "w");
     if (file == NULL) {
-        printf(\"Could not open file %s\\n\", filename);
+        printf("Could not open file %s\\n", filename);
         return;
     }
     for (int i = 0; i < size; i++) {
@@ -65,19 +68,20 @@ def kernel(C, A, B,
 static void read_csv_to_buffer(char *filename, int16_t *buffer, int size) {
     FILE *file = fopen(filename, "r");
     if (file == NULL) {
-        printf(\"Could not open file %s\\n\", filename);
+        printf("Could not open file %s\\n", filename);
         return;
     }
     int index = 0;
     while (fscanf(file, "%hd,", &buffer[index]) != EOF && index < size) {
         index++;
     }
     fclose(file);
-}
+}'''
 
-int main(int argc, char **argv) {
-  int M = 16, N = 16, K = 16;
-  int BM = 16, BN = 16, BK = 16;
+    test_src += f'''
+int main(int argc, char **argv) {{
+  int M = {M}, N = {N}, K = {K};
+  int BM = {M}, BN = {N}, BK = {K};
 
   // initialize CUDA handles
   CUdevice dev;
@@ -105,9 +109,11 @@ def kernel(C, A, B,
   cuMemcpyHtoD(B, hB, K*N*2);
 
   // launch kernel
-  int gX = 1, gY = 1, gZ = 1;
   cuStreamSynchronize(stream);
-  matmul_fp16xfp16_16x16x16(stream, M/BM, N/BN, 1, C, A, B, N, 1, K, 1, N, 1);
+  CUresult ret = matmul_fp16xfp16_16x16x16(stream, M/BM, N/BN, 1, C, A, B, N, 1, K, 1, N, 1);
+  if (ret != 0) fprintf(stderr, "kernel launch failed\\n");
+  assert(ret == 0);
+
   cuStreamSynchronize(stream);
 
   // read data
@@ -116,65 +122,75 @@ def kernel(C, A, B,
   cuMemcpyDtoH(hC, C, M*N*4);
   write_buffer_to_csv(argv[3], hC, M*N);
 
-
   // free cuda handles
   unload_matmul_fp16xfp16_16x16x16();
   cuMemFree(A);
   cuMemFree(B);
   cuMemFree(C);
   cuCtxDestroy(ctx);
-}
-"""
+}}
+'''
+
+    with open(os.path.join(dir, "test.c"), "w") as file:
+        file.write(test_src)
+    c_files = glob.glob(os.path.join(dir, "*.c"))
+    subprocess.run(["gcc"] + c_files + ["-I", cuda_include_dir(),
+                                        "-L", libcuda_dirs()[0],
+                                        "-l", "cuda",
+                                        "-o", "test"], check=True, cwd=dir)
+
+
+def generate_matmul_launcher(dir, dtype, BM, BN, BK, ha_hb_hints):
+    kernel_path = os.path.join(dir, "kernel.py")
+    with open(kernel_path, "w") as file:
+        file.write(kernel_src)
+
+    kernel_utils_path = os.path.join(dir, "kernel_utils.py")
+    with open(kernel_utils_path, "w") as file:
+        file.write(kernel_utils_src)
+
+    compiler_path = os.path.join(triton.tools.__path__[0], "compile.py")
+    linker_path = os.path.join(triton.tools.__path__[0], "link.py")
+
+    # compile all desired configs
+    for ha in ha_hb_hints:
+        for hb in ha_hb_hints:
+            sig = f'*fp32:16, *{dtype}:16, *{dtype}:16, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}'
+            name = f"matmul_{dtype}x{dtype}_{BM}x{BN}x{BK}"
+            subprocess.run([sys.executable, compiler_path, "-n", "kernel", "--signature", sig, "--out-name", name, "-o", name, "-w", "1", kernel_path], check=True, cwd=dir)
+
+    # link all desired configs
+    h_files = glob.glob(os.path.join(dir, "*.h"))
+    subprocess.run([sys.executable, linker_path] + h_files + ["-o", "kernel"], check=True, cwd=dir)
+
+
+def generate_matmul_test_data(dir, M, N, K):
+    a = np.random.randn(M * K).astype(np.float16).reshape((M, K))
+    b = np.random.randn(M * K).astype(np.float16).reshape((K, N))
+    a_path = os.path.join(dir, "a.csv")
+    b_path = os.path.join(dir, "b.csv")
+    c_path = os.path.join(dir, "c.csv")
+    for x, path in [(a, a_path), (b, b_path)]:
+        x.view(np.int16).ravel().tofile(path, sep=",")
+    return a, b, a_path, b_path, c_path
 
 
 def test_compile_link_matmul():
     np.random.seed(3)
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        kernel_path = os.path.join(tmp_dir, "kernel.py")
-        with open(kernel_path, "w") as file:
-            file.write(kernel_src)
-
-        kernel_utils_path = os.path.join(tmp_dir, "kernel_utils.py")
-        with open(kernel_utils_path, "w") as file:
-            file.write(kernel_utils_src)
-
-        compiler_path = os.path.join(triton.tools.__path__[0], "compile.py")
-        linker_path = os.path.join(triton.tools.__path__[0], "link.py")
 
         dtype = "fp16"
-        M, N, K = 16, 16, 16
         BM, BN, BK = 16, 16, 16
 
-        # compile all desired configs
-        hints = [":16", ""]
-        for ha in hints:
-            for hb in hints:
-                sig = f'*fp32:16, *{dtype}:16, *{dtype}:16, i32{ha}, i32:1, i32{hb}, i32:1, i32:16, i32:1, {BM}, {BN}, {BK}'
-                name = f"matmul_{dtype}x{dtype}_{BM}x{BN}x{BK}"
-                subprocess.run([sys.executable, compiler_path, "-n", "kernel", "--signature", sig, "--out-name", name, "-o", name, "-w", "1", kernel_path], check=True, cwd=tmp_dir)
-
-        # link all desired configs
-        h_files = glob.glob(os.path.join(tmp_dir, "*.h"))
-        subprocess.run([sys.executable, linker_path] + h_files + ["-o", "kernel"], check=True, cwd=tmp_dir)
+        generate_matmul_launcher(tmp_dir, dtype, BM, BN, BK, ha_hb_hints=["", ":16"])
 
         # compile test case
-        with open(os.path.join(tmp_dir, "test.c"), "w") as file:
-            file.write(test_src)
-        c_files = glob.glob(os.path.join(tmp_dir, "*.c"))
-        subprocess.run(["gcc"] + c_files + ["-I", cuda_include_dir(),
-                                            "-L", libcuda_dirs()[0],
-                                            "-l", "cuda",
-                                            "-o", "test"], check=True, cwd=tmp_dir)
+        M, N, K = 16, 16, 16
+        gen_test_bin(tmp_dir, M, N, K, BM, BN, BK)
 
         # initialize test data
-        a = np.random.randn(M * K).astype(np.float16).reshape((M, K))
-        b = np.random.randn(M * K).astype(np.float16).reshape((K, N))
-        a_path = os.path.join(tmp_dir, "a.csv")
-        b_path = os.path.join(tmp_dir, "b.csv")
-        c_path = os.path.join(tmp_dir, "c.csv")
-        for x, path in [(a, a_path), (b, b_path)]:
-            x.view(np.int16).ravel().tofile(path, sep=",")
+        a, b, a_path, b_path, c_path = generate_matmul_test_data(tmp_dir, M, N, K)
 
         # run test case
         subprocess.run(["./test", a_path, b_path, c_path], check=True, cwd=tmp_dir)
@@ -186,6 +202,30 @@ def test_compile_link_matmul():
         np.testing.assert_allclose(c_tri, c_ref * c_ref, atol=1e-4, rtol=0.)
 
 
+def test_launcher_has_no_available_kernel():
+    np.random.seed(3)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        dtype = "fp16"
+        BM, BN, BK = 16, 16, 16
+
+        generate_matmul_launcher(tmp_dir, dtype, BM, BN, BK, ha_hb_hints=[":1"])
+
+        # compile test case
+        M, N, K = 16, 16, 16
+        gen_test_bin(tmp_dir, M, N, K, BM, BN, BK)
+
+        # initialize test data
+        a, b, a_path, b_path, c_path = generate_matmul_test_data(tmp_dir, M, N, K)
+
+        # run test case
+        result = subprocess.run(["./test", a_path, b_path, c_path], cwd=tmp_dir, capture_output=True, text=True)
+
+        # It should fail since the launcher requires all the strides be 1 while they are not.
+        assert result.returncode == -6
+        assert "kernel launch failed" in result.stderr
+
+
 def test_ttgir_to_ptx():
     src = """
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {

diff --git a/python/triton/tools/link.py b/python/triton/tools/link.py
@@ -148,6 +148,8 @@ def make_kernel_dispatcher(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
         src += f"  if ({conds})\n"
         arg_names = [arg for arg, hint in zip(meta.arg_names, meta.sizes) if hint != 1]
         src += f"    return {name}_{meta.sig_hash}_{meta.suffix}(stream, gX, gY, gZ, {', '.join(arg_names)});\n"
+    src += "\n"
+    src += "  return CUDA_ERROR_INVALID_VALUE;\n"
     src += "}\n"
 
     for mode in ["load", "unload"]: