Skip to content

Commit

Permalink
Revert "Revert "[AMD][Pipeliner] Improve clustering and add prefetch (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sjw36 committed Nov 15, 2024
1 parent 9aa114a commit 7578863
Show file tree
Hide file tree
Showing 10 changed files with 548 additions and 646 deletions.
345 changes: 0 additions & 345 deletions test/TritonGPU/amd/amd-reorder-instructions.mlir

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions test/TritonGPU/amd/amd-sched-2nd-load.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
%c1 = arith.constant 1 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
%0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 {
%4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
%5 = tt.load %B_ptr : tensor<128x256x!tt.ptr<f16>, #blocked1>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x256xf16, #dotOp1>
%3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
%4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
%5 = tt.load %B_ptr : tensor<128x256x!tt.ptr<f16>, #blocked1>
triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
triton_gpu.local_store %5, %B_LDS : tensor<128x256xf16, #blocked1> -> !tt.memdesc<128x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
scf.yield %3 : tensor<256x256xf32, #mma>
Expand Down Expand Up @@ -74,11 +74,11 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
%c1 = arith.constant 1 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
%0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 {
%4 = tt.load %A_ptr : tensor<256x64x!tt.ptr<f16>, #blocked>
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x64xf16, #dotOp0>
%5 = tt.load %B_ptr : tensor<64x256x!tt.ptr<f16>, #blocked1>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x256xf16, #dotOp1>
%3 = tt.dot %1, %2, %arg1 : tensor<256x64xf16, #dotOp0> * tensor<64x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
%4 = tt.load %A_ptr : tensor<256x64x!tt.ptr<f16>, #blocked>
%5 = tt.load %B_ptr : tensor<64x256x!tt.ptr<f16>, #blocked1>
triton_gpu.local_store %4, %A_LDS : tensor<256x64xf16, #blocked> -> !tt.memdesc<256x64xf16, #shared, #triton_gpu.shared_memory, mutable>
triton_gpu.local_store %5, %B_LDS : tensor<64x256xf16, #blocked1> -> !tt.memdesc<64x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
scf.yield %3 : tensor<256x256xf32, #mma>
Expand All @@ -101,8 +101,8 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
// Should NOT apply: tile size 256x64x128 with single dot
// CHECK-LABEL: sink_2nd_load_256x64x128
// CHECK: %[[tileA:.*]] = tt.load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: tt.dot
// CHECK-NEXT: triton_gpu.local_store %[[tileA]]
Expand All @@ -113,11 +113,11 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
%c1 = arith.constant 1 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<256x64xf32, #mma>
%0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x64xf32, #mma>) : i32 {
%4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x128xf16, #dotOp0>
%5 = tt.load %B_ptr : tensor<128x64x!tt.ptr<f16>, #blocked1>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x64xf16, #dotOp1>
%3 = tt.dot %1, %2, %arg1 : tensor<256x128xf16, #dotOp0> * tensor<128x64xf16, #dotOp1> -> tensor<256x64xf32, #mma>
%4 = tt.load %A_ptr : tensor<256x128x!tt.ptr<f16>, #blocked>
%5 = tt.load %B_ptr : tensor<128x64x!tt.ptr<f16>, #blocked1>
triton_gpu.local_store %4, %A_LDS : tensor<256x128xf16, #blocked> -> !tt.memdesc<256x128xf16, #shared, #triton_gpu.shared_memory, mutable>
triton_gpu.local_store %5, %B_LDS : tensor<128x64xf16, #blocked1> -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
scf.yield %3 : tensor<256x64xf32, #mma>
Expand All @@ -140,8 +140,8 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
// Should NOT apply: tile size 256x256x32 with single dot
// CHECK-LABEL: sink_2nd_load_256x256x32
// CHECK: %[[tileA:.*]] = tt.load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: tt.dot
// CHECK-NEXT: triton_gpu.local_store %[[tileA]]
Expand All @@ -152,11 +152,11 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
%c1 = arith.constant 1 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
%0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<256x256xf32, #mma>) : i32 {
%4 = tt.load %A_ptr : tensor<256x32x!tt.ptr<f16>, #blocked>
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<256x32xf16, #dotOp0>
%5 = tt.load %B_ptr : tensor<32x256x!tt.ptr<f16>, #blocked1>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x256xf16, #dotOp1>
%3 = tt.dot %1, %2, %arg1 : tensor<256x32xf16, #dotOp0> * tensor<32x256xf16, #dotOp1> -> tensor<256x256xf32, #mma>
%4 = tt.load %A_ptr : tensor<256x32x!tt.ptr<f16>, #blocked>
%5 = tt.load %B_ptr : tensor<32x256x!tt.ptr<f16>, #blocked1>
triton_gpu.local_store %4, %A_LDS : tensor<256x32xf16, #blocked> -> !tt.memdesc<256x32xf16, #shared, #triton_gpu.shared_memory, mutable>
triton_gpu.local_store %5, %B_LDS : tensor<32x256xf16, #blocked1> -> !tt.memdesc<32x256xf16, #shared1, #triton_gpu.shared_memory, mutable>
scf.yield %3 : tensor<256x256xf32, #mma>
Expand All @@ -181,8 +181,8 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
// Should NOT apply: the 2nd load has a user before the dot
// CHECK-LABEL: sink_2nd_load_128x128x128_user_before_dot
// CHECK: %[[tileA:.*]] = tt.load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: %[[tileB:.*]] = tt.load
// CHECK-NEXT: local_load
// CHECK-NEXT: tt.store
// CHECK-NEXT: tt.dot
Expand All @@ -193,10 +193,10 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
%c1 = arith.constant 1 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
%0:1 = scf.for %arg0 = %c0 to %c1 step %c1 iter_args(%arg1 = %cst) -> (tensor<128x128xf32, #mma>) : i32 {
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1>
%4 = tt.load %A_ptr : tensor<128x128x!tt.ptr<f16>, #blocked>
%1 = triton_gpu.local_load %A_LDS : !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp0>
%5 = tt.load %B_ptr : tensor<128x128x!tt.ptr<i64>, #blocked>
%2 = triton_gpu.local_load %B_LDS : !tt.memdesc<128x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<128x128xf16, #dotOp1>
tt.store %B_ptr, %5 : tensor<128x128x!tt.ptr<i64>, #blocked>
%3 = tt.dot %1, %2, %arg1 : tensor<128x128xf16, #dotOp0> * tensor<128x128xf16, #dotOp1> -> tensor<128x128xf32, #mma>
triton_gpu.local_store %4, %A_LDS : tensor<128x128xf16, #blocked> -> !tt.memdesc<128x128xf16, #shared, #triton_gpu.shared_memory, mutable>
Expand All @@ -213,12 +213,12 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
// Category 3: two dots in the for loop. Make sure the optimization is not applied
// should NOT apply: two dots
// CHECK-LABEL: sink_2nd_load_256x256x64_two_dot
// CHECK: triton_gpu.local_load
// CHECK: tt.load
// CHECK-NEXT: tt.load
// CHECK-NEXT: triton_gpu.local_load
// CHECK-NEXT: triton_gpu.local_load
// CHECK-NEXT: tt.dot
// CHECK-NEXT: tt.dot
// CHECK-NEXT: tt.load
// CHECK-NEXT: tt.load
// CHECK-NEXT: triton_gpu.local_store
// CHECK-NEXT: triton_gpu.local_store
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
Expand Down
4 changes: 2 additions & 2 deletions test/TritonGPU/loop-pipeline-hip.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1
%16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
// CHECK: triton_gpu.local_store
// CHECK: scf.for
// CHECK: tt.load
// CHECK: tt.dot
// CHECK: tt.dot
// CHECK: tt.load
// CHECK: triton_gpu.local_store
// CHECK: scf.yield
%17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 {
Expand Down Expand Up @@ -165,9 +165,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1
// CHECK-LABEL: tt.func public @add_barrier_kernel
// CHECK: tt.load
// CHECK: scf.for
// CHECK: tt.load
// CHECK: gpu.barrier
// CHECK: tt.store
// CHECK: tt.load
// CHECK: scf.yield
// CHECK: gpu.barrier
// CHECK: tt.store
Expand Down
Loading

0 comments on commit 7578863

Please sign in to comment.