From 85b9826c34bbefb0bdd42697edfda7d5192c5d1e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 May 2024 12:14:07 +0100 Subject: [PATCH] [AMDGPU] Regenerate sad.ll test checks Improve test checks for better codegen review of #92576 --- llvm/test/CodeGen/AMDGPU/sad.ll | 369 +++++++++++++++++++++++++++----- 1 file changed, 310 insertions(+), 59 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 1b03065592956f..0492c5663e6660 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,8 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}v_sad_u32_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -16,9 +27,18 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ret void } -; GCN-LABEL: {{^}}v_sad_u32_constant_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20 define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { +; GCN-LABEL: v_sad_u32_constant_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x5a +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, 90 %t0 = select i1 %icmp0, i32 %a, i32 90 @@ -32,9 +52,19 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ret void } -; GCN-LABEL: {{^}}v_sad_u32_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -46,12 +76,28 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1: -; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s3, s0, s1 +; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s3, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -66,9 +112,25 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_add_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -82,9 +144,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_max_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b store volatile i32 %t0, ptr addrspace(5) undef @@ -99,9 +179,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_min_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_min_u32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -117,9 +215,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b store volatile i32 %sub0, ptr addrspace(5) undef @@ -132,11 +248,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: -; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_select_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s1 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_gt_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s3, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -149,12 +283,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_sad_u32_vector_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; GCN-LABEL: v_sad_u32_vector_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b @@ -168,12 +319,29 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 ret void } -; GCN-LABEL: {{^}}v_sad_u32_vector_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; GCN-LABEL: v_sad_u32_vector_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b %sub0 = sub <4 x i32> %a, %b %sub1 = sub <4 x i32> %b, %a @@ -185,10 +353,22 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 ret void } -; GCN-LABEL: {{^}}v_sad_u32_i16_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { - +; GCN-LABEL: v_sad_u32_i16_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s4, s6, 0xffff +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i16 %a, %b %t0 = select i1 %icmp0, i16 %a, i16 %b @@ -202,9 +382,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ret void } -; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { +; GCN-LABEL: v_sad_u32_i16_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: flat_load_ushort v1, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_load_ushort v2, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sad_u32 v2, v0, v1, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm %a = load volatile i16, ptr addrspace(1) undef %b = load volatile i16, ptr addrspace(1) undef %c = load volatile i16, ptr addrspace(1) undef @@ -219,9 +412,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ret void } -; GCN-LABEL: {{^}}v_sad_u32_i8_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { +; GCN-LABEL: v_sad_u32_i8_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s3, s2, 0xff +; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s3, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i8 %a, %b %t0 = select i1 %icmp0, i8 %a, i8 %b @@ -235,9 +441,22 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ret void } -; GCN-LABEL: {{^}}v_sad_u32_i8_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { +; GCN-LABEL: v_sad_u32_i8_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sad_u32 v2, v0, v1, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %a = load volatile i8, ptr addrspace(1) undef %b = load volatile i8, ptr addrspace(1) undef %c = load volatile i8, ptr addrspace(1) undef @@ -252,15 +471,26 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ret void } -; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: -; GCN: s_load_dword -; GCN-DAG: s_bfe_u32 -; GCN-DAG: s_sub_i32 -; GCN-DAG: s_and_b32 -; GCN-DAG: s_sub_i32 -; GCN-DAG: s_lshr_b32 -; GCN: s_add_i32 define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { +; GCN-LABEL: s_sad_u32_i8_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s2, 8 +; GCN-NEXT: s_and_b32 s3, s2, 0xff +; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GCN-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-NEXT: s_sub_i32 s7, s2, s4 +; GCN-NEXT: s_sub_i32 s2, s4, s2 +; GCN-NEXT: s_cmp_gt_u32 s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s7, s2 +; GCN-NEXT: s_add_i32 s2, s2, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b %sub1 = sub i8 %b, %a @@ -272,12 +502,22 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ret void } -; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1: -; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s6, s0, s1 +; GCN-NEXT: s_cmp_le_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s0, s3 +; GCN-NEXT: s_sub_i32 s0, s6, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -291,11 +531,22 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2: -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s3 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_lt_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s0, s3, s6 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %d %sub1 = sub i32 %b, %a