Add fshr functionality and unit tests (#1217)

* Add fshr functionality and unit tests * Add space at EOF * Fix some format issues
google · Sep 5, 2023 · eda331e · eda331e
1 parent a0c7603
commit eda331e
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 0 deletions.
diff --git a/lib/ReplaceLLVMIntrinsicsPass.cpp b/lib/ReplaceLLVMIntrinsicsPass.cpp
@@ -75,6 +75,8 @@ bool clspv::ReplaceLLVMIntrinsicsPass::runOnFunction(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::bswap:
     return replaceBswap(F);
+  case Intrinsic::fshr:
+    return replaceFshr(F);
   case Intrinsic::fshl:
     return replaceFshl(F);
   case Intrinsic::copysign:
@@ -182,6 +184,46 @@ bool clspv::ReplaceLLVMIntrinsicsPass::replaceBswap(Function &F) {
   });
 }
 
+bool clspv::ReplaceLLVMIntrinsicsPass::replaceFshr(Function &F) {
+  return replaceCallsWithValue(F, [](CallInst *call) {
+    auto arg_hi = call->getArgOperand(0);
+    auto arg_lo = call->getArgOperand(1);
+    auto arg_shift = call->getArgOperand(2);
+
+    // Validate argument types with correct sizes.
+    auto type = arg_hi->getType();
+    if ((type->getScalarSizeInBits() != 8) &&
+        (type->getScalarSizeInBits() != 16) &&
+        (type->getScalarSizeInBits() != 32) &&
+        (type->getScalarSizeInBits() != 64)) {
+      return static_cast<Value *>(nullptr);
+    }
+
+    // We need the n LSB of the first arg and size-n MSB of the second arg
+    IRBuilder<> builder(call);
+
+    // The shift amount is treated modulo the element size.
+    auto mod_mask = ConstantInt::get(type, type->getScalarSizeInBits() - 1);
+    // The LSB of the result is the first size - n MSB of the second arg
+    auto lsb_shift = builder.CreateAnd(arg_shift, mod_mask);
+    // The MSB of the result is the first n LSB of the second arg
+    auto scalar_size = ConstantInt::get(type, type->getScalarSizeInBits());
+    auto msb_shift = builder.CreateSub(scalar_size, lsb_shift);
+
+    // "The resulting value is undefined if Shift is greater than or equal to
+    // the bit width of the components of Base."
+    // https://www.khronos.org/registry/SPIR-V/specs/unified1/SPIRV.html#Bit
+    if (!dyn_cast<ConstantInt>(arg_shift)) {
+      msb_shift = builder.CreateAnd(msb_shift, mod_mask);
+    }
+
+    auto hi_bits = builder.CreateShl(arg_hi, msb_shift);
+    auto lo_bits = builder.CreateLShr(arg_lo, lsb_shift);
+
+    return builder.CreateOr(lo_bits, hi_bits);
+  });
+}
+
 bool clspv::ReplaceLLVMIntrinsicsPass::replaceFshl(Function &F) {
   return replaceCallsWithValue(F, [](CallInst *call) {
     auto arg_hi = call->getArgOperand(0);

diff --git a/lib/ReplaceLLVMIntrinsicsPass.h b/lib/ReplaceLLVMIntrinsicsPass.h
@@ -30,6 +30,7 @@ struct ReplaceLLVMIntrinsicsPass
   bool replaceMemcpy(llvm::Module &M);
   bool removeIntrinsicDeclaration(llvm::Function &F);
   bool replaceBswap(llvm::Function &F);
+  bool replaceFshr(llvm::Function &F);
   bool replaceFshl(llvm::Function &F);
   bool replaceCountZeroes(llvm::Function &F, bool leading);
   bool replaceCopysign(llvm::Function &F);

diff --git a/test/LLVMIntrinsics/fshr.ll b/test/LLVMIntrinsics/fshr.ll
@@ -0,0 +1,83 @@
+; RUN: clspv-opt %s -o %t.ll --passes=replace-llvm-intrinsics
+; RUN: FileCheck %s < %t.ll
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+define void @fshr_i8(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
+entry:
+  %result = call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
+  store i8 %result, ptr addrspace(1) %out
+  ret void
+}
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+
+; CHECK-NOT: llvm.fshr
+; CHECK: [[and:%[0-9a-zA-Z_.]+]] = and i8 %c, 7
+; CHECK: [[sub:%[0-9a-zA-Z_.]+]] = sub i8 8, [[and]]
+; CHECK: [[and2:%[0-9a-zA-Z_.]+]] = and i8 [[sub]], 7
+; CHECK: [[shl:%[0-9a-zA-Z_.]+]] = shl i8 %a, [[and2]]
+; CHECK: [[lshr:%[0-9a-zA-Z_.]+]] = lshr i8 %b, [[and]]
+; CHECK: [[or:%[0-9a-zA-Z_.]+]] = or i8 [[lshr]], [[shl]]
+; CHECK: store i8 [[or]], ptr addrspace(1) %out
+
+
+
+define void @fshr_i16(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
+entry:
+  %result = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %c)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
+
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+
+; CHECK-NOT: llvm.fshr
+; CHECK: [[and:%[0-9a-zA-Z_.]+]] = and i16 %c, 15
+; CHECK: [[sub:%[0-9a-zA-Z_.]+]] = sub i16 16, [[and]]
+; CHECK: [[and2:%[0-9a-zA-Z_.]+]] = and i16 [[sub]], 15
+; CHECK: [[shl:%[0-9a-zA-Z_.]+]] = shl i16 %a, [[and2]]
+; CHECK: [[lshr:%[0-9a-zA-Z_.]+]] = lshr i16 %b, [[and]]
+; CHECK: [[or:%[0-9a-zA-Z_.]+]] = or i16 [[lshr]], [[shl]]
+; CHECK: store i16 [[or]], ptr addrspace(1) %out
+
+
+
+define void @fshr_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %result = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+; CHECK-NOT: llvm.fshr
+; CHECK: [[and:%[0-9a-zA-Z_.]+]] = and i32 %c, 31
+; CHECK: [[sub:%[0-9a-zA-Z_.]+]] = sub i32 32, [[and]]
+; CHECK: [[and2:%[0-9a-zA-Z_.]+]] = and i32 [[sub]], 31
+; CHECK: [[shl:%[0-9a-zA-Z_.]+]] = shl i32 %a, [[and2]]
+; CHECK: [[lshr:%[0-9a-zA-Z_.]+]] = lshr i32 %b, [[and]]
+; CHECK: [[or:%[0-9a-zA-Z_.]+]] = or i32 [[lshr]], [[shl]]
+; CHECK: store i32 [[or]], ptr addrspace(1) %
+
+
+
+define void @fshr_i64(ptr addrspace(1) %out, i64 %a, i64 %b, i64 %c) {
+entry:
+  %result = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; CHECK-NOT: llvm.fshr
+; CHECK: [[and:%[0-9a-zA-Z_.]+]] = and i64 %c, 63
+; CHECK: [[sub:%[0-9a-zA-Z_.]+]] = sub i64 64, [[and]]
+; CHECK: [[and2:%[0-9a-zA-Z_.]+]] = and i64 [[sub]], 63
+; CHECK: [[shl:%[0-9a-zA-Z_.]+]] = shl i64 %a, [[and2]]
+; CHECK: [[lshr:%[0-9a-zA-Z_.]+]] = lshr i64 %b, [[and]]
+; CHECK: [[or:%[0-9a-zA-Z_.]+]] = or i64 [[lshr]], [[shl]]
+; CHECK: store i64 [[or]], ptr addrspace(1) %out