-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[x86] bitReverse of a byte or half-word can probably be optimized #79794
Comments
define dso_local i64 @byteReverseDefault(i8 zeroext %0) local_unnamed_addr {
Entry:
%1 = tail call i8 @llvm.bitreverse.i8(i8 %0)
%2 = zext i8 %1 to i64
ret i64 %2
}
declare i8 @llvm.bitreverse.i8(i8) #1
define dso_local i64 @byteReversePext(i8 zeroext %0) local_unnamed_addr {
Entry:
%1 = zext i8 %0 to i64
%2 = mul nuw i64 %1, 72340172838076673
%3 = tail call i64 asm "pext ${2}, ${1}, ${0}", "=r,r,r,~{dirflag},~{fpsr},~{flags}"(i64 %2, i64 72624976668147840) #4
ret i64 %3
}
define dso_local zeroext i8 @byteReverseVector(i8 zeroext %0) local_unnamed_addr {
Entry:
%1 = insertelement <1 x i8> poison, i8 %0, i64 0
%2 = shufflevector <1 x i8> %1, <1 x i8> poison, <8 x i32> zeroinitializer
%3 = and <8 x i8> %2, <i8 -128, i8 64, i8 32, i8 16, i8 8, i8 4, i8 2, i8 1>
%4 = icmp ne <8 x i8> %3, zeroinitializer
%5 = bitcast <8 x i1> %4 to i8
ret i8 %5
} |
@llvm/issue-subscribers-backend-x86 Author: Niles Salter (Validark)
[Godbolt link](https://zig.godbolt.org/z/f6PTsr3nj)
const std = @<!-- -->import("std");
inline fn pext(src: u64, mask: u64) u64 {
return asm ("pext %[mask], %[src], %[ret]"
: [ret] "=r" (-> u64),
: [src] "r" (src),
[mask] "r" (mask),
);
}
const T = u8;
export fn byteReverseDefault(x: T) u64 {
return @<!-- -->bitReverse(x);
}
export fn byteReversePext(x: u8) u64 {
return pext(@<!-- -->as(u64, x) * 0x0101010101010101, @<!-- -->as(u64, @<!-- -->bitCast(@<!-- -->as(@<!-- -->Vector(8, u8), @<!-- -->splat(1)) << ~std.simd.iota(u3, 8))));
}
export fn byteReverseVector(x: T) T {
const vec = @<!-- -->as(@<!-- -->Vector(@<!-- -->bitSizeOf(T), T), @<!-- -->splat(x));
const mask = @<!-- -->as(@<!-- -->Vector(@<!-- -->bitSizeOf(T), T), @<!-- -->splat(1)) << ~std.simd.iota(std.math.Log2Int(T), @<!-- -->bitSizeOf(T));
return @<!-- -->bitCast((vec & mask) == mask);
} byteReverseDefault:
rol dil, 4
mov eax, edi
shr dil, 2
and al, 51
and dil, 51
shl al, 2
or dil, al
mov eax, edi
shr dil
and al, 85
and dil, 85
add al, al
or dil, al
movzx eax, dil
ret
byteReversePext:
movabs rcx, 72340172838076673
mov eax, edi
imul rcx, rax
movabs rax, 72624976668147840
pext rax, rcx, rax
ret
.LCPI2_0:
.byte 128
.byte 64
.byte 32
.byte 16
.byte 8
.byte 4
.byte 2
.byte 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
byteReverseVector: ; the compiler will not give you this emit in godbolt
vpbroadcastq xmm1, qword ptr [rip + .LCPI2_0]
vmovd xmm0, edi
vpbroadcastb xmm0, xmm0
vpand xmm0, xmm0, xmm1
vpcmpeqb xmm0, xmm0, xmm1
vpmovmskb eax, xmm0
ret |
…2 scalar types By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal. So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB). Fixes llvm#79794
Is this not a duplicate of #31158? Seems like quite a lot of measurements have been performed there. |
I don't see that anyone else realized it could be done with a multiply + pext? |
It doesn't discuss the PMOVMSKB/PTESTM trick either. |
Guess I'm just too cool for school 😎 |
…2 scalar types By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal. So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB). Fixes llvm#79794
…2 scalar types By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal. So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB). Fixes llvm#79794
…2 scalar types By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal. So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB). Fixes llvm#79794
Godbolt link
The text was updated successfully, but these errors were encountered: