Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[x86] bitReverse of a byte or half-word can probably be optimized #79794

Open
Validark opened this issue Jan 29, 2024 · 6 comments · May be fixed by #92236
Open

[x86] bitReverse of a byte or half-word can probably be optimized #79794

Validark opened this issue Jan 29, 2024 · 6 comments · May be fixed by #92236
Assignees

Comments

@Validark
Copy link

Godbolt link

const std = @import("std");

inline fn pext(src: u64, mask: u64) u64 {
    return asm ("pext %[mask], %[src], %[ret]"
        : [ret] "=r" (-> u64),
        : [src] "r" (src),
          [mask] "r" (mask),
    );
}

const T = u8;

export fn byteReverseDefault(x: T) u64 {
    return @bitReverse(x);
}

export fn byteReversePext(x: u8) u64 {
    return pext(@as(u64, x) * 0x0101010101010101, @as(u64, @bitCast(@as(@Vector(8, u8), @splat(1)) << ~std.simd.iota(u3, 8))));
}

export fn byteReverseVector(x: T) T {
    const vec = @as(@Vector(@bitSizeOf(T), T), @splat(x));
    const mask = @as(@Vector(@bitSizeOf(T), T), @splat(1)) << ~std.simd.iota(std.math.Log2Int(T), @bitSizeOf(T));
    return @bitCast((vec & mask) == mask);
}
byteReverseDefault:
        rol     dil, 4
        mov     eax, edi
        shr     dil, 2
        and     al, 51
        and     dil, 51
        shl     al, 2
        or      dil, al
        mov     eax, edi
        shr     dil
        and     al, 85
        and     dil, 85
        add     al, al
        or      dil, al
        movzx   eax, dil
        ret

byteReversePext:
        movabs  rcx, 72340172838076673
        mov     eax, edi
        imul    rcx, rax
        movabs  rax, 72624976668147840
        pext    rax, rcx, rax
        ret

.LCPI2_0:
        .byte   128
        .byte   64
        .byte   32
        .byte   16
        .byte   8
        .byte   4
        .byte   2
        .byte   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
byteReverseVector: ; the compiler will not give you this emit in godbolt
        vpbroadcastq    xmm1, qword ptr [rip + .LCPI2_0]
        vmovd   xmm0, edi
        vpbroadcastb    xmm0, xmm0
        vpand   xmm0, xmm0, xmm1
        vpcmpeqb        xmm0, xmm0, xmm1
        vpmovmskb       eax, xmm0
        ret
@RKSimon
Copy link
Collaborator

RKSimon commented Jan 29, 2024

define dso_local i64 @byteReverseDefault(i8 zeroext %0) local_unnamed_addr {
Entry:
  %1 = tail call i8 @llvm.bitreverse.i8(i8 %0)
  %2 = zext i8 %1 to i64
  ret i64 %2
}
declare i8 @llvm.bitreverse.i8(i8) #1

define dso_local i64 @byteReversePext(i8 zeroext %0) local_unnamed_addr {
Entry:
  %1 = zext i8 %0 to i64
  %2 = mul nuw i64 %1, 72340172838076673
  %3 = tail call i64 asm "pext ${2}, ${1}, ${0}", "=r,r,r,~{dirflag},~{fpsr},~{flags}"(i64 %2, i64 72624976668147840) #4
  ret i64 %3
}

define dso_local zeroext i8 @byteReverseVector(i8 zeroext %0) local_unnamed_addr {
Entry:
  %1 = insertelement <1 x i8> poison, i8 %0, i64 0
  %2 = shufflevector <1 x i8> %1, <1 x i8> poison, <8 x i32> zeroinitializer
  %3 = and <8 x i8> %2, <i8 -128, i8 64, i8 32, i8 16, i8 8, i8 4, i8 2, i8 1>
  %4 = icmp ne <8 x i8> %3, zeroinitializer
  %5 = bitcast <8 x i1> %4 to i8
  ret i8 %5
}

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 29, 2024

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

[Godbolt link](https://zig.godbolt.org/z/f6PTsr3nj)
const std = @<!-- -->import("std");

inline fn pext(src: u64, mask: u64) u64 {
    return asm ("pext %[mask], %[src], %[ret]"
        : [ret] "=r" (-&gt; u64),
        : [src] "r" (src),
          [mask] "r" (mask),
    );
}

const T = u8;

export fn byteReverseDefault(x: T) u64 {
    return @<!-- -->bitReverse(x);
}

export fn byteReversePext(x: u8) u64 {
    return pext(@<!-- -->as(u64, x) * 0x0101010101010101, @<!-- -->as(u64, @<!-- -->bitCast(@<!-- -->as(@<!-- -->Vector(8, u8), @<!-- -->splat(1)) &lt;&lt; ~std.simd.iota(u3, 8))));
}

export fn byteReverseVector(x: T) T {
    const vec = @<!-- -->as(@<!-- -->Vector(@<!-- -->bitSizeOf(T), T), @<!-- -->splat(x));
    const mask = @<!-- -->as(@<!-- -->Vector(@<!-- -->bitSizeOf(T), T), @<!-- -->splat(1)) &lt;&lt; ~std.simd.iota(std.math.Log2Int(T), @<!-- -->bitSizeOf(T));
    return @<!-- -->bitCast((vec &amp; mask) == mask);
}
byteReverseDefault:
        rol     dil, 4
        mov     eax, edi
        shr     dil, 2
        and     al, 51
        and     dil, 51
        shl     al, 2
        or      dil, al
        mov     eax, edi
        shr     dil
        and     al, 85
        and     dil, 85
        add     al, al
        or      dil, al
        movzx   eax, dil
        ret

byteReversePext:
        movabs  rcx, 72340172838076673
        mov     eax, edi
        imul    rcx, rax
        movabs  rax, 72624976668147840
        pext    rax, rcx, rax
        ret

.LCPI2_0:
        .byte   128
        .byte   64
        .byte   32
        .byte   16
        .byte   8
        .byte   4
        .byte   2
        .byte   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
byteReverseVector: ; the compiler will not give you this emit in godbolt
        vpbroadcastq    xmm1, qword ptr [rip + .LCPI2_0]
        vmovd   xmm0, edi
        vpbroadcastb    xmm0, xmm0
        vpand   xmm0, xmm0, xmm1
        vpcmpeqb        xmm0, xmm0, xmm1
        vpmovmskb       eax, xmm0
        ret

@RKSimon RKSimon self-assigned this May 14, 2024
RKSimon added a commit to RKSimon/llvm-project that referenced this issue May 15, 2024
…2 scalar types

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes llvm#79794
@GnomedDev
Copy link

Is this not a duplicate of #31158? Seems like quite a lot of measurements have been performed there.

@Validark
Copy link
Author

Is this not a duplicate of #31158? Seems like quite a lot of measurements have been performed there.

I don't see that anyone else realized it could be done with a multiply + pext?

@RKSimon
Copy link
Collaborator

RKSimon commented May 17, 2024

It doesn't discuss the PMOVMSKB/PTESTM trick either.

@Validark
Copy link
Author

It doesn't discuss the PMOVMSKB/PTESTM trick either.

Guess I'm just too cool for school 😎

RKSimon added a commit to RKSimon/llvm-project that referenced this issue Jun 24, 2024
…2 scalar types

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes llvm#79794
RKSimon added a commit to RKSimon/llvm-project that referenced this issue Jun 24, 2024
…2 scalar types

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes llvm#79794
RKSimon added a commit to RKSimon/llvm-project that referenced this issue Jul 17, 2024
…2 scalar types

By splitting each byte of a scalar type into 8 copies, in reverse order, we can then test each one to see if the bit is set and then use PMOVMSKB to pack them back together to get the bit reversal.

So far I've only managed to make this worth it for i8/i16 with SSSE3 (PSHUFB), i32 with AVX2 (AVX1 would be worth it if we can force all 256-bit operations to be split), and i64 with AVX512BW (which can use VPTESTMB).

Fixes llvm#79794
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants