From ed0bcdaf3b262f51caa85a2596e6d7906f8070a7 Mon Sep 17 00:00:00 2001 From: RyuaNerin Date: Fri, 18 Aug 2023 18:02:20 +0900 Subject: [PATCH] func(dst, arg...) --- aria/avo/x86/main.go | 6 +- avoutil/mov.go | 6 +- avoutil/simd/simd_avx2.go | 36 +- avoutil/simd/simd_sse2.go | 319 +- avoutil/simd/simd_ssse3.go | 7 +- lsh/x86/avx2/avx2.go | 23 +- lsh/x86/sse2/sse2.go | 52 +- lsh/x86/ssse3/ssse3.go | 56 +- lsh256/avo/x86/lsh256avx2/lsh256_avx2.go | 64 +- lsh256/avo/x86/lsh256sse2/lsh256_sse2.go | 168 +- lsh256/avo/x86/lsh256ssse3/lsh256_ssse3.go | 150 +- lsh256/lsh256_amd64.s | 3896 ++++++------ lsh512/avo/x86/lsh512avx2/lsh512_avx2.go | 59 +- lsh512/avo/x86/lsh512sse2/lsh512_sse2.go | 170 +- lsh512/avo/x86/lsh512ssse3/lsh512_ssse3.go | 153 +- lsh512/lsh512_amd64.s | 6352 ++++++++++---------- 16 files changed, 5515 insertions(+), 6002 deletions(-) diff --git a/aria/avo/x86/main.go b/aria/avo/x86/main.go index df0e616..df70817 100644 --- a/aria/avo/x86/main.go +++ b/aria/avo/x86/main.go @@ -29,11 +29,13 @@ func processFinSSE2() { dst[j] = rk[j] ^ t[j] } */ + tmp := XMM() F_mm_storeu_si128( dst, F_mm_xor_si128( - A_mm_loadu_si128(rk), - A_mm_loadu_si128(t), + tmp, + F_mm_loadu_si128(tmp, rk), + F_mm_loadu_si128(XMM(), t), ), ) diff --git a/avoutil/mov.go b/avoutil/mov.go index 4dcb1b9..c7118ed 100644 --- a/avoutil/mov.go +++ b/avoutil/mov.go @@ -33,8 +33,9 @@ func isAligned(alignedByte int, args ...Op) bool { return true } -func VMOVDQ_autoAU2(dst, src Op) { +func VMOVDQ_autoAU2(dst, src Op) Op { VMOVDQ_autoAU(src, dst) + return dst } func VMOVDQ_autoAU(mxy, mxy1 Op) { if isAligned(YmmSize, mxy, mxy1) { @@ -68,8 +69,9 @@ func VMOVDQ_autoAU(mxy, mxy1 Op) { } } -func MOVO_autoAU2(dst, src Op) { +func MOVO_autoAU2(dst, src Op) Op { MOVO_autoAU(src, dst) + return dst } func MOVO_autoAU(mx, mx1 Op) { if isAligned(XmmSize, mx, mx1) { diff --git a/avoutil/simd/simd_avx2.go b/avoutil/simd/simd_avx2.go index b14a79b..57f1697 100644 --- a/avoutil/simd/simd_avx2.go +++ b/avoutil/simd/simd_avx2.go @@ -3,6 +3,7 @@ package simd import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" . "kryptosimd/avoutil" ) @@ -24,8 +25,8 @@ Operation dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0 */ -func F_mm256_loadu_si256(dst, src Op) Op { - VMOVDQ_autoAU(src, dst) +func F_mm256_loadu_si256(dst VecVirtual, src Op) VecVirtual { + VMOVDQ_autoAU2(dst, src) return dst } @@ -45,9 +46,8 @@ Operation MEM[mem_addr+255:mem_addr] := a[255:0] */ -func F_mm256_storeu_si256(dst, src Op) Op { - VMOVDQ_autoAU(src, dst) - return dst +func F_mm256_storeu_si256(dst, src Op) { + VMOVDQ_autoAU2(dst, src) } /* @@ -67,7 +67,7 @@ Operation dst[255:0] := (a[255:0] XOR b[255:0]) dst[MAX:256] := 0 */ -func F_mm256_xor_si256(dst, a, b Op) Op { +func F_mm256_xor_si256(dst VecVirtual, a, b Op) VecVirtual { CheckType( ` // VPXOR m256 ymm ymm @@ -99,7 +99,7 @@ Operation dst[255:0] := (a[255:0] OR b[255:0]) dst[MAX:256] := 0 */ -func F_mm256_or_si256(dst, a, b Op) Op { +func F_mm256_or_si256(dst VecVirtual, a, b Op) VecVirtual { CheckType( ` // VPOR m256 ymm ymm @@ -131,7 +131,7 @@ Operation dst[255:0] := (a[255:0] AND b[255:0]) dst[MAX:256] := 0 */ -func F_mm256_and_si256(dst, a, b Op) Op { +func F_mm256_and_si256(dst VecVirtual, a, b Op) VecVirtual { CheckType( ` // VPAND m256 ymm ymm @@ -177,7 +177,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_shuffle_epi8(dst, x, y Op) Op { +func F_mm256_shuffle_epi8(dst VecVirtual, x, y Op) VecVirtual { CheckType( ` // VPSHUFB m256 ymm ymm @@ -220,7 +220,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_add_epi32(dst, a, b Op) Op { +func F_mm256_add_epi32(dst VecVirtual, a, b Op) VecVirtual { CheckType( ` // VPADDD m256 ymm ymm @@ -267,7 +267,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_slli_epi32(dst, a, imm8 Op) Op { +func F_mm256_slli_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { CheckType( ` // VPSLLD imm8 ymm ymm @@ -326,7 +326,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_srli_epi32(dst, a, imm8 Op) Op { +func F_mm256_srli_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { CheckType( ` // VPSRLD imm8 ymm ymm @@ -394,7 +394,7 @@ Operation dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[MAX:256] := 0 */ -func F_mm256_shuffle_epi32(dst, a, imm8 Op) Op { +func F_mm256_shuffle_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { CheckType( ` // VPSHUFD imm8 m256 ymm @@ -447,7 +447,7 @@ Operation dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) dst[MAX:256] := 0 */ -func F_mm256_permute2x128_si256(dst, a, b, imm8 Op) Op { +func F_mm256_permute2x128_si256(dst VecVirtual, a, b, imm8 Op) VecVirtual { CheckType( ` // VPERM2I128 imm8 m256 ymm ymm @@ -481,7 +481,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_add_epi64(dst, x, y Op) Op { +func F_mm256_add_epi64(dst VecVirtual, x, y Op) VecVirtual { CheckType( ` // VPADDQ m256 ymm ymm @@ -529,7 +529,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_slli_epi64(dst, x, r Op) Op { +func F_mm256_slli_epi64(dst VecVirtual, x, r Op) VecVirtual { CheckType( ` // VPSLLQ imm8 ymm ymm @@ -589,7 +589,7 @@ Operation ENDFOR dst[MAX:256] := 0 */ -func F_mm256_srli_epi64(dst, x, r Op) Op { +func F_mm256_srli_epi64(dst VecVirtual, x, r Op) VecVirtual { CheckType( ` // VPSRLQ imm8 ymm ymm @@ -654,7 +654,7 @@ Operation dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0 */ -func F_mm256_permute4x64_epi64(dst, a, imm8 Op) Op { +func F_mm256_permute4x64_epi64(dst VecVirtual, a, imm8 Op) VecVirtual { CheckType( ` // VPERMQ imm8 m256 ymm diff --git a/avoutil/simd/simd_sse2.go b/avoutil/simd/simd_sse2.go index 01a9c90..38d3f0f 100644 --- a/avoutil/simd/simd_sse2.go +++ b/avoutil/simd/simd_sse2.go @@ -24,8 +24,8 @@ Operation dst[127:0] := MEM[mem_addr+127:mem_addr] */ -func F_mm_loadu_si128(dst Op, src Op) Op { - MOVO_autoAU(src, dst) +func F_mm_loadu_si128(dst VecVirtual, src Op) VecVirtual { + MOVO_autoAU2(dst, src) return dst } @@ -45,9 +45,8 @@ Operation MEM[mem_addr+127:mem_addr] := a[127:0] */ -func F_mm_storeu_si128(dst, src Op) Op { - MOVO_autoAU(src, dst) - return dst +func F_mm_storeu_si128(dst, src Op) { + MOVO_autoAU2(dst, src) } /* @@ -66,16 +65,39 @@ Operation dst[127:0] := (a[127:0] XOR b[127:0]) */ -func F_mm_xor_si128(dst Op, src Op) Op { - CheckType( - ` - // PXOR m128 xmm - // PXOR xmm xmm - `, - src, dst, - ) +func F_mm_xor_si128(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: + CheckType( + ` + // PXOR m128 xmm + // PXOR xmm xmm + `, + b, dst, + ) + PXOR(b, dst) + case dst == b: + CheckType( + ` + // PXOR m128 xmm + // PXOR xmm xmm + `, + b, dst, + ) + PXOR(a, dst) + default: + CheckType( + ` + // PXOR m128 xmm + // PXOR xmm xmm + `, + b, dst, + ) + + MOVO_autoAU2(dst, a) + PXOR(b, dst) + } - PXOR(src, dst) return dst } @@ -95,16 +117,38 @@ Operation dst[127:0] := (a[127:0] OR b[127:0]) */ -func F_mm_or_si128(dst Op, src Op) Op { - CheckType( - ` - // POR m128 xmm - // POR xmm xmm - `, - src, dst, - ) +func F_mm_or_si128(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: + CheckType( + ` + // POR m128 xmm + // POR xmm xmm + `, + b, dst, + ) + POR(b, dst) + case dst == b: + CheckType( + ` + // POR m128 xmm + // POR xmm xmm + `, + b, dst, + ) + POR(a, dst) + default: + CheckType( + ` + // POR m128 xmm + // POR xmm xmm + `, + b, dst, + ) - POR(src, dst) + MOVO_autoAU2(dst, a) + POR(b, dst) + } return dst } @@ -124,16 +168,38 @@ Operation dst[127:0] := (a[127:0] AND b[127:0]) */ -func F_mm_and_si128(dst Op, src Op) Op { - CheckType( - ` - // PAND m128 xmm - // PAND xmm xmm - `, - src, dst, - ) +func F_mm_and_si128(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: + CheckType( + ` + // PAND m128 xmm + // PAND xmm xmm + `, + b, dst, + ) + PAND(b, dst) + case dst == b: + CheckType( + ` + // PAND m128 xmm + // PAND xmm xmm + `, + b, dst, + ) + PAND(a, dst) + default: + CheckType( + ` + // PAND m128 xmm + // PAND xmm xmm + `, + b, dst, + ) - PAND(src, dst) + MOVO_autoAU2(dst, a) + PAND(b, dst) + } return dst } @@ -156,16 +222,38 @@ Operation dst[i+31:i] := a[i+31:i] + b[i+31:i] ENDFOR */ -func F_mm_add_epi32(dst VecVirtual, src Op) Op { - CheckType( - ` - // PADDD m128 xmm - // PADDD xmm xmm - `, - src, dst, - ) +func F_mm_add_epi32(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: + CheckType( + ` + // PADDD m128 xmm + // PADDD xmm xmm + `, + b, dst, + ) + PADDD(b, dst) + case dst == b: + CheckType( + ` + // PADDD m128 xmm + // PADDD xmm xmm + `, + b, dst, + ) + PADDD(a, dst) + default: + CheckType( + ` + // PADDD m128 xmm + // PADDD xmm xmm + `, + b, dst, + ) - PADDD(src, dst) + MOVO_autoAU2(dst, a) + PADDD(b, dst) + } return dst } @@ -192,17 +280,21 @@ Operation FI ENDFOR */ -func F_mm_slli_epi32(dst VecVirtual, r Op) VecVirtual { +func F_mm_slli_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSLLL imm8 xmm // PSLLL m128 xmm // PSLLL xmm xmm `, - r, dst, + imm8, dst, ) - PSLLL(r, dst) + PSLLL(imm8, dst) return dst } @@ -229,17 +321,21 @@ Operation FI ENDFOR */ -func F_mm_srli_epi32(dst VecVirtual, r Op) VecVirtual { +func F_mm_srli_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSRLL imm8 xmm // PSRLL m128 xmm // PSRLL xmm xmm `, - r, dst, + imm8, dst, ) - PSRLL(r, dst) + PSRLL(imm8, dst) return dst } @@ -271,16 +367,20 @@ Operation dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) */ -func F_mm_shuffle_epi32(dst VecVirtual, x, i Op) VecVirtual { +func F_mm_shuffle_epi32(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSHUFD imm8 m128 xmm // PSHUFD imm8 xmm xmm `, - i, x, dst, + imm8, a, dst, ) - PSHUFD(i, x, dst) + PSHUFD(imm8, a, dst) return dst } @@ -306,8 +406,9 @@ Operation } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) */ -func F_mm_unpacklo_epi64(dst, a, b Op) Op { - if dst == a { +func F_mm_unpacklo_epi64(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: CheckType( ` // PUNPCKLQDQ m128 xmm @@ -317,7 +418,8 @@ func F_mm_unpacklo_epi64(dst, a, b Op) Op { ) PUNPCKLQDQ(b, dst) - } else if dst == b { + + case dst == b: CheckType( ` // PUNPCKLQDQ m128 xmm @@ -330,7 +432,8 @@ func F_mm_unpacklo_epi64(dst, a, b Op) Op { MOVO_autoAU2(tmp, b) MOVO_autoAU2(dst, a) PUNPCKLQDQ(tmp, dst) - } else { + + default: CheckType( ` // PUNPCKLQDQ m128 xmm @@ -368,8 +471,9 @@ Operation } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) */ -func F_mm_unpackhi_epi64(dst, a, b Op) Op { - if dst == a { +func F_mm_unpackhi_epi64(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: CheckType( ` // PUNPCKHQDQ m128 xmm @@ -379,7 +483,8 @@ func F_mm_unpackhi_epi64(dst, a, b Op) Op { ) PUNPCKHQDQ(b, dst) - } else if dst == b { + + case dst == b: CheckType( ` // PUNPCKHQDQ m128 xmm @@ -392,7 +497,8 @@ func F_mm_unpackhi_epi64(dst, a, b Op) Op { MOVO_autoAU2(tmp, b) MOVO_autoAU2(dst, a) PUNPCKHQDQ(tmp, dst) - } else { + + default: CheckType( ` // PUNPCKHQDQ m128 xmm @@ -404,6 +510,7 @@ func F_mm_unpackhi_epi64(dst, a, b Op) Op { MOVO_autoAU2(dst, a) PUNPCKHQDQ(b, dst) } + return dst } @@ -427,16 +534,41 @@ Operation dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR */ -func F_mm_add_epi64(dst Op, b Op) Op { - CheckType( - ` - // PADDQ m128 xmm - // PADDQ xmm xmm - `, - b, dst, - ) +func F_mm_add_epi64(dst VecVirtual, a, b Op) VecVirtual { + switch { + case dst == a: + CheckType( + ` + // PADDQ m128 xmm + // PADDQ xmm xmm + `, + b, dst, + ) + + PADDQ(b, dst) + + case dst == b: + CheckType( + ` + // PADDQ m128 xmm + // PADDQ xmm xmm + `, + a, dst, + ) + PADDQ(a, dst) - PADDQ(b, dst) + default: + CheckType( + ` + // PADDQ m128 xmm + // PADDQ xmm xmm + `, + b, dst, + ) + + MOVO_autoAU2(dst, a) + PADDQ(b, dst) + } return dst } @@ -464,7 +596,11 @@ Operation FI ENDFOR */ -func F_mm_srli_epi64(dst, imm8 Op) Op { +func F_mm_srli_epi64(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSRLQ imm8 xmm @@ -502,7 +638,11 @@ Operation FI ENDFOR */ -func F_mm_slli_epi64(dst, imm8 Op) Op { +func F_mm_slli_epi64(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSLLQ imm8 xmm @@ -544,5 +684,42 @@ func F_mm_load_si128(dst VecVirtual, src Mem) VecVirtual { return dst } -func A_mm_loadu_si128(src Op) Op { return F_mm_loadu_si128(XMM(), src) } -func A_mm_load_si128(src Mem) Op { return F_mm_load_si128(XMM(), src) } +/* +* +Synopsis + + __m128i _mm_srli_epi16 (__m128i a, int imm8) + #include + Instruction: psrlw xmm, imm8 + CPUID Flags: SSE2 + +Description + + Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst. + +Operation + + FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ENDFOR +*/ +func F_mm_srli_epi16(dst VecVirtual, a, imm8 Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + + CheckType( + ` + // PSRLW imm8 xmm + `, + imm8, dst, + ) + + PSRLW(imm8, dst) + return dst +} diff --git a/avoutil/simd/simd_ssse3.go b/avoutil/simd/simd_ssse3.go index 1c366ae..c8f32cb 100644 --- a/avoutil/simd/simd_ssse3.go +++ b/avoutil/simd/simd_ssse3.go @@ -5,6 +5,7 @@ import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" ) /** @@ -29,7 +30,11 @@ Operation dst = a */ -func F_mm_shuffle_epi8(dst, b Op) Op { +func F_mm_shuffle_epi8(dst VecVirtual, a, b Op) VecVirtual { + if dst != a { + MOVO_autoAU2(dst, a) + } + CheckType( ` // PSHUFB m128 xmm diff --git a/lsh/x86/avx2/avx2.go b/lsh/x86/avx2/avx2.go index c17775e..abefdc8 100644 --- a/lsh/x86/avx2/avx2.go +++ b/lsh/x86/avx2/avx2.go @@ -2,6 +2,7 @@ package avx2 import ( . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" . "kryptosimd/avoutil/simd" ) @@ -11,37 +12,37 @@ import ( * -------------------------------------------------------- */ // #define LOAD(x) _mm256_loadu_si256((__m256i*)x) -func LOAD(dst, src Op) Op { return F_mm256_loadu_si256(dst, src) } +func LOAD(dst VecVirtual, src Op) Op { return F_mm256_loadu_si256(dst, src) } // #define STORE(x,y) _mm256_storeu_si256((__m256i*)x, y) func STORE(dst, src Op) { F_mm256_storeu_si256(dst, src) } // #define XOR(x,y) _mm256_xor_si256(x,y) -func XOR(dst, x, y Op) Op { return F_mm256_xor_si256(dst, x, y) } +func XOR(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_xor_si256(dst, x, y) } // #define OR(x,y) _mm256_or_si256(x,y) -func OR(dst, x, y Op) Op { return F_mm256_or_si256(dst, x, y) } +func OR(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_or_si256(dst, x, y) } // #define AND(x,y) _mm256_and_si256(x,y) -func AND(dst, x, y Op) Op { return F_mm256_and_si256(dst, x, y) } +func AND(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_and_si256(dst, x, y) } // #define SHUFFLE8(x,y) _mm256_shuffle_epi8(x,y) -func SHUFFLE8(dst, x, y Op) Op { return F_mm256_shuffle_epi8(dst, x, y) } +func SHUFFLE8(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_shuffle_epi8(dst, x, y) } // #define ADD32(x,y) _mm256_add_epi32(x,y) -func ADD32(dst, x, y Op) Op { return F_mm256_add_epi32(dst, x, y) } +func ADD32(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_add_epi32(dst, x, y) } // #define SHIFT_L32(x,r) _mm256_slli_epi32(x,r) -func SHIFT_L32(dst, x, y Op) Op { return F_mm256_slli_epi32(dst, x, y) } +func SHIFT_L32(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_slli_epi32(dst, x, y) } // #define SHIFT_R32(x,r) _mm256_srli_epi32(x,r) -func SHIFT_R32(dst, x, y Op) Op { return F_mm256_srli_epi32(dst, x, y) } +func SHIFT_R32(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_srli_epi32(dst, x, y) } // #define ADD(x,y) F_mm256_add_epi64(x,y) -func ADD64(dst, x, y Op) Op { return F_mm256_add_epi64(dst, x, y) } +func ADD64(dst VecVirtual, x, y Op) VecVirtual { return F_mm256_add_epi64(dst, x, y) } // #define SHIFT_L(x,r) _mm256_slli_epi64(x,r) -func SHIFT_L64(dst, x, r Op) Op { return F_mm256_slli_epi64(dst, x, r) } +func SHIFT_L64(dst VecVirtual, x, r Op) VecVirtual { return F_mm256_slli_epi64(dst, x, r) } // #define SHIFT_R(x,r) _mm256_srli_epi64(x,r) -func SHIFT_R64(dst, x, r Op) Op { return F_mm256_srli_epi64(dst, x, r) } +func SHIFT_R64(dst VecVirtual, x, r Op) VecVirtual { return F_mm256_srli_epi64(dst, x, r) } diff --git a/lsh/x86/sse2/sse2.go b/lsh/x86/sse2/sse2.go index df2bc8b..a7e5626 100644 --- a/lsh/x86/sse2/sse2.go +++ b/lsh/x86/sse2/sse2.go @@ -4,7 +4,6 @@ import ( . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" - . "kryptosimd/avoutil" . "kryptosimd/avoutil/simd" ) @@ -16,56 +15,25 @@ import ( /* -------------------------------------------------------- */ // #define LOAD(x) _mm_loadu_si128((__m128i*)x) -func LOAD(dst, x Op) Op { return F_mm_loadu_si128(dst, x) } +func LOAD(dst VecVirtual, x Op) Op { return F_mm_loadu_si128(dst, x) } // #define STORE(x,y) _mm_storeu_si128((__m128i*)x, y) func STORE(dst, y Op) { F_mm_storeu_si128(dst, y) } // #define XOR(x,y) _mm_xor_si128(x,y) -func XOR(dst Op, y Op) Op { return F_mm_xor_si128(dst, y) } +func XOR(dst VecVirtual, x, y Op) VecVirtual { return F_mm_xor_si128(dst, x, y) } // #define OR(x,y) _mm_or_si128(x,y) -func OR(dst Op, y Op) Op { return F_mm_or_si128(dst, y) } +func OR(dst VecVirtual, x, y Op) VecVirtual { return F_mm_or_si128(dst, x, y) } // #define AND(x,y) _mm_and_si128(x,y) -func AND(dst Op, y Op) Op { return F_mm_and_si128(dst, y) } +func AND(dst VecVirtual, x, y Op) VecVirtual { return F_mm_and_si128(dst, x, y) } -// #define ADD(x,y) _mm_add_epi32(x,y) -func ADD32(dst VecVirtual, y Op) Op { return F_mm_add_epi32(dst, y) } -func ADD32_(dst VecVirtual, a, b Op) Op { - if dst == a { - ADD32(dst, b) - } else if dst == b { - ADD32(dst, a) - } else { - MOVO_autoAU2(dst, a) - ADD32(dst, b) - } - return dst -} +func ADD32(dst VecVirtual, x, y Op) VecVirtual { return F_mm_add_epi32(dst, x, y) } // #define ADD(x,y) _mm_add_epi32(x,y) +func ADD64(dst VecVirtual, x, y Op) VecVirtual { return F_mm_add_epi64(dst, x, y) } // #define ADD(x,y) _mm_add_epi64(x,y) -// #define SHIFT_L(x,r) _mm_slli_epi32(x,r) -func SHIFT_L32(dst VecVirtual, r Op) Op { return F_mm_slli_epi32(dst, r) } +func SHIFT_L32(dst VecVirtual, x, r Op) VecVirtual { return F_mm_slli_epi32(dst, x, r) } // #define SHIFT_L(x,r) _mm_slli_epi32(x,r) +func SHIFT_R32(dst VecVirtual, x, r Op) VecVirtual { return F_mm_srli_epi32(dst, x, r) } // #define SHIFT_R(x,r) _mm_srli_epi32(x,r) -// #define SHIFT_R(x,r) _mm_srli_epi32(x,r) -func SHIFT_R32(dst VecVirtual, r Op) Op { return F_mm_srli_epi32(dst, r) } - -// #define ADD(x,y) F_mm_add_epi64(x,y) -func ADD64(dst VecVirtual, y Op) Op { return F_mm_add_epi64(dst, y) } -func ADD64_(dst VecVirtual, a, b Op) Op { - if dst == a { - ADD64(dst, b) - } else if dst == b { - ADD64(dst, a) - } else { - MOVO_autoAU2(dst, a) - ADD64(dst, b) - } - return dst -} - -// #define SHIFT_L(x,r) _mm_slli_epi64(x,r) -func SHIFT_L64(dst VecVirtual, r Op) Op { return F_mm_slli_epi64(dst, r) } - -// #define SHIFT_R(x,r) _mm_srli_epi64(x,r) -func SHIFT_R64(dst VecVirtual, r Op) Op { return F_mm_srli_epi64(dst, r) } +func SHIFT_L64(dst VecVirtual, x, r Op) VecVirtual { return F_mm_slli_epi64(dst, x, r) } // #define SHIFT_L(x,r) _mm_slli_epi64(x,r) +func SHIFT_R64(dst VecVirtual, x, r Op) VecVirtual { return F_mm_srli_epi64(dst, x, r) } // #define SHIFT_R(x,r) _mm_srli_epi64(x,r) diff --git a/lsh/x86/ssse3/ssse3.go b/lsh/x86/ssse3/ssse3.go index 6f06018..1b6ba8f 100644 --- a/lsh/x86/ssse3/ssse3.go +++ b/lsh/x86/ssse3/ssse3.go @@ -4,7 +4,6 @@ import ( . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" - . "kryptosimd/avoutil" . "kryptosimd/avoutil/simd" ) @@ -16,61 +15,28 @@ import ( /* -------------------------------------------------------- */ // #define LOAD(x) _mm_loadu_si128((__m128i*)x) -func LOAD(dst, src Op) Op { return F_mm_loadu_si128(dst, src) } +func LOAD(dst VecVirtual, src Op) VecVirtual { return F_mm_loadu_si128(dst, src) } // #define STORE(x,y) _mm_storeu_si128((__m128i*)x, y) func STORE(dst, src Op) { F_mm_storeu_si128(dst, src) } // #define XOR(x,y) _mm_xor_si128(x,y) -func XOR(dst Op, src VecVirtual) Op { return F_mm_xor_si128(dst, src) } +func XOR(dst VecVirtual, x, y Op) VecVirtual { return F_mm_xor_si128(dst, x, y) } // #define OR(x,y) _mm_or_si128(x,y) -func OR(dst Op, src VecVirtual) Op { return F_mm_or_si128(dst, src) } +func OR(dst VecVirtual, x, y Op) VecVirtual { return F_mm_or_si128(dst, x, y) } // #define AND(x,y) _mm_and_si128(x,y) -func AND(dst Op, src VecVirtual) Op { return F_mm_and_si128(dst, src) } +func AND(dst VecVirtual, x, y Op) VecVirtual { return F_mm_and_si128(dst, x, y) } // #define SHUFFLE8(x,y) _mm_shuffle_epi8(x,y) -// -// >> dst == x -func SHUFFLE8(dst VecVirtual, y Op) Op { return F_mm_shuffle_epi8(dst, y) } +func SHUFFLE8(dst VecVirtual, x, y Op) VecVirtual { return F_mm_shuffle_epi8(dst, x, y) } -// #define ADD32(x,y) _mm_add_epi32(x,y) -func ADD32(dst VecVirtual, y Op) Op { return F_mm_add_epi32(dst, y) } -func ADD32_(dst VecVirtual, a, b Op) Op { - if dst == a { - ADD32(dst, b) - } else if dst == b { - ADD32(dst, a) - } else { - MOVO_autoAU2(dst, a) - ADD32(dst, b) - } - return dst -} +func ADD32(dst VecVirtual, x, y Op) VecVirtual { return F_mm_add_epi32(dst, x, y) } // #define ADD(x,y) _mm_add_epi32(x,y) +func ADD64(dst VecVirtual, x, y Op) VecVirtual { return F_mm_add_epi64(dst, x, y) } // #define ADD(x,y) _mm_add_epi64(x,y) -// #define SHIFT_L32(x,r) _mm_slli_epi32(x,r) -func SHIFT_L32(dst VecVirtual, r Op) Op { return F_mm_slli_epi32(dst, r) } +func SHIFT_L32(dst VecVirtual, x, r Op) VecVirtual { return F_mm_slli_epi32(dst, x, r) } // #define SHIFT_L(x,r) _mm_slli_epi32(x,r) +func SHIFT_R32(dst VecVirtual, x, r Op) VecVirtual { return F_mm_srli_epi32(dst, x, r) } // #define SHIFT_R(x,r) _mm_srli_epi32(x,r) -// #define SHIFT_R32(x,r) _mm_srli_epi32(x,r) -func SHIFT_R32(dst VecVirtual, r Op) Op { return F_mm_srli_epi32(dst, r) } - -// #define ADD(x,y) F_mm_add_epi64(x,y) -func ADD64(dst VecVirtual, y Op) Op { return F_mm_add_epi64(dst, y) } -func ADD64_(dst VecVirtual, a, b Op) Op { - if dst == a { - ADD64(dst, b) - } else if dst == b { - ADD64(dst, a) - } else { - MOVO_autoAU2(dst, a) - ADD64(dst, b) - } - return dst -} - -// #define SHIFT_L(x,r) _mm_slli_epi64(x,r) -func SHIFT_L64(dst VecVirtual, r Op) Op { return F_mm_slli_epi64(dst, r) } - -// #define SHIFT_R(x,r) _mm_srli_epi64(x,r) -func SHIFT_R64(dst VecVirtual, r Op) Op { return F_mm_srli_epi64(dst, r) } +func SHIFT_L64(dst VecVirtual, x, r Op) VecVirtual { return F_mm_slli_epi64(dst, x, r) } // #define SHIFT_L(x,r) _mm_slli_epi64(x,r) +func SHIFT_R64(dst VecVirtual, x, r Op) VecVirtual { return F_mm_srli_epi64(dst, x, r) } // #define SHIFT_R(x,r) _mm_srli_epi64(x,r) diff --git a/lsh256/avo/x86/lsh256avx2/lsh256_avx2.go b/lsh256/avo/x86/lsh256avx2/lsh256_avx2.go index 617a89e..92fe8ee 100644 --- a/lsh256/avo/x86/lsh256avx2/lsh256_avx2.go +++ b/lsh256/avo/x86/lsh256avx2/lsh256_avx2.go @@ -32,19 +32,11 @@ func load_blk_mem2vec(dest []VecVirtual, src Mem) { //dest[0] = LOAD(src); LOAD(dest[0], src) } -func load_blk_vec2mem(dest Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - //dest[0] = LOAD(src); - LOAD(dest, src[0]) -} func load_blk_mem2mem(dest, src Mem) { Comment("load_blk_mem2mem") - tmp := YMM() //dest[0] = LOAD(src); - LOAD(tmp, src) - LOAD(dest, tmp) + MemcpyStatic(dest, src, YmmSize, true) } // static INLINE void store_blk(__m256i* dest, const __m256i* src){ @@ -60,12 +52,12 @@ func load_msg_blk(i_state LSH256AVX2_internal, msgblk Mem) { Comment("load_msg_blk") //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) + load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) + load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) load_blk_mem2vec(i_state.submsg_o_r, msgblk.Offset(24*4)) } @@ -74,11 +66,9 @@ func msg_exp_even(i_state LSH256AVX2_internal, perm_step Op) { Comment("msg_exp_even") //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], SHUFFLE8(i_state->submsg_e_l[0], perm_step)); - SHUFFLE8(i_state.submsg_e_l[0], i_state.submsg_e_l[0], perm_step) - ADD32(i_state.submsg_e_l[0], i_state.submsg_e_l[0], i_state.submsg_o_l[0]) //i_state->submsg_e_r[0] = ADD(i_state->submsg_o_r[0], SHUFFLE8(i_state->submsg_e_r[0], perm_step)); - SHUFFLE8(i_state.submsg_e_r[0], i_state.submsg_e_r[0], perm_step) - ADD32(i_state.submsg_e_r[0], i_state.submsg_e_r[0], i_state.submsg_o_r[0]) + ADD32(i_state.submsg_e_l[0], i_state.submsg_o_l[0], SHUFFLE8(i_state.submsg_e_l[0], i_state.submsg_e_l[0], perm_step)) + ADD32(i_state.submsg_e_r[0], i_state.submsg_o_r[0], SHUFFLE8(i_state.submsg_e_r[0], i_state.submsg_e_r[0], perm_step)) } // static INLINE void msg_exp_odd(LSH256AVX2_internal * i_state, const __m256i perm_step){ @@ -86,11 +76,9 @@ func msg_exp_odd(i_state LSH256AVX2_internal, perm_step Op) { Comment("msg_exp_odd") //i_state->submsg_o_l[0] = ADD(i_state->submsg_e_l[0], SHUFFLE8(i_state->submsg_o_l[0], perm_step)); - SHUFFLE8(i_state.submsg_o_l[0], i_state.submsg_o_l[0], perm_step) - ADD32(i_state.submsg_o_l[0], i_state.submsg_o_l[0], i_state.submsg_e_l[0]) //i_state->submsg_o_r[0] = ADD(i_state->submsg_e_r[0], SHUFFLE8(i_state->submsg_o_r[0], perm_step)); - SHUFFLE8(i_state.submsg_o_r[0], i_state.submsg_o_r[0], perm_step) - ADD32(i_state.submsg_o_r[0], i_state.submsg_o_r[0], i_state.submsg_e_r[0]) + ADD32(i_state.submsg_o_l[0], i_state.submsg_e_l[0], SHUFFLE8(i_state.submsg_o_l[0], i_state.submsg_o_l[0], perm_step)) + ADD32(i_state.submsg_o_r[0], i_state.submsg_e_r[0], SHUFFLE8(i_state.submsg_o_r[0], i_state.submsg_o_r[0], perm_step)) } // static INLINE void load_sc(__m256i* const_v, lsh_uint i){ @@ -107,8 +95,8 @@ func msg_add_even(cv_l, cv_r []VecVirtual, i_state LSH256AVX2_internal) { Comment("msg_add_even") //*cv_l = XOR(*cv_l, i_state->submsg_e_l[0]); - XOR(cv_l[0], cv_l[0], i_state.submsg_e_l[0]) //*cv_r = XOR(*cv_r, i_state->submsg_e_r[0]); + XOR(cv_l[0], cv_l[0], i_state.submsg_e_l[0]) XOR(cv_r[0], cv_r[0], i_state.submsg_e_r[0]) } @@ -117,8 +105,8 @@ func msg_add_odd(cv_l, cv_r []VecVirtual, i_state LSH256AVX2_internal) { Comment("msg_add_odd") //*cv_l = XOR(*cv_l, i_state->submsg_o_l[0]); - XOR(cv_l[0], cv_l[0], i_state.submsg_o_l[0]) //*cv_r = XOR(*cv_r, i_state->submsg_o_r[0]); + XOR(cv_l[0], cv_l[0], i_state.submsg_o_l[0]) XOR(cv_r[0], cv_r[0], i_state.submsg_o_r[0]) } @@ -134,44 +122,36 @@ func add_blk(cv_l, cv_r []VecVirtual) { func rotate_blk_even_alpha(cv []VecVirtual) { Comment("rotate_blk_even_alpha") - tmpYmm := YMM() + tmp := YMM() //*cv = OR(SHIFT_L(*cv, ROT_EVEN_ALPHA), SHIFT_R(*cv, WORD_BIT_LEN - ROT_EVEN_ALPHA)); - SHIFT_L32(tmpYmm, cv[0], U8(ROT_EVEN_ALPHA)) - SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA)) - OR(cv[0], cv[0], tmpYmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA))) } // static INLINE void rotate_blk_even_beta(__m256i* cv){ func rotate_blk_even_beta(cv []VecVirtual) { Comment("rotate_blk_even_beta") - tmpYmm := YMM() + tmp := YMM() //*cv = OR(SHIFT_L(*cv, ROT_EVEN_BETA), SHIFT_R(*cv, WORD_BIT_LEN - ROT_EVEN_BETA)); - SHIFT_L32(tmpYmm, cv[0], U8(ROT_EVEN_BETA)) - SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA)) - OR(cv[0], cv[0], tmpYmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA))) } // static INLINE void rotate_blk_odd_alpha(__m256i* cv){ func rotate_blk_odd_alpha(cv []VecVirtual) { Comment("rotate_blk_odd_alpha") - tmpYmm := YMM() + tmp := YMM() //*cv = OR(SHIFT_L(*cv, ROT_ODD_ALPHA), SHIFT_R(*cv, WORD_BIT_LEN - ROT_ODD_ALPHA)); - SHIFT_L32(tmpYmm, cv[0], U8(ROT_ODD_ALPHA)) - SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA)) - OR(cv[0], cv[0], tmpYmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA))) } // static INLINE void rotate_blk_odd_beta(__m256i* cv){ func rotate_blk_odd_beta(cv []VecVirtual) { Comment("rotate_blk_odd_beta") - tmpYmm := YMM() + tmp := YMM() //*cv = OR(SHIFT_L(*cv, ROT_ODD_BETA), SHIFT_R(*cv, WORD_BIT_LEN - ROT_ODD_BETA)); - SHIFT_L32(tmpYmm, cv[0], U8(ROT_ODD_BETA)) - SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA)) - OR(cv[0], cv[0], tmpYmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA))) } // static INLINE void xor_with_const(__m256i* cv_l, const __m256i* const_v){ @@ -197,12 +177,12 @@ func word_perm(cv_l, cv_r []VecVirtual) { //__m256i temp; temp := YMM() //temp = _mm256_shuffle_epi32(*cv_l, 0xd2); - F_mm256_shuffle_epi32(temp, cv_l[0], U8(0xd2)) //*cv_r = _mm256_shuffle_epi32(*cv_r, 0x6c); + F_mm256_shuffle_epi32(temp, cv_l[0], U8(0xd2)) F_mm256_shuffle_epi32(cv_r[0], cv_r[0], U8(0x6c)) //*cv_l = _mm256_permute2x128_si256(temp, *cv_r, 0x31); - F_mm256_permute2x128_si256(cv_l[0], temp, cv_r[0], U8(0x31)) //*cv_r = _mm256_permute2x128_si256(temp, *cv_r, 0x20); + F_mm256_permute2x128_si256(cv_l[0], temp, cv_r[0], U8(0x31)) F_mm256_permute2x128_si256(cv_r[0], temp, cv_r[0], U8(0x20)) } @@ -303,8 +283,8 @@ func init224(state *LSH256_Context) { Comment("init224") //load_blk(state->cv_l, g_IV224); - load_blk_mem2mem(state.Cv_l, G_IV224) //load_blk(state->cv_r, g_IV224 + 8); + load_blk_mem2mem(state.Cv_l, G_IV224) load_blk_mem2mem(state.Cv_r, G_IV224.Offset(8*4)) } @@ -313,8 +293,8 @@ func init256(state *LSH256_Context) { Comment("init256") //load_blk(state->cv_l, g_IV256); - load_blk_mem2mem(state.Cv_l, G_IV256) //load_blk(state->cv_r, g_IV256 + 8); + load_blk_mem2mem(state.Cv_l, G_IV256) load_blk_mem2mem(state.Cv_r, G_IV256.Offset(8*4)) } diff --git a/lsh256/avo/x86/lsh256sse2/lsh256_sse2.go b/lsh256/avo/x86/lsh256sse2/lsh256_sse2.go index faabca0..f928a77 100644 --- a/lsh256/avo/x86/lsh256sse2/lsh256_sse2.go +++ b/lsh256/avo/x86/lsh256sse2/lsh256_sse2.go @@ -34,28 +34,16 @@ func load_blk_mem2vec(dst []VecVirtual, src Mem) { Comment("load_blk_mem2vec") // dest[0] = LOAD((const __m128i*)src); - LOAD(dst[0], src) // dest[1] = LOAD((const __m128i*)src + 1); + LOAD(dst[0], src) LOAD(dst[1], src.Offset(XmmSize)) } -func load_blk_vec2mem(dst Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - // dest[0] = LOAD((const __m128i*)src); - LOAD(dst, src[0]) - // dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst.Offset(XmmSize), src[1]) -} func load_blk_mem2mem(dst, src Mem) { Comment("load_blk_mem2mem") - tmp := XMM() // dest[0] = LOAD((const __m128i*)src); - LOAD(tmp, src) - LOAD(dst, tmp) // dest[1] = LOAD((const __m128i*)src + 1); - LOAD(tmp, src.Offset(XmmSize)) - LOAD(dst.Offset(XmmSize), tmp) + MemcpyStatic(dst, src, XmmSize*2, false) } // static INLINE void store_blk(__m128i* dest, const __m128i* src){ @@ -63,8 +51,8 @@ func store_blk(dst Mem, src []VecVirtual) { Comment("store_blk") //STORE(dest, src[0]); - STORE(dst, src[0]) //STORE(dest + 1, src[1]); + STORE(dst, src[0]) STORE(dst.Offset(XmmSize), src[1]) } @@ -73,12 +61,12 @@ func load_msg_blk(i_state LSH256SSE2_internal, msgblk Mem /* uint32 */) { Comment("load_msg_blk") //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) + load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) + load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) load_blk_mem2vec(i_state.submsg_o_r, msgblk.Offset(24*4)) } @@ -87,17 +75,13 @@ func msg_exp_even(i_state LSH256SSE2_internal) { Comment("msg_exp_even") //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], _mm_shuffle_epi32(i_state->submsg_e_l[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_e_l[0], i_state.submsg_e_l[0], U8(0x4b)) - ADD32(i_state.submsg_e_l[0], i_state.submsg_o_l[0]) //i_state->submsg_e_l[1] = ADD(i_state->submsg_o_l[1], _mm_shuffle_epi32(i_state->submsg_e_l[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_e_l[1], i_state.submsg_e_l[1], U8(0x93)) - ADD32(i_state.submsg_e_l[1], i_state.submsg_o_l[1]) //i_state->submsg_e_r[0] = ADD(i_state->submsg_o_r[0], _mm_shuffle_epi32(i_state->submsg_e_r[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_e_r[0], i_state.submsg_e_r[0], U8(0x4b)) - ADD32(i_state.submsg_e_r[0], i_state.submsg_o_r[0]) //i_state->submsg_e_r[1] = ADD(i_state->submsg_o_r[1], _mm_shuffle_epi32(i_state->submsg_e_r[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_e_r[1], i_state.submsg_e_r[1], U8(0x93)) - ADD32(i_state.submsg_e_r[1], i_state.submsg_o_r[1]) + ADD32(i_state.submsg_e_l[0], i_state.submsg_o_l[0], F_mm_shuffle_epi32(i_state.submsg_e_l[0], i_state.submsg_e_l[0], U8(0x4b))) + ADD32(i_state.submsg_e_l[1], i_state.submsg_o_l[1], F_mm_shuffle_epi32(i_state.submsg_e_l[1], i_state.submsg_e_l[1], U8(0x93))) + ADD32(i_state.submsg_e_r[0], i_state.submsg_o_r[0], F_mm_shuffle_epi32(i_state.submsg_e_r[0], i_state.submsg_e_r[0], U8(0x4b))) + ADD32(i_state.submsg_e_r[1], i_state.submsg_o_r[1], F_mm_shuffle_epi32(i_state.submsg_e_r[1], i_state.submsg_e_r[1], U8(0x93))) } // static INLINE void msg_exp_odd(LSH256SSE2_internal * i_state){ @@ -105,17 +89,13 @@ func msg_exp_odd(i_state LSH256SSE2_internal) { Comment("msg_exp_odd") //i_state->submsg_o_l[0] = ADD(i_state->submsg_e_l[0], _mm_shuffle_epi32(i_state->submsg_o_l[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_o_l[0], i_state.submsg_o_l[0], U8(0x4b)) - ADD32(i_state.submsg_o_l[0], i_state.submsg_e_l[0]) //i_state->submsg_o_l[1] = ADD(i_state->submsg_e_l[1], _mm_shuffle_epi32(i_state->submsg_o_l[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_o_l[1], i_state.submsg_o_l[1], U8(0x93)) - ADD32(i_state.submsg_o_l[1], i_state.submsg_e_l[1]) //i_state->submsg_o_r[0] = ADD(i_state->submsg_e_r[0], _mm_shuffle_epi32(i_state->submsg_o_r[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_o_r[0], i_state.submsg_o_r[0], U8(0x4b)) - ADD32(i_state.submsg_o_r[0], i_state.submsg_e_r[0]) //i_state->submsg_o_r[1] = ADD(i_state->submsg_e_r[1], _mm_shuffle_epi32(i_state->submsg_o_r[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_o_r[1], i_state.submsg_o_r[1], U8(0x93)) - ADD32(i_state.submsg_o_r[1], i_state.submsg_e_r[1]) + ADD32(i_state.submsg_o_l[0], i_state.submsg_e_l[0], F_mm_shuffle_epi32(i_state.submsg_o_l[0], i_state.submsg_o_l[0], U8(0x4b))) + ADD32(i_state.submsg_o_l[1], i_state.submsg_e_l[1], F_mm_shuffle_epi32(i_state.submsg_o_l[1], i_state.submsg_o_l[1], U8(0x93))) + ADD32(i_state.submsg_o_r[0], i_state.submsg_e_r[0], F_mm_shuffle_epi32(i_state.submsg_o_r[0], i_state.submsg_o_r[0], U8(0x4b))) + ADD32(i_state.submsg_o_r[1], i_state.submsg_e_r[1], F_mm_shuffle_epi32(i_state.submsg_o_r[1], i_state.submsg_o_r[1], U8(0x93))) } // static INLINE void load_sc(__m128i* const_v, lsh_uint i){ @@ -131,13 +111,13 @@ func msg_add_even(cv_l, cv_r []VecVirtual, i_state LSH256SSE2_internal) { Comment("msg_add_even") //cv_l[0] = XOR(cv_l[0], i_state->submsg_e_l[0]); - XOR(cv_l[0], i_state.submsg_e_l[0]) //cv_r[0] = XOR(cv_r[0], i_state->submsg_e_r[0]); - XOR(cv_r[0], i_state.submsg_e_r[0]) //cv_l[1] = XOR(cv_l[1], i_state->submsg_e_l[1]); - XOR(cv_l[1], i_state.submsg_e_l[1]) //cv_r[1] = XOR(cv_r[1], i_state->submsg_e_r[1]); - XOR(cv_r[1], i_state.submsg_e_r[1]) + XOR(cv_l[0], cv_l[0], i_state.submsg_e_l[0]) + XOR(cv_r[0], cv_r[0], i_state.submsg_e_r[0]) + XOR(cv_l[1], cv_l[1], i_state.submsg_e_l[1]) + XOR(cv_r[1], cv_r[1], i_state.submsg_e_r[1]) } // static INLINE void msg_add_odd(__m128i* cv_l, __m128i* cv_r, const LSH256SSE2_internal * i_state){ @@ -145,13 +125,13 @@ func msg_add_odd(cv_l, cv_r []VecVirtual, i_state LSH256SSE2_internal) { Comment("msg_add_odd") //cv_l[0] = XOR(cv_l[0], i_state->submsg_o_l[0]); - XOR(cv_l[0], i_state.submsg_o_l[0]) //cv_r[0] = XOR(cv_r[0], i_state->submsg_o_r[0]); - XOR(cv_r[0], i_state.submsg_o_r[0]) //cv_l[1] = XOR(cv_l[1], i_state->submsg_o_l[1]); - XOR(cv_l[1], i_state.submsg_o_l[1]) //cv_r[1] = XOR(cv_r[1], i_state->submsg_o_r[1]); - XOR(cv_r[1], i_state.submsg_o_r[1]) + XOR(cv_l[0], cv_l[0], i_state.submsg_o_l[0]) + XOR(cv_r[0], cv_r[0], i_state.submsg_o_r[0]) + XOR(cv_l[1], cv_l[1], i_state.submsg_o_l[1]) + XOR(cv_r[1], cv_r[1], i_state.submsg_o_r[1]) } // static INLINE void add_blk(__m128i* cv_l, const __m128i* cv_r){ @@ -159,77 +139,53 @@ func add_blk(cv_l, cv_r []VecVirtual) { Comment("add_blk") //cv_l[0] = ADD(cv_l[0], cv_r[0]); - ADD32(cv_l[0], cv_r[0]) //cv_l[1] = ADD(cv_l[1], cv_r[1]); - ADD32(cv_l[1], cv_r[1]) + ADD32(cv_l[0], cv_l[0], cv_r[0]) + ADD32(cv_l[1], cv_l[1], cv_r[1]) } // static INLINE void rotate_blk_even_alpha(__m128i* cv){ func rotate_blk_even_alpha(cv []VecVirtual) { Comment("rotate_blk_even_alpha") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_EVEN_ALPHA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_EVEN_ALPHA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_ALPHA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_EVEN_ALPHA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_EVEN_ALPHA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_ALPHA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_EVEN_ALPHA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA))) } // static INLINE void rotate_blk_even_beta(__m128i* cv){ func rotate_blk_even_beta(cv []VecVirtual) { Comment("rotate_blk_even_beta") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_EVEN_BETA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_EVEN_BETA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_BETA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_EVEN_BETA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_EVEN_BETA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_BETA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_EVEN_BETA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_EVEN_BETA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_EVEN_BETA))) } // static INLINE void rotate_blk_odd_alpha(__m128i* cv){ func rotate_blk_odd_alpha(cv []VecVirtual) { Comment("rotate_blk_odd_alpha") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_ODD_ALPHA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_ODD_ALPHA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_ALPHA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_ODD_ALPHA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_ODD_ALPHA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_ALPHA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_ODD_ALPHA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_ODD_ALPHA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_ODD_ALPHA))) } // static INLINE void rotate_blk_odd_beta(__m128i* cv){ func rotate_blk_odd_beta(cv []VecVirtual) { Comment("rotate_blk_odd_beta") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_ODD_BETA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_ODD_BETA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_BETA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_ODD_BETA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_ODD_BETA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_BETA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_ODD_BETA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_ODD_BETA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_ODD_BETA))) } // static INLINE void xor_with_const(__m128i* cv_l, const __m128i* const_v){ @@ -237,9 +193,9 @@ func xor_with_const(cv_l []VecVirtual, const_v []VecVirtual) { Comment("xor_with_const") //cv_l[0] = XOR(cv_l[0], const_v[0]); - XOR(cv_l[0], const_v[0]) //cv_l[1] = XOR(cv_l[1], const_v[1]); - XOR(cv_l[1], const_v[1]) + XOR(cv_l[0], cv_l[0], const_v[0]) + XOR(cv_l[1], cv_l[1], const_v[1]) } // static INLINE void rotate_msg_gamma(__m128i* cv_r){ @@ -282,26 +238,16 @@ func rotate_msg_gamma(cv_r []VecVirtual) { step := func(cv VecVirtual, idx int) { //temp = AND(cv, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0)); - // >> temp = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0) - // >> temp = AND(temp, cv) - MOVO_autoAU(g_BytePermInfo.Offset(16*idx), temp) - AND(temp, cv) + AND(temp, cv, g_BytePermInfo.Offset(16*idx)) + //cv = AND(cv, _mm_set_epi32(0x0, 0x0, 0x0, 0xffffffff)); - // >> __tmp = _mm_set_epi32(0x0, 0x0, 0x0, 0xffffffff) - // >> cv = AND(cv, __tmp) - MOVO_autoAU(g_BytePermInfo.Offset(16*(idx+1)), __tmp) - AND(cv, __tmp) + AND(cv, cv, g_BytePermInfo.Offset(16*(idx+1))) + //temp = XOR(SHIFT_L(temp, 8), SHIFT_R(temp, 24)); - // >> __tmp = temp - // >> __tmp = SHIFT_L(__tmp, 8) - // >> temp = SHIFT_R(temp, 24) - // >> temp = XOR(temp, __tmp) - MOVO_autoAU(temp, __tmp) - SHIFT_L32(__tmp, U8(8)) - SHIFT_R32(temp, U8(24)) - XOR(temp, __tmp) + XOR(temp, SHIFT_L32(__tmp, temp, U8(8)), SHIFT_R32(temp, temp, U8(24))) + //cv = XOR(cv, temp); - XOR(cv, temp) + XOR(cv, cv, temp) } step(cv_r[0], 0) @@ -320,23 +266,23 @@ func word_perm(cv_l, cv_r []VecVirtual) { //__m128i temp; temp := XMM() //cv_l[0] = _mm_shuffle_epi32(cv_l[0], 0xd2); - F_mm_shuffle_epi32(cv_l[0], cv_l[0], U8(0xd2)) //cv_l[1] = _mm_shuffle_epi32(cv_l[1], 0xd2);; - F_mm_shuffle_epi32(cv_l[1], cv_l[1], U8(0xd2)) //cv_r[0] = _mm_shuffle_epi32(cv_r[0], 0x6c); - F_mm_shuffle_epi32(cv_r[0], cv_r[0], U8(0x6c)) //cv_r[1] = _mm_shuffle_epi32(cv_r[1], 0x6c); + F_mm_shuffle_epi32(cv_l[0], cv_l[0], U8(0xd2)) + F_mm_shuffle_epi32(cv_l[1], cv_l[1], U8(0xd2)) + F_mm_shuffle_epi32(cv_r[0], cv_r[0], U8(0x6c)) F_mm_shuffle_epi32(cv_r[1], cv_r[1], U8(0x6c)) //temp = cv_l[0]; - MOVO_autoAU(cv_l[0], temp) //cv_l[0] = cv_l[1]; - MOVO_autoAU(cv_l[1], cv_l[0]) //cv_l[1] = cv_r[1]; - MOVO_autoAU(cv_r[1], cv_l[1]) //cv_r[1] = cv_r[0]; - MOVO_autoAU(cv_r[0], cv_r[1]) //cv_r[0] = temp; - MOVO_autoAU(temp, cv_r[0]) + MOVO_autoAU2(temp, cv_l[0]) + MOVO_autoAU2(cv_l[0], cv_l[1]) + MOVO_autoAU2(cv_l[1], cv_r[1]) + MOVO_autoAU2(cv_r[1], cv_r[0]) + MOVO_autoAU2(cv_r[0], temp) } /* -------------------------------------------------------- */ @@ -426,8 +372,8 @@ func init224(state *LSH256_Context) { Comment("init224") //load_blk(state->cv_l, g_IV224); - load_blk_mem2mem(state.Cv_l, G_IV224) //load_blk(state->cv_r, g_IV224 + 8); + load_blk_mem2mem(state.Cv_l, G_IV224) load_blk_mem2mem(state.Cv_r, G_IV224.Offset(8*4)) } @@ -436,8 +382,8 @@ func init256(state *LSH256_Context) { Comment("init256") //load_blk(state->cv_l, g_IV256); - load_blk_mem2mem(state.Cv_l, G_IV256) //load_blk(state->cv_r, g_IV256 + 8); + load_blk_mem2mem(state.Cv_l, G_IV256) load_blk_mem2mem(state.Cv_r, G_IV256.Offset(8*4)) } @@ -448,9 +394,9 @@ func fin(cv_l, cv_r []VecVirtual) { Comment("fin") //cv_l[0] = XOR(cv_l[0], cv_r[0]); - XOR(cv_l[0], cv_r[0]) //cv_l[1] = XOR(cv_l[1], cv_r[1]); - XOR(cv_l[1], cv_r[1]) + XOR(cv_l[0], cv_l[0], cv_r[0]) + XOR(cv_l[1], cv_l[1], cv_r[1]) } /* -------------------------------------------------------- */ diff --git a/lsh256/avo/x86/lsh256ssse3/lsh256_ssse3.go b/lsh256/avo/x86/lsh256ssse3/lsh256_ssse3.go index 875a092..c0be7ac 100644 --- a/lsh256/avo/x86/lsh256ssse3/lsh256_ssse3.go +++ b/lsh256/avo/x86/lsh256ssse3/lsh256_ssse3.go @@ -34,28 +34,16 @@ func load_blk_mem2vec(dst []VecVirtual, src Mem) { Comment("load_blk_mem2vec") // dest[0] = LOAD((const __m128i*)src); - LOAD(dst[0], src) // dest[1] = LOAD((const __m128i*)src + 1); + LOAD(dst[0], src) LOAD(dst[1], src.Offset(XmmSize)) } -func load_blk_vec2mem(dst Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - // dest[0] = LOAD((const __m128i*)src); - LOAD(dst, src[0]) - // dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst.Offset(XmmSize), src[1]) -} func load_blk_mem2mem(dst, src Mem) { Comment("load_blk_mem2mem") - tmp := XMM() // dest[0] = LOAD((const __m128i*)src); - LOAD(tmp, src) - LOAD(dst, tmp) // dest[1] = LOAD((const __m128i*)src + 1); - LOAD(tmp, src.Offset(XmmSize)) - LOAD(dst.Offset(XmmSize), tmp) + MemcpyStatic(dst, src, XmmSize*2, false) } // static INLINE void store_blk(__m128i* dest, const __m128i* src){ @@ -63,8 +51,8 @@ func store_blk(dst Mem, src []VecVirtual) { Comment("store_blk") //STORE(dest, src[0]); - STORE(dst, src[0]) //STORE(dest + 1, src[1]); + STORE(dst, src[0]) STORE(dst.Offset(XmmSize), src[1]) } @@ -73,12 +61,12 @@ func load_msg_blk(i_state LSH256SSSE3_internal, msgblk Mem /* uint32 */) { Comment("load_msg_blk") //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*4)) + load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*4)) + load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*4)) load_blk_mem2vec(i_state.submsg_o_r, msgblk.Offset(24*4)) } @@ -87,17 +75,13 @@ func msg_exp_even(i_state LSH256SSSE3_internal) { Comment("msg_exp_even") //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], _mm_shuffle_epi32(i_state->submsg_e_l[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_e_l[0], i_state.submsg_e_l[0], U8(0x4b)) - ADD32(i_state.submsg_e_l[0], i_state.submsg_o_l[0]) //i_state->submsg_e_l[1] = ADD(i_state->submsg_o_l[1], _mm_shuffle_epi32(i_state->submsg_e_l[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_e_l[1], i_state.submsg_e_l[1], U8(0x93)) - ADD32(i_state.submsg_e_l[1], i_state.submsg_o_l[1]) //i_state->submsg_e_r[0] = ADD(i_state->submsg_o_r[0], _mm_shuffle_epi32(i_state->submsg_e_r[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_e_r[0], i_state.submsg_e_r[0], U8(0x4b)) - ADD32(i_state.submsg_e_r[0], i_state.submsg_o_r[0]) //i_state->submsg_e_r[1] = ADD(i_state->submsg_o_r[1], _mm_shuffle_epi32(i_state->submsg_e_r[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_e_r[1], i_state.submsg_e_r[1], U8(0x93)) - ADD32(i_state.submsg_e_r[1], i_state.submsg_o_r[1]) + ADD32(i_state.submsg_e_l[0], i_state.submsg_o_l[0], F_mm_shuffle_epi32(i_state.submsg_e_l[0], i_state.submsg_e_l[0], U8(0x4b))) + ADD32(i_state.submsg_e_l[1], i_state.submsg_o_l[1], F_mm_shuffle_epi32(i_state.submsg_e_l[1], i_state.submsg_e_l[1], U8(0x93))) + ADD32(i_state.submsg_e_r[0], i_state.submsg_o_r[0], F_mm_shuffle_epi32(i_state.submsg_e_r[0], i_state.submsg_e_r[0], U8(0x4b))) + ADD32(i_state.submsg_e_r[1], i_state.submsg_o_r[1], F_mm_shuffle_epi32(i_state.submsg_e_r[1], i_state.submsg_e_r[1], U8(0x93))) } // static INLINE void msg_exp_odd(LSH256SSSE3_internal * i_state){ @@ -105,17 +89,13 @@ func msg_exp_odd(i_state LSH256SSSE3_internal) { Comment("msg_exp_odd") //i_state->submsg_o_l[0] = ADD(i_state->submsg_e_l[0], _mm_shuffle_epi32(i_state->submsg_o_l[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_o_l[0], i_state.submsg_o_l[0], U8(0x4b)) - ADD32(i_state.submsg_o_l[0], i_state.submsg_e_l[0]) //i_state->submsg_o_l[1] = ADD(i_state->submsg_e_l[1], _mm_shuffle_epi32(i_state->submsg_o_l[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_o_l[1], i_state.submsg_o_l[1], U8(0x93)) - ADD32(i_state.submsg_o_l[1], i_state.submsg_e_l[1]) //i_state->submsg_o_r[0] = ADD(i_state->submsg_e_r[0], _mm_shuffle_epi32(i_state->submsg_o_r[0], 0x4b)); - F_mm_shuffle_epi32(i_state.submsg_o_r[0], i_state.submsg_o_r[0], U8(0x4b)) - ADD32(i_state.submsg_o_r[0], i_state.submsg_e_r[0]) //i_state->submsg_o_r[1] = ADD(i_state->submsg_e_r[1], _mm_shuffle_epi32(i_state->submsg_o_r[1], 0x93)); - F_mm_shuffle_epi32(i_state.submsg_o_r[1], i_state.submsg_o_r[1], U8(0x93)) - ADD32(i_state.submsg_o_r[1], i_state.submsg_e_r[1]) + ADD32(i_state.submsg_o_l[0], i_state.submsg_e_l[0], F_mm_shuffle_epi32(i_state.submsg_o_l[0], i_state.submsg_o_l[0], U8(0x4b))) + ADD32(i_state.submsg_o_l[1], i_state.submsg_e_l[1], F_mm_shuffle_epi32(i_state.submsg_o_l[1], i_state.submsg_o_l[1], U8(0x93))) + ADD32(i_state.submsg_o_r[0], i_state.submsg_e_r[0], F_mm_shuffle_epi32(i_state.submsg_o_r[0], i_state.submsg_o_r[0], U8(0x4b))) + ADD32(i_state.submsg_o_r[1], i_state.submsg_e_r[1], F_mm_shuffle_epi32(i_state.submsg_o_r[1], i_state.submsg_o_r[1], U8(0x93))) } // static INLINE void load_sc(__m128i* const_v, lsh_uint i){ @@ -131,13 +111,13 @@ func msg_add_even(cv_l, cv_r []VecVirtual, i_state LSH256SSSE3_internal) { Comment("msg_add_even") //cv_l[0] = XOR(cv_l[0], i_state->submsg_e_l[0]); - XOR(cv_l[0], i_state.submsg_e_l[0]) //cv_r[0] = XOR(cv_r[0], i_state->submsg_e_r[0]); - XOR(cv_r[0], i_state.submsg_e_r[0]) //cv_l[1] = XOR(cv_l[1], i_state->submsg_e_l[1]); - XOR(cv_l[1], i_state.submsg_e_l[1]) //cv_r[1] = XOR(cv_r[1], i_state->submsg_e_r[1]); - XOR(cv_r[1], i_state.submsg_e_r[1]) + XOR(cv_l[0], cv_l[0], i_state.submsg_e_l[0]) + XOR(cv_r[0], cv_r[0], i_state.submsg_e_r[0]) + XOR(cv_l[1], cv_l[1], i_state.submsg_e_l[1]) + XOR(cv_r[1], cv_r[1], i_state.submsg_e_r[1]) } // static INLINE void msg_add_odd(__m128i* cv_l, __m128i* cv_r, const LSH256SSSE3_internal * i_state){ @@ -145,13 +125,13 @@ func msg_add_odd(cv_l, cv_r []VecVirtual, i_state LSH256SSSE3_internal) { Comment("msg_add_odd") //cv_l[0] = XOR(cv_l[0], i_state->submsg_o_l[0]); - XOR(cv_l[0], i_state.submsg_o_l[0]) //cv_r[0] = XOR(cv_r[0], i_state->submsg_o_r[0]); - XOR(cv_r[0], i_state.submsg_o_r[0]) //cv_l[1] = XOR(cv_l[1], i_state->submsg_o_l[1]); - XOR(cv_l[1], i_state.submsg_o_l[1]) //cv_r[1] = XOR(cv_r[1], i_state->submsg_o_r[1]); - XOR(cv_r[1], i_state.submsg_o_r[1]) + XOR(cv_l[0], cv_l[0], i_state.submsg_o_l[0]) + XOR(cv_r[0], cv_r[0], i_state.submsg_o_r[0]) + XOR(cv_l[1], cv_l[1], i_state.submsg_o_l[1]) + XOR(cv_r[1], cv_r[1], i_state.submsg_o_r[1]) } // static INLINE void add_blk(__m128i* cv_l, const __m128i* cv_r){ @@ -159,77 +139,53 @@ func add_blk(cv_l, cv_r []VecVirtual) { Comment("add_blk") //cv_l[0] = ADD(cv_l[0], cv_r[0]); - ADD32(cv_l[0], cv_r[0]) //cv_l[1] = ADD(cv_l[1], cv_r[1]); - ADD32(cv_l[1], cv_r[1]) + ADD32(cv_l[0], cv_l[0], cv_r[0]) + ADD32(cv_l[1], cv_l[1], cv_r[1]) } // static INLINE void rotate_blk_even_alpha(__m128i* cv){ func rotate_blk_even_alpha(cv []VecVirtual) { Comment("rotate_blk_even_alpha") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_EVEN_ALPHA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_EVEN_ALPHA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_ALPHA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_EVEN_ALPHA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_EVEN_ALPHA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_ALPHA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_EVEN_ALPHA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_EVEN_ALPHA))) } // static INLINE void rotate_blk_even_beta(__m128i* cv){ func rotate_blk_even_beta(cv []VecVirtual) { Comment("rotate_blk_even_beta") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_EVEN_BETA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_EVEN_BETA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_BETA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_EVEN_BETA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_EVEN_BETA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_EVEN_BETA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_EVEN_BETA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_EVEN_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_EVEN_BETA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_EVEN_BETA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_EVEN_BETA))) } // static INLINE void rotate_blk_odd_alpha(__m128i* cv){ func rotate_blk_odd_alpha(cv []VecVirtual) { Comment("rotate_blk_odd_alpha") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_ODD_ALPHA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_ODD_ALPHA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_ALPHA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_ODD_ALPHA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_ODD_ALPHA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_ALPHA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_ODD_ALPHA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_ALPHA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_ALPHA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_ODD_ALPHA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_ODD_ALPHA))) } // static INLINE void rotate_blk_odd_beta(__m128i* cv){ func rotate_blk_odd_beta(cv []VecVirtual) { Comment("rotate_blk_odd_beta") - tmpXmm := XMM() + tmp := XMM() //cv[0] = OR(SHIFT_L(cv[0], ROT_ODD_BETA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_ODD_BETA)); - MOVO_autoAU(cv[0], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_BETA)) - SHIFT_R32(cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA)) - OR(cv[0], tmpXmm) //cv[1] = OR(SHIFT_L(cv[1], ROT_ODD_BETA), SHIFT_R(cv[1], WORD_BIT_LEN - ROT_ODD_BETA)); - MOVO_autoAU(cv[1], tmpXmm) - SHIFT_L32(tmpXmm, U8(ROT_ODD_BETA)) - SHIFT_R32(cv[1], U8(WORD_BIT_LEN-ROT_ODD_BETA)) - OR(cv[1], tmpXmm) + OR(cv[0], SHIFT_L32(tmp, cv[0], U8(ROT_ODD_BETA)), SHIFT_R32(cv[0], cv[0], U8(WORD_BIT_LEN-ROT_ODD_BETA))) + OR(cv[1], SHIFT_L32(tmp, cv[1], U8(ROT_ODD_BETA)), SHIFT_R32(cv[1], cv[1], U8(WORD_BIT_LEN-ROT_ODD_BETA))) } // static INLINE void xor_with_const(__m128i* cv_l, const __m128i* const_v){ @@ -237,17 +193,17 @@ func xor_with_const(cv_l []VecVirtual, const_v []VecVirtual) { Comment("xor_with_const") //cv_l[0] = XOR(cv_l[0], const_v[0]); - XOR(cv_l[0], const_v[0]) //cv_l[1] = XOR(cv_l[1], const_v[1]); - XOR(cv_l[1], const_v[1]) + XOR(cv_l[0], cv_l[0], const_v[0]) + XOR(cv_l[1], cv_l[1], const_v[1]) } // static INLINE void rotate_msg_gamma(__m128i* cv_r, const __m128i * perm_step){\ func rotate_msg_gamma(cv_r []VecVirtual, perm_step []Mem) { //cv_r[0] = SHUFFLE8(cv_r[0], perm_step[0]); - SHUFFLE8(cv_r[0], perm_step[0]) //cv_r[1] = SHUFFLE8(cv_r[1], perm_step[1]); - SHUFFLE8(cv_r[1], perm_step[1]) + SHUFFLE8(cv_r[0], cv_r[0], perm_step[0]) + SHUFFLE8(cv_r[1], cv_r[1], perm_step[1]) } // static INLINE void word_perm(__m128i* cv_l, __m128i* cv_r){ @@ -257,23 +213,23 @@ func word_perm(cv_l, cv_r []VecVirtual) { //__m128i temp; temp := XMM() //cv_l[0] = _mm_shuffle_epi32(cv_l[0], 0xd2); - F_mm_shuffle_epi32(cv_l[0], cv_l[0], U8(0xd2)) //cv_l[1] = _mm_shuffle_epi32(cv_l[1], 0xd2);; - F_mm_shuffle_epi32(cv_l[1], cv_l[1], U8(0xd2)) //cv_r[0] = _mm_shuffle_epi32(cv_r[0], 0x6c); - F_mm_shuffle_epi32(cv_r[0], cv_r[0], U8(0x6c)) //cv_r[1] = _mm_shuffle_epi32(cv_r[1], 0x6c); + F_mm_shuffle_epi32(cv_l[0], cv_l[0], U8(0xd2)) + F_mm_shuffle_epi32(cv_l[1], cv_l[1], U8(0xd2)) + F_mm_shuffle_epi32(cv_r[0], cv_r[0], U8(0x6c)) F_mm_shuffle_epi32(cv_r[1], cv_r[1], U8(0x6c)) //temp = cv_l[0]; - MOVO_autoAU(cv_l[0], temp) //cv_l[0] = cv_l[1]; - MOVO_autoAU(cv_l[1], cv_l[0]) //cv_l[1] = cv_r[1]; - MOVO_autoAU(cv_r[1], cv_l[1]) //cv_r[1] = cv_r[0]; - MOVO_autoAU(cv_r[0], cv_r[1]) //cv_r[0] = temp; - MOVO_autoAU(temp, cv_r[0]) + MOVO_autoAU2(temp, cv_l[0]) + MOVO_autoAU2(cv_l[0], cv_l[1]) + MOVO_autoAU2(cv_l[1], cv_r[1]) + MOVO_autoAU2(cv_r[1], cv_r[0]) + MOVO_autoAU2(cv_r[0], temp) } // static INLINE void mix_even(__m128i* cv_l, __m128i* cv_r, const __m128i* const_v, const __m128i * perm_step){ @@ -361,8 +317,8 @@ func init224(state *LSH256_Context) { Comment("init224") //load_blk(state->cv_l, g_IV224); - load_blk_mem2mem(state.Cv_l, G_IV224) //load_blk(state->cv_r, g_IV224 + 8); + load_blk_mem2mem(state.Cv_l, G_IV224) load_blk_mem2mem(state.Cv_r, G_IV224.Offset(8*4)) } @@ -371,8 +327,8 @@ func init256(state *LSH256_Context) { Comment("init256") //load_blk(state->cv_l, g_IV256); - load_blk_mem2mem(state.Cv_l, G_IV256) //load_blk(state->cv_r, g_IV256 + 8); + load_blk_mem2mem(state.Cv_l, G_IV256) load_blk_mem2mem(state.Cv_r, G_IV256.Offset(8*4)) } @@ -383,9 +339,9 @@ func fin(cv_l, cv_r []VecVirtual) { Comment("fin") //cv_l[0] = XOR(cv_l[0], cv_r[0]); - XOR(cv_l[0], cv_r[0]) //cv_l[1] = XOR(cv_l[1], cv_r[1]); - XOR(cv_l[1], cv_r[1]) + XOR(cv_l[0], cv_l[0], cv_r[0]) + XOR(cv_l[1], cv_l[1], cv_r[1]) } /* -------------------------------------------------------- */ @@ -401,8 +357,8 @@ func get_hash(cv_l []VecVirtual, pbHashVal Mem, algtype Op) { //lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(algtype); //STORE(hash_val, cv_l[0]); - STORE(hash_val, cv_l[0]) //STORE((hash_val + 16), cv_l[1]); + STORE(hash_val, cv_l[0]) STORE((hash_val.Offset(16)), cv_l[1]) //memcpy(pbHashVal, hash_val, sizeof(lsh_u8) * hash_val_byte_len); //if (hash_val_bit_len){ diff --git a/lsh256/lsh256_amd64.s b/lsh256/lsh256_amd64.s index 97fc6ae..98c9b0e 100644 --- a/lsh256/lsh256_amd64.s +++ b/lsh256/lsh256_amd64.s @@ -343,12 +343,14 @@ TEXT ·lsh256InitSSE2(SB), NOSPLIT, $0-8 // init256 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV256<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV256<>+16(SB), X0 MOVOU X0, 32(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV256<>+32(SB), X0 MOVOU X0, 48(AX) MOVOA g_IV256<>+48(SB), X0 @@ -358,12 +360,14 @@ TEXT ·lsh256InitSSE2(SB), NOSPLIT, $0-8 lsh256_sse2_init_if0_end: // init224 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV224<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV224<>+16(SB), X0 MOVOU X0, 32(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV224<>+32(SB), X0 MOVOU X0, 48(AX) MOVOA g_IV224<>+48(SB), X0 @@ -593,55 +597,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -708,55 +706,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -833,55 +825,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -958,55 +944,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1083,55 +1063,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1208,55 +1182,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1333,55 +1301,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1458,55 +1420,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1583,55 +1539,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1708,55 +1658,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1833,55 +1777,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -1958,55 +1896,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2083,55 +2015,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2208,55 +2134,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2333,55 +2253,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2458,55 +2372,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2583,55 +2491,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2708,55 +2610,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2833,55 +2729,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -2958,55 +2848,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3083,55 +2967,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3208,55 +3086,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3333,55 +3205,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3458,55 +3324,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3583,55 +3443,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3708,55 +3562,49 @@ memcpy_2_sz1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3866,55 +3714,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -3981,55 +3823,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4106,55 +3942,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4231,55 +4061,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4356,55 +4180,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4481,55 +4299,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4606,55 +4418,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4731,55 +4537,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4856,55 +4656,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -4981,55 +4775,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5106,55 +4894,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5231,55 +5013,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5356,55 +5132,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5481,55 +5251,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5606,55 +5370,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5731,55 +5489,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5856,55 +5608,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -5981,55 +5727,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6106,55 +5846,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6231,55 +5965,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6356,55 +6084,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6481,55 +6203,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6606,55 +6322,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6731,55 +6441,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6856,55 +6560,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -6981,55 +6679,49 @@ lsh256_sse2_update_while_start: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7297,55 +6989,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7412,55 +7098,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7537,55 +7217,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7662,55 +7336,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7787,55 +7455,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -7912,55 +7574,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8037,55 +7693,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8162,55 +7812,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8287,55 +7931,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8412,55 +8050,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8537,55 +8169,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8662,55 +8288,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8787,55 +8407,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -8912,55 +8526,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9037,55 +8645,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9162,55 +8764,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9287,55 +8883,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9412,55 +9002,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9537,55 +9121,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9662,55 +9240,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9787,55 +9359,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -9912,55 +9478,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -10037,55 +9597,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -10162,55 +9716,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -10287,55 +9835,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -10412,55 +9954,49 @@ memset_1_1_end: PADDD X3, X1 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+16(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+0(SB), X4 + PAND g_BytePermInfo_sse2<>+16(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+32(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+48(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+32(SB), X4 + PAND g_BytePermInfo_sse2<>+48(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+64(SB), X4 - PAND X2, X4 - MOVOA g_BytePermInfo_sse2<>+80(SB), X5 - PAND X5, X2 + MOVOA X2, X4 + PAND g_BytePermInfo_sse2<>+64(SB), X4 + PAND g_BytePermInfo_sse2<>+80(SB), X2 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X2 - MOVOA g_BytePermInfo_sse2<>+96(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+112(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+96(SB), X4 + PAND g_BytePermInfo_sse2<>+112(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+128(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+144(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+128(SB), X4 + PAND g_BytePermInfo_sse2<>+144(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 PXOR X5, X4 PXOR X4, X3 - MOVOA g_BytePermInfo_sse2<>+160(SB), X4 - PAND X3, X4 - MOVOA g_BytePermInfo_sse2<>+176(SB), X5 - PAND X5, X3 + MOVOA X3, X4 + PAND g_BytePermInfo_sse2<>+160(SB), X4 + PAND g_BytePermInfo_sse2<>+176(SB), X3 MOVOA X4, X5 PSLLL $0x08, X5 PSRLL $0x18, X4 @@ -16443,10 +15979,12 @@ TEXT ·lsh256InitAVX2(SB), NOSPLIT, $0-8 // init256 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV256<>+0(SB), Y0 VMOVDQU Y0, 16(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV256<>+32(SB), Y0 VMOVDQU Y0, 48(AX) JMP lsh256_avx2_init_ret @@ -16454,10 +15992,12 @@ TEXT ·lsh256InitAVX2(SB), NOSPLIT, $0-8 lsh256_avx2_init_if0_end: // init224 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV224<>+0(SB), Y0 VMOVDQU Y0, 16(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV224<>+32(SB), Y0 VMOVDQU Y0, 48(AX) @@ -16670,7 +16210,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16681,7 +16221,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16710,7 +16250,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16721,7 +16261,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16737,9 +16277,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -16756,7 +16296,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16767,7 +16307,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16783,9 +16323,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -16802,7 +16342,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16813,7 +16353,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16829,9 +16369,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -16848,7 +16388,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16859,7 +16399,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16875,9 +16415,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -16894,7 +16434,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16905,7 +16445,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16921,9 +16461,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -16940,7 +16480,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16951,7 +16491,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -16967,9 +16507,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -16986,7 +16526,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -16997,7 +16537,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17013,9 +16553,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17032,7 +16572,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17043,7 +16583,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17059,9 +16599,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17078,7 +16618,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17089,7 +16629,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17105,9 +16645,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17124,7 +16664,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17135,7 +16675,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17151,9 +16691,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17170,7 +16710,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17181,7 +16721,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17197,9 +16737,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17216,7 +16756,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17227,7 +16767,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17243,9 +16783,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17262,7 +16802,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17273,7 +16813,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17289,9 +16829,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17308,7 +16848,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17319,7 +16859,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17335,9 +16875,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17354,7 +16894,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17365,7 +16905,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17381,9 +16921,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17400,7 +16940,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17411,7 +16951,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17427,9 +16967,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17446,7 +16986,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17457,7 +16997,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17473,9 +17013,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17492,7 +17032,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17503,7 +17043,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17519,9 +17059,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17538,7 +17078,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17549,7 +17089,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17565,9 +17105,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17584,7 +17124,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17595,7 +17135,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17611,9 +17151,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17630,7 +17170,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17641,7 +17181,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17657,9 +17197,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17676,7 +17216,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17687,7 +17227,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17703,9 +17243,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17722,7 +17262,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17733,7 +17273,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17749,9 +17289,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17768,7 +17308,7 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17779,7 +17319,7 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17795,9 +17335,9 @@ memcpy_8_sz1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -17814,7 +17354,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17825,7 +17365,7 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17841,9 +17381,9 @@ memcpy_8_sz1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17890,7 +17430,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17901,7 +17441,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17930,7 +17470,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17941,7 +17481,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -17957,9 +17497,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -17976,7 +17516,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -17987,7 +17527,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18003,9 +17543,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18022,7 +17562,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18033,7 +17573,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18049,9 +17589,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18068,7 +17608,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18079,7 +17619,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18095,9 +17635,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18114,7 +17654,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18125,7 +17665,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18141,9 +17681,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18160,7 +17700,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18171,7 +17711,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18187,9 +17727,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18206,7 +17746,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18217,7 +17757,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18233,9 +17773,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18252,7 +17792,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18263,7 +17803,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18279,9 +17819,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18298,7 +17838,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18309,7 +17849,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18325,9 +17865,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18344,7 +17884,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18355,7 +17895,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18371,9 +17911,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18390,7 +17930,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18401,7 +17941,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18417,9 +17957,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18436,7 +17976,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18447,7 +17987,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18463,9 +18003,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18482,7 +18022,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18493,7 +18033,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18509,9 +18049,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18528,7 +18068,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18539,7 +18079,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18555,9 +18095,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18574,7 +18114,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18585,7 +18125,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18601,9 +18141,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18620,7 +18160,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18631,7 +18171,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18647,9 +18187,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18666,7 +18206,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18677,7 +18217,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18693,9 +18233,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18712,7 +18252,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18723,7 +18263,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18739,9 +18279,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18758,7 +18298,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18769,7 +18309,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18785,9 +18325,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18804,7 +18344,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18815,7 +18355,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18831,9 +18371,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18850,7 +18390,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18861,7 +18401,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18877,9 +18417,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18896,7 +18436,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18907,7 +18447,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18923,9 +18463,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -18942,7 +18482,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18953,7 +18493,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -18969,9 +18509,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -18988,7 +18528,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -18999,7 +18539,7 @@ lsh256_avx2_update_while_start: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19015,9 +18555,9 @@ lsh256_avx2_update_while_start: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19034,7 +18574,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19045,7 +18585,7 @@ lsh256_avx2_update_while_start: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19061,9 +18601,9 @@ lsh256_avx2_update_while_start: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19275,7 +18815,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19286,7 +18826,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19315,7 +18855,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19326,7 +18866,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19342,9 +18882,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19361,7 +18901,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19372,7 +18912,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19388,9 +18928,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19407,7 +18947,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19418,7 +18958,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19434,9 +18974,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19453,7 +18993,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19464,7 +19004,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19480,9 +19020,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19499,7 +19039,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19510,7 +19050,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19526,9 +19066,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19545,7 +19085,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19556,7 +19096,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19572,9 +19112,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19591,7 +19131,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19602,7 +19142,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19618,9 +19158,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19637,7 +19177,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19648,7 +19188,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19664,9 +19204,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19683,7 +19223,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19694,7 +19234,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19710,9 +19250,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19729,7 +19269,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19740,7 +19280,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19756,9 +19296,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19775,7 +19315,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19786,7 +19326,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19802,9 +19342,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19821,7 +19361,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19832,7 +19372,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19848,9 +19388,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19867,7 +19407,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19878,7 +19418,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19894,9 +19434,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -19913,7 +19453,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19924,7 +19464,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19940,9 +19480,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -19959,7 +19499,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -19970,7 +19510,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -19986,9 +19526,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -20005,7 +19545,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20016,7 +19556,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20032,9 +19572,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -20051,7 +19591,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20062,7 +19602,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20078,9 +19618,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -20097,7 +19637,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20108,7 +19648,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20124,9 +19664,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -20143,7 +19683,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20154,7 +19694,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20170,9 +19710,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -20189,7 +19729,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20200,7 +19740,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20216,9 +19756,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -20235,7 +19775,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20246,7 +19786,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20262,9 +19802,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -20281,7 +19821,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20292,7 +19832,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20308,9 +19848,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -20327,7 +19867,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20338,7 +19878,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20354,9 +19894,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 @@ -20373,7 +19913,7 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLD $0x1d, Y0, Y9 VPSRLD $0x03, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20384,7 +19924,7 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLD $0x01, Y1, Y2 VPSRLD $0x1f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20400,9 +19940,9 @@ memset_3_1_end: // msg_exp_odd VPSHUFB Y4, Y7, Y7 - VPADDD Y5, Y7, Y7 + VPADDD Y7, Y5, Y7 VPSHUFB Y4, Y8, Y8 - VPADDD Y6, Y8, Y8 + VPADDD Y8, Y6, Y8 // msg_add_odd VPXOR Y7, Y0, Y0 @@ -20419,7 +19959,7 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLD $0x05, Y0, Y9 VPSRLD $0x1b, Y0, Y0 - VPOR Y9, Y0, Y0 + VPOR Y0, Y9, Y0 // xor_with_const VPXOR Y2, Y0, Y0 @@ -20430,7 +19970,7 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLD $0x11, Y1, Y2 VPSRLD $0x0f, Y1, Y1 - VPOR Y2, Y1, Y1 + VPOR Y1, Y2, Y1 // add_blk VPADDD Y1, Y0, Y0 @@ -20446,9 +19986,9 @@ memset_3_1_end: // msg_exp_even VPSHUFB Y4, Y5, Y5 - VPADDD Y7, Y5, Y5 + VPADDD Y5, Y7, Y5 VPSHUFB Y4, Y6, Y6 - VPADDD Y8, Y6, Y6 + VPADDD Y6, Y8, Y6 // msg_add_even VPXOR Y5, Y0, Y0 diff --git a/lsh512/avo/x86/lsh512avx2/lsh512_avx2.go b/lsh512/avo/x86/lsh512avx2/lsh512_avx2.go index d298b83..9fb0580 100644 --- a/lsh512/avo/x86/lsh512avx2/lsh512_avx2.go +++ b/lsh512/avo/x86/lsh512avx2/lsh512_avx2.go @@ -36,7 +36,7 @@ func (ctx *LSH512AVX2_internal) load(v []VecVirtual, m Mem) { } func (ctx *LSH512AVX2_internal) save(v []VecVirtual, m Mem) { Comment("i_state_save___start") - load_blk_vec2mem(m, v) + store_blk(m, v) Comment("i_state_save___end") } @@ -53,24 +53,12 @@ func load_blk_mem2vec(dst []VecVirtual, src Mem) { //dest[1] = LOAD((const __m256i*)src + 1); LOAD(dst[1], src.Offset(YmmSize*1)) } -func load_blk_vec2mem(dst Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - //dest[0] = LOAD((const __m256i*)src); - LOAD(dst.Offset(YmmSize*0), src[0]) - //dest[1] = LOAD((const __m256i*)src + 1); - LOAD(dst.Offset(YmmSize*1), src[1]) -} func load_blk_mem2mem(dst Mem, src Mem) { Comment("load_blk_mem2mem") - tmp := YMM() //dest[0] = LOAD((const __m256i*)src); - LOAD(tmp, src) - LOAD(dst, tmp) //dest[1] = LOAD((const __m256i*)src + 1); - LOAD(tmp, src.Offset(YmmSize*1)) - LOAD(dst.Offset(YmmSize*1), tmp) + MemcpyStatic(dst, src, YmmSize*2, true) } // static INLINE void store_blk(__m256i* dest, const __m256i* src){ @@ -86,12 +74,12 @@ func store_blk(dst Mem, src []VecVirtual) { // static INLINE void load_msg_blk(LSH512AVX2_internal * i_state, const lsh_u64* msgblk){ func load_msg_blk(i_state LSH512AVX2_internal, msgblk Mem /* uint32 */) { //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*8)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*8)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*8)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2vec(i_state.submsg_e_l, msgblk.Offset(0*8)) + load_blk_mem2vec(i_state.submsg_e_r, msgblk.Offset(8*8)) + load_blk_mem2vec(i_state.submsg_o_l, msgblk.Offset(16*8)) load_blk_mem2vec(i_state.submsg_o_r, msgblk.Offset(24*8)) } @@ -99,8 +87,7 @@ func msg_exp(dst, a VecVirtual, v int) { //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], _mm256_permute4x64_epi64(i_state->submsg_e_l[0], 0x4b)); //dst = ADD(a , _mm256_permute4x64_epi64(dst , v )); - F_mm256_permute4x64_epi64(dst, dst, U8(v)) // dst = _mm256_permute4x64_epi64(dst, v) - ADD64(dst, dst, a) // dst = ADD(a, _mm256_permute4x64_epi64(dst, v)); + ADD64(dst, a, F_mm256_permute4x64_epi64(dst, dst, U8(v))) } // static INLINE void msg_exp_even(LSH512AVX2_internal * i_state, const __m256i perm_step){ @@ -182,9 +169,7 @@ func rotate_blk(dst VecVirtual, v int) { tmp := YMM() // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) - SHIFT_L64(tmp, dst, U8(v)) // tmp = SHIFT_L(dst, ROT_EVEN_ALPHA) - SHIFT_R64(dst, dst, U8(WORD_BIT_LEN-v)) // dst = SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA) - OR(dst, dst, tmp) // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) + OR(dst, SHIFT_L64(tmp, dst, U8(v)), SHIFT_R64(dst, dst, U8(WORD_BIT_LEN-v))) } // static INLINE void rotate_blk_even_alpha(__m256i* cv){ @@ -238,8 +223,8 @@ func rotate_blk_odd_beta(cv []VecVirtual) { // static INLINE void xor_with_const(__m256i* cv_l, const __m256i* const_v){ func xor_with_const(cv_l []VecVirtual, const_v []VecVirtual) { //cv_l[0] = XOR(cv_l[0], const_v[0]); - XOR(cv_l[0], cv_l[0], const_v[0]) //cv_l[1] = XOR(cv_l[1], const_v[1]); + XOR(cv_l[0], cv_l[0], const_v[0]) XOR(cv_l[1], cv_l[1], const_v[1]) } @@ -248,8 +233,8 @@ func rotate_msg_gamma(cv_r []VecVirtual, byte_perm_step []Mem) { Comment("rotate_msg_gamma") //cv_r[0] = SHUFFLE8(cv_r[0], byte_perm_step[0]); - SHUFFLE8(cv_r[0], cv_r[0], byte_perm_step[0]) //cv_r[1] = SHUFFLE8(cv_r[1], byte_perm_step[1]); + SHUFFLE8(cv_r[0], cv_r[0], byte_perm_step[0]) SHUFFLE8(cv_r[1], cv_r[1], byte_perm_step[1]) } @@ -259,24 +244,24 @@ func word_perm(cv_l, cv_r []VecVirtual) { //__m256i temp[2]; temp := []VecVirtual{YMM(), YMM()} //cv_l[0] = _mm256_permute4x64_epi64(cv_l[0], 0xd2); - F_mm256_permute4x64_epi64(cv_l[0], cv_l[0], U8(0xd2)) //cv_l[1] = _mm256_permute4x64_epi64(cv_l[1], 0xd2); - F_mm256_permute4x64_epi64(cv_l[1], cv_l[1], U8(0xd2)) //cv_r[0] = _mm256_permute4x64_epi64(cv_r[0], 0x6c); - F_mm256_permute4x64_epi64(cv_r[0], cv_r[0], U8(0x6c)) //cv_r[1] = _mm256_permute4x64_epi64(cv_r[1], 0x6c); + F_mm256_permute4x64_epi64(cv_l[0], cv_l[0], U8(0xd2)) + F_mm256_permute4x64_epi64(cv_l[1], cv_l[1], U8(0xd2)) + F_mm256_permute4x64_epi64(cv_r[0], cv_r[0], U8(0x6c)) F_mm256_permute4x64_epi64(cv_r[1], cv_r[1], U8(0x6c)) //temp[0] = cv_l[0]; - VMOVDQ_autoAU2(temp[0], cv_l[0]) //temp[1] = cv_r[0]; - VMOVDQ_autoAU2(temp[1], cv_r[0]) //cv_l[0] = cv_l[1]; - VMOVDQ_autoAU2(cv_l[0], cv_l[1]) //cv_l[1] = cv_r[1]; - VMOVDQ_autoAU2(cv_l[1], cv_r[1]) //cv_r[0] = temp[0]; - VMOVDQ_autoAU2(cv_r[0], temp[0]) //cv_r[1] = temp[1]; + VMOVDQ_autoAU2(temp[0], cv_l[0]) + VMOVDQ_autoAU2(temp[1], cv_r[0]) + VMOVDQ_autoAU2(cv_l[0], cv_l[1]) + VMOVDQ_autoAU2(cv_l[1], cv_r[1]) + VMOVDQ_autoAU2(cv_r[0], temp[0]) VMOVDQ_autoAU2(cv_r[1], temp[1]) } @@ -375,8 +360,8 @@ func init224(state *LSH512_Context) { Comment("init224") //load_blk(state->cv_l, g_IV224); - load_blk_mem2mem(state.Cv_l, G_IV224) //load_blk(state->cv_r, g_IV224 + 8); + load_blk_mem2mem(state.Cv_l, G_IV224) load_blk_mem2mem(state.Cv_r, G_IV224.Offset(8*8)) } @@ -385,8 +370,8 @@ func init256(state *LSH512_Context) { Comment("init256") //load_blk(state->cv_l, g_IV256); - load_blk_mem2mem(state.Cv_l, G_IV256) //load_blk(state->cv_r, g_IV256 + 8); + load_blk_mem2mem(state.Cv_l, G_IV256) load_blk_mem2mem(state.Cv_r, G_IV256.Offset(8*8)) } @@ -395,8 +380,8 @@ func init384(state *LSH512_Context) { Comment("init384") //load_blk(state->cv_l, g_IV384); - load_blk_mem2mem(state.Cv_l, G_IV384) //load_blk(state->cv_r, g_IV384 + 8); + load_blk_mem2mem(state.Cv_l, G_IV384) load_blk_mem2mem(state.Cv_r, G_IV384.Offset(8*8)) } @@ -405,8 +390,8 @@ func init512(state *LSH512_Context) { Comment("init512") //load_blk(state->cv_l, g_IV512); - load_blk_mem2mem(state.Cv_l, G_IV512) //load_blk(state->cv_r, g_IV512 + 8); + load_blk_mem2mem(state.Cv_l, G_IV512) load_blk_mem2mem(state.Cv_r, G_IV512.Offset(8*8)) } @@ -417,8 +402,8 @@ func fin(cv_l, cv_r []VecVirtual) { Comment("fin") //cv_l[0] = XOR(cv_l[0], cv_r[0]); - XOR(cv_l[0], cv_l[0], cv_r[0]) //cv_l[1] = XOR(cv_l[1], cv_r[1]); + XOR(cv_l[0], cv_l[0], cv_r[0]) XOR(cv_l[1], cv_l[1], cv_r[1]) } diff --git a/lsh512/avo/x86/lsh512sse2/lsh512_sse2.go b/lsh512/avo/x86/lsh512sse2/lsh512_sse2.go index 24b5837..663f5d9 100644 --- a/lsh512/avo/x86/lsh512sse2/lsh512_sse2.go +++ b/lsh512/avo/x86/lsh512sse2/lsh512_sse2.go @@ -44,7 +44,7 @@ func (ctx *LSH512SSE2_internal) load(v []VecVirtual, m Mem) { } func (ctx *LSH512SSE2_internal) save(v []VecVirtual, m Mem) { Comment("i_state_save___start") - load_blk_vec2mem(m, v) + store_blk(m, v) Comment("i_state_save___end") } @@ -57,42 +57,22 @@ func load_blk_mem2vec(dst []VecVirtual, src Mem) { Comment("load_blk_mem2vec") //dest[0] = LOAD((const __m128i*)src); - LOAD(dst[0], src.Offset(XmmSize*0)) //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst[1], src.Offset(XmmSize*1)) //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(dst[2], src.Offset(XmmSize*2)) //dest[3] = LOAD((const __m128i*)src + 3); + LOAD(dst[0], src.Offset(XmmSize*0)) + LOAD(dst[1], src.Offset(XmmSize*1)) + LOAD(dst[2], src.Offset(XmmSize*2)) LOAD(dst[3], src.Offset(XmmSize*3)) } -func load_blk_vec2mem(dst Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - //dest[0] = LOAD((const __m128i*)src); - LOAD(dst.Offset(XmmSize*0), src[0]) - //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst.Offset(XmmSize*1), src[1]) - //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(dst.Offset(XmmSize*2), src[2]) - //dest[3] = LOAD((const __m128i*)src + 3); - LOAD(dst.Offset(XmmSize*3), src[3]) -} func load_blk_mem2mem(dst Mem, src Mem) { Comment("load_blk_mem2mem") - tmp := XMM() //dest[0] = LOAD((const __m128i*)src); - LOAD(tmp, src) - LOAD(dst, tmp) //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(tmp, src.Offset(XmmSize*1)) - LOAD(dst.Offset(XmmSize*1), tmp) //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(tmp, src.Offset(XmmSize*2)) - LOAD(dst.Offset(XmmSize*2), tmp) //dest[3] = LOAD((const __m128i*)src + 3); - LOAD(tmp, src.Offset(XmmSize*3)) - LOAD(dst.Offset(XmmSize*3), tmp) + MemcpyStatic(dst, src, XmmSize*4, false) } // static INLINE void store_blk(__m128i* dest, const __m128i* src){ @@ -100,24 +80,24 @@ func store_blk(dst Mem, src []VecVirtual) { Comment("store_blk") //STORE(dest, src[0]); - STORE(dst, src[0]) //STORE(dest + 1, src[1]); - STORE(dst.Offset(XmmSize), src[1]) //STORE(dest + 2, src[2]); - STORE(dst.Offset(XmmSize*2), src[2]) //STORE(dest + 3, src[3]); + STORE(dst, src[0]) + STORE(dst.Offset(XmmSize), src[1]) + STORE(dst.Offset(XmmSize*2), src[2]) STORE(dst.Offset(XmmSize*3), src[3]) } // static INLINE void load_msg_blk(LSH512SSE2_internal * i_state, const lsh_u64* msgblk){ func load_msg_blk(i_state LSH512SSE2_internal, msgblk Mem /* uint32 */) { //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2mem(i_state.submsg_e_l_Mem, msgblk.Offset(0*8)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2mem(i_state.submsg_e_r_Mem, msgblk.Offset(8*8)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2mem(i_state.submsg_o_l_Mem, msgblk.Offset(16*8)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2mem(i_state.submsg_e_l_Mem, msgblk.Offset(0*8)) + load_blk_mem2mem(i_state.submsg_e_r_Mem, msgblk.Offset(8*8)) + load_blk_mem2mem(i_state.submsg_o_l_Mem, msgblk.Offset(16*8)) load_blk_mem2mem(i_state.submsg_o_r_Mem, msgblk.Offset(24*8)) } @@ -128,8 +108,6 @@ func msg_exp_even(i_state LSH512SSE2_internal) { } Comment("msg_exp_even") - ADD := ADD64_ - //__m128i temp; temp := XMM() @@ -159,13 +137,13 @@ func msg_exp_even(i_state LSH512SSE2_internal) { F_mm_unpackhi_epi64(i_state.submsg_e_l[3], temp, i_state.submsg_e_l[3]) //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], i_state->submsg_e_l[0]); - ADD(i_state.submsg_e_l[0], i_state.submsg_o_l[0], i_state.submsg_e_l[0]) //i_state->submsg_e_l[1] = ADD(i_state->submsg_o_l[1], i_state->submsg_e_l[1]); - ADD(i_state.submsg_e_l[1], i_state.submsg_o_l[1], i_state.submsg_e_l[1]) //i_state->submsg_e_l[2] = ADD(i_state->submsg_o_l[2], i_state->submsg_e_l[2]); - ADD(i_state.submsg_e_l[2], i_state.submsg_o_l[2], i_state.submsg_e_l[2]) //i_state->submsg_e_l[3] = ADD(i_state->submsg_o_l[3], i_state->submsg_e_l[3]); - ADD(i_state.submsg_e_l[3], i_state.submsg_o_l[3], i_state.submsg_e_l[3]) + ADD64(i_state.submsg_e_l[0], i_state.submsg_o_l[0], i_state.submsg_e_l[0]) + ADD64(i_state.submsg_e_l[1], i_state.submsg_o_l[1], i_state.submsg_e_l[1]) + ADD64(i_state.submsg_e_l[2], i_state.submsg_o_l[2], i_state.submsg_e_l[2]) + ADD64(i_state.submsg_e_l[3], i_state.submsg_o_l[3], i_state.submsg_e_l[3]) i_state.save(i_state.submsg_e_l, i_state.submsg_e_l_Mem) @@ -195,13 +173,13 @@ func msg_exp_even(i_state LSH512SSE2_internal) { F_mm_unpackhi_epi64(i_state.submsg_e_r[3], temp, i_state.submsg_e_r[3]) //i_state->submsg_e_r[0] = ADD(i_state->submsg_o_r[0], i_state->submsg_e_r[0]); - ADD(i_state.submsg_e_r[0], i_state.submsg_o_r[0], i_state.submsg_e_r[0]) //i_state->submsg_e_r[1] = ADD(i_state->submsg_o_r[1], i_state->submsg_e_r[1]); - ADD(i_state.submsg_e_r[1], i_state.submsg_o_r[1], i_state.submsg_e_r[1]) //i_state->submsg_e_r[2] = ADD(i_state->submsg_o_r[2], i_state->submsg_e_r[2]); - ADD(i_state.submsg_e_r[2], i_state.submsg_o_r[2], i_state.submsg_e_r[2]) //i_state->submsg_e_r[3] = ADD(i_state->submsg_o_r[3], i_state->submsg_e_r[3]); - ADD(i_state.submsg_e_r[3], i_state.submsg_o_r[3], i_state.submsg_e_r[3]) + ADD64(i_state.submsg_e_r[0], i_state.submsg_o_r[0], i_state.submsg_e_r[0]) + ADD64(i_state.submsg_e_r[1], i_state.submsg_o_r[1], i_state.submsg_e_r[1]) + ADD64(i_state.submsg_e_r[2], i_state.submsg_o_r[2], i_state.submsg_e_r[2]) + ADD64(i_state.submsg_e_r[3], i_state.submsg_o_r[3], i_state.submsg_e_r[3]) i_state.save(i_state.submsg_e_r, i_state.submsg_e_r_Mem) } @@ -213,8 +191,6 @@ func msg_exp_odd(i_state LSH512SSE2_internal) { } Comment("msg_exp_odd") - ADD := ADD64_ - //__m128i temp; temp := XMM() @@ -247,10 +223,10 @@ func msg_exp_odd(i_state LSH512SSE2_internal) { //i_state->submsg_o_l[1] = ADD(i_state->submsg_e_l[1], i_state->submsg_o_l[1]); //i_state->submsg_o_l[2] = ADD(i_state->submsg_e_l[2], i_state->submsg_o_l[2]); //i_state->submsg_o_l[3] = ADD(i_state->submsg_e_l[3], i_state->submsg_o_l[3]); - ADD(i_state.submsg_o_l[0], i_state.submsg_e_l[0], i_state.submsg_o_l[0]) - ADD(i_state.submsg_o_l[1], i_state.submsg_e_l[1], i_state.submsg_o_l[1]) - ADD(i_state.submsg_o_l[2], i_state.submsg_e_l[2], i_state.submsg_o_l[2]) - ADD(i_state.submsg_o_l[3], i_state.submsg_e_l[3], i_state.submsg_o_l[3]) + ADD64(i_state.submsg_o_l[0], i_state.submsg_e_l[0], i_state.submsg_o_l[0]) + ADD64(i_state.submsg_o_l[1], i_state.submsg_e_l[1], i_state.submsg_o_l[1]) + ADD64(i_state.submsg_o_l[2], i_state.submsg_e_l[2], i_state.submsg_o_l[2]) + ADD64(i_state.submsg_o_l[3], i_state.submsg_e_l[3], i_state.submsg_o_l[3]) i_state.save(i_state.submsg_o_l, i_state.submsg_o_l_Mem) @@ -283,10 +259,10 @@ func msg_exp_odd(i_state LSH512SSE2_internal) { //i_state->submsg_o_r[1] = ADD(i_state->submsg_e_r[1], i_state->submsg_o_r[1]); //i_state->submsg_o_r[2] = ADD(i_state->submsg_e_r[2], i_state->submsg_o_r[2]); //i_state->submsg_o_r[3] = ADD(i_state->submsg_e_r[3], i_state->submsg_o_r[3]); - ADD(i_state.submsg_o_r[0], i_state.submsg_e_r[0], i_state.submsg_o_r[0]) - ADD(i_state.submsg_o_r[1], i_state.submsg_e_r[1], i_state.submsg_o_r[1]) - ADD(i_state.submsg_o_r[2], i_state.submsg_e_r[2], i_state.submsg_o_r[2]) - ADD(i_state.submsg_o_r[3], i_state.submsg_e_r[3], i_state.submsg_o_r[3]) + ADD64(i_state.submsg_o_r[0], i_state.submsg_e_r[0], i_state.submsg_o_r[0]) + ADD64(i_state.submsg_o_r[1], i_state.submsg_e_r[1], i_state.submsg_o_r[1]) + ADD64(i_state.submsg_o_r[2], i_state.submsg_e_r[2], i_state.submsg_o_r[2]) + ADD64(i_state.submsg_o_r[3], i_state.submsg_e_r[3], i_state.submsg_o_r[3]) i_state.save(i_state.submsg_o_r, i_state.submsg_o_r_Mem) } @@ -314,16 +290,16 @@ func msg_add_even(cv_l, cv_r []VecVirtual, i_state LSH512SSE2_internal, tempVec //cv_r[3] = XOR(cv_r[3], i_state->submsg_e_r[3]); load_blk_mem2vec(tempVec, i_state.submsg_e_l_Mem) - XOR(cv_l[0], tempVec[0]) - XOR(cv_l[1], tempVec[1]) - XOR(cv_l[2], tempVec[2]) - XOR(cv_l[3], tempVec[3]) + XOR(cv_l[0], cv_l[0], tempVec[0]) + XOR(cv_l[1], cv_l[1], tempVec[1]) + XOR(cv_l[2], cv_l[2], tempVec[2]) + XOR(cv_l[3], cv_l[3], tempVec[3]) load_blk_mem2vec(tempVec, i_state.submsg_e_r_Mem) - XOR(cv_r[0], tempVec[0]) - XOR(cv_r[1], tempVec[1]) - XOR(cv_r[2], tempVec[2]) - XOR(cv_r[3], tempVec[3]) + XOR(cv_r[0], cv_r[0], tempVec[0]) + XOR(cv_r[1], cv_r[1], tempVec[1]) + XOR(cv_r[2], cv_r[2], tempVec[2]) + XOR(cv_r[3], cv_r[3], tempVec[3]) } // static INLINE void msg_add_odd(__m128i* cv_l, __m128i* cv_r, const LSH512SSE2_internal * i_state){ @@ -340,16 +316,16 @@ func msg_add_odd(cv_l, cv_r []VecVirtual, i_state LSH512SSE2_internal, tempVec [ //cv_r[3] = XOR(cv_r[3], i_state->submsg_o_r[3]); load_blk_mem2vec(tempVec, i_state.submsg_o_l_Mem) - XOR(cv_l[0], tempVec[0]) - XOR(cv_l[1], tempVec[1]) - XOR(cv_l[2], tempVec[2]) - XOR(cv_l[3], tempVec[3]) + XOR(cv_l[0], cv_l[0], tempVec[0]) + XOR(cv_l[1], cv_l[1], tempVec[1]) + XOR(cv_l[2], cv_l[2], tempVec[2]) + XOR(cv_l[3], cv_l[3], tempVec[3]) load_blk_mem2vec(tempVec, i_state.submsg_o_r_Mem) - XOR(cv_r[0], tempVec[0]) - XOR(cv_r[1], tempVec[1]) - XOR(cv_r[2], tempVec[2]) - XOR(cv_r[3], tempVec[3]) + XOR(cv_r[0], cv_r[0], tempVec[0]) + XOR(cv_r[1], cv_r[1], tempVec[1]) + XOR(cv_r[2], cv_r[2], tempVec[2]) + XOR(cv_r[3], cv_r[3], tempVec[3]) } // static INLINE void add_blk(__m128i* cv_l, const __m128i* cv_r){ @@ -357,23 +333,21 @@ func add_blk(cv_l, cv_r []VecVirtual) { Comment("add_blk") //cv_l[0] = ADD(cv_l[0], cv_r[0]); - ADD64(cv_l[0], cv_r[0]) //cv_l[1] = ADD(cv_l[1], cv_r[1]); - ADD64(cv_l[1], cv_r[1]) //cv_l[2] = ADD(cv_l[2], cv_r[2]); - ADD64(cv_l[2], cv_r[2]) //cv_l[3] = ADD(cv_l[3], cv_r[3]); - ADD64(cv_l[3], cv_r[3]) + ADD64(cv_l[0], cv_l[0], cv_r[0]) + ADD64(cv_l[1], cv_l[1], cv_r[1]) + ADD64(cv_l[2], cv_l[2], cv_r[2]) + ADD64(cv_l[3], cv_l[3], cv_r[3]) } func rotate_blk(dst VecVirtual, v int) { - tmpXmm := XMM() + tmp := XMM() - // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) - MOVO_autoAU2(tmpXmm, dst) // tmpXmm = dst - SHIFT_L64(tmpXmm, U8(v)) // tmpXmm = SHIFT_L(dst, ROT_EVEN_ALPHA) - SHIFT_R64(dst, U8(WORD_BIT_LEN-v)) // dst = SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA) - OR(dst, tmpXmm) // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) + // cv[0] = OR(SHIFT_L(cv[0], ROT_EVEN_ALPHA), SHIFT_R(cv[0], WORD_BIT_LEN - ROT_EVEN_ALPHA)); + // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) + OR(dst, SHIFT_L64(tmp, dst, U8(v)), SHIFT_R64(dst, dst, U8(WORD_BIT_LEN-v))) } // static INLINE void rotate_blk_even_alpha(__m128i* cv){ @@ -435,13 +409,13 @@ func rotate_blk_odd_beta(cv []VecVirtual) { // static INLINE void xor_with_const(__m128i* cv_l, const __m128i* const_v){ func xor_with_const(cv_l []VecVirtual, const_v []VecVirtual) { //cv_l[0] = XOR(cv_l[0], const_v[0]); - XOR(cv_l[0], const_v[0]) //cv_l[1] = XOR(cv_l[1], const_v[1]); - XOR(cv_l[1], const_v[1]) //cv_l[2] = XOR(cv_l[2], const_v[2]); - XOR(cv_l[2], const_v[2]) //cv_l[3] = XOR(cv_l[3], const_v[3]); - XOR(cv_l[3], const_v[3]) + XOR(cv_l[0], cv_l[0], const_v[0]) + XOR(cv_l[1], cv_l[1], const_v[1]) + XOR(cv_l[2], cv_l[2], const_v[2]) + XOR(cv_l[3], cv_l[3], const_v[3]) } // static INLINE void rotate_msg_gamma(__m128i* cv_r){ @@ -459,23 +433,21 @@ func rotate_msg_gamma(cv_r []VecVirtual) { //temp = _mm_xor_si128(_mm_slli_epi64(temp, 16), _mm_srli_epi64(temp, 48));\ //cv_r[0] = _mm_xor_si128(cv_r[0], temp);\ - MOVO_autoAU2(temp, g_BytePermInfo) //temp = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0) - F_mm_and_si128(temp, cv_r[idx]) //temp = _mm_and_si128(cv_r[0], _mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0)) + //temp = _mm_and_si128(cv_r[0], _mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0)) + F_mm_and_si128(temp, cv_r[idx], g_BytePermInfo) - F_mm_and_si128(cv_r[idx], g_BytePermInfo.Offset(XmmSize)) //cv_r[0] = _mm_and_si128(cv_r[0], _mm_set_epi32(0x0, 0x0, 0xffffffff, 0xffffffff)) + //cv_r[0] = _mm_and_si128(cv_r[0], _mm_set_epi32(0x0, 0x0, 0xffffffff, 0xffffffff)) + F_mm_and_si128(cv_r[idx], cv_r[idx], g_BytePermInfo.Offset(XmmSize)) - MOVO_autoAU2(__tmp, temp) // __tmp = temp - F_mm_srli_epi64(__tmp, I8(48)) // (__tmp =) _mm_srli_epi64(temp, 48)) - F_mm_slli_epi64(temp, I8(16)) // temp = _mm_slli_epi64(temp, 16) - F_mm_xor_si128(temp, __tmp) // temp = _mm_xor_si128(_mm_slli_epi64(temp, 16), _mm_srli_epi64(temp, 48)) + // temp = _mm_xor_si128(_mm_slli_epi64(temp, 16), _mm_srli_epi64(temp, 48)) + F_mm_xor_si128(temp, F_mm_slli_epi64(__tmp, temp, I8(16)), F_mm_srli_epi64(temp, temp, I8(48))) - F_mm_xor_si128(cv_r[idx], temp) //cv_r[0] = _mm_xor_si128(cv_r[0], temp); + //cv_r[0] = _mm_xor_si128(cv_r[0], temp); + F_mm_xor_si128(cv_r[idx], cv_r[idx], temp) } step2 := func(idx, slli, srli int) { - MOVO_autoAU2(temp, cv_r[idx]) // temp = cv_r[1] - F_mm_srli_epi64(temp, I8(srli)) // temp = _mm_srli_epi64(cv_r[1], 32)) - F_mm_slli_epi64(cv_r[idx], I8(slli)) // cv_r[1] = _mm_slli_epi64(cv_r[1], 32) - F_mm_xor_si128(cv_r[idx], temp) // cv_r[1] = _mm_xor_si128(_mm_slli_epi64(cv_r[1], 32), _mm_srli_epi64(cv_r[1], 32)) + // cv_r[1] = _mm_xor_si128(_mm_slli_epi64(cv_r[1], 32), _mm_srli_epi64(cv_r[1], 32)) + F_mm_xor_si128(cv_r[idx], F_mm_slli_epi64(temp, cv_r[idx], I8(slli)), F_mm_srli_epi64(cv_r[idx], cv_r[idx], I8(srli))) } //temp = _mm_and_si128(cv_r[0], _mm_set_epi32(0xffffffff, 0xffffffff, 0x0, 0x0));\ @@ -621,8 +593,8 @@ func compress(cv_l, cv_r []VecVirtual, pdMsgBlk Mem, i_state_alloc, cv_l_mem, cv save := func() { Comment("save___start") - load_blk_vec2mem(cv_l_mem, cv_l) - load_blk_vec2mem(cv_r_mem, cv_r) + store_blk(cv_l_mem, cv_l) + store_blk(cv_r_mem, cv_r) Comment("save___end") } load := func() { @@ -717,13 +689,13 @@ func fin(cv_l, cv_r []VecVirtual) { Comment("fin") //cv_l[0] = XOR(cv_l[0], cv_r[0]); - XOR(cv_l[0], cv_r[0]) //cv_l[1] = XOR(cv_l[1], cv_r[1]); - XOR(cv_l[1], cv_r[1]) //cv_l[2] = XOR(cv_l[2], cv_r[2]); - XOR(cv_l[2], cv_r[2]) //cv_l[3] = XOR(cv_l[3], cv_r[3]); - XOR(cv_l[3], cv_r[3]) + XOR(cv_l[0], cv_l[0], cv_r[0]) + XOR(cv_l[1], cv_l[1], cv_r[1]) + XOR(cv_l[2], cv_l[2], cv_r[2]) + XOR(cv_l[3], cv_l[3], cv_r[3]) } /* -------------------------------------------------------- */ diff --git a/lsh512/avo/x86/lsh512ssse3/lsh512_ssse3.go b/lsh512/avo/x86/lsh512ssse3/lsh512_ssse3.go index a340a4d..8bce183 100644 --- a/lsh512/avo/x86/lsh512ssse3/lsh512_ssse3.go +++ b/lsh512/avo/x86/lsh512ssse3/lsh512_ssse3.go @@ -44,7 +44,7 @@ func (ctx *LSH512SSSE3_internal) load(v []VecVirtual, m Mem) { } func (ctx *LSH512SSSE3_internal) save(v []VecVirtual, m Mem) { Comment("i_state_save___start") - load_blk_vec2mem(m, v) + store_blk(m, v) Comment("i_state_save___end") } @@ -57,42 +57,22 @@ func load_blk_mem2vec(dst []VecVirtual, src Mem) { Comment("load_blk_mem2vec") //dest[0] = LOAD((const __m128i*)src); - LOAD(dst[0], src.Offset(XmmSize*0)) //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst[1], src.Offset(XmmSize*1)) //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(dst[2], src.Offset(XmmSize*2)) //dest[3] = LOAD((const __m128i*)src + 3); + LOAD(dst[0], src.Offset(XmmSize*0)) + LOAD(dst[1], src.Offset(XmmSize*1)) + LOAD(dst[2], src.Offset(XmmSize*2)) LOAD(dst[3], src.Offset(XmmSize*3)) } -func load_blk_vec2mem(dst Mem, src []VecVirtual) { - Comment("load_blk_vec2mem") - - //dest[0] = LOAD((const __m128i*)src); - LOAD(dst.Offset(XmmSize*0), src[0]) - //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(dst.Offset(XmmSize*1), src[1]) - //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(dst.Offset(XmmSize*2), src[2]) - //dest[3] = LOAD((const __m128i*)src + 3); - LOAD(dst.Offset(XmmSize*3), src[3]) -} func load_blk_mem2mem(dst Mem, src Mem) { Comment("load_blk_mem2mem") - tmp := XMM() //dest[0] = LOAD((const __m128i*)src); - LOAD(tmp, src) - LOAD(dst, tmp) //dest[1] = LOAD((const __m128i*)src + 1); - LOAD(tmp, src.Offset(XmmSize*1)) - LOAD(dst.Offset(XmmSize*1), tmp) //dest[2] = LOAD((const __m128i*)src + 2); - LOAD(tmp, src.Offset(XmmSize*2)) - LOAD(dst.Offset(XmmSize*2), tmp) //dest[3] = LOAD((const __m128i*)src + 3); - LOAD(tmp, src.Offset(XmmSize*3)) - LOAD(dst.Offset(XmmSize*3), tmp) + MemcpyStatic(dst, src, XmmSize*4, false) } // static INLINE void store_blk(__m128i* dest, const __m128i* src){ @@ -100,24 +80,24 @@ func store_blk(dst Mem, src []VecVirtual) { Comment("store_blk") //STORE(dest, src[0]); - STORE(dst, src[0]) //STORE(dest + 1, src[1]); - STORE(dst.Offset(XmmSize), src[1]) //STORE(dest + 2, src[2]); - STORE(dst.Offset(XmmSize*2), src[2]) //STORE(dest + 3, src[3]); + STORE(dst, src[0]) + STORE(dst.Offset(XmmSize), src[1]) + STORE(dst.Offset(XmmSize*2), src[2]) STORE(dst.Offset(XmmSize*3), src[3]) } // static INLINE void load_msg_blk(LSH512SSSE3_internal * i_state, const lsh_u64* msgblk){ func load_msg_blk(i_state LSH512SSSE3_internal, msgblk Mem /* uint32 */) { //load_blk(i_state->submsg_e_l, msgblk + 0); - load_blk_mem2mem(i_state.submsg_e_l_Mem, msgblk.Offset(0*8)) //load_blk(i_state->submsg_e_r, msgblk + 8); - load_blk_mem2mem(i_state.submsg_e_r_Mem, msgblk.Offset(8*8)) //load_blk(i_state->submsg_o_l, msgblk + 16); - load_blk_mem2mem(i_state.submsg_o_l_Mem, msgblk.Offset(16*8)) //load_blk(i_state->submsg_o_r, msgblk + 24); + load_blk_mem2mem(i_state.submsg_e_l_Mem, msgblk.Offset(0*8)) + load_blk_mem2mem(i_state.submsg_e_r_Mem, msgblk.Offset(8*8)) + load_blk_mem2mem(i_state.submsg_o_l_Mem, msgblk.Offset(16*8)) load_blk_mem2mem(i_state.submsg_o_r_Mem, msgblk.Offset(24*8)) } @@ -128,8 +108,6 @@ func msg_exp_even(i_state LSH512SSSE3_internal) { } Comment("msg_exp_even") - ADD := ADD64_ - //__m128i temp; temp := XMM() @@ -159,13 +137,13 @@ func msg_exp_even(i_state LSH512SSSE3_internal) { F_mm_unpackhi_epi64(i_state.submsg_e_l[3], temp, i_state.submsg_e_l[3]) //i_state->submsg_e_l[0] = ADD(i_state->submsg_o_l[0], i_state->submsg_e_l[0]); - ADD(i_state.submsg_e_l[0], i_state.submsg_o_l[0], i_state.submsg_e_l[0]) //i_state->submsg_e_l[1] = ADD(i_state->submsg_o_l[1], i_state->submsg_e_l[1]); - ADD(i_state.submsg_e_l[1], i_state.submsg_o_l[1], i_state.submsg_e_l[1]) //i_state->submsg_e_l[2] = ADD(i_state->submsg_o_l[2], i_state->submsg_e_l[2]); - ADD(i_state.submsg_e_l[2], i_state.submsg_o_l[2], i_state.submsg_e_l[2]) //i_state->submsg_e_l[3] = ADD(i_state->submsg_o_l[3], i_state->submsg_e_l[3]); - ADD(i_state.submsg_e_l[3], i_state.submsg_o_l[3], i_state.submsg_e_l[3]) + ADD64(i_state.submsg_e_l[0], i_state.submsg_o_l[0], i_state.submsg_e_l[0]) + ADD64(i_state.submsg_e_l[1], i_state.submsg_o_l[1], i_state.submsg_e_l[1]) + ADD64(i_state.submsg_e_l[2], i_state.submsg_o_l[2], i_state.submsg_e_l[2]) + ADD64(i_state.submsg_e_l[3], i_state.submsg_o_l[3], i_state.submsg_e_l[3]) i_state.save(i_state.submsg_e_l, i_state.submsg_e_l_Mem) @@ -195,13 +173,13 @@ func msg_exp_even(i_state LSH512SSSE3_internal) { F_mm_unpackhi_epi64(i_state.submsg_e_r[3], temp, i_state.submsg_e_r[3]) //i_state->submsg_e_r[0] = ADD(i_state->submsg_o_r[0], i_state->submsg_e_r[0]); - ADD(i_state.submsg_e_r[0], i_state.submsg_o_r[0], i_state.submsg_e_r[0]) //i_state->submsg_e_r[1] = ADD(i_state->submsg_o_r[1], i_state->submsg_e_r[1]); - ADD(i_state.submsg_e_r[1], i_state.submsg_o_r[1], i_state.submsg_e_r[1]) //i_state->submsg_e_r[2] = ADD(i_state->submsg_o_r[2], i_state->submsg_e_r[2]); - ADD(i_state.submsg_e_r[2], i_state.submsg_o_r[2], i_state.submsg_e_r[2]) //i_state->submsg_e_r[3] = ADD(i_state->submsg_o_r[3], i_state->submsg_e_r[3]); - ADD(i_state.submsg_e_r[3], i_state.submsg_o_r[3], i_state.submsg_e_r[3]) + ADD64(i_state.submsg_e_r[0], i_state.submsg_o_r[0], i_state.submsg_e_r[0]) + ADD64(i_state.submsg_e_r[1], i_state.submsg_o_r[1], i_state.submsg_e_r[1]) + ADD64(i_state.submsg_e_r[2], i_state.submsg_o_r[2], i_state.submsg_e_r[2]) + ADD64(i_state.submsg_e_r[3], i_state.submsg_o_r[3], i_state.submsg_e_r[3]) i_state.save(i_state.submsg_e_r, i_state.submsg_e_r_Mem) } @@ -213,8 +191,6 @@ func msg_exp_odd(i_state LSH512SSSE3_internal) { } Comment("msg_exp_odd") - ADD := ADD64_ - //__m128i temp; temp := XMM() @@ -247,10 +223,10 @@ func msg_exp_odd(i_state LSH512SSSE3_internal) { //i_state->submsg_o_l[1] = ADD(i_state->submsg_e_l[1], i_state->submsg_o_l[1]); //i_state->submsg_o_l[2] = ADD(i_state->submsg_e_l[2], i_state->submsg_o_l[2]); //i_state->submsg_o_l[3] = ADD(i_state->submsg_e_l[3], i_state->submsg_o_l[3]); - ADD(i_state.submsg_o_l[0], i_state.submsg_e_l[0], i_state.submsg_o_l[0]) - ADD(i_state.submsg_o_l[1], i_state.submsg_e_l[1], i_state.submsg_o_l[1]) - ADD(i_state.submsg_o_l[2], i_state.submsg_e_l[2], i_state.submsg_o_l[2]) - ADD(i_state.submsg_o_l[3], i_state.submsg_e_l[3], i_state.submsg_o_l[3]) + ADD64(i_state.submsg_o_l[0], i_state.submsg_e_l[0], i_state.submsg_o_l[0]) + ADD64(i_state.submsg_o_l[1], i_state.submsg_e_l[1], i_state.submsg_o_l[1]) + ADD64(i_state.submsg_o_l[2], i_state.submsg_e_l[2], i_state.submsg_o_l[2]) + ADD64(i_state.submsg_o_l[3], i_state.submsg_e_l[3], i_state.submsg_o_l[3]) i_state.save(i_state.submsg_o_l, i_state.submsg_o_l_Mem) @@ -283,10 +259,10 @@ func msg_exp_odd(i_state LSH512SSSE3_internal) { //i_state->submsg_o_r[1] = ADD(i_state->submsg_e_r[1], i_state->submsg_o_r[1]); //i_state->submsg_o_r[2] = ADD(i_state->submsg_e_r[2], i_state->submsg_o_r[2]); //i_state->submsg_o_r[3] = ADD(i_state->submsg_e_r[3], i_state->submsg_o_r[3]); - ADD(i_state.submsg_o_r[0], i_state.submsg_e_r[0], i_state.submsg_o_r[0]) - ADD(i_state.submsg_o_r[1], i_state.submsg_e_r[1], i_state.submsg_o_r[1]) - ADD(i_state.submsg_o_r[2], i_state.submsg_e_r[2], i_state.submsg_o_r[2]) - ADD(i_state.submsg_o_r[3], i_state.submsg_e_r[3], i_state.submsg_o_r[3]) + ADD64(i_state.submsg_o_r[0], i_state.submsg_e_r[0], i_state.submsg_o_r[0]) + ADD64(i_state.submsg_o_r[1], i_state.submsg_e_r[1], i_state.submsg_o_r[1]) + ADD64(i_state.submsg_o_r[2], i_state.submsg_e_r[2], i_state.submsg_o_r[2]) + ADD64(i_state.submsg_o_r[3], i_state.submsg_e_r[3], i_state.submsg_o_r[3]) i_state.save(i_state.submsg_o_r, i_state.submsg_o_r_Mem) } @@ -314,16 +290,16 @@ func msg_add_even(cv_l, cv_r []VecVirtual, i_state LSH512SSSE3_internal, tempVec //cv_r[3] = XOR(cv_r[3], i_state->submsg_e_r[3]); load_blk_mem2vec(tempVec, i_state.submsg_e_l_Mem) - XOR(cv_l[0], tempVec[0]) - XOR(cv_l[1], tempVec[1]) - XOR(cv_l[2], tempVec[2]) - XOR(cv_l[3], tempVec[3]) + XOR(cv_l[0], cv_l[0], tempVec[0]) + XOR(cv_l[1], cv_l[1], tempVec[1]) + XOR(cv_l[2], cv_l[2], tempVec[2]) + XOR(cv_l[3], cv_l[3], tempVec[3]) load_blk_mem2vec(tempVec, i_state.submsg_e_r_Mem) - XOR(cv_r[0], tempVec[0]) - XOR(cv_r[1], tempVec[1]) - XOR(cv_r[2], tempVec[2]) - XOR(cv_r[3], tempVec[3]) + XOR(cv_r[0], cv_r[0], tempVec[0]) + XOR(cv_r[1], cv_r[1], tempVec[1]) + XOR(cv_r[2], cv_r[2], tempVec[2]) + XOR(cv_r[3], cv_r[3], tempVec[3]) } // static INLINE void msg_add_odd(__m128i* cv_l, __m128i* cv_r, const LSH512SSSE3_internal * i_state){ @@ -340,16 +316,16 @@ func msg_add_odd(cv_l, cv_r []VecVirtual, i_state LSH512SSSE3_internal, tempVec //cv_r[3] = XOR(cv_r[3], i_state->submsg_o_r[3]); load_blk_mem2vec(tempVec, i_state.submsg_o_l_Mem) - XOR(cv_l[0], tempVec[0]) - XOR(cv_l[1], tempVec[1]) - XOR(cv_l[2], tempVec[2]) - XOR(cv_l[3], tempVec[3]) + XOR(cv_l[0], cv_l[0], tempVec[0]) + XOR(cv_l[1], cv_l[1], tempVec[1]) + XOR(cv_l[2], cv_l[2], tempVec[2]) + XOR(cv_l[3], cv_l[3], tempVec[3]) load_blk_mem2vec(tempVec, i_state.submsg_o_r_Mem) - XOR(cv_r[0], tempVec[0]) - XOR(cv_r[1], tempVec[1]) - XOR(cv_r[2], tempVec[2]) - XOR(cv_r[3], tempVec[3]) + XOR(cv_r[0], cv_r[0], tempVec[0]) + XOR(cv_r[1], cv_r[1], tempVec[1]) + XOR(cv_r[2], cv_r[2], tempVec[2]) + XOR(cv_r[3], cv_r[3], tempVec[3]) } // static INLINE void add_blk(__m128i* cv_l, const __m128i* cv_r){ @@ -357,23 +333,20 @@ func add_blk(cv_l, cv_r []VecVirtual) { Comment("add_blk") //cv_l[0] = ADD(cv_l[0], cv_r[0]); - ADD64(cv_l[0], cv_r[0]) //cv_l[1] = ADD(cv_l[1], cv_r[1]); - ADD64(cv_l[1], cv_r[1]) //cv_l[2] = ADD(cv_l[2], cv_r[2]); - ADD64(cv_l[2], cv_r[2]) //cv_l[3] = ADD(cv_l[3], cv_r[3]); - ADD64(cv_l[3], cv_r[3]) + ADD64(cv_l[0], cv_l[0], cv_r[0]) + ADD64(cv_l[1], cv_l[1], cv_r[1]) + ADD64(cv_l[2], cv_l[2], cv_r[2]) + ADD64(cv_l[3], cv_l[3], cv_r[3]) } func rotate_blk(dst VecVirtual, v int) { - tmpXmm := XMM() + tmp := XMM() // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) - MOVO_autoAU2(tmpXmm, dst) // tmpXmm = dst - SHIFT_L64(tmpXmm, U8(v)) // tmpXmm = SHIFT_L(dst, ROT_EVEN_ALPHA) - SHIFT_R64(dst, U8(WORD_BIT_LEN-v)) // dst = SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA) - OR(dst, tmpXmm) // dst = OR(SHIFT_L(dst, ROT_EVEN_ALPHA), SHIFT_R(dst, WORD_BIT_LEN - ROT_EVEN_ALPHA)) + OR(dst, SHIFT_L64(tmp, dst, U8(v)), SHIFT_R64(dst, dst, U8(WORD_BIT_LEN-v))) } // static INLINE void rotate_blk_even_alpha(__m128i* cv){ @@ -435,13 +408,13 @@ func rotate_blk_odd_beta(cv []VecVirtual) { // static INLINE void xor_with_const(__m128i* cv_l, const __m128i* const_v){ func xor_with_const(cv_l []VecVirtual, const_v []VecVirtual) { //cv_l[0] = XOR(cv_l[0], const_v[0]); - XOR(cv_l[0], const_v[0]) //cv_l[1] = XOR(cv_l[1], const_v[1]); - XOR(cv_l[1], const_v[1]) //cv_l[2] = XOR(cv_l[2], const_v[2]); - XOR(cv_l[2], const_v[2]) //cv_l[3] = XOR(cv_l[3], const_v[3]); - XOR(cv_l[3], const_v[3]) + XOR(cv_l[0], cv_l[0], const_v[0]) + XOR(cv_l[1], cv_l[1], const_v[1]) + XOR(cv_l[2], cv_l[2], const_v[2]) + XOR(cv_l[3], cv_l[3], const_v[3]) } // static INLINE void rotate_msg_gamma(__m128i* cv_r, const __m128i * perm_step){ @@ -449,13 +422,13 @@ func rotate_msg_gamma(cv_r []VecVirtual, perm_step []Mem) { Comment("rotate_msg_gamma") //cv_r[0] = SHUFFLE8(cv_r[0], perm_step[0]); - SHUFFLE8(cv_r[0], perm_step[0]) //cv_r[1] = SHUFFLE8(cv_r[1], perm_step[1]); - SHUFFLE8(cv_r[1], perm_step[1]) //cv_r[2] = SHUFFLE8(cv_r[2], perm_step[2]); - SHUFFLE8(cv_r[2], perm_step[2]) //cv_r[3] = SHUFFLE8(cv_r[3], perm_step[3]); - SHUFFLE8(cv_r[3], perm_step[3]) + SHUFFLE8(cv_r[0], cv_r[0], perm_step[0]) + SHUFFLE8(cv_r[1], cv_r[1], perm_step[1]) + SHUFFLE8(cv_r[2], cv_r[2], perm_step[2]) + SHUFFLE8(cv_r[3], cv_r[3], perm_step[3]) } // static INLINE void word_perm(__m128i* cv_l, __m128i* cv_r){ @@ -578,8 +551,8 @@ func compress(cv_l, cv_r []VecVirtual, pdMsgBlk Mem, i_state_alloc, cv_l_mem, cv save := func() { Comment("save___start") - load_blk_vec2mem(cv_l_mem, cv_l) - load_blk_vec2mem(cv_r_mem, cv_r) + store_blk(cv_l_mem, cv_l) + store_blk(cv_r_mem, cv_r) Comment("save___end") } load := func() { @@ -674,13 +647,13 @@ func fin(cv_l, cv_r []VecVirtual) { Comment("fin") //cv_l[0] = XOR(cv_l[0], cv_r[0]); - XOR(cv_l[0], cv_r[0]) //cv_l[1] = XOR(cv_l[1], cv_r[1]); - XOR(cv_l[1], cv_r[1]) //cv_l[2] = XOR(cv_l[2], cv_r[2]); - XOR(cv_l[2], cv_r[2]) //cv_l[3] = XOR(cv_l[3], cv_r[3]); - XOR(cv_l[3], cv_r[3]) + XOR(cv_l[0], cv_l[0], cv_r[0]) + XOR(cv_l[1], cv_l[1], cv_r[1]) + XOR(cv_l[2], cv_l[2], cv_r[2]) + XOR(cv_l[3], cv_l[3], cv_r[3]) } /* -------------------------------------------------------- */ diff --git a/lsh512/lsh512_amd64.s b/lsh512/lsh512_amd64.s index 9975814..08ed02c 100644 --- a/lsh512/lsh512_amd64.s +++ b/lsh512/lsh512_amd64.s @@ -351,6 +351,7 @@ TEXT ·lsh512InitSSE2(SB), NOSPLIT, $0-8 // init512 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV512<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV512<>+16(SB), X0 @@ -361,6 +362,7 @@ TEXT ·lsh512InitSSE2(SB), NOSPLIT, $0-8 MOVOU X0, 64(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV512<>+64(SB), X0 MOVOU X0, 80(AX) MOVOA g_IV512<>+80(SB), X0 @@ -377,6 +379,7 @@ lsh512_sse2_init_if0_end: // init384 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV384<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV384<>+16(SB), X0 @@ -387,6 +390,7 @@ lsh512_sse2_init_if0_end: MOVOU X0, 64(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV384<>+64(SB), X0 MOVOU X0, 80(AX) MOVOA g_IV384<>+80(SB), X0 @@ -403,6 +407,7 @@ lsh512_sse2_init_if1_end: // init256 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV256<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV256<>+16(SB), X0 @@ -413,6 +418,7 @@ lsh512_sse2_init_if1_end: MOVOU X0, 64(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV256<>+64(SB), X0 MOVOU X0, 80(AX) MOVOA g_IV256<>+80(SB), X0 @@ -426,6 +432,7 @@ lsh512_sse2_init_if1_end: lsh512_sse2_init_if2_end: // init224 // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV224<>+0(SB), X0 MOVOU X0, 16(AX) MOVOA g_IV224<>+16(SB), X0 @@ -436,6 +443,7 @@ lsh512_sse2_init_if2_end: MOVOU X0, 64(AX) // load_blk_mem2mem + // MemcpyStatic MOVOA g_IV224<>+64(SB), X0 MOVOU X0, 80(AX) MOVOA g_IV224<>+80(SB), X0 @@ -609,6 +617,7 @@ memcpy_2_sz1_start: memcpy_2_sz1_end: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU 144(AX), X8 MOVOU X8, (SP) MOVOU 160(AX), X8 @@ -619,6 +628,7 @@ memcpy_2_sz1_end: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 208(AX), X8 MOVOU X8, 64(SP) MOVOU 224(AX), X8 @@ -629,6 +639,7 @@ memcpy_2_sz1_end: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 272(AX), X8 MOVOU X8, 128(SP) MOVOU 288(AX), X8 @@ -639,6 +650,7 @@ memcpy_2_sz1_end: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 336(AX), X8 MOVOU X8, 192(SP) MOVOU 352(AX), X8 @@ -736,48 +748,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -903,48 +915,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -983,13 +995,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -1031,7 +1043,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -1072,7 +1084,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -1181,48 +1193,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -1261,13 +1273,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -1309,7 +1321,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -1350,7 +1362,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -1459,48 +1471,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -1539,13 +1551,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -1587,7 +1599,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -1628,7 +1640,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -1737,48 +1749,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -1817,13 +1829,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -1865,7 +1877,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -1906,7 +1918,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -2015,48 +2027,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -2095,13 +2107,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -2143,7 +2155,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -2184,7 +2196,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -2293,48 +2305,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -2373,13 +2385,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -2421,7 +2433,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -2462,7 +2474,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -2571,48 +2583,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -2651,13 +2663,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -2699,7 +2711,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -2740,7 +2752,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -2849,48 +2861,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -2929,13 +2941,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -2977,7 +2989,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -3018,7 +3030,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -3127,48 +3139,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -3207,13 +3219,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -3255,7 +3267,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -3296,7 +3308,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -3405,48 +3417,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -3485,13 +3497,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -3533,7 +3545,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -3574,7 +3586,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -3683,48 +3695,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -3763,13 +3775,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -3811,7 +3823,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -3852,7 +3864,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -3961,48 +3973,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -4041,13 +4053,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -4089,7 +4101,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -4130,7 +4142,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -4239,48 +4251,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -4319,13 +4331,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -4367,7 +4379,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -4408,7 +4420,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -4517,48 +4529,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -4597,13 +4609,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -4645,7 +4657,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -4686,7 +4698,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -4795,48 +4807,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -4875,13 +4887,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -4923,7 +4935,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -4964,7 +4976,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -5073,48 +5085,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -5153,13 +5165,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -5201,7 +5213,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -5242,7 +5254,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -5351,48 +5363,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -5431,13 +5443,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -5479,7 +5491,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -5520,7 +5532,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -5629,48 +5641,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -5709,13 +5721,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -5757,7 +5769,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -5798,7 +5810,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -5907,48 +5919,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -5987,13 +5999,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -6035,7 +6047,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -6076,7 +6088,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -6185,48 +6197,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -6265,13 +6277,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -6313,7 +6325,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -6354,7 +6366,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -6463,48 +6475,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -6543,13 +6555,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -6591,7 +6603,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -6632,7 +6644,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -6741,48 +6753,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -6821,13 +6833,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -6869,7 +6881,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -6910,7 +6922,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -7019,48 +7031,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -7099,13 +7111,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -7147,7 +7159,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -7188,7 +7200,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -7297,48 +7309,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -7377,13 +7389,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -7425,7 +7437,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -7466,7 +7478,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -7575,48 +7587,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -7655,13 +7667,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -7703,7 +7715,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -7744,7 +7756,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -7853,48 +7865,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -7933,13 +7945,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -7981,7 +7993,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -8022,7 +8034,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -8131,48 +8143,48 @@ memcpy_2_sz1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -8211,13 +8223,13 @@ memcpy_2_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -8259,7 +8271,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -8300,7 +8312,7 @@ memcpy_2_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -8353,6 +8365,7 @@ lsh512_sse2_update_while_start: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU (DX), X8 MOVOU X8, (SP) MOVOU 16(DX), X8 @@ -8363,6 +8376,7 @@ lsh512_sse2_update_while_start: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 64(DX), X8 MOVOU X8, 64(SP) MOVOU 80(DX), X8 @@ -8373,6 +8387,7 @@ lsh512_sse2_update_while_start: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 128(DX), X8 MOVOU X8, 128(SP) MOVOU 144(DX), X8 @@ -8383,6 +8398,7 @@ lsh512_sse2_update_while_start: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 192(DX), X8 MOVOU X8, 192(SP) MOVOU 208(DX), X8 @@ -8480,48 +8496,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -8647,48 +8663,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -8727,13 +8743,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -8775,7 +8791,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -8816,7 +8832,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -8925,48 +8941,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -9005,13 +9021,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -9053,7 +9069,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -9094,7 +9110,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -9203,48 +9219,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -9283,13 +9299,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -9331,7 +9347,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -9372,7 +9388,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -9481,48 +9497,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -9561,13 +9577,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -9609,7 +9625,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -9650,7 +9666,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -9759,48 +9775,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -9839,13 +9855,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -9887,7 +9903,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -9928,7 +9944,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -10037,48 +10053,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -10117,13 +10133,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -10165,7 +10181,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -10206,7 +10222,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -10315,48 +10331,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -10395,13 +10411,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -10443,7 +10459,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -10484,7 +10500,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -10593,48 +10609,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -10673,13 +10689,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -10721,7 +10737,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -10762,7 +10778,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -10871,48 +10887,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -10951,13 +10967,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -10999,7 +11015,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -11040,7 +11056,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -11149,48 +11165,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -11229,13 +11245,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -11277,7 +11293,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -11318,7 +11334,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -11427,48 +11443,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -11507,13 +11523,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -11555,7 +11571,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -11596,7 +11612,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -11705,48 +11721,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -11785,13 +11801,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -11833,7 +11849,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -11874,7 +11890,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -11983,48 +11999,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -12063,13 +12079,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -12111,7 +12127,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -12152,7 +12168,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -12261,48 +12277,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -12341,13 +12357,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -12389,7 +12405,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -12430,7 +12446,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -12539,48 +12555,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -12619,13 +12635,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -12667,7 +12683,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -12708,7 +12724,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -12817,48 +12833,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -12897,13 +12913,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -12945,7 +12961,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -12986,7 +13002,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -13095,48 +13111,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -13175,13 +13191,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -13223,7 +13239,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -13264,7 +13280,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -13373,48 +13389,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -13453,13 +13469,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -13501,7 +13517,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -13542,7 +13558,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -13651,48 +13667,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -13731,13 +13747,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -13779,7 +13795,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -13820,7 +13836,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -13929,48 +13945,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -14009,13 +14025,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -14057,7 +14073,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -14098,7 +14114,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -14207,48 +14223,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -14287,13 +14303,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -14335,7 +14351,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -14376,7 +14392,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -14485,48 +14501,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -14565,13 +14581,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -14613,7 +14629,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -14654,7 +14670,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -14763,48 +14779,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -14843,13 +14859,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -14891,7 +14907,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -14932,7 +14948,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -15041,48 +15057,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -15121,13 +15137,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -15169,7 +15185,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -15210,7 +15226,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -15319,48 +15335,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -15399,13 +15415,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -15447,7 +15463,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -15488,7 +15504,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -15597,48 +15613,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -15677,13 +15693,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -15725,7 +15741,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -15766,7 +15782,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -15875,48 +15891,48 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -15955,13 +15971,13 @@ lsh512_sse2_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -16003,7 +16019,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -16044,7 +16060,7 @@ lsh512_sse2_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -16263,6 +16279,7 @@ memset_1_1_end: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU 144(AX), X8 MOVOU X8, (SP) MOVOU 160(AX), X8 @@ -16273,6 +16290,7 @@ memset_1_1_end: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 208(AX), X8 MOVOU X8, 64(SP) MOVOU 224(AX), X8 @@ -16283,6 +16301,7 @@ memset_1_1_end: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 272(AX), X8 MOVOU X8, 128(SP) MOVOU 288(AX), X8 @@ -16293,6 +16312,7 @@ memset_1_1_end: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 336(AX), X8 MOVOU X8, 192(SP) MOVOU 352(AX), X8 @@ -16390,48 +16410,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -16557,48 +16577,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -16637,13 +16657,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -16685,7 +16705,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -16726,7 +16746,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -16835,48 +16855,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -16915,13 +16935,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -16963,7 +16983,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -17004,7 +17024,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -17113,48 +17133,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -17193,13 +17213,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -17241,7 +17261,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -17282,7 +17302,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -17391,48 +17411,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -17471,13 +17491,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -17519,7 +17539,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -17560,7 +17580,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -17669,48 +17689,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -17749,13 +17769,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -17797,7 +17817,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -17838,7 +17858,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -17947,48 +17967,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -18027,13 +18047,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -18075,7 +18095,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -18116,7 +18136,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -18225,48 +18245,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -18305,13 +18325,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -18353,7 +18373,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -18394,7 +18414,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -18503,48 +18523,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -18583,13 +18603,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -18631,7 +18651,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -18672,7 +18692,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -18781,48 +18801,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -18861,13 +18881,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -18909,7 +18929,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -18950,7 +18970,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -19059,48 +19079,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -19139,13 +19159,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -19187,7 +19207,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -19228,7 +19248,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -19337,48 +19357,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -19417,13 +19437,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -19465,7 +19485,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -19506,7 +19526,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -19615,48 +19635,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -19695,13 +19715,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -19743,7 +19763,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -19784,7 +19804,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -19893,48 +19913,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -19973,13 +19993,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -20021,7 +20041,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -20062,7 +20082,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -20171,48 +20191,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -20251,13 +20271,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -20299,7 +20319,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -20340,7 +20360,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -20449,48 +20469,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -20529,13 +20549,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -20577,7 +20597,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -20618,7 +20638,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -20727,48 +20747,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -20807,13 +20827,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -20855,7 +20875,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -20896,7 +20916,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -21005,48 +21025,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -21085,13 +21105,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -21133,7 +21153,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -21174,7 +21194,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -21283,48 +21303,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -21363,13 +21383,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -21411,7 +21431,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -21452,7 +21472,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -21561,48 +21581,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -21641,13 +21661,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -21689,7 +21709,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -21730,7 +21750,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -21839,48 +21859,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -21919,13 +21939,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -21967,7 +21987,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -22008,7 +22028,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -22117,48 +22137,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -22197,13 +22217,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -22245,7 +22265,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -22286,7 +22306,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -22395,48 +22415,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -22475,13 +22495,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -22523,7 +22543,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -22564,7 +22584,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -22673,48 +22693,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -22753,13 +22773,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -22801,7 +22821,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -22842,7 +22862,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -22951,48 +22971,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -23031,13 +23051,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -23079,7 +23099,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -23120,7 +23140,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -23229,48 +23249,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -23309,13 +23329,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -23357,7 +23377,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -23398,7 +23418,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -23507,48 +23527,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -23587,13 +23607,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -23635,7 +23655,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -23676,7 +23696,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -23785,48 +23805,48 @@ memset_1_1_end: PADDQ X7, X3 // rotate_msg_gamma - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X4, X9 + MOVOA X4, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X4 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X4 MOVOA X5, X9 - PSRLQ $+32, X9 - PSLLQ $+32, X5 + PSLLQ $+32, X9 + PSRLQ $+32, X5 PXOR X9, X5 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X5, X9 + MOVOA X5, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X5 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X5 MOVOA X6, X9 - PSRLQ $+56, X9 - PSLLQ $+8, X6 + PSLLQ $+8, X9 + PSRLQ $+56, X6 PXOR X9, X6 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X6, X9 + MOVOA X6, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X6 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X6 MOVOA X7, X9 - PSRLQ $+24, X9 - PSLLQ $+40, X7 + PSLLQ $+40, X9 + PSRLQ $+24, X7 PXOR X9, X7 - MOVOA g_BytePermInfo_sse2<>+0(SB), X9 - PAND X7, X9 + MOVOA X7, X9 + PAND g_BytePermInfo_sse2<>+0(SB), X9 PAND g_BytePermInfo_sse2<>+16(SB), X7 MOVOA X9, X8 - PSRLQ $+48, X8 - PSLLQ $+16, X9 + PSLLQ $+16, X8 + PSRLQ $+48, X9 PXOR X8, X9 PXOR X9, X7 @@ -23865,13 +23885,13 @@ memset_1_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -23913,7 +23933,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -23954,7 +23974,7 @@ memset_1_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -24177,6 +24197,7 @@ memcpy_5_sz1_start: memcpy_5_sz1_end: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU 144(AX), X8 MOVOU X8, (SP) MOVOU 160(AX), X8 @@ -24187,6 +24208,7 @@ memcpy_5_sz1_end: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 208(AX), X8 MOVOU X8, 64(SP) MOVOU 224(AX), X8 @@ -24197,6 +24219,7 @@ memcpy_5_sz1_end: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 272(AX), X8 MOVOU X8, 128(SP) MOVOU 288(AX), X8 @@ -24207,6 +24230,7 @@ memcpy_5_sz1_end: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 336(AX), X8 MOVOU X8, 192(SP) MOVOU 352(AX), X8 @@ -24471,13 +24495,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -24519,7 +24543,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -24560,7 +24584,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -24709,13 +24733,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -24757,7 +24781,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -24798,7 +24822,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -24947,13 +24971,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -24995,7 +25019,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -25036,7 +25060,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -25185,13 +25209,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -25233,7 +25257,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -25274,7 +25298,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -25423,13 +25447,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -25471,7 +25495,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -25512,7 +25536,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -25661,13 +25685,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -25709,7 +25733,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -25750,7 +25774,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -25899,13 +25923,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -25947,7 +25971,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -25988,7 +26012,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -26137,13 +26161,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -26185,7 +26209,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -26226,7 +26250,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -26375,13 +26399,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -26423,7 +26447,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -26464,7 +26488,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -26613,13 +26637,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -26661,7 +26685,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -26702,7 +26726,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -26851,13 +26875,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -26899,7 +26923,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -26940,7 +26964,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -27089,13 +27113,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -27137,7 +27161,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -27178,7 +27202,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -27327,13 +27351,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -27375,7 +27399,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -27416,7 +27440,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -27565,13 +27589,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -27613,7 +27637,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -27654,7 +27678,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -27803,13 +27827,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -27851,7 +27875,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -27892,7 +27916,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -28041,13 +28065,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -28089,7 +28113,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -28130,7 +28154,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -28279,13 +28303,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -28327,7 +28351,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -28368,7 +28392,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -28517,13 +28541,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -28565,7 +28589,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -28606,7 +28630,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -28755,13 +28779,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -28803,7 +28827,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -28844,7 +28868,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -28993,13 +29017,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -29041,7 +29065,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -29082,7 +29106,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -29231,13 +29255,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -29279,7 +29303,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -29320,7 +29344,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -29469,13 +29493,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -29517,7 +29541,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -29558,7 +29582,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -29707,13 +29731,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -29755,7 +29779,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -29796,7 +29820,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -29945,13 +29969,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -29993,7 +30017,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -30034,7 +30058,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -30183,13 +30207,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -30231,7 +30255,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -30272,7 +30296,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -30421,13 +30445,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -30469,7 +30493,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -30510,7 +30534,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -30659,13 +30683,13 @@ memcpy_5_sz1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -30707,7 +30731,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -30748,7 +30772,7 @@ memcpy_5_sz1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -30801,6 +30825,7 @@ lsh512_ssse3_update_while_start: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU (DX), X8 MOVOU X8, (SP) MOVOU 16(DX), X8 @@ -30811,6 +30836,7 @@ lsh512_ssse3_update_while_start: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 64(DX), X8 MOVOU X8, 64(SP) MOVOU 80(DX), X8 @@ -30821,6 +30847,7 @@ lsh512_ssse3_update_while_start: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 128(DX), X8 MOVOU X8, 128(SP) MOVOU 144(DX), X8 @@ -30831,6 +30858,7 @@ lsh512_ssse3_update_while_start: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 192(DX), X8 MOVOU X8, 192(SP) MOVOU 208(DX), X8 @@ -31095,13 +31123,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -31143,7 +31171,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -31184,7 +31212,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -31333,13 +31361,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -31381,7 +31409,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -31422,7 +31450,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -31571,13 +31599,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -31619,7 +31647,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -31660,7 +31688,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -31809,13 +31837,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -31857,7 +31885,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -31898,7 +31926,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -32047,13 +32075,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -32095,7 +32123,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -32136,7 +32164,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -32285,13 +32313,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -32333,7 +32361,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -32374,7 +32402,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -32523,13 +32551,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -32571,7 +32599,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -32612,7 +32640,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -32761,13 +32789,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -32809,7 +32837,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -32850,7 +32878,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -32999,13 +33027,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -33047,7 +33075,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -33088,7 +33116,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -33237,13 +33265,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -33285,7 +33313,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -33326,7 +33354,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -33475,13 +33503,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -33523,7 +33551,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -33564,7 +33592,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -33713,13 +33741,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -33761,7 +33789,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -33802,7 +33830,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -33951,13 +33979,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -33999,7 +34027,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -34040,7 +34068,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -34189,13 +34217,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -34237,7 +34265,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -34278,7 +34306,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -34427,13 +34455,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -34475,7 +34503,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -34516,7 +34544,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -34665,13 +34693,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -34713,7 +34741,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -34754,7 +34782,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -34903,13 +34931,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -34951,7 +34979,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -34992,7 +35020,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -35141,13 +35169,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -35189,7 +35217,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -35230,7 +35258,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -35379,13 +35407,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -35427,7 +35455,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -35468,7 +35496,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -35617,13 +35645,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -35665,7 +35693,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -35706,7 +35734,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -35855,13 +35883,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -35903,7 +35931,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -35944,7 +35972,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -36093,13 +36121,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -36141,7 +36169,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -36182,7 +36210,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -36331,13 +36359,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -36379,7 +36407,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -36420,7 +36448,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -36569,13 +36597,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -36617,7 +36645,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -36658,7 +36686,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -36807,13 +36835,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -36855,7 +36883,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -36896,7 +36924,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -37045,13 +37073,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -37093,7 +37121,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -37134,7 +37162,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -37283,13 +37311,13 @@ lsh512_ssse3_update_while_start: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -37331,7 +37359,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -37372,7 +37400,7 @@ lsh512_ssse3_update_while_start: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -37591,6 +37619,7 @@ memset_2_1_end: // compress // load_blk_mem2mem + // MemcpyStatic MOVOU 144(AX), X8 MOVOU X8, (SP) MOVOU 160(AX), X8 @@ -37601,6 +37630,7 @@ memset_2_1_end: MOVOU X8, 48(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 208(AX), X8 MOVOU X8, 64(SP) MOVOU 224(AX), X8 @@ -37611,6 +37641,7 @@ memset_2_1_end: MOVOU X8, 112(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 272(AX), X8 MOVOU X8, 128(SP) MOVOU 288(AX), X8 @@ -37621,6 +37652,7 @@ memset_2_1_end: MOVOU X8, 176(SP) // load_blk_mem2mem + // MemcpyStatic MOVOU 336(AX), X8 MOVOU X8, 192(SP) MOVOU 352(AX), X8 @@ -37885,13 +37917,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -37933,7 +37965,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -37974,7 +38006,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -38123,13 +38155,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -38171,7 +38203,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -38212,7 +38244,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -38361,13 +38393,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -38409,7 +38441,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -38450,7 +38482,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -38599,13 +38631,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -38647,7 +38679,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -38688,7 +38720,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -38837,13 +38869,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -38885,7 +38917,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -38926,7 +38958,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -39075,13 +39107,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -39123,7 +39155,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -39164,7 +39196,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -39313,13 +39345,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -39361,7 +39393,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -39402,7 +39434,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -39551,13 +39583,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -39599,7 +39631,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -39640,7 +39672,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -39789,13 +39821,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -39837,7 +39869,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -39878,7 +39910,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -40027,13 +40059,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -40075,7 +40107,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -40116,7 +40148,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -40265,13 +40297,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -40313,7 +40345,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -40354,7 +40386,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -40503,13 +40535,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -40551,7 +40583,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -40592,7 +40624,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -40741,13 +40773,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -40789,7 +40821,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -40830,7 +40862,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -40979,13 +41011,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -41027,7 +41059,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -41068,7 +41100,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -41217,13 +41249,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -41265,7 +41297,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -41306,7 +41338,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -41455,13 +41487,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -41503,7 +41535,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -41544,7 +41576,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -41693,13 +41725,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -41741,7 +41773,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -41782,7 +41814,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -41931,13 +41963,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -41979,7 +42011,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -42020,7 +42052,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -42169,13 +42201,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -42217,7 +42249,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -42258,7 +42290,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -42407,13 +42439,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -42455,7 +42487,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -42496,7 +42528,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -42645,13 +42677,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -42693,7 +42725,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -42734,7 +42766,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -42883,13 +42915,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -42931,7 +42963,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -42972,7 +43004,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -43121,13 +43153,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -43169,7 +43201,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -43210,7 +43242,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -43359,13 +43391,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -43407,7 +43439,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -43448,7 +43480,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -43597,13 +43629,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -43645,7 +43677,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -43686,7 +43718,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -43835,13 +43867,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -43883,7 +43915,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 128(SP) MOVOU X1, 144(SP) MOVOU X2, 160(SP) @@ -43924,7 +43956,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 192(SP) MOVOU X1, 208(SP) MOVOU X2, 224(SP) @@ -44073,13 +44105,13 @@ memset_2_1_end: MOVOA X9, X5 // save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 16(AX) MOVOU X1, 32(AX) MOVOU X2, 48(AX) MOVOU X3, 64(AX) - // load_blk_vec2mem + // store_blk MOVOU X4, 80(AX) MOVOU X5, 96(AX) MOVOU X6, 112(AX) @@ -44121,7 +44153,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) @@ -44162,7 +44194,7 @@ memset_2_1_end: PADDQ X7, X3 // i_state_save___start - // load_blk_vec2mem + // store_blk MOVOU X0, 64(SP) MOVOU X1, 80(SP) MOVOU X2, 96(SP) @@ -44233,12 +44265,14 @@ TEXT ·lsh512InitAVX2(SB), NOSPLIT, $0-8 // init512 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV512<>+0(SB), Y0 VMOVDQU Y0, 16(AX) VMOVDQA g_IV512<>+32(SB), Y0 VMOVDQU Y0, 48(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV512<>+64(SB), Y0 VMOVDQU Y0, 80(AX) VMOVDQA g_IV512<>+96(SB), Y0 @@ -44251,12 +44285,14 @@ lsh512_avx2_init_if0_end: // init384 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV384<>+0(SB), Y0 VMOVDQU Y0, 16(AX) VMOVDQA g_IV384<>+32(SB), Y0 VMOVDQU Y0, 48(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV384<>+64(SB), Y0 VMOVDQU Y0, 80(AX) VMOVDQA g_IV384<>+96(SB), Y0 @@ -44269,12 +44305,14 @@ lsh512_avx2_init_if1_end: // init256 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV256<>+0(SB), Y0 VMOVDQU Y0, 16(AX) VMOVDQA g_IV256<>+32(SB), Y0 VMOVDQU Y0, 48(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV256<>+64(SB), Y0 VMOVDQU Y0, 80(AX) VMOVDQA g_IV256<>+96(SB), Y0 @@ -44284,12 +44322,14 @@ lsh512_avx2_init_if1_end: lsh512_avx2_init_if2_end: // init224 // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV224<>+0(SB), Y0 VMOVDQU Y0, 16(AX) VMOVDQA g_IV224<>+32(SB), Y0 VMOVDQU Y0, 48(AX) // load_blk_mem2mem + // MemcpyStatic VMOVDQA g_IV224<>+64(SB), Y0 VMOVDQU Y0, 80(AX) VMOVDQA g_IV224<>+96(SB), Y0 @@ -44489,10 +44529,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44503,10 +44543,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44547,10 +44587,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44561,10 +44601,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44586,13 +44626,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -44613,10 +44653,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44627,10 +44667,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44652,13 +44692,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -44679,10 +44719,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44693,10 +44733,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44718,13 +44758,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -44745,10 +44785,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44759,10 +44799,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44784,13 +44824,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -44811,10 +44851,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44825,10 +44865,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44850,13 +44890,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -44877,10 +44917,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44891,10 +44931,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44916,13 +44956,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -44943,10 +44983,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -44957,10 +44997,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -44982,13 +45022,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45009,10 +45049,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45023,10 +45063,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45048,13 +45088,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45075,10 +45115,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45089,10 +45129,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45114,13 +45154,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45141,10 +45181,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45155,10 +45195,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45180,13 +45220,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45207,10 +45247,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45221,10 +45261,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45246,13 +45286,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45273,10 +45313,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45287,10 +45327,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45312,13 +45352,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45339,10 +45379,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45353,10 +45393,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45378,13 +45418,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45405,10 +45445,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45419,10 +45459,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45444,13 +45484,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45471,10 +45511,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45485,10 +45525,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45510,13 +45550,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45537,10 +45577,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45551,10 +45591,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45576,13 +45616,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45603,10 +45643,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45617,10 +45657,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45642,13 +45682,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45669,10 +45709,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45683,10 +45723,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45708,13 +45748,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45735,10 +45775,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45749,10 +45789,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45774,13 +45814,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45801,10 +45841,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45815,10 +45855,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45840,13 +45880,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45867,10 +45907,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45881,10 +45921,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45906,13 +45946,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -45933,10 +45973,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -45947,10 +45987,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -45972,13 +46012,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -45999,10 +46039,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46013,10 +46053,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46038,13 +46078,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46065,10 +46105,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46079,10 +46119,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46104,13 +46144,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46131,10 +46171,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46145,10 +46185,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46170,13 +46210,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46197,10 +46237,10 @@ memcpy_8_sz1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46211,10 +46251,10 @@ memcpy_8_sz1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46236,13 +46276,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46263,10 +46303,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46277,10 +46317,10 @@ memcpy_8_sz1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46302,13 +46342,13 @@ memcpy_8_sz1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46361,10 +46401,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46375,10 +46415,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46419,10 +46459,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46433,10 +46473,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46458,13 +46498,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46485,10 +46525,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46499,10 +46539,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46524,13 +46564,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46551,10 +46591,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46565,10 +46605,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46590,13 +46630,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46617,10 +46657,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46631,10 +46671,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46656,13 +46696,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46683,10 +46723,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46697,10 +46737,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46722,13 +46762,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46749,10 +46789,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46763,10 +46803,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46788,13 +46828,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46815,10 +46855,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46829,10 +46869,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46854,13 +46894,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -46881,10 +46921,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46895,10 +46935,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46920,13 +46960,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -46947,10 +46987,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -46961,10 +47001,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -46986,13 +47026,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47013,10 +47053,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47027,10 +47067,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47052,13 +47092,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47079,10 +47119,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47093,10 +47133,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47118,13 +47158,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47145,10 +47185,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47159,10 +47199,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47184,13 +47224,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47211,10 +47251,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47225,10 +47265,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47250,13 +47290,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47277,10 +47317,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47291,10 +47331,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47316,13 +47356,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47343,10 +47383,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47357,10 +47397,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47382,13 +47422,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47409,10 +47449,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47423,10 +47463,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47448,13 +47488,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47475,10 +47515,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47489,10 +47529,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47514,13 +47554,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47541,10 +47581,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47555,10 +47595,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47580,13 +47620,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47607,10 +47647,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47621,10 +47661,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47646,13 +47686,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47673,10 +47713,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47687,10 +47727,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47712,13 +47752,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47739,10 +47779,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47753,10 +47793,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47778,13 +47818,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47805,10 +47845,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47819,10 +47859,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47844,13 +47884,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -47871,10 +47911,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47885,10 +47925,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47910,13 +47950,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -47937,10 +47977,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -47951,10 +47991,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -47976,13 +48016,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48003,10 +48043,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48017,10 +48057,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48042,13 +48082,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48069,10 +48109,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48083,10 +48123,10 @@ lsh512_avx2_update_while_start: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48108,13 +48148,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48135,10 +48175,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48149,10 +48189,10 @@ lsh512_avx2_update_while_start: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48174,13 +48214,13 @@ lsh512_avx2_update_while_start: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48391,10 +48431,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48405,10 +48445,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48449,10 +48489,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48463,10 +48503,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48488,13 +48528,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48515,10 +48555,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48529,10 +48569,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48554,13 +48594,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48581,10 +48621,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48595,10 +48635,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48620,13 +48660,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48647,10 +48687,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48661,10 +48701,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48686,13 +48726,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48713,10 +48753,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48727,10 +48767,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48752,13 +48792,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48779,10 +48819,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48793,10 +48833,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48818,13 +48858,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48845,10 +48885,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48859,10 +48899,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48884,13 +48924,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -48911,10 +48951,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48925,10 +48965,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -48950,13 +48990,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -48977,10 +49017,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -48991,10 +49031,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49016,13 +49056,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49043,10 +49083,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49057,10 +49097,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49082,13 +49122,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49109,10 +49149,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49123,10 +49163,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49148,13 +49188,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49175,10 +49215,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49189,10 +49229,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49214,13 +49254,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49241,10 +49281,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49255,10 +49295,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49280,13 +49320,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49307,10 +49347,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49321,10 +49361,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49346,13 +49386,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49373,10 +49413,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49387,10 +49427,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49412,13 +49452,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49439,10 +49479,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49453,10 +49493,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49478,13 +49518,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49505,10 +49545,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49519,10 +49559,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49544,13 +49584,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49571,10 +49611,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49585,10 +49625,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49610,13 +49650,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49637,10 +49677,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49651,10 +49691,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49676,13 +49716,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49703,10 +49743,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49717,10 +49757,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49742,13 +49782,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49769,10 +49809,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49783,10 +49823,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49808,13 +49848,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49835,10 +49875,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49849,10 +49889,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49874,13 +49914,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -49901,10 +49941,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49915,10 +49955,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -49940,13 +49980,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -49967,10 +50007,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -49981,10 +50021,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -50006,13 +50046,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -50033,10 +50073,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -50047,10 +50087,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -50072,13 +50112,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0 @@ -50099,10 +50139,10 @@ memset_3_1_end: // rotate_blk_even_alpha VPSLLQ $0x17, Y0, Y14 VPSRLQ $0x29, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x17, Y1, Y14 VPSRLQ $0x29, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -50113,10 +50153,10 @@ memset_3_1_end: // rotate_blk_even_beta VPSLLQ $0x3b, Y2, Y4 VPSRLQ $0x05, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x3b, Y3, Y4 VPSRLQ $0x05, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -50138,13 +50178,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y10, Y10 - VPADDQ Y6, Y10, Y10 + VPADDQ Y10, Y6, Y10 VPERMQ $0x93, Y11, Y11 - VPADDQ Y7, Y11, Y11 + VPADDQ Y11, Y7, Y11 VPERMQ $0x4b, Y12, Y12 - VPADDQ Y8, Y12, Y12 + VPADDQ Y12, Y8, Y12 VPERMQ $0x93, Y13, Y13 - VPADDQ Y9, Y13, Y13 + VPADDQ Y13, Y9, Y13 // msg_add_odd VPXOR Y10, Y0, Y0 @@ -50165,10 +50205,10 @@ memset_3_1_end: // rotate_blk_odd_alpha VPSLLQ $0x07, Y0, Y14 VPSRLQ $0x39, Y0, Y0 - VPOR Y14, Y0, Y0 + VPOR Y0, Y14, Y0 VPSLLQ $0x07, Y1, Y14 VPSRLQ $0x39, Y1, Y1 - VPOR Y14, Y1, Y1 + VPOR Y1, Y14, Y1 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 @@ -50179,10 +50219,10 @@ memset_3_1_end: // rotate_blk_odd_beta VPSLLQ $0x03, Y2, Y4 VPSRLQ $0x3d, Y2, Y2 - VPOR Y4, Y2, Y2 + VPOR Y2, Y4, Y2 VPSLLQ $0x03, Y3, Y4 VPSRLQ $0x3d, Y3, Y3 - VPOR Y4, Y3, Y3 + VPOR Y3, Y4, Y3 // add_blk VPADDQ Y2, Y0, Y0 @@ -50204,13 +50244,13 @@ memset_3_1_end: VMOVDQA Y4, Y2 VMOVDQA Y5, Y3 VPERMQ $0x4b, Y6, Y6 - VPADDQ Y10, Y6, Y6 + VPADDQ Y6, Y10, Y6 VPERMQ $0x93, Y7, Y7 - VPADDQ Y11, Y7, Y7 + VPADDQ Y7, Y11, Y7 VPERMQ $0x4b, Y8, Y8 - VPADDQ Y12, Y8, Y8 + VPADDQ Y8, Y12, Y8 VPERMQ $0x93, Y9, Y9 - VPADDQ Y13, Y9, Y9 + VPADDQ Y9, Y13, Y9 // msg_add_even VPXOR Y6, Y0, Y0