diff --git a/lea/arm64_goat/src/READMD.md b/lea/READMD.md similarity index 100% rename from lea/arm64_goat/src/READMD.md rename to lea/READMD.md diff --git a/lea/amd64_avo/README.md b/lea/amd64_avo/README.md deleted file mode 100644 index dd52312..0000000 --- a/lea/amd64_avo/README.md +++ /dev/null @@ -1 +0,0 @@ -# [Based on the source code provided by KISA.](https://seed.kisa.or.kr/kisa/Board/20/detailView.do) diff --git a/lea/amd64_avo/lea.go b/lea/amd64_avo/lea.go deleted file mode 100644 index 47d35ab..0000000 --- a/lea/amd64_avo/lea.go +++ /dev/null @@ -1,19 +0,0 @@ -package main - -import ( - . "github.com/mmcloughlin/avo/build" -) - -func main() { - Package("kryptosimd/lea/amd64-avo") - ConstraintExpr("amd64,gc,!purego") - - leaEnc4SSE2() - leaDec4SSE2() - - leaEnc8AVX2() - leaDec8AVX2() - - Generate() - print("done") -} diff --git a/lea/amd64_avo/lea_avx2.go b/lea/amd64_avo/lea_avx2.go deleted file mode 100644 index 5cd4097..0000000 --- a/lea/amd64_avo/lea_avx2.go +++ /dev/null @@ -1,556 +0,0 @@ -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -func leaEnc8AVX2() { - TEXT("leaEnc8AVX2", NOSPLIT, "func(ctx *leaContext, dst []byte, src []byte)") - - ctx := GetCtx() - dst := Mem{Base: Load(Param("dst").Base(), GP64())} - src := Mem{Base: Load(Param("src").Base(), GP64())} - - /** - __m256i x0, x1, x2, x3, tmp; - __m128i tmp128; - */ - x0 := YMM() - x1 := YMM() - x2 := YMM() - x3 := YMM() - - /** - x0 = _mm256_setr_epi32( - *((unsigned int *)pt + 0x00), *((unsigned int *)pt + 0x04), - *((unsigned int *)pt + 0x08), *((unsigned int *)pt + 0x0c), - *((unsigned int *)pt + 0x10), *((unsigned int *)pt + 0x14), - *((unsigned int *)pt + 0x18), *((unsigned int *)pt + 0x1c) - ); - x1 = _mm256_setr_epi32( - *((unsigned int *)pt + 0x01), *((unsigned int *)pt + 0x05), - *((unsigned int *)pt + 0x09), *((unsigned int *)pt + 0x0d), - *((unsigned int *)pt + 0x11), *((unsigned int *)pt + 0x15), - *((unsigned int *)pt + 0x19), *((unsigned int *)pt + 0x1d) - ); - x2 = _mm256_setr_epi32( - *((unsigned int *)pt + 0x02), *((unsigned int *)pt + 0x06), - *((unsigned int *)pt + 0x0a), *((unsigned int *)pt + 0x0e), - *((unsigned int *)pt + 0x12), *((unsigned int *)pt + 0x16), - *((unsigned int *)pt + 0x1a), *((unsigned int *)pt + 0x1e) - ); - x3 = _mm256_setr_epi32( - *((unsigned int *)pt + 0x03), *((unsigned int *)pt + 0x07), - *((unsigned int *)pt + 0x0b), *((unsigned int *)pt + 0x0f), - *((unsigned int *)pt + 0x13), *((unsigned int *)pt + 0x17), - *((unsigned int *)pt + 0x1b), *((unsigned int *)pt + 0x1f) - ); - */ - leaAVX2Int2Ymm(x0, src, 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c) - leaAVX2Int2Ymm(x1, src, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d) - leaAVX2Int2Ymm(x2, src, 0x02, 0x06, 0x0a, 0x0e, 0x12, 0x16, 0x1a, 0x1e) - leaAVX2Int2Ymm(x3, src, 0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f) - - XAR3_AVX2(ctx.Rk, x3, x2, 4, 5) - XAR5_AVX2(ctx.Rk, x2, x1, 2, 3) - XAR9_AVX2(ctx.Rk, x1, x0, 0, 1) - XAR3_AVX2(ctx.Rk, x0, x3, 10, 11) - XAR5_AVX2(ctx.Rk, x3, x2, 8, 9) - XAR9_AVX2(ctx.Rk, x2, x1, 6, 7) - XAR3_AVX2(ctx.Rk, x1, x0, 16, 17) - XAR5_AVX2(ctx.Rk, x0, x3, 14, 15) - XAR9_AVX2(ctx.Rk, x3, x2, 12, 13) - XAR3_AVX2(ctx.Rk, x2, x1, 22, 23) - XAR5_AVX2(ctx.Rk, x1, x0, 20, 21) - XAR9_AVX2(ctx.Rk, x0, x3, 18, 19) - - XAR3_AVX2(ctx.Rk, x3, x2, 28, 29) - XAR5_AVX2(ctx.Rk, x2, x1, 26, 27) - XAR9_AVX2(ctx.Rk, x1, x0, 24, 25) - XAR3_AVX2(ctx.Rk, x0, x3, 34, 35) - XAR5_AVX2(ctx.Rk, x3, x2, 32, 33) - XAR9_AVX2(ctx.Rk, x2, x1, 30, 31) - XAR3_AVX2(ctx.Rk, x1, x0, 40, 41) - XAR5_AVX2(ctx.Rk, x0, x3, 38, 39) - XAR9_AVX2(ctx.Rk, x3, x2, 36, 37) - XAR3_AVX2(ctx.Rk, x2, x1, 46, 47) - XAR5_AVX2(ctx.Rk, x1, x0, 44, 45) - XAR9_AVX2(ctx.Rk, x0, x3, 42, 43) - - XAR3_AVX2(ctx.Rk, x3, x2, 52, 53) - XAR5_AVX2(ctx.Rk, x2, x1, 50, 51) - XAR9_AVX2(ctx.Rk, x1, x0, 48, 49) - XAR3_AVX2(ctx.Rk, x0, x3, 58, 59) - XAR5_AVX2(ctx.Rk, x3, x2, 56, 57) - XAR9_AVX2(ctx.Rk, x2, x1, 54, 55) - XAR3_AVX2(ctx.Rk, x1, x0, 64, 65) - XAR5_AVX2(ctx.Rk, x0, x3, 62, 63) - XAR9_AVX2(ctx.Rk, x3, x2, 60, 61) - XAR3_AVX2(ctx.Rk, x2, x1, 70, 71) - XAR5_AVX2(ctx.Rk, x1, x0, 68, 69) - XAR9_AVX2(ctx.Rk, x0, x3, 66, 67) - - XAR3_AVX2(ctx.Rk, x3, x2, 76, 77) - XAR5_AVX2(ctx.Rk, x2, x1, 74, 75) - XAR9_AVX2(ctx.Rk, x1, x0, 72, 73) - XAR3_AVX2(ctx.Rk, x0, x3, 82, 83) - XAR5_AVX2(ctx.Rk, x3, x2, 80, 81) - XAR9_AVX2(ctx.Rk, x2, x1, 78, 79) - XAR3_AVX2(ctx.Rk, x1, x0, 88, 89) - XAR5_AVX2(ctx.Rk, x0, x3, 86, 87) - XAR9_AVX2(ctx.Rk, x3, x2, 84, 85) - XAR3_AVX2(ctx.Rk, x2, x1, 94, 95) - XAR5_AVX2(ctx.Rk, x1, x0, 92, 93) - XAR9_AVX2(ctx.Rk, x0, x3, 90, 91) - - XAR3_AVX2(ctx.Rk, x3, x2, 100, 101) - XAR5_AVX2(ctx.Rk, x2, x1, 98, 99) - XAR9_AVX2(ctx.Rk, x1, x0, 96, 97) - XAR3_AVX2(ctx.Rk, x0, x3, 106, 107) - XAR5_AVX2(ctx.Rk, x3, x2, 104, 105) - XAR9_AVX2(ctx.Rk, x2, x1, 102, 103) - XAR3_AVX2(ctx.Rk, x1, x0, 112, 113) - XAR5_AVX2(ctx.Rk, x0, x3, 110, 111) - XAR9_AVX2(ctx.Rk, x3, x2, 108, 109) - XAR3_AVX2(ctx.Rk, x2, x1, 118, 119) - XAR5_AVX2(ctx.Rk, x1, x0, 116, 117) - XAR9_AVX2(ctx.Rk, x0, x3, 114, 115) - - XAR3_AVX2(ctx.Rk, x3, x2, 124, 125) - XAR5_AVX2(ctx.Rk, x2, x1, 122, 123) - XAR9_AVX2(ctx.Rk, x1, x0, 120, 121) - XAR3_AVX2(ctx.Rk, x0, x3, 130, 131) - XAR5_AVX2(ctx.Rk, x3, x2, 128, 129) - XAR9_AVX2(ctx.Rk, x2, x1, 126, 127) - XAR3_AVX2(ctx.Rk, x1, x0, 136, 137) - XAR5_AVX2(ctx.Rk, x0, x3, 134, 135) - XAR9_AVX2(ctx.Rk, x3, x2, 132, 133) - XAR3_AVX2(ctx.Rk, x2, x1, 142, 143) - XAR5_AVX2(ctx.Rk, x1, x0, 140, 141) - XAR9_AVX2(ctx.Rk, x0, x3, 138, 139) - - CMPB(ctx.Round, U8(24)) - JBE(LabelRef("OVER24_END")) - XAR3_AVX2(ctx.Rk, x3, x2, 148, 149) - XAR5_AVX2(ctx.Rk, x2, x1, 146, 147) - XAR9_AVX2(ctx.Rk, x1, x0, 144, 145) - XAR3_AVX2(ctx.Rk, x0, x3, 154, 155) - XAR5_AVX2(ctx.Rk, x3, x2, 152, 153) - XAR9_AVX2(ctx.Rk, x2, x1, 150, 151) - XAR3_AVX2(ctx.Rk, x1, x0, 160, 161) - XAR5_AVX2(ctx.Rk, x0, x3, 158, 159) - XAR9_AVX2(ctx.Rk, x3, x2, 156, 157) - XAR3_AVX2(ctx.Rk, x2, x1, 166, 167) - XAR5_AVX2(ctx.Rk, x1, x0, 164, 165) - XAR9_AVX2(ctx.Rk, x0, x3, 162, 163) - Label("OVER24_END") - - CMPB(ctx.Round, U8(28)) - JBE(LabelRef("OVER28_END")) - XAR3_AVX2(ctx.Rk, x3, x2, 172, 173) - XAR5_AVX2(ctx.Rk, x2, x1, 170, 171) - XAR9_AVX2(ctx.Rk, x1, x0, 168, 169) - XAR3_AVX2(ctx.Rk, x0, x3, 178, 179) - XAR5_AVX2(ctx.Rk, x3, x2, 176, 177) - XAR9_AVX2(ctx.Rk, x2, x1, 174, 175) - XAR3_AVX2(ctx.Rk, x1, x0, 184, 185) - XAR5_AVX2(ctx.Rk, x0, x3, 182, 183) - XAR9_AVX2(ctx.Rk, x3, x2, 180, 181) - XAR3_AVX2(ctx.Rk, x2, x1, 190, 191) - XAR5_AVX2(ctx.Rk, x1, x0, 188, 189) - XAR9_AVX2(ctx.Rk, x0, x3, 186, 187) - Label("OVER28_END") - - /** - tmp128 = _mm256_extractf128_si256(x0, 0); - *((unsigned int *)ct + 0x00) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x04) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x08) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x0c) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x0, 1); - *((unsigned int *)ct + 0x10) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x14) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x18) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x1c) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x1, 0); - *((unsigned int *)ct + 0x01) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x05) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x09) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x0d) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x1, 1); - *((unsigned int *)ct + 0x11) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x15) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x19) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x1d) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x2, 0); - *((unsigned int *)ct + 0x02) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x06) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x0a) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x0e) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x2, 1); - *((unsigned int *)ct + 0x12) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x16) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x1a) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x1e) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x3, 0); - *((unsigned int *)ct + 0x03) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x07) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x0b) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x0f) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x3, 1); - *((unsigned int *)ct + 0x13) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)ct + 0x17) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x1b) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)ct + 0x1f) = _mm_extract_epi32(tmp128, 3); - */ - leaAVX2Ymm2Int(x0, dst, 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c) - leaAVX2Ymm2Int(x1, dst, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d) - leaAVX2Ymm2Int(x2, dst, 0x02, 0x06, 0x0a, 0x0e, 0x12, 0x16, 0x1a, 0x1e) - leaAVX2Ymm2Int(x3, dst, 0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f) - - /** - return - */ - RET() -} - -func leaDec8AVX2() { - TEXT("leaDec8AVX2", NOSPLIT, "func(ctx *leaContext, dst []byte, src []byte)") - - ctx := GetCtx() - dst := Mem{Base: Load(Param("dst").Base(), GP64())} - src := Mem{Base: Load(Param("src").Base(), GP64())} - - /** - __m256i x0, x1, x2, x3; - __m128i tmp128; - */ - x0 := YMM() - x1 := YMM() - x2 := YMM() - x3 := YMM() - - /** - x0 = _mm256_setr_epi32( - *((unsigned int *)ct + 0x00), *((unsigned int *)ct + 0x04), - *((unsigned int *)ct + 0x08), *((unsigned int *)ct + 0x0c), - *((unsigned int *)ct + 0x10), *((unsigned int *)ct + 0x14), - *((unsigned int *)ct + 0x18), *((unsigned int *)ct + 0x1c) - ); - x1 = _mm256_setr_epi32( - *((unsigned int *)ct + 0x01), *((unsigned int *)ct + 0x05), - *((unsigned int *)ct + 0x09), *((unsigned int *)ct + 0x0d), - *((unsigned int *)ct + 0x11), *((unsigned int *)ct + 0x15), - *((unsigned int *)ct + 0x19), *((unsigned int *)ct + 0x1d) - ); - x2 = _mm256_setr_epi32( - *((unsigned int *)ct + 0x02), *((unsigned int *)ct + 0x06), - *((unsigned int *)ct + 0x0a), *((unsigned int *)ct + 0x0e), - *((unsigned int *)ct + 0x12), *((unsigned int *)ct + 0x16), - *((unsigned int *)ct + 0x1a), *((unsigned int *)ct + 0x1e) - ); - x3 = _mm256_setr_epi32( - *((unsigned int *)ct + 0x03), *((unsigned int *)ct + 0x07), - *((unsigned int *)ct + 0x0b), *((unsigned int *)ct + 0x0f), - *((unsigned int *)ct + 0x13), *((unsigned int *)ct + 0x17), - *((unsigned int *)ct + 0x1b), *((unsigned int *)ct + 0x1f) - ); - */ - leaAVX2Int2Ymm(x0, src, 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c) - leaAVX2Int2Ymm(x1, src, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d) - leaAVX2Int2Ymm(x2, src, 0x02, 0x06, 0x0a, 0x0e, 0x12, 0x16, 0x1a, 0x1e) - leaAVX2Int2Ymm(x3, src, 0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f) - - CMPB(ctx.Round, U8(28)) - JBE(LabelRef("OVER28_END")) - XSR9_AVX2(ctx.Rk, x0, x3, 186, 187) - XSR5_AVX2(ctx.Rk, x1, x0, 188, 189) - XSR3_AVX2(ctx.Rk, x2, x1, 190, 191) - XSR9_AVX2(ctx.Rk, x3, x2, 180, 181) - XSR5_AVX2(ctx.Rk, x0, x3, 182, 183) - XSR3_AVX2(ctx.Rk, x1, x0, 184, 185) - XSR9_AVX2(ctx.Rk, x2, x1, 174, 175) - XSR5_AVX2(ctx.Rk, x3, x2, 176, 177) - XSR3_AVX2(ctx.Rk, x0, x3, 178, 179) - XSR9_AVX2(ctx.Rk, x1, x0, 168, 169) - XSR5_AVX2(ctx.Rk, x2, x1, 170, 171) - XSR3_AVX2(ctx.Rk, x3, x2, 172, 173) - Label("OVER28_END") - - CMPB(ctx.Round, U8(24)) - JBE(LabelRef("OVER24_END")) - XSR9_AVX2(ctx.Rk, x0, x3, 162, 163) - XSR5_AVX2(ctx.Rk, x1, x0, 164, 165) - XSR3_AVX2(ctx.Rk, x2, x1, 166, 167) - XSR9_AVX2(ctx.Rk, x3, x2, 156, 157) - XSR5_AVX2(ctx.Rk, x0, x3, 158, 159) - XSR3_AVX2(ctx.Rk, x1, x0, 160, 161) - XSR9_AVX2(ctx.Rk, x2, x1, 150, 151) - XSR5_AVX2(ctx.Rk, x3, x2, 152, 153) - XSR3_AVX2(ctx.Rk, x0, x3, 154, 155) - XSR9_AVX2(ctx.Rk, x1, x0, 144, 145) - XSR5_AVX2(ctx.Rk, x2, x1, 146, 147) - XSR3_AVX2(ctx.Rk, x3, x2, 148, 149) - Label("OVER24_END") - - XSR9_AVX2(ctx.Rk, x0, x3, 138, 139) - XSR5_AVX2(ctx.Rk, x1, x0, 140, 141) - XSR3_AVX2(ctx.Rk, x2, x1, 142, 143) - XSR9_AVX2(ctx.Rk, x3, x2, 132, 133) - XSR5_AVX2(ctx.Rk, x0, x3, 134, 135) - XSR3_AVX2(ctx.Rk, x1, x0, 136, 137) - XSR9_AVX2(ctx.Rk, x2, x1, 126, 127) - XSR5_AVX2(ctx.Rk, x3, x2, 128, 129) - XSR3_AVX2(ctx.Rk, x0, x3, 130, 131) - XSR9_AVX2(ctx.Rk, x1, x0, 120, 121) - XSR5_AVX2(ctx.Rk, x2, x1, 122, 123) - XSR3_AVX2(ctx.Rk, x3, x2, 124, 125) - - XSR9_AVX2(ctx.Rk, x0, x3, 114, 115) - XSR5_AVX2(ctx.Rk, x1, x0, 116, 117) - XSR3_AVX2(ctx.Rk, x2, x1, 118, 119) - XSR9_AVX2(ctx.Rk, x3, x2, 108, 109) - XSR5_AVX2(ctx.Rk, x0, x3, 110, 111) - XSR3_AVX2(ctx.Rk, x1, x0, 112, 113) - XSR9_AVX2(ctx.Rk, x2, x1, 102, 103) - XSR5_AVX2(ctx.Rk, x3, x2, 104, 105) - XSR3_AVX2(ctx.Rk, x0, x3, 106, 107) - XSR9_AVX2(ctx.Rk, x1, x0, 96, 97) - XSR5_AVX2(ctx.Rk, x2, x1, 98, 99) - XSR3_AVX2(ctx.Rk, x3, x2, 100, 101) - - XSR9_AVX2(ctx.Rk, x0, x3, 90, 91) - XSR5_AVX2(ctx.Rk, x1, x0, 92, 93) - XSR3_AVX2(ctx.Rk, x2, x1, 94, 95) - XSR9_AVX2(ctx.Rk, x3, x2, 84, 85) - XSR5_AVX2(ctx.Rk, x0, x3, 86, 87) - XSR3_AVX2(ctx.Rk, x1, x0, 88, 89) - XSR9_AVX2(ctx.Rk, x2, x1, 78, 79) - XSR5_AVX2(ctx.Rk, x3, x2, 80, 81) - XSR3_AVX2(ctx.Rk, x0, x3, 82, 83) - XSR9_AVX2(ctx.Rk, x1, x0, 72, 73) - XSR5_AVX2(ctx.Rk, x2, x1, 74, 75) - XSR3_AVX2(ctx.Rk, x3, x2, 76, 77) - - XSR9_AVX2(ctx.Rk, x0, x3, 66, 67) - XSR5_AVX2(ctx.Rk, x1, x0, 68, 69) - XSR3_AVX2(ctx.Rk, x2, x1, 70, 71) - XSR9_AVX2(ctx.Rk, x3, x2, 60, 61) - XSR5_AVX2(ctx.Rk, x0, x3, 62, 63) - XSR3_AVX2(ctx.Rk, x1, x0, 64, 65) - XSR9_AVX2(ctx.Rk, x2, x1, 54, 55) - XSR5_AVX2(ctx.Rk, x3, x2, 56, 57) - XSR3_AVX2(ctx.Rk, x0, x3, 58, 59) - XSR9_AVX2(ctx.Rk, x1, x0, 48, 49) - XSR5_AVX2(ctx.Rk, x2, x1, 50, 51) - XSR3_AVX2(ctx.Rk, x3, x2, 52, 53) - - XSR9_AVX2(ctx.Rk, x0, x3, 42, 43) - XSR5_AVX2(ctx.Rk, x1, x0, 44, 45) - XSR3_AVX2(ctx.Rk, x2, x1, 46, 47) - XSR9_AVX2(ctx.Rk, x3, x2, 36, 37) - XSR5_AVX2(ctx.Rk, x0, x3, 38, 39) - XSR3_AVX2(ctx.Rk, x1, x0, 40, 41) - XSR9_AVX2(ctx.Rk, x2, x1, 30, 31) - XSR5_AVX2(ctx.Rk, x3, x2, 32, 33) - XSR3_AVX2(ctx.Rk, x0, x3, 34, 35) - XSR9_AVX2(ctx.Rk, x1, x0, 24, 25) - XSR5_AVX2(ctx.Rk, x2, x1, 26, 27) - XSR3_AVX2(ctx.Rk, x3, x2, 28, 29) - - XSR9_AVX2(ctx.Rk, x0, x3, 18, 19) - XSR5_AVX2(ctx.Rk, x1, x0, 20, 21) - XSR3_AVX2(ctx.Rk, x2, x1, 22, 23) - XSR9_AVX2(ctx.Rk, x3, x2, 12, 13) - XSR5_AVX2(ctx.Rk, x0, x3, 14, 15) - XSR3_AVX2(ctx.Rk, x1, x0, 16, 17) - XSR9_AVX2(ctx.Rk, x2, x1, 6, 7) - XSR5_AVX2(ctx.Rk, x3, x2, 8, 9) - XSR3_AVX2(ctx.Rk, x0, x3, 10, 11) - XSR9_AVX2(ctx.Rk, x1, x0, 0, 1) - XSR5_AVX2(ctx.Rk, x2, x1, 2, 3) - XSR3_AVX2(ctx.Rk, x3, x2, 4, 5) - - /** - tmp128 = _mm256_extractf128_si256(x0, 0); - *((unsigned int *)pt + 0x00) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x04) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x08) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x0c) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x0, 1); - *((unsigned int *)pt + 0x10) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x14) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x18) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x1c) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x1, 0); - *((unsigned int *)pt + 0x01) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x05) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x09) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x0d) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x1, 1); - *((unsigned int *)pt + 0x11) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x15) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x19) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x1d) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x2, 0); - *((unsigned int *)pt + 0x02) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x06) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x0a) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x0e) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x2, 1); - *((unsigned int *)pt + 0x12) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x16) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x1a) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x1e) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x3, 0); - *((unsigned int *)pt + 0x03) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x07) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x0b) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x0f) = _mm_extract_epi32(tmp128, 3); - tmp128 = _mm256_extractf128_si256(x3, 1); - *((unsigned int *)pt + 0x13) = _mm_extract_epi32(tmp128, 0); *((unsigned int *)pt + 0x17) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)pt + 0x1b) = _mm_extract_epi32(tmp128, 2); *((unsigned int *)pt + 0x1f) = _mm_extract_epi32(tmp128, 3); - */ - leaAVX2Ymm2Int(x0, dst, 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c) - leaAVX2Ymm2Int(x1, dst, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d) - leaAVX2Ymm2Int(x2, dst, 0x02, 0x06, 0x0a, 0x0e, 0x12, 0x16, 0x1a, 0x1e) - leaAVX2Ymm2Int(x3, dst, 0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f) - - /** - return - */ - RET() -} - -func leaAVX2Int2Ymm(dst VecVirtual, src Mem, r0, r1, r2, r3, r4, r5, r6, r7 int) { - /** - vmovd xmm1, dword ptr [rsi ] # xmm1 = mem[0],zero,zero,zero - vpinsrd xmm1, xmm1, dword ptr [rsi + 16], 1 - vpinsrd xmm1, xmm1, dword ptr [rsi + 32], 2 - vpinsrd xmm1, xmm1, dword ptr [rsi + 48], 3 - - vmovd xmm0, dword ptr [rsi + 64] # xmm0 = mem[0],zero,zero,zero - vpinsrd xmm0, xmm0, dword ptr [rsi + 80], 1 - vpinsrd xmm0, xmm0, dword ptr [rsi + 96], 2 - vpinsrd xmm0, xmm0, dword ptr [rsi + 112], 3 - - vinserti128 ymm0, ymm1, xmm0, 1 - */ - - ymm0 := dst - xmm0 := dst.AsX() - - ymm1 := YMM() - xmm1 := ymm1.AsX() - - VMOVD(src.Offset(4*r0), xmm0) - VPINSRD(U8(1), src.Offset(4*r1), xmm0, xmm0) - VPINSRD(U8(2), src.Offset(4*r2), xmm0, xmm0) - VPINSRD(U8(3), src.Offset(4*r3), xmm0, xmm0) - - VMOVD(src.Offset(4*r4), xmm1) - VPINSRD(U8(1), src.Offset(4*r5), xmm1, xmm1) - VPINSRD(U8(2), src.Offset(4*r6), xmm1, xmm1) - VPINSRD(U8(3), src.Offset(4*r7), xmm1, xmm1) - - VINSERTI128(U8(1), xmm1, ymm0, ymm0) -} - -func leaAVX2Ymm2Int(y VecVirtual, dst Mem, d0, d1, d2, d3, d4, d5, d6, d7 int) { - /** - tmp128 = _mm256_extractf128_si256(x0, 0); - *((unsigned int *)ct) = _mm_extract_epi32(tmp128, 0); - *((unsigned int *)ct + 0x04) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x08) = _mm_extract_epi32(tmp128, 2); - *((unsigned int *)ct + 0x0c) = _mm_extract_epi32(tmp128, 3); - - tmp128 = _mm256_extractf128_si256(x0, 1); - *((unsigned int *)ct + 0x10) = _mm_extract_epi32(tmp128, 0); - *((unsigned int *)ct + 0x14) = _mm_extract_epi32(tmp128, 1); - *((unsigned int *)ct + 0x18) = _mm_extract_epi32(tmp128, 2); - *((unsigned int *)ct + 0x1c) = _mm_extract_epi32(tmp128, 3); - - 0 128 256 - xmm | 00000000 | 00000000 | 00000000 | 00000000 | | - ymm | 00000000 | 00000000 | 00000000 | 00000000 | 00000000 | 00000000 | 00000000 | 00000000 | - | x0 x1 x2 x3 | x4 x5 x6 x7 | - */ - - ymm0 := y - xmm0 := y.AsX() - - VEXTRACTPS(U8(0), xmm0, dst.Offset(4*d0)) - VEXTRACTPS(U8(1), xmm0, dst.Offset(4*d1)) - VEXTRACTPS(U8(2), xmm0, dst.Offset(4*d2)) - VEXTRACTPS(U8(3), xmm0, dst.Offset(4*d3)) - - VEXTRACTF128(U8(1), ymm0, xmm0) - VEXTRACTPS(U8(0), xmm0, dst.Offset(4*d4)) - VEXTRACTPS(U8(1), xmm0, dst.Offset(4*d5)) - VEXTRACTPS(U8(2), xmm0, dst.Offset(4*d6)) - VEXTRACTPS(U8(3), xmm0, dst.Offset(4*d7)) -} - -func XAR_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2, a, b int) { - /** - tmp = _mm256_add_epi32( ----> tmp0 - _mm256_xor_si256( ------> tmp0 - pre, - _mm256_set1_epi32(ctx.Rk1) --------> tmp0 - ), - _mm256_xor_si256( ------> tmp1 - cur, - _mm256_set1_epi32(ctx.Rk2) --------> tmp1 - ) - ); - cur = _mm256_xor_si256( ----> cur - _mm256_srli_epi32(tmp, a), ------> cur - _mm256_slli_epi32(tmp, b) ------> tmp0 - ); - */ - - tmp0 := YMM() - tmp1 := YMM() - - VPBROADCASTD(rk.Offset(4*rk1), tmp0) - VPBROADCASTD(rk.Offset(4*rk2), tmp1) - - VPXOR(pre, tmp0, tmp0) - VPXOR(cur, tmp1, tmp1) - VPADDD(tmp1, tmp0, tmp0) - - VPSRLD(U8(a), tmp0, cur) - VPSLLD(U8(b), tmp0, tmp0) - VPXOR(tmp0, cur, cur) -} -func XAR3_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_AVX2(rk, cur, pre, rk1, rk2, 3, 29) -} -func XAR5_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_AVX2(rk, cur, pre, rk1, rk2, 5, 27) -} -func XAR9_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_AVX2(rk, cur, pre, rk1, rk2, 23, 9) -} - -func XSR_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2, a, b int) { - /** - cur = _mm256_xor_si256( ----> cur - _mm256_sub_epi32( ------> cur - _mm256_xor_si256( --------> cur - _mm256_srli_epi32(cur, a), ----------> tmp - _mm256_slli_epi32(cur, b) ----------> cur - ), - _mm256_xor_si256( --------> tmp - pre, - _mm256_set1_epi32(ctx.Rk1) ----------> tmp - ) - ), - _mm256_set1_epi32(ctx.Rk2) ------> tmp - ); - */ - - tmp := YMM() - - VPSRLD(U8(a), cur, tmp) - - VPSLLD(U8(b), cur, cur) - VPXOR(tmp, cur, cur) - - VPBROADCASTD(rk.Offset(4*rk1), tmp) - VPXOR(pre, tmp, tmp) - - VPSUBD(tmp, cur, cur) - - VPBROADCASTD(rk.Offset(4*rk2), tmp) - - VPXOR(tmp, cur, cur) -} - -func XSR9_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_AVX2(rk, cur, pre, rk1, rk2, 9, 23) -} -func XSR5_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_AVX2(rk, cur, pre, rk1, rk2, 27, 5) -} -func XSR3_AVX2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_AVX2(rk, cur, pre, rk1, rk2, 29, 3) -} diff --git a/lea/amd64_avo/lea_sse2.go b/lea/amd64_avo/lea_sse2.go deleted file mode 100644 index 9b3f230..0000000 --- a/lea/amd64_avo/lea_sse2.go +++ /dev/null @@ -1,475 +0,0 @@ -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -func leaEnc4SSE2() { - TEXT("leaEnc4SSE2", NOSPLIT, "func(ctx *leaContext, dst []byte, src []byte)") - - ctx := GetCtx() - dst := Mem{Base: Load(Param("dst").Base(), GP64())} - src := Mem{Base: Load(Param("src").Base(), GP64())} - - /** - __m128i x0, x1, x2, x3, tmp; - __m128i tmp0, tmp1, tmp2, tmp3; - */ - x0 := XMM() - x1 := XMM() - x2 := XMM() - x3 := XMM() - - MOVUPS(src.Offset(0x00), x0) - MOVUPS(src.Offset(0x10), x1) - MOVUPS(src.Offset(0x20), x2) - MOVUPS(src.Offset(0x30), x3) - x0, x1, x2, x3 = leaSSE2Swap(x0, x1, x2, x3) - - XAR3_SSE2(ctx.Rk, x3, x2, 4, 5) - XAR5_SSE2(ctx.Rk, x2, x1, 2, 3) - XAR9_SSE2(ctx.Rk, x1, x0, 0, 1) - XAR3_SSE2(ctx.Rk, x0, x3, 10, 11) - XAR5_SSE2(ctx.Rk, x3, x2, 8, 9) - XAR9_SSE2(ctx.Rk, x2, x1, 6, 7) - XAR3_SSE2(ctx.Rk, x1, x0, 16, 17) - XAR5_SSE2(ctx.Rk, x0, x3, 14, 15) - XAR9_SSE2(ctx.Rk, x3, x2, 12, 13) - XAR3_SSE2(ctx.Rk, x2, x1, 22, 23) - XAR5_SSE2(ctx.Rk, x1, x0, 20, 21) - XAR9_SSE2(ctx.Rk, x0, x3, 18, 19) - - XAR3_SSE2(ctx.Rk, x3, x2, 28, 29) - XAR5_SSE2(ctx.Rk, x2, x1, 26, 27) - XAR9_SSE2(ctx.Rk, x1, x0, 24, 25) - XAR3_SSE2(ctx.Rk, x0, x3, 34, 35) - XAR5_SSE2(ctx.Rk, x3, x2, 32, 33) - XAR9_SSE2(ctx.Rk, x2, x1, 30, 31) - XAR3_SSE2(ctx.Rk, x1, x0, 40, 41) - XAR5_SSE2(ctx.Rk, x0, x3, 38, 39) - XAR9_SSE2(ctx.Rk, x3, x2, 36, 37) - XAR3_SSE2(ctx.Rk, x2, x1, 46, 47) - XAR5_SSE2(ctx.Rk, x1, x0, 44, 45) - XAR9_SSE2(ctx.Rk, x0, x3, 42, 43) - - XAR3_SSE2(ctx.Rk, x3, x2, 52, 53) - XAR5_SSE2(ctx.Rk, x2, x1, 50, 51) - XAR9_SSE2(ctx.Rk, x1, x0, 48, 49) - XAR3_SSE2(ctx.Rk, x0, x3, 58, 59) - XAR5_SSE2(ctx.Rk, x3, x2, 56, 57) - XAR9_SSE2(ctx.Rk, x2, x1, 54, 55) - XAR3_SSE2(ctx.Rk, x1, x0, 64, 65) - XAR5_SSE2(ctx.Rk, x0, x3, 62, 63) - XAR9_SSE2(ctx.Rk, x3, x2, 60, 61) - XAR3_SSE2(ctx.Rk, x2, x1, 70, 71) - XAR5_SSE2(ctx.Rk, x1, x0, 68, 69) - XAR9_SSE2(ctx.Rk, x0, x3, 66, 67) - - XAR3_SSE2(ctx.Rk, x3, x2, 76, 77) - XAR5_SSE2(ctx.Rk, x2, x1, 74, 75) - XAR9_SSE2(ctx.Rk, x1, x0, 72, 73) - XAR3_SSE2(ctx.Rk, x0, x3, 82, 83) - XAR5_SSE2(ctx.Rk, x3, x2, 80, 81) - XAR9_SSE2(ctx.Rk, x2, x1, 78, 79) - XAR3_SSE2(ctx.Rk, x1, x0, 88, 89) - XAR5_SSE2(ctx.Rk, x0, x3, 86, 87) - XAR9_SSE2(ctx.Rk, x3, x2, 84, 85) - XAR3_SSE2(ctx.Rk, x2, x1, 94, 95) - XAR5_SSE2(ctx.Rk, x1, x0, 92, 93) - XAR9_SSE2(ctx.Rk, x0, x3, 90, 91) - - XAR3_SSE2(ctx.Rk, x3, x2, 100, 101) - XAR5_SSE2(ctx.Rk, x2, x1, 98, 99) - XAR9_SSE2(ctx.Rk, x1, x0, 96, 97) - XAR3_SSE2(ctx.Rk, x0, x3, 106, 107) - XAR5_SSE2(ctx.Rk, x3, x2, 104, 105) - XAR9_SSE2(ctx.Rk, x2, x1, 102, 103) - XAR3_SSE2(ctx.Rk, x1, x0, 112, 113) - XAR5_SSE2(ctx.Rk, x0, x3, 110, 111) - XAR9_SSE2(ctx.Rk, x3, x2, 108, 109) - XAR3_SSE2(ctx.Rk, x2, x1, 118, 119) - XAR5_SSE2(ctx.Rk, x1, x0, 116, 117) - XAR9_SSE2(ctx.Rk, x0, x3, 114, 115) - - XAR3_SSE2(ctx.Rk, x3, x2, 124, 125) - XAR5_SSE2(ctx.Rk, x2, x1, 122, 123) - XAR9_SSE2(ctx.Rk, x1, x0, 120, 121) - XAR3_SSE2(ctx.Rk, x0, x3, 130, 131) - XAR5_SSE2(ctx.Rk, x3, x2, 128, 129) - XAR9_SSE2(ctx.Rk, x2, x1, 126, 127) - XAR3_SSE2(ctx.Rk, x1, x0, 136, 137) - XAR5_SSE2(ctx.Rk, x0, x3, 134, 135) - XAR9_SSE2(ctx.Rk, x3, x2, 132, 133) - XAR3_SSE2(ctx.Rk, x2, x1, 142, 143) - XAR5_SSE2(ctx.Rk, x1, x0, 140, 141) - XAR9_SSE2(ctx.Rk, x0, x3, 138, 139) - - CMPB(ctx.Round, U8(24)) - JBE(LabelRef("OVER24_END")) - XAR3_SSE2(ctx.Rk, x3, x2, 148, 149) - XAR5_SSE2(ctx.Rk, x2, x1, 146, 147) - XAR9_SSE2(ctx.Rk, x1, x0, 144, 145) - XAR3_SSE2(ctx.Rk, x0, x3, 154, 155) - XAR5_SSE2(ctx.Rk, x3, x2, 152, 153) - XAR9_SSE2(ctx.Rk, x2, x1, 150, 151) - XAR3_SSE2(ctx.Rk, x1, x0, 160, 161) - XAR5_SSE2(ctx.Rk, x0, x3, 158, 159) - XAR9_SSE2(ctx.Rk, x3, x2, 156, 157) - XAR3_SSE2(ctx.Rk, x2, x1, 166, 167) - XAR5_SSE2(ctx.Rk, x1, x0, 164, 165) - XAR9_SSE2(ctx.Rk, x0, x3, 162, 163) - Label("OVER24_END") - - CMPB(ctx.Round, U8(28)) - JBE(LabelRef("OVER28_END")) - XAR3_SSE2(ctx.Rk, x3, x2, 172, 173) - XAR5_SSE2(ctx.Rk, x2, x1, 170, 171) - XAR9_SSE2(ctx.Rk, x1, x0, 168, 169) - XAR3_SSE2(ctx.Rk, x0, x3, 178, 179) - XAR5_SSE2(ctx.Rk, x3, x2, 176, 177) - XAR9_SSE2(ctx.Rk, x2, x1, 174, 175) - XAR3_SSE2(ctx.Rk, x1, x0, 184, 185) - XAR5_SSE2(ctx.Rk, x0, x3, 182, 183) - XAR9_SSE2(ctx.Rk, x3, x2, 180, 181) - XAR3_SSE2(ctx.Rk, x2, x1, 190, 191) - XAR5_SSE2(ctx.Rk, x1, x0, 188, 189) - XAR9_SSE2(ctx.Rk, x0, x3, 186, 187) - Label("OVER28_END") - - x0, x1, x2, x3 = leaSSE2Swap(x0, x1, x2, x3) - MOVUPS(x0, dst.Offset(0x00)) - MOVUPS(x1, dst.Offset(0x10)) - MOVUPS(x2, dst.Offset(0x20)) - MOVUPS(x3, dst.Offset(0x30)) - - /** - return; - */ - RET() -} - -func leaDec4SSE2() { - TEXT("leaDec4SSE2", NOSPLIT, "func(ctx *leaContext, dst []byte, src []byte)") - - ctx := GetCtx() - dst := Mem{Base: Load(Param("dst").Base(), GP64())} - src := Mem{Base: Load(Param("src").Base(), GP64())} - - /** - __m128i x0, x1, x2, x3; - */ - x0 := XMM() - x1 := XMM() - x2 := XMM() - x3 := XMM() - - MOVUPS(src.Offset(0x00), x0) - MOVUPS(src.Offset(0x10), x1) - MOVUPS(src.Offset(0x20), x2) - MOVUPS(src.Offset(0x30), x3) - x0, x1, x2, x3 = leaSSE2Swap(x0, x1, x2, x3) - - CMPB(ctx.Round, U8(28)) - JBE(LabelRef("OVER28_END")) - XSR9_SSE2(ctx.Rk, x0, x3, 186, 187) - XSR5_SSE2(ctx.Rk, x1, x0, 188, 189) - XSR3_SSE2(ctx.Rk, x2, x1, 190, 191) - XSR9_SSE2(ctx.Rk, x3, x2, 180, 181) - XSR5_SSE2(ctx.Rk, x0, x3, 182, 183) - XSR3_SSE2(ctx.Rk, x1, x0, 184, 185) - XSR9_SSE2(ctx.Rk, x2, x1, 174, 175) - XSR5_SSE2(ctx.Rk, x3, x2, 176, 177) - XSR3_SSE2(ctx.Rk, x0, x3, 178, 179) - XSR9_SSE2(ctx.Rk, x1, x0, 168, 169) - XSR5_SSE2(ctx.Rk, x2, x1, 170, 171) - XSR3_SSE2(ctx.Rk, x3, x2, 172, 173) - Label("OVER28_END") - - CMPB(ctx.Round, U8(24)) - JBE(LabelRef("OVER24_END")) - XSR9_SSE2(ctx.Rk, x0, x3, 162, 163) - XSR5_SSE2(ctx.Rk, x1, x0, 164, 165) - XSR3_SSE2(ctx.Rk, x2, x1, 166, 167) - XSR9_SSE2(ctx.Rk, x3, x2, 156, 157) - XSR5_SSE2(ctx.Rk, x0, x3, 158, 159) - XSR3_SSE2(ctx.Rk, x1, x0, 160, 161) - XSR9_SSE2(ctx.Rk, x2, x1, 150, 151) - XSR5_SSE2(ctx.Rk, x3, x2, 152, 153) - XSR3_SSE2(ctx.Rk, x0, x3, 154, 155) - XSR9_SSE2(ctx.Rk, x1, x0, 144, 145) - XSR5_SSE2(ctx.Rk, x2, x1, 146, 147) - XSR3_SSE2(ctx.Rk, x3, x2, 148, 149) - Label("OVER24_END") - - XSR9_SSE2(ctx.Rk, x0, x3, 138, 139) - XSR5_SSE2(ctx.Rk, x1, x0, 140, 141) - XSR3_SSE2(ctx.Rk, x2, x1, 142, 143) - XSR9_SSE2(ctx.Rk, x3, x2, 132, 133) - XSR5_SSE2(ctx.Rk, x0, x3, 134, 135) - XSR3_SSE2(ctx.Rk, x1, x0, 136, 137) - XSR9_SSE2(ctx.Rk, x2, x1, 126, 127) - XSR5_SSE2(ctx.Rk, x3, x2, 128, 129) - XSR3_SSE2(ctx.Rk, x0, x3, 130, 131) - XSR9_SSE2(ctx.Rk, x1, x0, 120, 121) - XSR5_SSE2(ctx.Rk, x2, x1, 122, 123) - XSR3_SSE2(ctx.Rk, x3, x2, 124, 125) - - XSR9_SSE2(ctx.Rk, x0, x3, 114, 115) - XSR5_SSE2(ctx.Rk, x1, x0, 116, 117) - XSR3_SSE2(ctx.Rk, x2, x1, 118, 119) - XSR9_SSE2(ctx.Rk, x3, x2, 108, 109) - XSR5_SSE2(ctx.Rk, x0, x3, 110, 111) - XSR3_SSE2(ctx.Rk, x1, x0, 112, 113) - XSR9_SSE2(ctx.Rk, x2, x1, 102, 103) - XSR5_SSE2(ctx.Rk, x3, x2, 104, 105) - XSR3_SSE2(ctx.Rk, x0, x3, 106, 107) - XSR9_SSE2(ctx.Rk, x1, x0, 96, 97) - XSR5_SSE2(ctx.Rk, x2, x1, 98, 99) - XSR3_SSE2(ctx.Rk, x3, x2, 100, 101) - - XSR9_SSE2(ctx.Rk, x0, x3, 90, 91) - XSR5_SSE2(ctx.Rk, x1, x0, 92, 93) - XSR3_SSE2(ctx.Rk, x2, x1, 94, 95) - XSR9_SSE2(ctx.Rk, x3, x2, 84, 85) - XSR5_SSE2(ctx.Rk, x0, x3, 86, 87) - XSR3_SSE2(ctx.Rk, x1, x0, 88, 89) - XSR9_SSE2(ctx.Rk, x2, x1, 78, 79) - XSR5_SSE2(ctx.Rk, x3, x2, 80, 81) - XSR3_SSE2(ctx.Rk, x0, x3, 82, 83) - XSR9_SSE2(ctx.Rk, x1, x0, 72, 73) - XSR5_SSE2(ctx.Rk, x2, x1, 74, 75) - XSR3_SSE2(ctx.Rk, x3, x2, 76, 77) - - XSR9_SSE2(ctx.Rk, x0, x3, 66, 67) - XSR5_SSE2(ctx.Rk, x1, x0, 68, 69) - XSR3_SSE2(ctx.Rk, x2, x1, 70, 71) - XSR9_SSE2(ctx.Rk, x3, x2, 60, 61) - XSR5_SSE2(ctx.Rk, x0, x3, 62, 63) - XSR3_SSE2(ctx.Rk, x1, x0, 64, 65) - XSR9_SSE2(ctx.Rk, x2, x1, 54, 55) - XSR5_SSE2(ctx.Rk, x3, x2, 56, 57) - XSR3_SSE2(ctx.Rk, x0, x3, 58, 59) - XSR9_SSE2(ctx.Rk, x1, x0, 48, 49) - XSR5_SSE2(ctx.Rk, x2, x1, 50, 51) - XSR3_SSE2(ctx.Rk, x3, x2, 52, 53) - - XSR9_SSE2(ctx.Rk, x0, x3, 42, 43) - XSR5_SSE2(ctx.Rk, x1, x0, 44, 45) - XSR3_SSE2(ctx.Rk, x2, x1, 46, 47) - XSR9_SSE2(ctx.Rk, x3, x2, 36, 37) - XSR5_SSE2(ctx.Rk, x0, x3, 38, 39) - XSR3_SSE2(ctx.Rk, x1, x0, 40, 41) - XSR9_SSE2(ctx.Rk, x2, x1, 30, 31) - XSR5_SSE2(ctx.Rk, x3, x2, 32, 33) - XSR3_SSE2(ctx.Rk, x0, x3, 34, 35) - XSR9_SSE2(ctx.Rk, x1, x0, 24, 25) - XSR5_SSE2(ctx.Rk, x2, x1, 26, 27) - XSR3_SSE2(ctx.Rk, x3, x2, 28, 29) - - XSR9_SSE2(ctx.Rk, x0, x3, 18, 19) - XSR5_SSE2(ctx.Rk, x1, x0, 20, 21) - XSR3_SSE2(ctx.Rk, x2, x1, 22, 23) - XSR9_SSE2(ctx.Rk, x3, x2, 12, 13) - XSR5_SSE2(ctx.Rk, x0, x3, 14, 15) - XSR3_SSE2(ctx.Rk, x1, x0, 16, 17) - XSR9_SSE2(ctx.Rk, x2, x1, 6, 7) - XSR5_SSE2(ctx.Rk, x3, x2, 8, 9) - XSR3_SSE2(ctx.Rk, x0, x3, 10, 11) - XSR9_SSE2(ctx.Rk, x1, x0, 0, 1) - XSR5_SSE2(ctx.Rk, x2, x1, 2, 3) - XSR3_SSE2(ctx.Rk, x3, x2, 4, 5) - - x0, x1, x2, x3 = leaSSE2Swap(x0, x1, x2, x3) - MOVUPS(x0, dst.Offset(0x00)) - MOVUPS(x1, dst.Offset(0x10)) - MOVUPS(x2, dst.Offset(0x20)) - MOVUPS(x3, dst.Offset(0x30)) - - /** - return; - */ - RET() -} - -func leaSSE2Swap(x0, x1, x2, x3 VecVirtual) (VecVirtual, VecVirtual, VecVirtual, VecVirtual) { - /** - tmp0 = _mm_unpacklo_epi32(x0, x1); - tmp1 = _mm_unpacklo_epi32(x2, x3); - tmp2 = _mm_unpackhi_epi32(x0, x1); - tmp3 = _mm_unpackhi_epi32(x2, x3); - - x0 = _mm_unpacklo_epi64(tmp0, tmp1); - x1 = _mm_unpackhi_epi64(tmp0, tmp1); - x2 = _mm_unpacklo_epi64(tmp2, tmp3); - x3 = _mm_unpackhi_epi64(tmp2, tmp3); - - x86-64 clang 14.0.0 - -O3 -msse2 - - movups xmm0, xmmword ptr [rdi + 2] - movups xmm1, xmmword ptr [rdi + 20] - movups xmm2, xmmword ptr [rdi + 200] - movups xmm3, xmmword ptr [rdi + 2000] - - movaps xmm4, xmm0 - unpcklps xmm4, xmm1 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] - movaps xmm5, xmm2 - unpcklps xmm5, xmm3 # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] - unpckhps xmm0, xmm1 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] - unpckhps xmm2, xmm3 # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] - - movaps xmm1, xmm4 - movlhps xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0] - unpckhpd xmm4, xmm5 # xmm4 = xmm4[1],xmm5[1] - movaps xmm3, xmm0 - movlhps xmm3, xmm2 # xmm3 = xmm3[0],xmm2[0] - unpckhpd xmm0, xmm2 # xmm0 = xmm0[1],xmm2[1] - - movups xmmword ptr [rdi + 2], xmm1 - movups xmmword ptr [rdi + 20], xmm4 - movups xmmword ptr [rdi + 200], xmm3 - movups xmmword ptr [rdi + 2000], xmm0 - */ - xmm0 := x0 - xmm1 := x1 - xmm2 := x2 - xmm3 := x3 - xmm4 := XMM() - xmm5 := XMM() - - MOVAPS(xmm0, xmm4) - UNPCKLPS(xmm1, xmm4) - - MOVAPS(xmm2, xmm5) - UNPCKLPS(xmm3, xmm5) - - UNPCKHPS(xmm1, xmm0) - - UNPCKHPS(xmm3, xmm2) - - MOVAPS(xmm4, xmm1) - MOVLHPS(xmm5, xmm1) - - UNPCKHPD(xmm5, xmm4) - - MOVAPS(xmm0, xmm3) - MOVLHPS(xmm2, xmm3) - - UNPCKHPD(xmm2, xmm0) - - return xmm1, xmm4, xmm3, xmm0 -} - -func XAR_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2, a, b int) { - /** - tmp = _mm_add_epi32( - _mm_xor_si128( - pre, - _mm_set1_epi32(rk1) - ), - _mm_xor_si128( - cur, - _mm_set1_epi32(rk2) - ) - ); - cur = _mm_xor_si128( - _mm_srli_epi32(tmp, 3), - _mm_slli_epi32(tmp, 29) - ) - */ - - tmp0 := XMM() - tmp1 := XMM() - - { - { - MOVD(rk.Offset(4*rk1), tmp0) - PSHUFD(U8(0), tmp0, tmp0) - } - PXOR(pre, tmp0) - } - { - { - MOVD(rk.Offset(4*rk2), tmp1) - PSHUFD(U8(0), tmp1, tmp1) - } - PXOR(cur, tmp1) - } - PADDL(tmp1, tmp0) // paddd - - { - MOVO(tmp0, cur) // movdqa - PSRLL(U8(a), cur) // psrld - } - { - PSLLL(U8(b), tmp0) // pslld - } - PXOR(tmp0, cur) -} -func XAR3_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_SSE2(rk, cur, pre, rk1, rk2, 3, 29) -} -func XAR5_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_SSE2(rk, cur, pre, rk1, rk2, 5, 27) -} -func XAR9_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XAR_SSE2(rk, cur, pre, rk1, rk2, 23, 9) -} - -func XSR_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int, a, b int) { - /** - cur = _mm_xor_si128( - _mm_sub_epi32( - _mm_xor_si128( - _mm_srli_epi32(cur, a), - _mm_slli_epi32(cur, b) - ), - _mm_xor_si128( - pre, - _mm_set1_epi32(rk1) - ) - ), - _mm_set1_epi32(rk2) - ); - */ - - tmp := XMM() - - { - - { - { - MOVO(cur, tmp) - PSRLL(U8(a), tmp) // psrld - } - { - PSLLL(U8(b), cur) // pslld - } - PXOR(tmp, cur) - } - { - { - MOVD(rk.Offset(4*rk1), tmp) - PSHUFD(U8(0), tmp, tmp) - } - PXOR(pre, tmp) - } - PSUBL(tmp, cur) // psubd - } - { - MOVD(rk.Offset(4*rk2), tmp) - PSHUFD(U8(0), tmp, tmp) // psubd - } - PXOR(tmp, cur) -} -func XSR9_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_SSE2(rk, cur, pre, rk1, rk2, 9, 23) -} -func XSR5_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_SSE2(rk, cur, pre, rk1, rk2, 27, 5) -} -func XSR3_SSE2(rk Mem, cur, pre VecVirtual, rk1, rk2 int) { - XSR_SSE2(rk, cur, pre, rk1, rk2, 29, 3) -} diff --git a/lea/amd64_avo/structure.go b/lea/amd64_avo/structure.go deleted file mode 100644 index e5cbaef..0000000 --- a/lea/amd64_avo/structure.go +++ /dev/null @@ -1,33 +0,0 @@ -package main - -import ( - . "github.com/mmcloughlin/avo/build" - . "github.com/mmcloughlin/avo/operand" - . "github.com/mmcloughlin/avo/reg" -) - -type leaContext struct { - round uint8 - rk [192]uint32 - ecb bool -} - -type LeaContext struct { - Round Register // U8 - Rk Mem -} - -func GetCtx() LeaContext { - ctx := Dereference(Param("ctx")) - - round := Load(ctx.Field("round"), GP8()) - rk, err := ctx.Field("rk").Index(0).Resolve() - if err != nil { - panic(err) - } - - return LeaContext{ - Round: round, - Rk: rk.Addr, - } -} diff --git a/lea/amd64_c2goasm/.gitignore b/lea/amd64_c2goasm/.gitignore new file mode 100644 index 0000000..b72b9e3 --- /dev/null +++ b/lea/amd64_c2goasm/.gitignore @@ -0,0 +1,2 @@ +*.s +*.o \ No newline at end of file diff --git a/lea/amd64_c2goasm/Makefile b/lea/amd64_c2goasm/Makefile new file mode 100644 index 0000000..1fabbf4 --- /dev/null +++ b/lea/amd64_c2goasm/Makefile @@ -0,0 +1,24 @@ +EXECUTABLES=clang c2goasm asm2plan9s asmfmt as yasm +K := $(foreach exec,$(EXECUTABLES),\ + $(if $(shell which $(exec)),some string,$(error "No $(exec) in PATH"))) + +C2GOASM=c2goasm + +CC=clang +CFLAGS=-O3 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 +CFLAGS_ARCH=-arch x86_64 -masm=intel + +CFLAGS_SSE2=-msse -msse2 +CFLAGS_SSSE3=${CFLAGS_SSE2} -mssse3 +CFLAGS_AVX2=${CFLAGS_SSSE3} -mavx -mavx2 + + +all: sse2 avx2 + +sse2: + ${CC} ${CFLAGS} ${CFLAGS_ARCH} ${CFLAGS_SSE2} -S src/lea_sse2.c -o src/lea_sse2.s + ${C2GOASM} -a -f src/lea_sse2.s lea_amd64_sse2.s + +avx2: + ${CC} ${CFLAGS} ${CFLAGS_ARCH} ${CFLAGS_AVX2} -S src/lea_avx2.c -o src/lea_avx2.s + ${C2GOASM} -a -f src/lea_avx2.s lea_amd64_avx2.s diff --git a/lea/amd64_c2goasm/lea_amd64_avx2.go b/lea/amd64_c2goasm/lea_amd64_avx2.go new file mode 100644 index 0000000..e5a0ecf --- /dev/null +++ b/lea/amd64_c2goasm/lea_amd64_avx2.go @@ -0,0 +1,11 @@ +//go:build amd64 && gc && !purego + +package lea + +import "unsafe" + +//go:noescape +func __lea_encrypt_8block(ct, pt, rk unsafe.Pointer, round uint64) + +//go:noescape +func __lea_decrypt_8block(pt, ct, rk unsafe.Pointer, round uint64) diff --git a/lea/amd64_c2goasm/lea_amd64_sse2.go b/lea/amd64_c2goasm/lea_amd64_sse2.go new file mode 100644 index 0000000..f809fdc --- /dev/null +++ b/lea/amd64_c2goasm/lea_amd64_sse2.go @@ -0,0 +1,11 @@ +//go:build amd64 && gc && !purego + +package lea + +import "unsafe" + +//go:noescape +func __lea_encrypt_4block(ct, pt, rk unsafe.Pointer, round uint64) + +//go:noescape +func __lea_decrypt_4block(pt, ct, rk unsafe.Pointer, round uint64) diff --git a/lea/amd64_c2goasm/src/lea_avx2.c b/lea/amd64_c2goasm/src/lea_avx2.c new file mode 100644 index 0000000..3065032 --- /dev/null +++ b/lea/amd64_c2goasm/src/lea_avx2.c @@ -0,0 +1,361 @@ +#include +#include + +#define XAR3_AVX2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm256_add_epi32(_mm256_xor_si256(pre, _mm256_set1_epi32(rk1)), _mm256_xor_si256(cur, _mm256_set1_epi32(rk2))); \ + cur = _mm256_xor_si256(_mm256_srli_epi32(tmp, 3), _mm256_slli_epi32(tmp, 29)); +#define XAR5_AVX2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm256_add_epi32(_mm256_xor_si256(pre, _mm256_set1_epi32(rk1)), _mm256_xor_si256(cur, _mm256_set1_epi32(rk2))); \ + cur = _mm256_xor_si256(_mm256_srli_epi32(tmp, 5), _mm256_slli_epi32(tmp, 27)); +#define XAR9_AVX2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm256_add_epi32(_mm256_xor_si256(pre, _mm256_set1_epi32(rk1)), _mm256_xor_si256(cur, _mm256_set1_epi32(rk2))); \ + cur = _mm256_xor_si256(_mm256_srli_epi32(tmp, 23), _mm256_slli_epi32(tmp, 9)); + +#define XSR9_AVX2(cur, pre, rk1, rk2) \ + cur = _mm256_xor_si256(_mm256_sub_epi32(_mm256_xor_si256(_mm256_srli_epi32(cur, 9), _mm256_slli_epi32(cur, 23)), _mm256_xor_si256(pre, _mm256_set1_epi32(rk1))), \ + _mm256_set1_epi32(rk2)); +#define XSR5_AVX2(cur, pre, rk1, rk2) \ + cur = _mm256_xor_si256(_mm256_sub_epi32(_mm256_xor_si256(_mm256_srli_epi32(cur, 27), _mm256_slli_epi32(cur, 5)), _mm256_xor_si256(pre, _mm256_set1_epi32(rk1))), \ + _mm256_set1_epi32(rk2)); +#define XSR3_AVX2(cur, pre, rk1, rk2) \ + cur = _mm256_xor_si256(_mm256_sub_epi32(_mm256_xor_si256(_mm256_srli_epi32(cur, 29), _mm256_slli_epi32(cur, 3)), _mm256_xor_si256(pre, _mm256_set1_epi32(rk1))), \ + _mm256_set1_epi32(rk2)); + +void lea_encrypt_8block(char *ct, const char *pt, const unsigned int *rk, const unsigned long round) +{ + __m256i x0, x1, x2, x3, tmp; + __m128i tmp128; + + x0 = _mm256_setr_epi32(*((unsigned int *)pt), *((unsigned int *)pt + 0x04), *((unsigned int *)pt + 0x08), *((unsigned int *)pt + 0x0c), + *((unsigned int *)pt + 0x10), *((unsigned int *)pt + 0x14), *((unsigned int *)pt + 0x18), *((unsigned int *)pt + 0x1c)); + x1 = _mm256_setr_epi32(*((unsigned int *)pt + 0x01), *((unsigned int *)pt + 0x05), *((unsigned int *)pt + 0x09), *((unsigned int *)pt + 0x0d), + *((unsigned int *)pt + 0x11), *((unsigned int *)pt + 0x15), *((unsigned int *)pt + 0x19), *((unsigned int *)pt + 0x1d)); + x2 = _mm256_setr_epi32(*((unsigned int *)pt + 0x02), *((unsigned int *)pt + 0x06), *((unsigned int *)pt + 0x0a), *((unsigned int *)pt + 0x0e), + *((unsigned int *)pt + 0x12), *((unsigned int *)pt + 0x16), *((unsigned int *)pt + 0x1a), *((unsigned int *)pt + 0x1e)); + x3 = _mm256_setr_epi32(*((unsigned int *)pt + 0x03), *((unsigned int *)pt + 0x07), *((unsigned int *)pt + 0x0b), *((unsigned int *)pt + 0x0f), + *((unsigned int *)pt + 0x13), *((unsigned int *)pt + 0x17), *((unsigned int *)pt + 0x1b), *((unsigned int *)pt + 0x1f)); + + XAR3_AVX2(x3, x2, tmp, rk[4], rk[5]); + XAR5_AVX2(x2, x1, tmp, rk[2], rk[3]); + XAR9_AVX2(x1, x0, tmp, rk[0], rk[1]); + XAR3_AVX2(x0, x3, tmp, rk[10], rk[11]); + XAR5_AVX2(x3, x2, tmp, rk[8], rk[9]); + XAR9_AVX2(x2, x1, tmp, rk[6], rk[7]); + XAR3_AVX2(x1, x0, tmp, rk[16], rk[17]); + XAR5_AVX2(x0, x3, tmp, rk[14], rk[15]); + XAR9_AVX2(x3, x2, tmp, rk[12], rk[13]); + XAR3_AVX2(x2, x1, tmp, rk[22], rk[23]); + XAR5_AVX2(x1, x0, tmp, rk[20], rk[21]); + XAR9_AVX2(x0, x3, tmp, rk[18], rk[19]); + + XAR3_AVX2(x3, x2, tmp, rk[28], rk[29]); + XAR5_AVX2(x2, x1, tmp, rk[26], rk[27]); + XAR9_AVX2(x1, x0, tmp, rk[24], rk[25]); + XAR3_AVX2(x0, x3, tmp, rk[34], rk[35]); + XAR5_AVX2(x3, x2, tmp, rk[32], rk[33]); + XAR9_AVX2(x2, x1, tmp, rk[30], rk[31]); + XAR3_AVX2(x1, x0, tmp, rk[40], rk[41]); + XAR5_AVX2(x0, x3, tmp, rk[38], rk[39]); + XAR9_AVX2(x3, x2, tmp, rk[36], rk[37]); + XAR3_AVX2(x2, x1, tmp, rk[46], rk[47]); + XAR5_AVX2(x1, x0, tmp, rk[44], rk[45]); + XAR9_AVX2(x0, x3, tmp, rk[42], rk[43]); + + XAR3_AVX2(x3, x2, tmp, rk[52], rk[53]); + XAR5_AVX2(x2, x1, tmp, rk[50], rk[51]); + XAR9_AVX2(x1, x0, tmp, rk[48], rk[49]); + XAR3_AVX2(x0, x3, tmp, rk[58], rk[59]); + XAR5_AVX2(x3, x2, tmp, rk[56], rk[57]); + XAR9_AVX2(x2, x1, tmp, rk[54], rk[55]); + XAR3_AVX2(x1, x0, tmp, rk[64], rk[65]); + XAR5_AVX2(x0, x3, tmp, rk[62], rk[63]); + XAR9_AVX2(x3, x2, tmp, rk[60], rk[61]); + XAR3_AVX2(x2, x1, tmp, rk[70], rk[71]); + XAR5_AVX2(x1, x0, tmp, rk[68], rk[69]); + XAR9_AVX2(x0, x3, tmp, rk[66], rk[67]); + + XAR3_AVX2(x3, x2, tmp, rk[76], rk[77]); + XAR5_AVX2(x2, x1, tmp, rk[74], rk[75]); + XAR9_AVX2(x1, x0, tmp, rk[72], rk[73]); + XAR3_AVX2(x0, x3, tmp, rk[82], rk[83]); + XAR5_AVX2(x3, x2, tmp, rk[80], rk[81]); + XAR9_AVX2(x2, x1, tmp, rk[78], rk[79]); + XAR3_AVX2(x1, x0, tmp, rk[88], rk[89]); + XAR5_AVX2(x0, x3, tmp, rk[86], rk[87]); + XAR9_AVX2(x3, x2, tmp, rk[84], rk[85]); + XAR3_AVX2(x2, x1, tmp, rk[94], rk[95]); + XAR5_AVX2(x1, x0, tmp, rk[92], rk[93]); + XAR9_AVX2(x0, x3, tmp, rk[90], rk[91]); + + XAR3_AVX2(x3, x2, tmp, rk[100], rk[101]); + XAR5_AVX2(x2, x1, tmp, rk[98], rk[99]); + XAR9_AVX2(x1, x0, tmp, rk[96], rk[97]); + XAR3_AVX2(x0, x3, tmp, rk[106], rk[107]); + XAR5_AVX2(x3, x2, tmp, rk[104], rk[105]); + XAR9_AVX2(x2, x1, tmp, rk[102], rk[103]); + XAR3_AVX2(x1, x0, tmp, rk[112], rk[113]); + XAR5_AVX2(x0, x3, tmp, rk[110], rk[111]); + XAR9_AVX2(x3, x2, tmp, rk[108], rk[109]); + XAR3_AVX2(x2, x1, tmp, rk[118], rk[119]); + XAR5_AVX2(x1, x0, tmp, rk[116], rk[117]); + XAR9_AVX2(x0, x3, tmp, rk[114], rk[115]); + + XAR3_AVX2(x3, x2, tmp, rk[124], rk[125]); + XAR5_AVX2(x2, x1, tmp, rk[122], rk[123]); + XAR9_AVX2(x1, x0, tmp, rk[120], rk[121]); + XAR3_AVX2(x0, x3, tmp, rk[130], rk[131]); + XAR5_AVX2(x3, x2, tmp, rk[128], rk[129]); + XAR9_AVX2(x2, x1, tmp, rk[126], rk[127]); + XAR3_AVX2(x1, x0, tmp, rk[136], rk[137]); + XAR5_AVX2(x0, x3, tmp, rk[134], rk[135]); + XAR9_AVX2(x3, x2, tmp, rk[132], rk[133]); + XAR3_AVX2(x2, x1, tmp, rk[142], rk[143]); + XAR5_AVX2(x1, x0, tmp, rk[140], rk[141]); + XAR9_AVX2(x0, x3, tmp, rk[138], rk[139]); + + if (round > 24) + { + XAR3_AVX2(x3, x2, tmp, rk[148], rk[149]); + XAR5_AVX2(x2, x1, tmp, rk[146], rk[147]); + XAR9_AVX2(x1, x0, tmp, rk[144], rk[145]); + XAR3_AVX2(x0, x3, tmp, rk[154], rk[155]); + XAR5_AVX2(x3, x2, tmp, rk[152], rk[153]); + XAR9_AVX2(x2, x1, tmp, rk[150], rk[151]); + XAR3_AVX2(x1, x0, tmp, rk[160], rk[161]); + XAR5_AVX2(x0, x3, tmp, rk[158], rk[159]); + XAR9_AVX2(x3, x2, tmp, rk[156], rk[157]); + XAR3_AVX2(x2, x1, tmp, rk[166], rk[167]); + XAR5_AVX2(x1, x0, tmp, rk[164], rk[165]); + XAR9_AVX2(x0, x3, tmp, rk[162], rk[163]); + } + + if (round > 28) + { + XAR3_AVX2(x3, x2, tmp, rk[172], rk[173]); + XAR5_AVX2(x2, x1, tmp, rk[170], rk[171]); + XAR9_AVX2(x1, x0, tmp, rk[168], rk[169]); + XAR3_AVX2(x0, x3, tmp, rk[178], rk[179]); + XAR5_AVX2(x3, x2, tmp, rk[176], rk[177]); + XAR9_AVX2(x2, x1, tmp, rk[174], rk[175]); + XAR3_AVX2(x1, x0, tmp, rk[184], rk[185]); + XAR5_AVX2(x0, x3, tmp, rk[182], rk[183]); + XAR9_AVX2(x3, x2, tmp, rk[180], rk[181]); + XAR3_AVX2(x2, x1, tmp, rk[190], rk[191]); + XAR5_AVX2(x1, x0, tmp, rk[188], rk[189]); + XAR9_AVX2(x0, x3, tmp, rk[186], rk[187]); + } + + tmp128 = _mm256_extractf128_si256(x0, 0); + *((unsigned int *)ct) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x04) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x08) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x0c) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x0, 1); + *((unsigned int *)ct + 0x10) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x14) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x18) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x1c) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x1, 0); + *((unsigned int *)ct + 0x01) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x05) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x09) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x0d) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x1, 1); + *((unsigned int *)ct + 0x11) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x15) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x19) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x1d) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x2, 0); + *((unsigned int *)ct + 0x02) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x06) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x0a) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x0e) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x2, 1); + *((unsigned int *)ct + 0x12) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x16) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x1a) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x1e) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x3, 0); + *((unsigned int *)ct + 0x03) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x07) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x0b) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x0f) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x3, 1); + *((unsigned int *)ct + 0x13) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)ct + 0x17) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)ct + 0x1b) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)ct + 0x1f) = _mm_extract_epi32(tmp128, 3); +} + +void lea_decrypt_8block(char *pt, const char *ct, const unsigned int *rk, const unsigned long round) +{ + __m256i x0, x1, x2, x3; + __m128i tmp128; + + x0 = _mm256_setr_epi32(*((unsigned int *)ct), *((unsigned int *)ct + 0x04), *((unsigned int *)ct + 0x08), *((unsigned int *)ct + 0x0c), + *((unsigned int *)ct + 0x10), *((unsigned int *)ct + 0x14), *((unsigned int *)ct + 0x18), *((unsigned int *)ct + 0x1c)); + x1 = _mm256_setr_epi32(*((unsigned int *)ct + 0x01), *((unsigned int *)ct + 0x05), *((unsigned int *)ct + 0x09), *((unsigned int *)ct + 0x0d), + *((unsigned int *)ct + 0x11), *((unsigned int *)ct + 0x15), *((unsigned int *)ct + 0x19), *((unsigned int *)ct + 0x1d)); + x2 = _mm256_setr_epi32(*((unsigned int *)ct + 0x02), *((unsigned int *)ct + 0x06), *((unsigned int *)ct + 0x0a), *((unsigned int *)ct + 0x0e), + *((unsigned int *)ct + 0x12), *((unsigned int *)ct + 0x16), *((unsigned int *)ct + 0x1a), *((unsigned int *)ct + 0x1e)); + x3 = _mm256_setr_epi32(*((unsigned int *)ct + 0x03), *((unsigned int *)ct + 0x07), *((unsigned int *)ct + 0x0b), *((unsigned int *)ct + 0x0f), + *((unsigned int *)ct + 0x13), *((unsigned int *)ct + 0x17), *((unsigned int *)ct + 0x1b), *((unsigned int *)ct + 0x1f)); + + if (round > 28) + { + XSR9_AVX2(x0, x3, rk[186], rk[187]); + XSR5_AVX2(x1, x0, rk[188], rk[189]); + XSR3_AVX2(x2, x1, rk[190], rk[191]); + XSR9_AVX2(x3, x2, rk[180], rk[181]); + XSR5_AVX2(x0, x3, rk[182], rk[183]); + XSR3_AVX2(x1, x0, rk[184], rk[185]); + XSR9_AVX2(x2, x1, rk[174], rk[175]); + XSR5_AVX2(x3, x2, rk[176], rk[177]); + XSR3_AVX2(x0, x3, rk[178], rk[179]); + XSR9_AVX2(x1, x0, rk[168], rk[169]); + XSR5_AVX2(x2, x1, rk[170], rk[171]); + XSR3_AVX2(x3, x2, rk[172], rk[173]); + } + + if (round > 24) + { + XSR9_AVX2(x0, x3, rk[162], rk[163]); + XSR5_AVX2(x1, x0, rk[164], rk[165]); + XSR3_AVX2(x2, x1, rk[166], rk[167]); + XSR9_AVX2(x3, x2, rk[156], rk[157]); + XSR5_AVX2(x0, x3, rk[158], rk[159]); + XSR3_AVX2(x1, x0, rk[160], rk[161]); + XSR9_AVX2(x2, x1, rk[150], rk[151]); + XSR5_AVX2(x3, x2, rk[152], rk[153]); + XSR3_AVX2(x0, x3, rk[154], rk[155]); + XSR9_AVX2(x1, x0, rk[144], rk[145]); + XSR5_AVX2(x2, x1, rk[146], rk[147]); + XSR3_AVX2(x3, x2, rk[148], rk[149]); + } + + XSR9_AVX2(x0, x3, rk[138], rk[139]); + XSR5_AVX2(x1, x0, rk[140], rk[141]); + XSR3_AVX2(x2, x1, rk[142], rk[143]); + XSR9_AVX2(x3, x2, rk[132], rk[133]); + XSR5_AVX2(x0, x3, rk[134], rk[135]); + XSR3_AVX2(x1, x0, rk[136], rk[137]); + XSR9_AVX2(x2, x1, rk[126], rk[127]); + XSR5_AVX2(x3, x2, rk[128], rk[129]); + XSR3_AVX2(x0, x3, rk[130], rk[131]); + XSR9_AVX2(x1, x0, rk[120], rk[121]); + XSR5_AVX2(x2, x1, rk[122], rk[123]); + XSR3_AVX2(x3, x2, rk[124], rk[125]); + + XSR9_AVX2(x0, x3, rk[114], rk[115]); + XSR5_AVX2(x1, x0, rk[116], rk[117]); + XSR3_AVX2(x2, x1, rk[118], rk[119]); + XSR9_AVX2(x3, x2, rk[108], rk[109]); + XSR5_AVX2(x0, x3, rk[110], rk[111]); + XSR3_AVX2(x1, x0, rk[112], rk[113]); + XSR9_AVX2(x2, x1, rk[102], rk[103]); + XSR5_AVX2(x3, x2, rk[104], rk[105]); + XSR3_AVX2(x0, x3, rk[106], rk[107]); + XSR9_AVX2(x1, x0, rk[96], rk[97]); + XSR5_AVX2(x2, x1, rk[98], rk[99]); + XSR3_AVX2(x3, x2, rk[100], rk[101]); + + XSR9_AVX2(x0, x3, rk[90], rk[91]); + XSR5_AVX2(x1, x0, rk[92], rk[93]); + XSR3_AVX2(x2, x1, rk[94], rk[95]); + XSR9_AVX2(x3, x2, rk[84], rk[85]); + XSR5_AVX2(x0, x3, rk[86], rk[87]); + XSR3_AVX2(x1, x0, rk[88], rk[89]); + XSR9_AVX2(x2, x1, rk[78], rk[79]); + XSR5_AVX2(x3, x2, rk[80], rk[81]); + XSR3_AVX2(x0, x3, rk[82], rk[83]); + XSR9_AVX2(x1, x0, rk[72], rk[73]); + XSR5_AVX2(x2, x1, rk[74], rk[75]); + XSR3_AVX2(x3, x2, rk[76], rk[77]); + + XSR9_AVX2(x0, x3, rk[66], rk[67]); + XSR5_AVX2(x1, x0, rk[68], rk[69]); + XSR3_AVX2(x2, x1, rk[70], rk[71]); + XSR9_AVX2(x3, x2, rk[60], rk[61]); + XSR5_AVX2(x0, x3, rk[62], rk[63]); + XSR3_AVX2(x1, x0, rk[64], rk[65]); + XSR9_AVX2(x2, x1, rk[54], rk[55]); + XSR5_AVX2(x3, x2, rk[56], rk[57]); + XSR3_AVX2(x0, x3, rk[58], rk[59]); + XSR9_AVX2(x1, x0, rk[48], rk[49]); + XSR5_AVX2(x2, x1, rk[50], rk[51]); + XSR3_AVX2(x3, x2, rk[52], rk[53]); + + XSR9_AVX2(x0, x3, rk[42], rk[43]); + XSR5_AVX2(x1, x0, rk[44], rk[45]); + XSR3_AVX2(x2, x1, rk[46], rk[47]); + XSR9_AVX2(x3, x2, rk[36], rk[37]); + XSR5_AVX2(x0, x3, rk[38], rk[39]); + XSR3_AVX2(x1, x0, rk[40], rk[41]); + XSR9_AVX2(x2, x1, rk[30], rk[31]); + XSR5_AVX2(x3, x2, rk[32], rk[33]); + XSR3_AVX2(x0, x3, rk[34], rk[35]); + XSR9_AVX2(x1, x0, rk[24], rk[25]); + XSR5_AVX2(x2, x1, rk[26], rk[27]); + XSR3_AVX2(x3, x2, rk[28], rk[29]); + + XSR9_AVX2(x0, x3, rk[18], rk[19]); + XSR5_AVX2(x1, x0, rk[20], rk[21]); + XSR3_AVX2(x2, x1, rk[22], rk[23]); + XSR9_AVX2(x3, x2, rk[12], rk[13]); + XSR5_AVX2(x0, x3, rk[14], rk[15]); + XSR3_AVX2(x1, x0, rk[16], rk[17]); + XSR9_AVX2(x2, x1, rk[6], rk[7]); + XSR5_AVX2(x3, x2, rk[8], rk[9]); + XSR3_AVX2(x0, x3, rk[10], rk[11]); + XSR9_AVX2(x1, x0, rk[0], rk[1]); + XSR5_AVX2(x2, x1, rk[2], rk[3]); + XSR3_AVX2(x3, x2, rk[4], rk[5]); + + + tmp128 = _mm256_extractf128_si256(x0, 0); + *((unsigned int *)pt) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x04) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x08) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x0c) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x0, 1); + *((unsigned int *)pt + 0x10) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x14) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x18) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x1c) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x1, 0); + *((unsigned int *)pt + 0x01) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x05) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x09) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x0d) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x1, 1); + *((unsigned int *)pt + 0x11) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x15) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x19) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x1d) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x2, 0); + *((unsigned int *)pt + 0x02) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x06) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x0a) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x0e) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x2, 1); + *((unsigned int *)pt + 0x12) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x16) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x1a) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x1e) = _mm_extract_epi32(tmp128, 3); + + tmp128 = _mm256_extractf128_si256(x3, 0); + *((unsigned int *)pt + 0x03) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x07) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x0b) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x0f) = _mm_extract_epi32(tmp128, 3); + tmp128 = _mm256_extractf128_si256(x3, 1); + *((unsigned int *)pt + 0x13) = _mm_extract_epi32(tmp128, 0); + *((unsigned int *)pt + 0x17) = _mm_extract_epi32(tmp128, 1); + *((unsigned int *)pt + 0x1b) = _mm_extract_epi32(tmp128, 2); + *((unsigned int *)pt + 0x1f) = _mm_extract_epi32(tmp128, 3); +} diff --git a/lea/amd64_c2goasm/src/lea_sse2.c b/lea/amd64_c2goasm/src/lea_sse2.c new file mode 100644 index 0000000..b8a9bba --- /dev/null +++ b/lea/amd64_c2goasm/src/lea_sse2.c @@ -0,0 +1,311 @@ +#include + + +#define XAR3_SSE2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm_add_epi32(_mm_xor_si128(pre, _mm_set1_epi32(rk1)), _mm_xor_si128(cur, _mm_set1_epi32(rk2))); \ + cur = _mm_xor_si128(_mm_srli_epi32(tmp, 3), _mm_slli_epi32(tmp, 29)); +#define XAR5_SSE2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm_add_epi32(_mm_xor_si128(pre, _mm_set1_epi32(rk1)), _mm_xor_si128(cur, _mm_set1_epi32(rk2))); \ + cur = _mm_xor_si128(_mm_srli_epi32(tmp, 5), _mm_slli_epi32(tmp, 27)); +#define XAR9_SSE2(cur, pre, tmp, rk1, rk2) \ + tmp = _mm_add_epi32(_mm_xor_si128(pre, _mm_set1_epi32(rk1)), _mm_xor_si128(cur, _mm_set1_epi32(rk2))); \ + cur = _mm_xor_si128(_mm_srli_epi32(tmp, 23), _mm_slli_epi32(tmp, 9)); + +#define XSR9_SSE2(cur, pre, rk1, rk2) \ + cur = _mm_xor_si128(_mm_sub_epi32(_mm_xor_si128(_mm_srli_epi32(cur, 9), _mm_slli_epi32(cur, 23)), _mm_xor_si128(pre, _mm_set1_epi32(rk1))), _mm_set1_epi32(rk2)); +#define XSR5_SSE2(cur, pre, rk1, rk2) \ + cur = _mm_xor_si128(_mm_sub_epi32(_mm_xor_si128(_mm_srli_epi32(cur, 27), _mm_slli_epi32(cur, 5)), _mm_xor_si128(pre, _mm_set1_epi32(rk1))), _mm_set1_epi32(rk2)); +#define XSR3_SSE2(cur, pre, rk1, rk2) \ + cur = _mm_xor_si128(_mm_sub_epi32(_mm_xor_si128(_mm_srli_epi32(cur, 29), _mm_slli_epi32(cur, 3)), _mm_xor_si128(pre, _mm_set1_epi32(rk1))), _mm_set1_epi32(rk2)); + +void lea_encrypt_4block(char *ct, const char *pt, const unsigned int *rk, const unsigned long round) +{ + __m128i x0, x1, x2, x3, tmp; + __m128i tmp0, tmp1, tmp2, tmp3; + + x0 = _mm_loadu_si128((__m128i *)(pt )); + x1 = _mm_loadu_si128((__m128i *)(pt + 16)); + x2 = _mm_loadu_si128((__m128i *)(pt + 32)); + x3 = _mm_loadu_si128((__m128i *)(pt + 48)); + + tmp0 = _mm_unpacklo_epi32(x0, x1); + tmp1 = _mm_unpacklo_epi32(x2, x3); + tmp2 = _mm_unpackhi_epi32(x0, x1); + tmp3 = _mm_unpackhi_epi32(x2, x3); + + x0 = _mm_unpacklo_epi64(tmp0, tmp1); + x1 = _mm_unpackhi_epi64(tmp0, tmp1); + x2 = _mm_unpacklo_epi64(tmp2, tmp3); + x3 = _mm_unpackhi_epi64(tmp2, tmp3); + + XAR3_SSE2(x3, x2, tmp, rk[ 4], rk[ 5]); + XAR5_SSE2(x2, x1, tmp, rk[ 2], rk[ 3]); + XAR9_SSE2(x1, x0, tmp, rk[ 0], rk[ 1]); + XAR3_SSE2(x0, x3, tmp, rk[ 10], rk[ 11]); + XAR5_SSE2(x3, x2, tmp, rk[ 8], rk[ 9]); + XAR9_SSE2(x2, x1, tmp, rk[ 6], rk[ 7]); + XAR3_SSE2(x1, x0, tmp, rk[ 16], rk[ 17]); + XAR5_SSE2(x0, x3, tmp, rk[ 14], rk[ 15]); + XAR9_SSE2(x3, x2, tmp, rk[ 12], rk[ 13]); + XAR3_SSE2(x2, x1, tmp, rk[ 22], rk[ 23]); + XAR5_SSE2(x1, x0, tmp, rk[ 20], rk[ 21]); + XAR9_SSE2(x0, x3, tmp, rk[ 18], rk[ 19]); + + XAR3_SSE2(x3, x2, tmp, rk[ 28], rk[ 29]); + XAR5_SSE2(x2, x1, tmp, rk[ 26], rk[ 27]); + XAR9_SSE2(x1, x0, tmp, rk[ 24], rk[ 25]); + XAR3_SSE2(x0, x3, tmp, rk[ 34], rk[ 35]); + XAR5_SSE2(x3, x2, tmp, rk[ 32], rk[ 33]); + XAR9_SSE2(x2, x1, tmp, rk[ 30], rk[ 31]); + XAR3_SSE2(x1, x0, tmp, rk[ 40], rk[ 41]); + XAR5_SSE2(x0, x3, tmp, rk[ 38], rk[ 39]); + XAR9_SSE2(x3, x2, tmp, rk[ 36], rk[ 37]); + XAR3_SSE2(x2, x1, tmp, rk[ 46], rk[ 47]); + XAR5_SSE2(x1, x0, tmp, rk[ 44], rk[ 45]); + XAR9_SSE2(x0, x3, tmp, rk[ 42], rk[ 43]); + + XAR3_SSE2(x3, x2, tmp, rk[ 52], rk[ 53]); + XAR5_SSE2(x2, x1, tmp, rk[ 50], rk[ 51]); + XAR9_SSE2(x1, x0, tmp, rk[ 48], rk[ 49]); + XAR3_SSE2(x0, x3, tmp, rk[ 58], rk[ 59]); + XAR5_SSE2(x3, x2, tmp, rk[ 56], rk[ 57]); + XAR9_SSE2(x2, x1, tmp, rk[ 54], rk[ 55]); + XAR3_SSE2(x1, x0, tmp, rk[ 64], rk[ 65]); + XAR5_SSE2(x0, x3, tmp, rk[ 62], rk[ 63]); + XAR9_SSE2(x3, x2, tmp, rk[ 60], rk[ 61]); + XAR3_SSE2(x2, x1, tmp, rk[ 70], rk[ 71]); + XAR5_SSE2(x1, x0, tmp, rk[ 68], rk[ 69]); + XAR9_SSE2(x0, x3, tmp, rk[ 66], rk[ 67]); + + XAR3_SSE2(x3, x2, tmp, rk[ 76], rk[ 77]); + XAR5_SSE2(x2, x1, tmp, rk[ 74], rk[ 75]); + XAR9_SSE2(x1, x0, tmp, rk[ 72], rk[ 73]); + XAR3_SSE2(x0, x3, tmp, rk[ 82], rk[ 83]); + XAR5_SSE2(x3, x2, tmp, rk[ 80], rk[ 81]); + XAR9_SSE2(x2, x1, tmp, rk[ 78], rk[ 79]); + XAR3_SSE2(x1, x0, tmp, rk[ 88], rk[ 89]); + XAR5_SSE2(x0, x3, tmp, rk[ 86], rk[ 87]); + XAR9_SSE2(x3, x2, tmp, rk[ 84], rk[ 85]); + XAR3_SSE2(x2, x1, tmp, rk[ 94], rk[ 95]); + XAR5_SSE2(x1, x0, tmp, rk[ 92], rk[ 93]); + XAR9_SSE2(x0, x3, tmp, rk[ 90], rk[ 91]); + + XAR3_SSE2(x3, x2, tmp, rk[100], rk[101]); + XAR5_SSE2(x2, x1, tmp, rk[ 98], rk[ 99]); + XAR9_SSE2(x1, x0, tmp, rk[ 96], rk[ 97]); + XAR3_SSE2(x0, x3, tmp, rk[106], rk[107]); + XAR5_SSE2(x3, x2, tmp, rk[104], rk[105]); + XAR9_SSE2(x2, x1, tmp, rk[102], rk[103]); + XAR3_SSE2(x1, x0, tmp, rk[112], rk[113]); + XAR5_SSE2(x0, x3, tmp, rk[110], rk[111]); + XAR9_SSE2(x3, x2, tmp, rk[108], rk[109]); + XAR3_SSE2(x2, x1, tmp, rk[118], rk[119]); + XAR5_SSE2(x1, x0, tmp, rk[116], rk[117]); + XAR9_SSE2(x0, x3, tmp, rk[114], rk[115]); + + XAR3_SSE2(x3, x2, tmp, rk[124], rk[125]); + XAR5_SSE2(x2, x1, tmp, rk[122], rk[123]); + XAR9_SSE2(x1, x0, tmp, rk[120], rk[121]); + XAR3_SSE2(x0, x3, tmp, rk[130], rk[131]); + XAR5_SSE2(x3, x2, tmp, rk[128], rk[129]); + XAR9_SSE2(x2, x1, tmp, rk[126], rk[127]); + XAR3_SSE2(x1, x0, tmp, rk[136], rk[137]); + XAR5_SSE2(x0, x3, tmp, rk[134], rk[135]); + XAR9_SSE2(x3, x2, tmp, rk[132], rk[133]); + XAR3_SSE2(x2, x1, tmp, rk[142], rk[143]); + XAR5_SSE2(x1, x0, tmp, rk[140], rk[141]); + XAR9_SSE2(x0, x3, tmp, rk[138], rk[139]); + + if(round > 24) + { + XAR3_SSE2(x3, x2, tmp, rk[148], rk[149]); + XAR5_SSE2(x2, x1, tmp, rk[146], rk[147]); + XAR9_SSE2(x1, x0, tmp, rk[144], rk[145]); + XAR3_SSE2(x0, x3, tmp, rk[154], rk[155]); + XAR5_SSE2(x3, x2, tmp, rk[152], rk[153]); + XAR9_SSE2(x2, x1, tmp, rk[150], rk[151]); + XAR3_SSE2(x1, x0, tmp, rk[160], rk[161]); + XAR5_SSE2(x0, x3, tmp, rk[158], rk[159]); + XAR9_SSE2(x3, x2, tmp, rk[156], rk[157]); + XAR3_SSE2(x2, x1, tmp, rk[166], rk[167]); + XAR5_SSE2(x1, x0, tmp, rk[164], rk[165]); + XAR9_SSE2(x0, x3, tmp, rk[162], rk[163]); + } + + if(round > 28) + { + XAR3_SSE2(x3, x2, tmp, rk[172], rk[173]); + XAR5_SSE2(x2, x1, tmp, rk[170], rk[171]); + XAR9_SSE2(x1, x0, tmp, rk[168], rk[169]); + XAR3_SSE2(x0, x3, tmp, rk[178], rk[179]); + XAR5_SSE2(x3, x2, tmp, rk[176], rk[177]); + XAR9_SSE2(x2, x1, tmp, rk[174], rk[175]); + XAR3_SSE2(x1, x0, tmp, rk[184], rk[185]); + XAR5_SSE2(x0, x3, tmp, rk[182], rk[183]); + XAR9_SSE2(x3, x2, tmp, rk[180], rk[181]); + XAR3_SSE2(x2, x1, tmp, rk[190], rk[191]); + XAR5_SSE2(x1, x0, tmp, rk[188], rk[189]); + XAR9_SSE2(x0, x3, tmp, rk[186], rk[187]); + } + + tmp0 = _mm_unpacklo_epi32(x0, x1); + tmp1 = _mm_unpacklo_epi32(x2, x3); + tmp2 = _mm_unpackhi_epi32(x0, x1); + tmp3 = _mm_unpackhi_epi32(x2, x3); + + x0 = _mm_unpacklo_epi64(tmp0, tmp1); + x1 = _mm_unpackhi_epi64(tmp0, tmp1); + x2 = _mm_unpacklo_epi64(tmp2, tmp3); + x3 = _mm_unpackhi_epi64(tmp2, tmp3); + + _mm_storeu_si128((__m128i *)(ct ), x0); + _mm_storeu_si128((__m128i *)(ct + 16), x1); + _mm_storeu_si128((__m128i *)(ct + 32), x2); + _mm_storeu_si128((__m128i *)(ct + 48), x3); +} + +void lea_decrypt_4block(char *pt, const char *ct, const unsigned int *rk, const unsigned long round) +{ + __m128i x0, x1, x2, x3; + __m128i tmp0, tmp1, tmp2, tmp3; + + x0 = _mm_loadu_si128((__m128i *)(ct )); + x1 = _mm_loadu_si128((__m128i *)(ct + 16)); + x2 = _mm_loadu_si128((__m128i *)(ct + 32)); + x3 = _mm_loadu_si128((__m128i *)(ct + 48)); + + tmp0 = _mm_unpacklo_epi32(x0, x1); + tmp1 = _mm_unpacklo_epi32(x2, x3); + tmp2 = _mm_unpackhi_epi32(x0, x1); + tmp3 = _mm_unpackhi_epi32(x2, x3); + + x0 = _mm_unpacklo_epi64(tmp0, tmp1); + x1 = _mm_unpackhi_epi64(tmp0, tmp1); + x2 = _mm_unpacklo_epi64(tmp2, tmp3); + x3 = _mm_unpackhi_epi64(tmp2, tmp3); + + if(round > 28) + { + XSR9_SSE2(x0, x3, rk[186], rk[187]); + XSR5_SSE2(x1, x0, rk[188], rk[189]); + XSR3_SSE2(x2, x1, rk[190], rk[191]); + XSR9_SSE2(x3, x2, rk[180], rk[181]); + XSR5_SSE2(x0, x3, rk[182], rk[183]); + XSR3_SSE2(x1, x0, rk[184], rk[185]); + XSR9_SSE2(x2, x1, rk[174], rk[175]); + XSR5_SSE2(x3, x2, rk[176], rk[177]); + XSR3_SSE2(x0, x3, rk[178], rk[179]); + XSR9_SSE2(x1, x0, rk[168], rk[169]); + XSR5_SSE2(x2, x1, rk[170], rk[171]); + XSR3_SSE2(x3, x2, rk[172], rk[173]); + } + + if(round > 24) + { + XSR9_SSE2(x0, x3, rk[162], rk[163]); + XSR5_SSE2(x1, x0, rk[164], rk[165]); + XSR3_SSE2(x2, x1, rk[166], rk[167]); + XSR9_SSE2(x3, x2, rk[156], rk[157]); + XSR5_SSE2(x0, x3, rk[158], rk[159]); + XSR3_SSE2(x1, x0, rk[160], rk[161]); + XSR9_SSE2(x2, x1, rk[150], rk[151]); + XSR5_SSE2(x3, x2, rk[152], rk[153]); + XSR3_SSE2(x0, x3, rk[154], rk[155]); + XSR9_SSE2(x1, x0, rk[144], rk[145]); + XSR5_SSE2(x2, x1, rk[146], rk[147]); + XSR3_SSE2(x3, x2, rk[148], rk[149]); + } + + XSR9_SSE2(x0, x3, rk[138], rk[139]); + XSR5_SSE2(x1, x0, rk[140], rk[141]); + XSR3_SSE2(x2, x1, rk[142], rk[143]); + XSR9_SSE2(x3, x2, rk[132], rk[133]); + XSR5_SSE2(x0, x3, rk[134], rk[135]); + XSR3_SSE2(x1, x0, rk[136], rk[137]); + XSR9_SSE2(x2, x1, rk[126], rk[127]); + XSR5_SSE2(x3, x2, rk[128], rk[129]); + XSR3_SSE2(x0, x3, rk[130], rk[131]); + XSR9_SSE2(x1, x0, rk[120], rk[121]); + XSR5_SSE2(x2, x1, rk[122], rk[123]); + XSR3_SSE2(x3, x2, rk[124], rk[125]); + + XSR9_SSE2(x0, x3, rk[114], rk[115]); + XSR5_SSE2(x1, x0, rk[116], rk[117]); + XSR3_SSE2(x2, x1, rk[118], rk[119]); + XSR9_SSE2(x3, x2, rk[108], rk[109]); + XSR5_SSE2(x0, x3, rk[110], rk[111]); + XSR3_SSE2(x1, x0, rk[112], rk[113]); + XSR9_SSE2(x2, x1, rk[102], rk[103]); + XSR5_SSE2(x3, x2, rk[104], rk[105]); + XSR3_SSE2(x0, x3, rk[106], rk[107]); + XSR9_SSE2(x1, x0, rk[ 96], rk[ 97]); + XSR5_SSE2(x2, x1, rk[ 98], rk[ 99]); + XSR3_SSE2(x3, x2, rk[100], rk[101]); + + XSR9_SSE2(x0, x3, rk[ 90], rk[ 91]); + XSR5_SSE2(x1, x0, rk[ 92], rk[ 93]); + XSR3_SSE2(x2, x1, rk[ 94], rk[ 95]); + XSR9_SSE2(x3, x2, rk[ 84], rk[ 85]); + XSR5_SSE2(x0, x3, rk[ 86], rk[ 87]); + XSR3_SSE2(x1, x0, rk[ 88], rk[ 89]); + XSR9_SSE2(x2, x1, rk[ 78], rk[ 79]); + XSR5_SSE2(x3, x2, rk[ 80], rk[ 81]); + XSR3_SSE2(x0, x3, rk[ 82], rk[ 83]); + XSR9_SSE2(x1, x0, rk[ 72], rk[ 73]); + XSR5_SSE2(x2, x1, rk[ 74], rk[ 75]); + XSR3_SSE2(x3, x2, rk[ 76], rk[ 77]); + + XSR9_SSE2(x0, x3, rk[ 66], rk[ 67]); + XSR5_SSE2(x1, x0, rk[ 68], rk[ 69]); + XSR3_SSE2(x2, x1, rk[ 70], rk[ 71]); + XSR9_SSE2(x3, x2, rk[ 60], rk[ 61]); + XSR5_SSE2(x0, x3, rk[ 62], rk[ 63]); + XSR3_SSE2(x1, x0, rk[ 64], rk[ 65]); + XSR9_SSE2(x2, x1, rk[ 54], rk[ 55]); + XSR5_SSE2(x3, x2, rk[ 56], rk[ 57]); + XSR3_SSE2(x0, x3, rk[ 58], rk[ 59]); + XSR9_SSE2(x1, x0, rk[ 48], rk[ 49]); + XSR5_SSE2(x2, x1, rk[ 50], rk[ 51]); + XSR3_SSE2(x3, x2, rk[ 52], rk[ 53]); + + XSR9_SSE2(x0, x3, rk[ 42], rk[ 43]); + XSR5_SSE2(x1, x0, rk[ 44], rk[ 45]); + XSR3_SSE2(x2, x1, rk[ 46], rk[ 47]); + XSR9_SSE2(x3, x2, rk[ 36], rk[ 37]); + XSR5_SSE2(x0, x3, rk[ 38], rk[ 39]); + XSR3_SSE2(x1, x0, rk[ 40], rk[ 41]); + XSR9_SSE2(x2, x1, rk[ 30], rk[ 31]); + XSR5_SSE2(x3, x2, rk[ 32], rk[ 33]); + XSR3_SSE2(x0, x3, rk[ 34], rk[ 35]); + XSR9_SSE2(x1, x0, rk[ 24], rk[ 25]); + XSR5_SSE2(x2, x1, rk[ 26], rk[ 27]); + XSR3_SSE2(x3, x2, rk[ 28], rk[ 29]); + + XSR9_SSE2(x0, x3, rk[ 18], rk[ 19]); + XSR5_SSE2(x1, x0, rk[ 20], rk[ 21]); + XSR3_SSE2(x2, x1, rk[ 22], rk[ 23]); + XSR9_SSE2(x3, x2, rk[ 12], rk[ 13]); + XSR5_SSE2(x0, x3, rk[ 14], rk[ 15]); + XSR3_SSE2(x1, x0, rk[ 16], rk[ 17]); + XSR9_SSE2(x2, x1, rk[ 6], rk[ 7]); + XSR5_SSE2(x3, x2, rk[ 8], rk[ 9]); + XSR3_SSE2(x0, x3, rk[ 10], rk[ 11]); + XSR9_SSE2(x1, x0, rk[ 0], rk[ 1]); + XSR5_SSE2(x2, x1, rk[ 2], rk[ 3]); + XSR3_SSE2(x3, x2, rk[ 4], rk[ 5]); + + tmp0 = _mm_unpacklo_epi32(x0, x1); + tmp1 = _mm_unpacklo_epi32(x2, x3); + tmp2 = _mm_unpackhi_epi32(x0, x1); + tmp3 = _mm_unpackhi_epi32(x2, x3); + + x0 = _mm_unpacklo_epi64(tmp0, tmp1); + x1 = _mm_unpackhi_epi64(tmp0, tmp1); + x2 = _mm_unpacklo_epi64(tmp2, tmp3); + x3 = _mm_unpackhi_epi64(tmp2, tmp3); + + _mm_storeu_si128((__m128i *)(pt ), x0); + _mm_storeu_si128((__m128i *)(pt + 16), x1); + _mm_storeu_si128((__m128i *)(pt + 32), x2); + _mm_storeu_si128((__m128i *)(pt + 48), x3); +} diff --git a/lea/arm64_goat/Makefile b/lea/arm64_goat/Makefile new file mode 100644 index 0000000..9bd6c3f --- /dev/null +++ b/lea/arm64_goat/Makefile @@ -0,0 +1,15 @@ +EXECUTABLES=goat clang ar objdump +K := $(foreach exec,$(EXECUTABLES),\ + $(if $(shell which $(exec)),some string,$(error "No $(exec) in PATH"))) + +GOAT=goat +GOAT_FLAGS=-O3 + +export PATH := /opt/homebrew/opt/binutils/bin/:/opt/homebrew/opt/llvm/bin:$(PATH) +export LDFLAGS := -L/opt/homebrew/opt/llvm/lib +export CPPFLAGS := -I/opt/homebrew/opt/llvm/include + +all: arm64 + +arm64: + ${GOAT} ${GOAT_FLAGS} src/lea_arm64_neon.c diff --git a/lea/arm64_goat/build.sh b/lea/arm64_goat/build.sh deleted file mode 100644 index 1fd042f..0000000 --- a/lea/arm64_goat/build.sh +++ /dev/null @@ -1 +0,0 @@ -goat src/lea_arm64.c -O3 diff --git a/lea/arm64_goat/src/lea_arm64.c b/lea/arm64_goat/src/lea_arm64_neon.c similarity index 100% rename from lea/arm64_goat/src/lea_arm64.c rename to lea/arm64_goat/src/lea_arm64_neon.c diff --git a/lea/lea.go b/lea/lea.go index cc58325..1d6a570 100644 --- a/lea/lea.go +++ b/lea/lea.go @@ -7,7 +7,6 @@ import ( ) type funcNew func(key []byte) (cipher.Block, error) -type funcBlock func(ctx *leaContext, dst, src []byte) type leaContext struct { round uint8 @@ -16,14 +15,6 @@ type leaContext struct { } var ( - leaEnc1 funcBlock = leaEnc1Go - leaEnc4 funcBlock = leaEnc4Go - leaEnc8 funcBlock = leaEnc8Go - - leaDec1 funcBlock = leaDec1Go - leaDec4 funcBlock = leaDec4Go - leaDec8 funcBlock = leaDec8Go - leaNew funcNew = newCipherGo leaNewECB funcNew = newCipherECBGo ) diff --git a/lea/lea_amd64.go b/lea/lea_amd64.go index 361c1be..5912dee 100644 --- a/lea/lea_amd64.go +++ b/lea/lea_amd64.go @@ -3,13 +3,13 @@ package lea import ( + "unsafe" + "golang.org/x/sys/cpu" ) var ( hasAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasAVX - - useAVX2 = false ) func init() { @@ -19,12 +19,46 @@ func init() { leaEnc8 = leaEnc8SSE2 leaDec8 = leaDec8SSE2 - if hasAVX2 && useAVX2 { + if hasAVX2 { leaEnc8 = leaEnc8AVX2 leaDec8 = leaDec8AVX2 } } +func leaEnc4SSE2(ctx *leaContext, dst, src []byte) { + __lea_encrypt_4block( + unsafe.Pointer(&dst[0]), + unsafe.Pointer(&src[0]), + unsafe.Pointer(&ctx.rk[0]), + uint64(ctx.round), + ) +} +func leaDec4SSE2(ctx *leaContext, dst, src []byte) { + __lea_decrypt_4block( + unsafe.Pointer(&dst[0]), + unsafe.Pointer(&src[0]), + unsafe.Pointer(&ctx.rk[0]), + uint64(ctx.round), + ) +} + +func leaEnc8AVX2(ctx *leaContext, dst, src []byte) { + __lea_encrypt_8block( + unsafe.Pointer(&dst[0]), + unsafe.Pointer(&src[0]), + unsafe.Pointer(&ctx.rk[0]), + uint64(ctx.round), + ) +} +func leaDec8AVX2(ctx *leaContext, dst, src []byte) { + __lea_decrypt_8block( + unsafe.Pointer(&dst[0]), + unsafe.Pointer(&src[0]), + unsafe.Pointer(&ctx.rk[0]), + uint64(ctx.round), + ) +} + func leaEnc8SSE2(ctx *leaContext, dst, src []byte) { leaEnc4SSE2(ctx, dst[0x00:], src[0x00:]) leaEnc4SSE2(ctx, dst[0x40:], src[0x40:]) diff --git a/lea/lea_amd64.s b/lea/lea_amd64.s deleted file mode 100644 index 830e951..0000000 --- a/lea/lea_amd64.s +++ /dev/null @@ -1,3929 +0,0 @@ -// Code generated by command: go run lea.go -out ../lea_amd64.s -stubs ../lea_amd64_stubs.go -pkg lea. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -#include "textflag.h" - -// func leaEnc4SSE2(ctx *leaContext, dst []byte, src []byte) -// Requires: SSE, SSE2 -TEXT ·leaEnc4SSE2(SB), NOSPLIT, $0-56 - MOVQ ctx+0(FP), AX - MOVB (AX), CL - MOVQ dst_base+8(FP), DX - MOVQ src_base+32(FP), BX - MOVUPS (BX), X0 - MOVUPS 16(BX), X1 - MOVUPS 32(BX), X2 - MOVUPS 48(BX), X3 - MOVAPS X0, X4 - UNPCKLPS X1, X4 - MOVAPS X2, X5 - UNPCKLPS X3, X5 - UNPCKHPS X1, X0 - UNPCKHPS X3, X2 - MOVAPS X4, X1 - MOVLHPS X5, X1 - UNPCKHPD X5, X4 - MOVAPS X0, X3 - MOVLHPS X2, X3 - UNPCKHPD X2, X0 - MOVD 20(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 24(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 12(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 16(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 4(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 8(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 44(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 48(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 36(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 40(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 28(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 32(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 68(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 72(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 60(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 64(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 52(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 56(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 92(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 96(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 84(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 88(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 76(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 80(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - MOVD 116(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 120(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 108(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 112(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 100(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 104(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 140(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 144(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 132(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 136(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 124(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 128(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 164(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 168(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 156(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 160(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 148(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 152(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 188(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 192(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 180(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 184(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 172(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 176(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - MOVD 212(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 216(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 204(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 208(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 196(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 200(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 236(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 240(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 228(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 232(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 220(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 224(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 260(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 264(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 252(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 256(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 244(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 248(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 284(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 288(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 276(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 280(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 268(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 272(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - MOVD 308(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 312(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 300(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 304(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 292(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 296(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 332(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 336(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 324(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 328(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 316(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 320(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 356(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 360(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 348(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 352(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 340(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 344(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 380(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 384(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 372(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 376(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 364(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 368(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - MOVD 404(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 408(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 396(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 400(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 388(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 392(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 428(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 432(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 420(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 424(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 412(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 416(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 452(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 456(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 444(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 448(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 436(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 440(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 476(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 480(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 468(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 472(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 460(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 464(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - MOVD 500(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 504(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 492(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 496(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 484(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 488(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 524(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 528(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 516(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 520(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 508(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 512(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 548(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 552(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 540(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 544(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 532(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 536(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 572(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 576(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 564(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 568(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 556(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 560(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - CMPB CL, $0x18 - JBE OVER24_END - MOVD 596(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 600(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 588(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 592(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 580(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 584(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 620(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 624(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 612(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 616(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 604(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 608(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 644(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 648(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 636(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 640(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 628(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 632(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 668(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 672(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 660(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 664(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 652(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 656(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - -OVER24_END: - CMPB CL, $0x1c - JBE OVER28_END - MOVD 692(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 696(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x03, X0 - PSLLL $0x1d, X2 - PXOR X2, X0 - MOVD 684(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 688(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x05, X3 - PSLLL $0x1b, X2 - PXOR X2, X3 - MOVD 676(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 680(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x17, X4 - PSLLL $0x09, X2 - PXOR X2, X4 - MOVD 716(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 720(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x03, X1 - PSLLL $0x1d, X2 - PXOR X2, X1 - MOVD 708(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 712(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x05, X0 - PSLLL $0x1b, X2 - PXOR X2, X0 - MOVD 700(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 704(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x17, X3 - PSLLL $0x09, X2 - PXOR X2, X3 - MOVD 740(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 744(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x03, X4 - PSLLL $0x1d, X2 - PXOR X2, X4 - MOVD 732(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 736(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x05, X1 - PSLLL $0x1b, X2 - PXOR X2, X1 - MOVD 724(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - MOVD 728(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X0, X5 - PADDL X5, X2 - MOVO X2, X0 - PSRLL $0x17, X0 - PSLLL $0x09, X2 - PXOR X2, X0 - MOVD 764(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - MOVD 768(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X3, X5 - PADDL X5, X2 - MOVO X2, X3 - PSRLL $0x03, X3 - PSLLL $0x1d, X2 - PXOR X2, X3 - MOVD 756(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - MOVD 760(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X4, X5 - PADDL X5, X2 - MOVO X2, X4 - PSRLL $0x05, X4 - PSLLL $0x1b, X2 - PXOR X2, X4 - MOVD 748(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - MOVD 752(AX), X5 - PSHUFD $0x00, X5, X5 - PXOR X1, X5 - PADDL X5, X2 - MOVO X2, X1 - PSRLL $0x17, X1 - PSLLL $0x09, X2 - PXOR X2, X1 - -OVER28_END: - MOVAPS X1, X2 - UNPCKLPS X4, X2 - MOVAPS X3, X5 - UNPCKLPS X0, X5 - UNPCKHPS X4, X1 - UNPCKHPS X0, X3 - MOVAPS X2, X4 - MOVLHPS X5, X4 - UNPCKHPD X5, X2 - MOVAPS X1, X0 - MOVLHPS X3, X0 - UNPCKHPD X3, X1 - MOVUPS X4, (DX) - MOVUPS X2, 16(DX) - MOVUPS X0, 32(DX) - MOVUPS X1, 48(DX) - RET - -// func leaDec4SSE2(ctx *leaContext, dst []byte, src []byte) -// Requires: SSE, SSE2 -TEXT ·leaDec4SSE2(SB), NOSPLIT, $0-56 - MOVQ ctx+0(FP), AX - MOVB (AX), CL - MOVQ dst_base+8(FP), DX - MOVQ src_base+32(FP), BX - MOVUPS (BX), X0 - MOVUPS 16(BX), X1 - MOVUPS 32(BX), X2 - MOVUPS 48(BX), X3 - MOVAPS X0, X4 - UNPCKLPS X1, X4 - MOVAPS X2, X5 - UNPCKLPS X3, X5 - UNPCKHPS X1, X0 - UNPCKHPS X3, X2 - MOVAPS X4, X1 - MOVLHPS X5, X1 - UNPCKHPD X5, X4 - MOVAPS X0, X3 - MOVLHPS X2, X3 - UNPCKHPD X2, X0 - CMPB CL, $0x1c - JBE OVER28_END - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 748(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 752(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 756(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 760(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 764(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 768(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 724(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 728(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 732(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 736(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 740(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 744(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 700(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 704(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 708(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 712(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 716(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 720(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 676(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 680(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 684(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 688(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 692(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 696(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - -OVER28_END: - CMPB CL, $0x18 - JBE OVER24_END - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 652(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 656(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 660(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 664(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 668(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 672(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 628(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 632(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 636(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 640(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 644(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 648(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 604(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 608(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 612(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 616(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 620(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 624(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 580(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 584(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 588(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 592(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 596(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 600(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - -OVER24_END: - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 556(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 560(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 564(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 568(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 572(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 576(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 532(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 536(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 540(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 544(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 548(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 552(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 508(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 512(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 516(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 520(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 524(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 528(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 484(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 488(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 492(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 496(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 500(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 504(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 460(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 464(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 468(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 472(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 476(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 480(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 436(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 440(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 444(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 448(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 452(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 456(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 412(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 416(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 420(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 424(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 428(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 432(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 388(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 392(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 396(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 400(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 404(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 408(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 364(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 368(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 372(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 376(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 380(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 384(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 340(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 344(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 348(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 352(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 356(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 360(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 316(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 320(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 324(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 328(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 332(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 336(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 292(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 296(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 300(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 304(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 308(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 312(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 268(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 272(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 276(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 280(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 284(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 288(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 244(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 248(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 252(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 256(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 260(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 264(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 220(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 224(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 228(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 232(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 236(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 240(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 196(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 200(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 204(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 208(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 212(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 216(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 172(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 176(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 180(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 184(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 188(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 192(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 148(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 152(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 156(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 160(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 164(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 168(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 124(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 128(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 132(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 136(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 140(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 144(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 100(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 104(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 108(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 112(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 116(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 120(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X1 - PXOR X2, X1 - MOVD 76(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 80(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X4 - PXOR X2, X4 - MOVD 84(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 88(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X3 - PXOR X2, X3 - MOVD 92(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 96(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X0 - PXOR X2, X0 - MOVD 52(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 56(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X1 - PXOR X2, X1 - MOVD 60(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 64(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X4 - PXOR X2, X4 - MOVD 68(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 72(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X3 - PXOR X2, X3 - MOVD 28(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 32(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X0 - PXOR X2, X0 - MOVD 36(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 40(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVO X1, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X1 - PXOR X2, X1 - MOVD 44(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X0, X2 - PSUBL X2, X1 - MOVD 48(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X1 - MOVO X4, X2 - PSRLL $0x09, X2 - PSLLL $0x17, X4 - PXOR X2, X4 - MOVD 4(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X1, X2 - PSUBL X2, X4 - MOVD 8(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X4 - MOVO X3, X2 - PSRLL $0x1b, X2 - PSLLL $0x05, X3 - PXOR X2, X3 - MOVD 12(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X4, X2 - PSUBL X2, X3 - MOVD 16(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X3 - MOVO X0, X2 - PSRLL $0x1d, X2 - PSLLL $0x03, X0 - PXOR X2, X0 - MOVD 20(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X3, X2 - PSUBL X2, X0 - MOVD 24(AX), X2 - PSHUFD $0x00, X2, X2 - PXOR X2, X0 - MOVAPS X1, X2 - UNPCKLPS X4, X2 - MOVAPS X3, X5 - UNPCKLPS X0, X5 - UNPCKHPS X4, X1 - UNPCKHPS X0, X3 - MOVAPS X2, X4 - MOVLHPS X5, X4 - UNPCKHPD X5, X2 - MOVAPS X1, X0 - MOVLHPS X3, X0 - UNPCKHPD X3, X1 - MOVUPS X4, (DX) - MOVUPS X2, 16(DX) - MOVUPS X0, 32(DX) - MOVUPS X1, 48(DX) - RET - -// func leaEnc8AVX2(ctx *leaContext, dst []byte, src []byte) -// Requires: AVX, AVX2 -TEXT ·leaEnc8AVX2(SB), NOSPLIT, $0-56 - MOVQ ctx+0(FP), AX - MOVB (AX), CL - MOVQ dst_base+8(FP), DX - MOVQ src_base+32(FP), BX - VMOVD (BX), X0 - VPINSRD $0x01, 16(BX), X0, X0 - VPINSRD $0x02, 32(BX), X0, X0 - VPINSRD $0x03, 48(BX), X0, X0 - VMOVD 64(BX), X1 - VPINSRD $0x01, 80(BX), X1, X1 - VPINSRD $0x02, 96(BX), X1, X1 - VPINSRD $0x03, 112(BX), X1, X1 - VINSERTI128 $0x01, X1, Y0, Y0 - VMOVD 4(BX), X1 - VPINSRD $0x01, 20(BX), X1, X1 - VPINSRD $0x02, 36(BX), X1, X1 - VPINSRD $0x03, 52(BX), X1, X1 - VMOVD 68(BX), X2 - VPINSRD $0x01, 84(BX), X2, X2 - VPINSRD $0x02, 100(BX), X2, X2 - VPINSRD $0x03, 116(BX), X2, X2 - VINSERTI128 $0x01, X2, Y1, Y1 - VMOVD 8(BX), X2 - VPINSRD $0x01, 24(BX), X2, X2 - VPINSRD $0x02, 40(BX), X2, X2 - VPINSRD $0x03, 56(BX), X2, X2 - VMOVD 72(BX), X3 - VPINSRD $0x01, 88(BX), X3, X3 - VPINSRD $0x02, 104(BX), X3, X3 - VPINSRD $0x03, 120(BX), X3, X3 - VINSERTI128 $0x01, X3, Y2, Y2 - VMOVD 12(BX), X3 - VPINSRD $0x01, 28(BX), X3, X3 - VPINSRD $0x02, 44(BX), X3, X3 - VPINSRD $0x03, 60(BX), X3, X3 - VMOVD 76(BX), X4 - VPINSRD $0x01, 92(BX), X4, X4 - VPINSRD $0x02, 108(BX), X4, X4 - VPINSRD $0x03, 124(BX), X4, X4 - VINSERTI128 $0x01, X4, Y3, Y3 - VPBROADCASTD 20(AX), Y4 - VPBROADCASTD 24(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 12(AX), Y4 - VPBROADCASTD 16(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 4(AX), Y4 - VPBROADCASTD 8(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 44(AX), Y4 - VPBROADCASTD 48(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 36(AX), Y4 - VPBROADCASTD 40(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 28(AX), Y4 - VPBROADCASTD 32(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 68(AX), Y4 - VPBROADCASTD 72(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 60(AX), Y4 - VPBROADCASTD 64(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 52(AX), Y4 - VPBROADCASTD 56(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 92(AX), Y4 - VPBROADCASTD 96(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 84(AX), Y4 - VPBROADCASTD 88(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 76(AX), Y4 - VPBROADCASTD 80(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 116(AX), Y4 - VPBROADCASTD 120(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 108(AX), Y4 - VPBROADCASTD 112(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 100(AX), Y4 - VPBROADCASTD 104(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 140(AX), Y4 - VPBROADCASTD 144(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 132(AX), Y4 - VPBROADCASTD 136(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 124(AX), Y4 - VPBROADCASTD 128(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 164(AX), Y4 - VPBROADCASTD 168(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 156(AX), Y4 - VPBROADCASTD 160(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 148(AX), Y4 - VPBROADCASTD 152(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 188(AX), Y4 - VPBROADCASTD 192(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 180(AX), Y4 - VPBROADCASTD 184(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 172(AX), Y4 - VPBROADCASTD 176(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 212(AX), Y4 - VPBROADCASTD 216(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 204(AX), Y4 - VPBROADCASTD 208(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 196(AX), Y4 - VPBROADCASTD 200(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 236(AX), Y4 - VPBROADCASTD 240(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 228(AX), Y4 - VPBROADCASTD 232(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 220(AX), Y4 - VPBROADCASTD 224(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 260(AX), Y4 - VPBROADCASTD 264(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 252(AX), Y4 - VPBROADCASTD 256(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 244(AX), Y4 - VPBROADCASTD 248(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 284(AX), Y4 - VPBROADCASTD 288(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 276(AX), Y4 - VPBROADCASTD 280(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 268(AX), Y4 - VPBROADCASTD 272(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 308(AX), Y4 - VPBROADCASTD 312(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 300(AX), Y4 - VPBROADCASTD 304(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 292(AX), Y4 - VPBROADCASTD 296(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 332(AX), Y4 - VPBROADCASTD 336(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 324(AX), Y4 - VPBROADCASTD 328(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 316(AX), Y4 - VPBROADCASTD 320(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 356(AX), Y4 - VPBROADCASTD 360(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 348(AX), Y4 - VPBROADCASTD 352(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 340(AX), Y4 - VPBROADCASTD 344(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 380(AX), Y4 - VPBROADCASTD 384(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 372(AX), Y4 - VPBROADCASTD 376(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 364(AX), Y4 - VPBROADCASTD 368(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 404(AX), Y4 - VPBROADCASTD 408(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 396(AX), Y4 - VPBROADCASTD 400(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 388(AX), Y4 - VPBROADCASTD 392(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 428(AX), Y4 - VPBROADCASTD 432(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 420(AX), Y4 - VPBROADCASTD 424(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 412(AX), Y4 - VPBROADCASTD 416(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 452(AX), Y4 - VPBROADCASTD 456(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 444(AX), Y4 - VPBROADCASTD 448(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 436(AX), Y4 - VPBROADCASTD 440(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 476(AX), Y4 - VPBROADCASTD 480(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 468(AX), Y4 - VPBROADCASTD 472(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 460(AX), Y4 - VPBROADCASTD 464(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 500(AX), Y4 - VPBROADCASTD 504(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 492(AX), Y4 - VPBROADCASTD 496(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 484(AX), Y4 - VPBROADCASTD 488(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 524(AX), Y4 - VPBROADCASTD 528(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 516(AX), Y4 - VPBROADCASTD 520(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 508(AX), Y4 - VPBROADCASTD 512(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 548(AX), Y4 - VPBROADCASTD 552(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 540(AX), Y4 - VPBROADCASTD 544(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 532(AX), Y4 - VPBROADCASTD 536(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 572(AX), Y4 - VPBROADCASTD 576(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 564(AX), Y4 - VPBROADCASTD 568(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 556(AX), Y4 - VPBROADCASTD 560(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - CMPB CL, $0x18 - JBE OVER24_END - VPBROADCASTD 596(AX), Y4 - VPBROADCASTD 600(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 588(AX), Y4 - VPBROADCASTD 592(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 580(AX), Y4 - VPBROADCASTD 584(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 620(AX), Y4 - VPBROADCASTD 624(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 612(AX), Y4 - VPBROADCASTD 616(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 604(AX), Y4 - VPBROADCASTD 608(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 644(AX), Y4 - VPBROADCASTD 648(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 636(AX), Y4 - VPBROADCASTD 640(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 628(AX), Y4 - VPBROADCASTD 632(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 668(AX), Y4 - VPBROADCASTD 672(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 660(AX), Y4 - VPBROADCASTD 664(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 652(AX), Y4 - VPBROADCASTD 656(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - -OVER24_END: - CMPB CL, $0x1c - JBE OVER28_END - VPBROADCASTD 692(AX), Y4 - VPBROADCASTD 696(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y3 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 684(AX), Y4 - VPBROADCASTD 688(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y2 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 676(AX), Y4 - VPBROADCASTD 680(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y1 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 716(AX), Y4 - VPBROADCASTD 720(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y0 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 708(AX), Y4 - VPBROADCASTD 712(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y3 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 700(AX), Y4 - VPBROADCASTD 704(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y2 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 740(AX), Y4 - VPBROADCASTD 744(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y1 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 732(AX), Y4 - VPBROADCASTD 736(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y0 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 724(AX), Y4 - VPBROADCASTD 728(AX), Y5 - VPXOR Y2, Y4, Y4 - VPXOR Y3, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y3 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 764(AX), Y4 - VPBROADCASTD 768(AX), Y5 - VPXOR Y1, Y4, Y4 - VPXOR Y2, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x03, Y4, Y2 - VPSLLD $0x1d, Y4, Y4 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 756(AX), Y4 - VPBROADCASTD 760(AX), Y5 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x05, Y4, Y1 - VPSLLD $0x1b, Y4, Y4 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 748(AX), Y4 - VPBROADCASTD 752(AX), Y5 - VPXOR Y3, Y4, Y4 - VPXOR Y0, Y5, Y5 - VPADDD Y5, Y4, Y4 - VPSRLD $0x17, Y4, Y0 - VPSLLD $0x09, Y4, Y4 - VPXOR Y4, Y0, Y0 - -OVER28_END: - VEXTRACTPS $0x00, X0, (DX) - VEXTRACTPS $0x01, X0, 16(DX) - VEXTRACTPS $0x02, X0, 32(DX) - VEXTRACTPS $0x03, X0, 48(DX) - VEXTRACTF128 $0x01, Y0, X0 - VEXTRACTPS $0x00, X0, 64(DX) - VEXTRACTPS $0x01, X0, 80(DX) - VEXTRACTPS $0x02, X0, 96(DX) - VEXTRACTPS $0x03, X0, 112(DX) - VEXTRACTPS $0x00, X1, 4(DX) - VEXTRACTPS $0x01, X1, 20(DX) - VEXTRACTPS $0x02, X1, 36(DX) - VEXTRACTPS $0x03, X1, 52(DX) - VEXTRACTF128 $0x01, Y1, X1 - VEXTRACTPS $0x00, X1, 68(DX) - VEXTRACTPS $0x01, X1, 84(DX) - VEXTRACTPS $0x02, X1, 100(DX) - VEXTRACTPS $0x03, X1, 116(DX) - VEXTRACTPS $0x00, X2, 8(DX) - VEXTRACTPS $0x01, X2, 24(DX) - VEXTRACTPS $0x02, X2, 40(DX) - VEXTRACTPS $0x03, X2, 56(DX) - VEXTRACTF128 $0x01, Y2, X2 - VEXTRACTPS $0x00, X2, 72(DX) - VEXTRACTPS $0x01, X2, 88(DX) - VEXTRACTPS $0x02, X2, 104(DX) - VEXTRACTPS $0x03, X2, 120(DX) - VEXTRACTPS $0x00, X3, 12(DX) - VEXTRACTPS $0x01, X3, 28(DX) - VEXTRACTPS $0x02, X3, 44(DX) - VEXTRACTPS $0x03, X3, 60(DX) - VEXTRACTF128 $0x01, Y3, X3 - VEXTRACTPS $0x00, X3, 76(DX) - VEXTRACTPS $0x01, X3, 92(DX) - VEXTRACTPS $0x02, X3, 108(DX) - VEXTRACTPS $0x03, X3, 124(DX) - RET - -// func leaDec8AVX2(ctx *leaContext, dst []byte, src []byte) -// Requires: AVX, AVX2 -TEXT ·leaDec8AVX2(SB), NOSPLIT, $0-56 - MOVQ ctx+0(FP), AX - MOVB (AX), CL - MOVQ dst_base+8(FP), DX - MOVQ src_base+32(FP), BX - VMOVD (BX), X0 - VPINSRD $0x01, 16(BX), X0, X0 - VPINSRD $0x02, 32(BX), X0, X0 - VPINSRD $0x03, 48(BX), X0, X0 - VMOVD 64(BX), X1 - VPINSRD $0x01, 80(BX), X1, X1 - VPINSRD $0x02, 96(BX), X1, X1 - VPINSRD $0x03, 112(BX), X1, X1 - VINSERTI128 $0x01, X1, Y0, Y0 - VMOVD 4(BX), X1 - VPINSRD $0x01, 20(BX), X1, X1 - VPINSRD $0x02, 36(BX), X1, X1 - VPINSRD $0x03, 52(BX), X1, X1 - VMOVD 68(BX), X2 - VPINSRD $0x01, 84(BX), X2, X2 - VPINSRD $0x02, 100(BX), X2, X2 - VPINSRD $0x03, 116(BX), X2, X2 - VINSERTI128 $0x01, X2, Y1, Y1 - VMOVD 8(BX), X2 - VPINSRD $0x01, 24(BX), X2, X2 - VPINSRD $0x02, 40(BX), X2, X2 - VPINSRD $0x03, 56(BX), X2, X2 - VMOVD 72(BX), X3 - VPINSRD $0x01, 88(BX), X3, X3 - VPINSRD $0x02, 104(BX), X3, X3 - VPINSRD $0x03, 120(BX), X3, X3 - VINSERTI128 $0x01, X3, Y2, Y2 - VMOVD 12(BX), X3 - VPINSRD $0x01, 28(BX), X3, X3 - VPINSRD $0x02, 44(BX), X3, X3 - VPINSRD $0x03, 60(BX), X3, X3 - VMOVD 76(BX), X4 - VPINSRD $0x01, 92(BX), X4, X4 - VPINSRD $0x02, 108(BX), X4, X4 - VPINSRD $0x03, 124(BX), X4, X4 - VINSERTI128 $0x01, X4, Y3, Y3 - CMPB CL, $0x1c - JBE OVER28_END - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 748(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 752(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 756(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 760(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 764(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 768(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 724(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 728(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 732(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 736(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 740(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 744(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 700(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 704(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 708(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 712(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 716(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 720(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 676(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 680(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 684(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 688(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 692(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 696(AX), Y4 - VPXOR Y4, Y3, Y3 - -OVER28_END: - CMPB CL, $0x18 - JBE OVER24_END - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 652(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 656(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 660(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 664(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 668(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 672(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 628(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 632(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 636(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 640(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 644(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 648(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 604(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 608(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 612(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 616(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 620(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 624(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 580(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 584(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 588(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 592(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 596(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 600(AX), Y4 - VPXOR Y4, Y3, Y3 - -OVER24_END: - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 556(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 560(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 564(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 568(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 572(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 576(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 532(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 536(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 540(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 544(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 548(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 552(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 508(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 512(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 516(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 520(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 524(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 528(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 484(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 488(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 492(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 496(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 500(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 504(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 460(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 464(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 468(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 472(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 476(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 480(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 436(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 440(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 444(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 448(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 452(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 456(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 412(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 416(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 420(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 424(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 428(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 432(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 388(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 392(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 396(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 400(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 404(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 408(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 364(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 368(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 372(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 376(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 380(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 384(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 340(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 344(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 348(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 352(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 356(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 360(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 316(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 320(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 324(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 328(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 332(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 336(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 292(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 296(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 300(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 304(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 308(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 312(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 268(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 272(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 276(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 280(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 284(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 288(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 244(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 248(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 252(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 256(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 260(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 264(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 220(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 224(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 228(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 232(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 236(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 240(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 196(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 200(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 204(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 208(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 212(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 216(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 172(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 176(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 180(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 184(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 188(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 192(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 148(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 152(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 156(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 160(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 164(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 168(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 124(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 128(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 132(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 136(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 140(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 144(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 100(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 104(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 108(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 112(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 116(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 120(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x09, Y0, Y4 - VPSLLD $0x17, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 76(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 80(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1b, Y1, Y4 - VPSLLD $0x05, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 84(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 88(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1d, Y2, Y4 - VPSLLD $0x03, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 92(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 96(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x09, Y3, Y4 - VPSLLD $0x17, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 52(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 56(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1b, Y0, Y4 - VPSLLD $0x05, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 60(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 64(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x1d, Y1, Y4 - VPSLLD $0x03, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 68(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 72(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x09, Y2, Y4 - VPSLLD $0x17, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 28(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 32(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1b, Y3, Y4 - VPSLLD $0x05, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 36(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 40(AX), Y4 - VPXOR Y4, Y3, Y3 - VPSRLD $0x1d, Y0, Y4 - VPSLLD $0x03, Y0, Y0 - VPXOR Y4, Y0, Y0 - VPBROADCASTD 44(AX), Y4 - VPXOR Y3, Y4, Y4 - VPSUBD Y4, Y0, Y0 - VPBROADCASTD 48(AX), Y4 - VPXOR Y4, Y0, Y0 - VPSRLD $0x09, Y1, Y4 - VPSLLD $0x17, Y1, Y1 - VPXOR Y4, Y1, Y1 - VPBROADCASTD 4(AX), Y4 - VPXOR Y0, Y4, Y4 - VPSUBD Y4, Y1, Y1 - VPBROADCASTD 8(AX), Y4 - VPXOR Y4, Y1, Y1 - VPSRLD $0x1b, Y2, Y4 - VPSLLD $0x05, Y2, Y2 - VPXOR Y4, Y2, Y2 - VPBROADCASTD 12(AX), Y4 - VPXOR Y1, Y4, Y4 - VPSUBD Y4, Y2, Y2 - VPBROADCASTD 16(AX), Y4 - VPXOR Y4, Y2, Y2 - VPSRLD $0x1d, Y3, Y4 - VPSLLD $0x03, Y3, Y3 - VPXOR Y4, Y3, Y3 - VPBROADCASTD 20(AX), Y4 - VPXOR Y2, Y4, Y4 - VPSUBD Y4, Y3, Y3 - VPBROADCASTD 24(AX), Y4 - VPXOR Y4, Y3, Y3 - VEXTRACTPS $0x00, X0, (DX) - VEXTRACTPS $0x01, X0, 16(DX) - VEXTRACTPS $0x02, X0, 32(DX) - VEXTRACTPS $0x03, X0, 48(DX) - VEXTRACTF128 $0x01, Y0, X0 - VEXTRACTPS $0x00, X0, 64(DX) - VEXTRACTPS $0x01, X0, 80(DX) - VEXTRACTPS $0x02, X0, 96(DX) - VEXTRACTPS $0x03, X0, 112(DX) - VEXTRACTPS $0x00, X1, 4(DX) - VEXTRACTPS $0x01, X1, 20(DX) - VEXTRACTPS $0x02, X1, 36(DX) - VEXTRACTPS $0x03, X1, 52(DX) - VEXTRACTF128 $0x01, Y1, X1 - VEXTRACTPS $0x00, X1, 68(DX) - VEXTRACTPS $0x01, X1, 84(DX) - VEXTRACTPS $0x02, X1, 100(DX) - VEXTRACTPS $0x03, X1, 116(DX) - VEXTRACTPS $0x00, X2, 8(DX) - VEXTRACTPS $0x01, X2, 24(DX) - VEXTRACTPS $0x02, X2, 40(DX) - VEXTRACTPS $0x03, X2, 56(DX) - VEXTRACTF128 $0x01, Y2, X2 - VEXTRACTPS $0x00, X2, 72(DX) - VEXTRACTPS $0x01, X2, 88(DX) - VEXTRACTPS $0x02, X2, 104(DX) - VEXTRACTPS $0x03, X2, 120(DX) - VEXTRACTPS $0x00, X3, 12(DX) - VEXTRACTPS $0x01, X3, 28(DX) - VEXTRACTPS $0x02, X3, 44(DX) - VEXTRACTPS $0x03, X3, 60(DX) - VEXTRACTF128 $0x01, Y3, X3 - VEXTRACTPS $0x00, X3, 76(DX) - VEXTRACTPS $0x01, X3, 92(DX) - VEXTRACTPS $0x02, X3, 108(DX) - VEXTRACTPS $0x03, X3, 124(DX) - RET diff --git a/lea/lea_amd64_avx2.s b/lea/lea_amd64_avx2.s new file mode 100644 index 0000000..5d906e7 --- /dev/null +++ b/lea/lea_amd64_avx2.s @@ -0,0 +1,1751 @@ +//go:build amd64 && gc && !purego +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +TEXT ·__lea_encrypt_8block(SB), $0-32 + + MOVQ ct+0(FP), DI + MOVQ pt+8(FP), SI + MOVQ rk+16(FP), DX + MOVQ round+24(FP), CX + + LONG $0x466ef9c5; BYTE $0x40 // vmovd xmm0, dword [rsi + 64] + LONG $0x2279e3c4; WORD $0x5046; BYTE $0x01 // vpinsrd xmm0, xmm0, dword [rsi + 80], 1 + LONG $0x2279e3c4; WORD $0x6046; BYTE $0x02 // vpinsrd xmm0, xmm0, dword [rsi + 96], 2 + LONG $0x2279e3c4; WORD $0x7046; BYTE $0x03 // vpinsrd xmm0, xmm0, dword [rsi + 112], 3 + LONG $0x0e6ef9c5 // vmovd xmm1, dword [rsi] + LONG $0x2271e3c4; WORD $0x104e; BYTE $0x01 // vpinsrd xmm1, xmm1, dword [rsi + 16], 1 + LONG $0x2271e3c4; WORD $0x204e; BYTE $0x02 // vpinsrd xmm1, xmm1, dword [rsi + 32], 2 + LONG $0x2271e3c4; WORD $0x304e; BYTE $0x03 // vpinsrd xmm1, xmm1, dword [rsi + 48], 3 + LONG $0x566ef9c5; BYTE $0x44 // vmovd xmm2, dword [rsi + 68] + LONG $0x2269e3c4; WORD $0x5456; BYTE $0x01 // vpinsrd xmm2, xmm2, dword [rsi + 84], 1 + LONG $0x2269e3c4; WORD $0x6456; BYTE $0x02 // vpinsrd xmm2, xmm2, dword [rsi + 100], 2 + LONG $0x2269e3c4; WORD $0x7456; BYTE $0x03 // vpinsrd xmm2, xmm2, dword [rsi + 116], 3 + LONG $0x5e6ef9c5; BYTE $0x04 // vmovd xmm3, dword [rsi + 4] + LONG $0x2261e3c4; WORD $0x145e; BYTE $0x01 // vpinsrd xmm3, xmm3, dword [rsi + 20], 1 + LONG $0x2261e3c4; WORD $0x245e; BYTE $0x02 // vpinsrd xmm3, xmm3, dword [rsi + 36], 2 + LONG $0x2261e3c4; WORD $0x345e; BYTE $0x03 // vpinsrd xmm3, xmm3, dword [rsi + 52], 3 + LONG $0x666ef9c5; BYTE $0x48 // vmovd xmm4, dword [rsi + 72] + LONG $0x2259e3c4; WORD $0x5866; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 88], 1 + LONG $0x2259e3c4; WORD $0x6866; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 104], 2 + LONG $0x3875e3c4; WORD $0x01c0 // vinserti128 ymm0, ymm1, xmm0, 1 + LONG $0x2259e3c4; WORD $0x784e; BYTE $0x03 // vpinsrd xmm1, xmm4, dword [rsi + 120], 3 + LONG $0x666ef9c5; BYTE $0x08 // vmovd xmm4, dword [rsi + 8] + LONG $0x2259e3c4; WORD $0x1866; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 24], 1 + LONG $0x2259e3c4; WORD $0x2866; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 40], 2 + LONG $0x3865e3c4; WORD $0x01d2 // vinserti128 ymm2, ymm3, xmm2, 1 + LONG $0x2259e3c4; WORD $0x385e; BYTE $0x03 // vpinsrd xmm3, xmm4, dword [rsi + 56], 3 + LONG $0x666ef9c5; BYTE $0x4c // vmovd xmm4, dword [rsi + 76] + LONG $0x2259e3c4; WORD $0x5c66; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 92], 1 + LONG $0x2259e3c4; WORD $0x6c66; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 108], 2 + LONG $0x3865e3c4; WORD $0x01c9 // vinserti128 ymm1, ymm3, xmm1, 1 + LONG $0x2259e3c4; WORD $0x7c5e; BYTE $0x03 // vpinsrd xmm3, xmm4, dword [rsi + 124], 3 + LONG $0x666ef9c5; BYTE $0x0c // vmovd xmm4, dword [rsi + 12] + LONG $0x2259e3c4; WORD $0x1c66; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 28], 1 + LONG $0x2259e3c4; WORD $0x2c66; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 44], 2 + LONG $0x2259e3c4; WORD $0x3c66; BYTE $0x03 // vpinsrd xmm4, xmm4, dword [rsi + 60], 3 + LONG $0x587de2c4; WORD $0x106a // vpbroadcastd ymm5, dword [rdx + 16] + LONG $0x385de3c4; WORD $0x01db // vinserti128 ymm3, ymm4, xmm3, 1 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + LONG $0x587de2c4; WORD $0x146a // vpbroadcastd ymm5, dword [rdx + 20] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x0862 // vpbroadcastd ymm4, dword [rdx + 8] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + LONG $0x587de2c4; WORD $0x0c6a // vpbroadcastd ymm5, dword [rdx + 12] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + LONG $0x587de2c4; BYTE $0x2a // vpbroadcastd ymm5, dword [rdx] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + LONG $0x587de2c4; WORD $0x046a // vpbroadcastd ymm5, dword [rdx + 4] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x2862 // vpbroadcastd ymm4, dword [rdx + 40] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0x587de2c4; WORD $0x2c6a // vpbroadcastd ymm5, dword [rdx + 44] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + LONG $0x587de2c4; WORD $0x206a // vpbroadcastd ymm5, dword [rdx + 32] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + LONG $0x587de2c4; WORD $0x246a // vpbroadcastd ymm5, dword [rdx + 36] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x1862 // vpbroadcastd ymm4, dword [rdx + 24] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + LONG $0x587de2c4; WORD $0x1c6a // vpbroadcastd ymm5, dword [rdx + 28] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + LONG $0x587de2c4; WORD $0x406a // vpbroadcastd ymm5, dword [rdx + 64] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + LONG $0x587de2c4; WORD $0x446a // vpbroadcastd ymm5, dword [rdx + 68] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x3862 // vpbroadcastd ymm4, dword [rdx + 56] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0x587de2c4; WORD $0x3c6a // vpbroadcastd ymm5, dword [rdx + 60] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + LONG $0x587de2c4; WORD $0x306a // vpbroadcastd ymm5, dword [rdx + 48] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + LONG $0x587de2c4; WORD $0x346a // vpbroadcastd ymm5, dword [rdx + 52] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x5862 // vpbroadcastd ymm4, dword [rdx + 88] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + LONG $0x587de2c4; WORD $0x5c6a // vpbroadcastd ymm5, dword [rdx + 92] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + LONG $0x587de2c4; WORD $0x506a // vpbroadcastd ymm5, dword [rdx + 80] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + LONG $0x587de2c4; WORD $0x546a // vpbroadcastd ymm5, dword [rdx + 84] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x4862 // vpbroadcastd ymm4, dword [rdx + 72] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0x587de2c4; WORD $0x4c6a // vpbroadcastd ymm5, dword [rdx + 76] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + LONG $0x587de2c4; WORD $0x706a // vpbroadcastd ymm5, dword [rdx + 112] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + LONG $0x587de2c4; WORD $0x746a // vpbroadcastd ymm5, dword [rdx + 116] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x6862 // vpbroadcastd ymm4, dword [rdx + 104] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + LONG $0x587de2c4; WORD $0x6c6a // vpbroadcastd ymm5, dword [rdx + 108] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + LONG $0x587de2c4; WORD $0x606a // vpbroadcastd ymm5, dword [rdx + 96] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + LONG $0x587de2c4; WORD $0x646a // vpbroadcastd ymm5, dword [rdx + 100] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000088a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 136] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00008caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 140] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + QUAD $0x000080aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 128] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000084aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 132] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x7862 // vpbroadcastd ymm4, dword [rdx + 120] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + LONG $0x587de2c4; WORD $0x7c6a // vpbroadcastd ymm5, dword [rdx + 124] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x0000a0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 160] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0000a4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 164] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000098a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 152] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00009caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 156] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + QUAD $0x000090aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 144] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000094aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 148] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0000b8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 184] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0000bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 188] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x0000b0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 176] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0000b4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 180] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000a8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 168] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0000acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 172] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + QUAD $0x0000d0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 208] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0000d4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 212] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0000c8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 200] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0000ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 204] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + QUAD $0x0000c0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 192] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0000c4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 196] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000e8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 232] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0000ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 236] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + QUAD $0x0000e0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 224] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0000e4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 228] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0000d8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 216] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0000dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 220] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x000100aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 256] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000104aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 260] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000f8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 248] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0000fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 252] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + QUAD $0x0000f0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 240] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0000f4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 244] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000118a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 280] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00011caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 284] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x000110aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 272] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000114aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 276] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000108a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 264] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00010caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 268] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + QUAD $0x000130aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 304] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000134aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 308] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000128a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 296] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00012caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 300] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + QUAD $0x000120aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 288] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000124aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 292] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000148a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 328] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00014caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 332] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + QUAD $0x000140aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 320] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000144aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 324] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000138a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 312] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00013caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 316] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x000160aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 352] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000164aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 356] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000158a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 344] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00015caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 348] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + QUAD $0x000150aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 336] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000154aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 340] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000178a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 376] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00017caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 380] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x000170aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 368] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000174aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 372] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000168a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 360] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00016caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 364] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + QUAD $0x000190aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 400] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000194aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 404] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000188a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 392] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00018caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 396] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + QUAD $0x000180aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 384] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000184aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 388] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001a8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 424] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0001acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 428] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + QUAD $0x0001a0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 416] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0001a4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 420] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x000198a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 408] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00019caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 412] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x0001c0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 448] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0001c4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 452] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001b8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 440] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0001bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 444] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + QUAD $0x0001b0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 432] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0001b4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 436] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0001d8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 472] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0001dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 476] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x0001d0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 464] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0001d4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 468] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001c8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 456] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x0001ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 460] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + QUAD $0x0001f0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 496] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x0001f4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 500] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0001e8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 488] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0001ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 492] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + QUAD $0x0001e0aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 480] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0001e4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 484] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000208a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 520] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00020caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 524] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + QUAD $0x000200aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 512] + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000204aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 516] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0001f8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 504] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0001fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 508] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x000220aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 544] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x000224aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 548] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000218a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 536] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + QUAD $0x00021caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 540] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + QUAD $0x000210aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 528] + LONG $0xe4ebfdc5 // vpor ymm4, ymm0, ymm4 + LONG $0xc1efd5c5 // vpxor ymm0, ymm5, ymm1 + QUAD $0x000214aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 532] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xc0fee5c5 // vpaddd ymm0, ymm3, ymm0 + LONG $0xd072e5c5; BYTE $0x17 // vpsrld ymm3, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + LONG $0xc3ebfdc5 // vpor ymm0, ymm0, ymm3 + QUAD $0x0002389a587de2c4; BYTE $0x00 // vpbroadcastd ymm3, dword [rdx + 568] + LONG $0xdaefe5c5 // vpxor ymm3, ymm3, ymm2 + QUAD $0x00023caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 572] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xcbfef5c5 // vpaddd ymm1, ymm1, ymm3 + LONG $0xd172e5c5; BYTE $0x03 // vpsrld ymm3, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x000230aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 560] + LONG $0xcbebf5c5 // vpor ymm1, ymm1, ymm3 + LONG $0xdcefd5c5 // vpxor ymm3, ymm5, ymm4 + QUAD $0x000234aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 564] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd3feedc5 // vpaddd ymm2, ymm2, ymm3 + LONG $0xd272e5c5; BYTE $0x05 // vpsrld ymm3, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd3ebedc5 // vpor ymm2, ymm2, ymm3 + QUAD $0x0002289a587de2c4; BYTE $0x00 // vpbroadcastd ymm3, dword [rdx + 552] + LONG $0xd8efe5c5 // vpxor ymm3, ymm3, ymm0 + QUAD $0x00022caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 556] + LONG $0xe4efd5c5 // vpxor ymm4, ymm5, ymm4 + LONG $0xdbfeddc5 // vpaddd ymm3, ymm4, ymm3 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x19f98348 // cmp rcx, 25 + JB LBB0_3 + QUAD $0x000250a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 592] + LONG $0xe1efddc5 // vpxor ymm4, ymm4, ymm1 + QUAD $0x000254aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 596] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000248a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 584] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00024caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 588] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + QUAD $0x000240aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 576] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe3efd5c5 // vpxor ymm4, ymm5, ymm3 + QUAD $0x000244aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 580] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000268a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 616] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x00026caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 620] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + QUAD $0x000260aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 608] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000264aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 612] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000258a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 600] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00025caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 604] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + QUAD $0x000280aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 640] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe3efd5c5 // vpxor ymm4, ymm5, ymm3 + QUAD $0x000284aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 644] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000278a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 632] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x00027caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 636] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + QUAD $0x000270aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 624] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe1efd5c5 // vpxor ymm4, ymm5, ymm1 + QUAD $0x000274aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 628] + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000298a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 664] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x00029caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 668] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + QUAD $0x000290aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 656] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe3efd5c5 // vpxor ymm4, ymm5, ymm3 + QUAD $0x000294aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 660] + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000288a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 648] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x00028caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 652] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0x1df98348 // cmp rcx, 29 + JB LBB0_3 + QUAD $0x0002b0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 688] + QUAD $0x0002b4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 692] + LONG $0xe1efddc5 // vpxor ymm4, ymm4, ymm1 + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x03 // vpsrld ymm4, ymm0, 3 + LONG $0xf072fdc5; BYTE $0x1d // vpslld ymm0, ymm0, 29 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002a8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 680] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0002acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 684] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x05 // vpsrld ymm4, ymm1, 5 + LONG $0xf172f5c5; BYTE $0x1b // vpslld ymm1, ymm1, 27 + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + QUAD $0x0002a0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 672] + QUAD $0x0002a4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 676] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x17 // vpsrld ymm4, ymm2, 23 + LONG $0xf272edc5; BYTE $0x09 // vpslld ymm2, ymm2, 9 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002c8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 712] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x0002ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 716] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x03 // vpsrld ymm4, ymm3, 3 + LONG $0xf372e5c5; BYTE $0x1d // vpslld ymm3, ymm3, 29 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0002c0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 704] + QUAD $0x0002c4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 708] + LONG $0xe1efddc5 // vpxor ymm4, ymm4, ymm1 + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x05 // vpsrld ymm4, ymm0, 5 + LONG $0xf072fdc5; BYTE $0x1b // vpslld ymm0, ymm0, 27 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002b8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 696] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0002bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 700] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x17 // vpsrld ymm4, ymm1, 23 + LONG $0xf172f5c5; BYTE $0x09 // vpslld ymm1, ymm1, 9 + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + QUAD $0x0002e0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 736] + QUAD $0x0002e4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 740] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x03 // vpsrld ymm4, ymm2, 3 + LONG $0xf272edc5; BYTE $0x1d // vpslld ymm2, ymm2, 29 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002d8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 728] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x0002dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 732] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x05 // vpsrld ymm4, ymm3, 5 + LONG $0xf372e5c5; BYTE $0x1b // vpslld ymm3, ymm3, 27 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + QUAD $0x0002d0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 720] + QUAD $0x0002d4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 724] + LONG $0xe1efddc5 // vpxor ymm4, ymm4, ymm1 + LONG $0xc0efd5c5 // vpxor ymm0, ymm5, ymm0 + LONG $0xc4fefdc5 // vpaddd ymm0, ymm0, ymm4 + LONG $0xd072ddc5; BYTE $0x17 // vpsrld ymm4, ymm0, 23 + LONG $0xf072fdc5; BYTE $0x09 // vpslld ymm0, ymm0, 9 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002f8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 760] + LONG $0xe2efddc5 // vpxor ymm4, ymm4, ymm2 + QUAD $0x0002fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 764] + LONG $0xc9efd5c5 // vpxor ymm1, ymm5, ymm1 + LONG $0xccfef5c5 // vpaddd ymm1, ymm1, ymm4 + LONG $0xd172ddc5; BYTE $0x03 // vpsrld ymm4, ymm1, 3 + LONG $0xf172f5c5; BYTE $0x1d // vpslld ymm1, ymm1, 29 + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + QUAD $0x0002f0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 752] + QUAD $0x0002f4aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 756] + LONG $0xe3efddc5 // vpxor ymm4, ymm4, ymm3 + LONG $0xd2efd5c5 // vpxor ymm2, ymm5, ymm2 + LONG $0xd4feedc5 // vpaddd ymm2, ymm2, ymm4 + LONG $0xd272ddc5; BYTE $0x05 // vpsrld ymm4, ymm2, 5 + LONG $0xf272edc5; BYTE $0x1b // vpslld ymm2, ymm2, 27 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002e8a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 744] + LONG $0xe0efddc5 // vpxor ymm4, ymm4, ymm0 + QUAD $0x0002ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 748] + LONG $0xdbefd5c5 // vpxor ymm3, ymm5, ymm3 + LONG $0xdcfee5c5 // vpaddd ymm3, ymm3, ymm4 + LONG $0xd372ddc5; BYTE $0x17 // vpsrld ymm4, ymm3, 23 + LONG $0xf372e5c5; BYTE $0x09 // vpslld ymm3, ymm3, 9 + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + +LBB0_3: + LONG $0xe262e1c5 // vpunpckldq xmm4, xmm3, xmm2 + LONG $0xe862f1c5 // vpunpckldq xmm5, xmm1, xmm0 + LONG $0x385de3c4; WORD $0x01e5 // vinserti128 ymm4, ymm4, xmm5, 1 + LONG $0xeb70f9c5; BYTE $0x55 // vpshufd xmm5, xmm3, 85 + LONG $0x0251e3c4; WORD $0x02ea // vpblendd xmm5, xmm5, xmm2, 2 + LONG $0xf170f9c5; BYTE $0x55 // vpshufd xmm6, xmm1, 85 + LONG $0x0249e3c4; WORD $0x02f0 // vpblendd xmm6, xmm6, xmm0, 2 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xe56cddc5 // vpunpcklqdq ymm4, ymm4, ymm5 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x277ffec5 // vmovdqu yword [rdi], ymm4 + LONG $0xe26ae1c5 // vpunpckhdq xmm4, xmm3, xmm2 + LONG $0xea70f9c5; BYTE $0xee // vpshufd xmm5, xmm2, 238 + LONG $0xf06af1c5 // vpunpckhdq xmm6, xmm1, xmm0 + LONG $0x385de3c4; WORD $0x01e6 // vinserti128 ymm4, ymm4, xmm6, 1 + LONG $0xf370f9c5; BYTE $0xff // vpshufd xmm6, xmm3, 255 + LONG $0x0249e3c4; WORD $0x02ed // vpblendd xmm5, xmm6, xmm5, 2 + LONG $0xf070f9c5; BYTE $0xee // vpshufd xmm6, xmm0, 238 + LONG $0xf970f9c5; BYTE $0xff // vpshufd xmm7, xmm1, 255 + LONG $0x0241e3c4; WORD $0x02f6 // vpblendd xmm6, xmm7, xmm6, 2 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xe56cddc5 // vpunpcklqdq ymm4, ymm4, ymm5 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x677ffec5; BYTE $0x20 // vmovdqu yword [rdi + 32], ymm4 + LONG $0x397de3c4; WORD $0x01d2 // vextracti128 xmm2, ymm2, 1 + LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 + LONG $0xe262e1c5 // vpunpckldq xmm4, xmm3, xmm2 + LONG $0x397de3c4; WORD $0x01c0 // vextracti128 xmm0, ymm0, 1 + LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 + LONG $0xe862f1c5 // vpunpckldq xmm5, xmm1, xmm0 + LONG $0x385de3c4; WORD $0x01e5 // vinserti128 ymm4, ymm4, xmm5, 1 + LONG $0xeb70f9c5; BYTE $0x55 // vpshufd xmm5, xmm3, 85 + LONG $0x0251e3c4; WORD $0x02ea // vpblendd xmm5, xmm5, xmm2, 2 + LONG $0xf170f9c5; BYTE $0x55 // vpshufd xmm6, xmm1, 85 + LONG $0x0249e3c4; WORD $0x02f0 // vpblendd xmm6, xmm6, xmm0, 2 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xe56cddc5 // vpunpcklqdq ymm4, ymm4, ymm5 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x677ffec5; BYTE $0x40 // vmovdqu yword [rdi + 64], ymm4 + LONG $0xe26ae1c5 // vpunpckhdq xmm4, xmm3, xmm2 + LONG $0xd270f9c5; BYTE $0xee // vpshufd xmm2, xmm2, 238 + LONG $0xe86af1c5 // vpunpckhdq xmm5, xmm1, xmm0 + LONG $0x385de3c4; WORD $0x01e5 // vinserti128 ymm4, ymm4, xmm5, 1 + LONG $0xdb70f9c5; BYTE $0xff // vpshufd xmm3, xmm3, 255 + LONG $0x0261e3c4; WORD $0x02d2 // vpblendd xmm2, xmm3, xmm2, 2 + LONG $0xc070f9c5; BYTE $0xee // vpshufd xmm0, xmm0, 238 + LONG $0xc970f9c5; BYTE $0xff // vpshufd xmm1, xmm1, 255 + LONG $0x0271e3c4; WORD $0x02c0 // vpblendd xmm0, xmm1, xmm0, 2 + LONG $0x386de3c4; WORD $0x01c0 // vinserti128 ymm0, ymm2, xmm0, 1 + LONG $0xc06cddc5 // vpunpcklqdq ymm0, ymm4, ymm0 + LONG $0x00fde3c4; WORD $0xd8c0 // vpermq ymm0, ymm0, 216 + LONG $0x477ffec5; BYTE $0x60 // vmovdqu yword [rdi + 96], ymm0 + VZEROUPPER + RET + +TEXT ·__lea_decrypt_8block(SB), $0-32 + + MOVQ pt+0(FP), DI + MOVQ ct+8(FP), SI + MOVQ rk+16(FP), DX + MOVQ round+24(FP), CX + + LONG $0x466ef9c5; BYTE $0x40 // vmovd xmm0, dword [rsi + 64] + LONG $0x2279e3c4; WORD $0x5046; BYTE $0x01 // vpinsrd xmm0, xmm0, dword [rsi + 80], 1 + LONG $0x2279e3c4; WORD $0x6046; BYTE $0x02 // vpinsrd xmm0, xmm0, dword [rsi + 96], 2 + LONG $0x2279e3c4; WORD $0x7046; BYTE $0x03 // vpinsrd xmm0, xmm0, dword [rsi + 112], 3 + LONG $0x0e6ef9c5 // vmovd xmm1, dword [rsi] + LONG $0x2271e3c4; WORD $0x104e; BYTE $0x01 // vpinsrd xmm1, xmm1, dword [rsi + 16], 1 + LONG $0x2271e3c4; WORD $0x204e; BYTE $0x02 // vpinsrd xmm1, xmm1, dword [rsi + 32], 2 + LONG $0x2271e3c4; WORD $0x304e; BYTE $0x03 // vpinsrd xmm1, xmm1, dword [rsi + 48], 3 + LONG $0x566ef9c5; BYTE $0x44 // vmovd xmm2, dword [rsi + 68] + LONG $0x2269e3c4; WORD $0x5456; BYTE $0x01 // vpinsrd xmm2, xmm2, dword [rsi + 84], 1 + LONG $0x2269e3c4; WORD $0x6456; BYTE $0x02 // vpinsrd xmm2, xmm2, dword [rsi + 100], 2 + LONG $0x2269e3c4; WORD $0x7456; BYTE $0x03 // vpinsrd xmm2, xmm2, dword [rsi + 116], 3 + LONG $0x5e6ef9c5; BYTE $0x04 // vmovd xmm3, dword [rsi + 4] + LONG $0x2261e3c4; WORD $0x145e; BYTE $0x01 // vpinsrd xmm3, xmm3, dword [rsi + 20], 1 + LONG $0x2261e3c4; WORD $0x245e; BYTE $0x02 // vpinsrd xmm3, xmm3, dword [rsi + 36], 2 + LONG $0x2261e3c4; WORD $0x3466; BYTE $0x03 // vpinsrd xmm4, xmm3, dword [rsi + 52], 3 + LONG $0x5e6ef9c5; BYTE $0x48 // vmovd xmm3, dword [rsi + 72] + LONG $0x2261e3c4; WORD $0x585e; BYTE $0x01 // vpinsrd xmm3, xmm3, dword [rsi + 88], 1 + LONG $0x2261e3c4; WORD $0x686e; BYTE $0x02 // vpinsrd xmm5, xmm3, dword [rsi + 104], 2 + LONG $0x3875e3c4; WORD $0x01d8 // vinserti128 ymm3, ymm1, xmm0, 1 + LONG $0x2251e3c4; WORD $0x7846; BYTE $0x03 // vpinsrd xmm0, xmm5, dword [rsi + 120], 3 + LONG $0x4e6ef9c5; BYTE $0x08 // vmovd xmm1, dword [rsi + 8] + LONG $0x2271e3c4; WORD $0x184e; BYTE $0x01 // vpinsrd xmm1, xmm1, dword [rsi + 24], 1 + LONG $0x2271e3c4; WORD $0x284e; BYTE $0x02 // vpinsrd xmm1, xmm1, dword [rsi + 40], 2 + LONG $0x385de3c4; WORD $0x01d2 // vinserti128 ymm2, ymm4, xmm2, 1 + LONG $0x2271e3c4; WORD $0x384e; BYTE $0x03 // vpinsrd xmm1, xmm1, dword [rsi + 56], 3 + LONG $0x666ef9c5; BYTE $0x4c // vmovd xmm4, dword [rsi + 76] + LONG $0x2259e3c4; WORD $0x5c66; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 92], 1 + LONG $0x2259e3c4; WORD $0x6c66; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 108], 2 + LONG $0x3875e3c4; WORD $0x01c8 // vinserti128 ymm1, ymm1, xmm0, 1 + LONG $0x2259e3c4; WORD $0x7c46; BYTE $0x03 // vpinsrd xmm0, xmm4, dword [rsi + 124], 3 + LONG $0x666ef9c5; BYTE $0x0c // vmovd xmm4, dword [rsi + 12] + LONG $0x2259e3c4; WORD $0x1c66; BYTE $0x01 // vpinsrd xmm4, xmm4, dword [rsi + 28], 1 + LONG $0x2259e3c4; WORD $0x2c66; BYTE $0x02 // vpinsrd xmm4, xmm4, dword [rsi + 44], 2 + LONG $0x2259e3c4; WORD $0x3c66; BYTE $0x03 // vpinsrd xmm4, xmm4, dword [rsi + 60], 3 + LONG $0x385de3c4; WORD $0x01c0 // vinserti128 ymm0, ymm4, xmm0, 1 + LONG $0x1df98348 // cmp rcx, 29 + JB LBB1_2 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x0002e8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 744] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x0002ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 748] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002f0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 752] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0002f4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 756] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x0002f8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 760] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0002fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 764] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002d0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 720] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0002d4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 724] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x0002d8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 728] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0002dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 732] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002e0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 736] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0002e4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 740] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x0002b8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 696] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0002bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 700] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002c0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 704] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0002c4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 708] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x0002c8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 712] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0002ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 716] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0002a0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 672] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0002a4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 676] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x0002a8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 680] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0002acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 684] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0002b0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 688] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0002b4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 692] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + JMP LBB1_3 + +LBB1_2: + LONG $0x19f98348 // cmp rcx, 25 + JB LBB1_4 + +LBB1_3: + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x000288aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 648] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x00028caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 652] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000290a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 656] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000294a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 660] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x000298aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 664] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00029caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 668] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000270a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 624] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000274a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 628] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x000278aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 632] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00027caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 636] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000280a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 640] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000284a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 644] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x000258aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 600] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00025caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 604] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000260a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 608] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000264a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 612] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x000268aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 616] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00026caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 620] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000240a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 576] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000244a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 580] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x000248aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 584] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00024caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 588] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000250a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 592] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000254a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 596] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + +LBB1_4: + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x000228aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 552] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe0efd5c5 // vpxor ymm4, ymm5, ymm0 + QUAD $0x00022caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 556] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000230a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 560] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000234a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 564] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x000238aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 568] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00023caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 572] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000210a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 528] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000214a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 532] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x000218aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 536] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00021caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 540] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000220a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 544] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000224a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 548] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x0001f8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 504] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0001fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 508] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000200a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 512] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000204a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 516] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x000208aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 520] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00020caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 524] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001e0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 480] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0001e4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 484] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x0001e8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 488] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0001ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 492] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0001f0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 496] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0001f4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 500] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x0001c8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 456] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0001ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 460] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001d0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 464] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0001d4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 468] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x0001d8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 472] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0001dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 476] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0001b0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 432] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0001b4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 436] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x0001b8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 440] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0001bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 444] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0001c0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 448] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0001c4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 452] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x000198aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 408] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00019caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 412] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0001a0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 416] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0001a4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 420] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x0001a8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 424] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0001acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 428] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000180a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 384] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000184a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 388] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x000188aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 392] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00018caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 396] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000190a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 400] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000194a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 404] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x000168aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 360] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00016caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 364] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000170a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 368] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000174a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 372] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x000178aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 376] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00017caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 380] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000150a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 336] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000154a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 340] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x000158aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 344] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00015caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 348] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000160a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 352] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000164a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 356] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x000138aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 312] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00013caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 316] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000140a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 320] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000144a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 324] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x000148aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 328] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00014caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 332] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000120a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 288] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000124a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 292] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x000128aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 296] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00012caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 300] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000130a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 304] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000134a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 308] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x000108aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 264] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00010caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 268] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000110a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 272] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000114a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 276] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x000118aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 280] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x00011caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 284] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0000f0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 240] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0000f4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 244] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x0000f8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 248] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0000fcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 252] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x000100a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 256] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x000104a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 260] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + QUAD $0x0000d8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 216] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0000dcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 220] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0000e0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 224] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0000e4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 228] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x0000e8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 232] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0000ecaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 236] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000c0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 192] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0000c4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 196] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + QUAD $0x0000c8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 200] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0000ccaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 204] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x0000d0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 208] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x0000d4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 212] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + QUAD $0x0000a8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 168] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x0000acaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 172] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000b0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 176] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0000b4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 180] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + QUAD $0x0000b8aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 184] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + QUAD $0x0000bcaa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 188] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000090a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 144] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000094a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 148] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + QUAD $0x000098aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 152] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00009caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 156] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + QUAD $0x0000a0a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 160] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + QUAD $0x0000a4a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 164] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + LONG $0x587de2c4; WORD $0x786a // vpbroadcastd ymm5, dword [rdx + 120] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + LONG $0x587de2c4; WORD $0x7c6a // vpbroadcastd ymm5, dword [rdx + 124] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1b // vpsrld ymm4, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + QUAD $0x000080a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 128] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + QUAD $0x000084a2587de2c4; BYTE $0x00 // vpbroadcastd ymm4, dword [rdx + 132] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1d // vpsrld ymm4, ymm3, 29 + LONG $0xf372e5c5; BYTE $0x03 // vpslld ymm3, ymm3, 3 + QUAD $0x000088aa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 136] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + QUAD $0x00008caa587de2c4; BYTE $0x00 // vpbroadcastd ymm5, dword [rdx + 140] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x09 // vpsrld ymm4, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x6062 // vpbroadcastd ymm4, dword [rdx + 96] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x6462 // vpbroadcastd ymm4, dword [rdx + 100] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1b // vpsrld ymm4, ymm1, 27 + LONG $0xf172f5c5; BYTE $0x05 // vpslld ymm1, ymm1, 5 + LONG $0x587de2c4; WORD $0x686a // vpbroadcastd ymm5, dword [rdx + 104] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + LONG $0x587de2c4; WORD $0x6c6a // vpbroadcastd ymm5, dword [rdx + 108] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x1d // vpsrld ymm4, ymm0, 29 + LONG $0xf072fdc5; BYTE $0x03 // vpslld ymm0, ymm0, 3 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0x587de2c4; WORD $0x7062 // vpbroadcastd ymm4, dword [rdx + 112] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + LONG $0x587de2c4; WORD $0x7462 // vpbroadcastd ymm4, dword [rdx + 116] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x09 // vpsrld ymm4, ymm3, 9 + LONG $0xf372e5c5; BYTE $0x17 // vpslld ymm3, ymm3, 23 + LONG $0x587de2c4; WORD $0x486a // vpbroadcastd ymm5, dword [rdx + 72] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + LONG $0x587de2c4; WORD $0x4c6a // vpbroadcastd ymm5, dword [rdx + 76] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1b // vpsrld ymm4, ymm2, 27 + LONG $0xf272edc5; BYTE $0x05 // vpslld ymm2, ymm2, 5 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x5062 // vpbroadcastd ymm4, dword [rdx + 80] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x5462 // vpbroadcastd ymm4, dword [rdx + 84] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x1d // vpsrld ymm4, ymm1, 29 + LONG $0xf172f5c5; BYTE $0x03 // vpslld ymm1, ymm1, 3 + LONG $0x587de2c4; WORD $0x586a // vpbroadcastd ymm5, dword [rdx + 88] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + LONG $0x587de2c4; WORD $0x5c6a // vpbroadcastd ymm5, dword [rdx + 92] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xcdeff5c5 // vpxor ymm1, ymm1, ymm5 + LONG $0xd072ddc5; BYTE $0x09 // vpsrld ymm4, ymm0, 9 + LONG $0xf072fdc5; BYTE $0x17 // vpslld ymm0, ymm0, 23 + LONG $0xc4ebfdc5 // vpor ymm0, ymm0, ymm4 + LONG $0x587de2c4; WORD $0x3062 // vpbroadcastd ymm4, dword [rdx + 48] + LONG $0xe4eff5c5 // vpxor ymm4, ymm1, ymm4 + LONG $0xc4fafdc5 // vpsubd ymm0, ymm0, ymm4 + LONG $0x587de2c4; WORD $0x3462 // vpbroadcastd ymm4, dword [rdx + 52] + LONG $0xc4effdc5 // vpxor ymm0, ymm0, ymm4 + LONG $0xd372ddc5; BYTE $0x1b // vpsrld ymm4, ymm3, 27 + LONG $0xf372e5c5; BYTE $0x05 // vpslld ymm3, ymm3, 5 + LONG $0x587de2c4; WORD $0x386a // vpbroadcastd ymm5, dword [rdx + 56] + LONG $0xdcebe5c5 // vpor ymm3, ymm3, ymm4 + LONG $0xe5effdc5 // vpxor ymm4, ymm0, ymm5 + LONG $0x587de2c4; WORD $0x3c6a // vpbroadcastd ymm5, dword [rdx + 60] + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0xddefe5c5 // vpxor ymm3, ymm3, ymm5 + LONG $0xd272ddc5; BYTE $0x1d // vpsrld ymm4, ymm2, 29 + LONG $0xf272edc5; BYTE $0x03 // vpslld ymm2, ymm2, 3 + LONG $0xd4ebedc5 // vpor ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x4062 // vpbroadcastd ymm4, dword [rdx + 64] + LONG $0xe4efe5c5 // vpxor ymm4, ymm3, ymm4 + LONG $0xd4faedc5 // vpsubd ymm2, ymm2, ymm4 + LONG $0x587de2c4; WORD $0x4462 // vpbroadcastd ymm4, dword [rdx + 68] + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd172ddc5; BYTE $0x09 // vpsrld ymm4, ymm1, 9 + LONG $0xf172f5c5; BYTE $0x17 // vpslld ymm1, ymm1, 23 + LONG $0x587de2c4; WORD $0x186a // vpbroadcastd ymm5, dword [rdx + 24] + LONG $0xccebf5c5 // vpor ymm1, ymm1, ymm4 + LONG $0xe5efedc5 // vpxor ymm4, ymm2, ymm5 + LONG $0x587de2c4; WORD $0x1c6a // vpbroadcastd ymm5, dword [rdx + 28] + LONG $0xccfaf5c5 // vpsubd ymm1, ymm1, ymm4 + LONG $0xe5eff5c5 // vpxor ymm4, ymm1, ymm5 + LONG $0xd072f5c5; BYTE $0x1b // vpsrld ymm1, ymm0, 27 + LONG $0xf072fdc5; BYTE $0x05 // vpslld ymm0, ymm0, 5 + LONG $0xc1ebfdc5 // vpor ymm0, ymm0, ymm1 + LONG $0x587de2c4; WORD $0x204a // vpbroadcastd ymm1, dword [rdx + 32] + LONG $0xc9efddc5 // vpxor ymm1, ymm4, ymm1 + LONG $0xc1fafdc5 // vpsubd ymm0, ymm0, ymm1 + LONG $0x587de2c4; WORD $0x244a // vpbroadcastd ymm1, dword [rdx + 36] + LONG $0xe9effdc5 // vpxor ymm5, ymm0, ymm1 + LONG $0xd372fdc5; BYTE $0x1d // vpsrld ymm0, ymm3, 29 + LONG $0xf372f5c5; BYTE $0x03 // vpslld ymm1, ymm3, 3 + LONG $0x587de2c4; WORD $0x285a // vpbroadcastd ymm3, dword [rdx + 40] + LONG $0xc0ebf5c5 // vpor ymm0, ymm1, ymm0 + LONG $0xcbefd5c5 // vpxor ymm1, ymm5, ymm3 + LONG $0x587de2c4; WORD $0x2c5a // vpbroadcastd ymm3, dword [rdx + 44] + LONG $0xc1fafdc5 // vpsubd ymm0, ymm0, ymm1 + LONG $0xc3effdc5 // vpxor ymm0, ymm0, ymm3 + LONG $0xd272f5c5; BYTE $0x09 // vpsrld ymm1, ymm2, 9 + LONG $0xf272edc5; BYTE $0x17 // vpslld ymm2, ymm2, 23 + LONG $0xc9ebedc5 // vpor ymm1, ymm2, ymm1 + LONG $0x587de2c4; BYTE $0x12 // vpbroadcastd ymm2, dword [rdx] + LONG $0xd2effdc5 // vpxor ymm2, ymm0, ymm2 + LONG $0xcafaf5c5 // vpsubd ymm1, ymm1, ymm2 + LONG $0x587de2c4; WORD $0x0452 // vpbroadcastd ymm2, dword [rdx + 4] + LONG $0xcaeff5c5 // vpxor ymm1, ymm1, ymm2 + LONG $0xd472edc5; BYTE $0x1b // vpsrld ymm2, ymm4, 27 + LONG $0xf472e5c5; BYTE $0x05 // vpslld ymm3, ymm4, 5 + LONG $0x587de2c4; WORD $0x0862 // vpbroadcastd ymm4, dword [rdx + 8] + LONG $0xd2ebe5c5 // vpor ymm2, ymm3, ymm2 + LONG $0xdceff5c5 // vpxor ymm3, ymm1, ymm4 + LONG $0x587de2c4; WORD $0x0c62 // vpbroadcastd ymm4, dword [rdx + 12] + LONG $0xd3faedc5 // vpsubd ymm2, ymm2, ymm3 + LONG $0xd4efedc5 // vpxor ymm2, ymm2, ymm4 + LONG $0xd572e5c5; BYTE $0x1d // vpsrld ymm3, ymm5, 29 + LONG $0xf572ddc5; BYTE $0x03 // vpslld ymm4, ymm5, 3 + LONG $0xdbebddc5 // vpor ymm3, ymm4, ymm3 + LONG $0x587de2c4; WORD $0x1062 // vpbroadcastd ymm4, dword [rdx + 16] + LONG $0xe4efedc5 // vpxor ymm4, ymm2, ymm4 + LONG $0xdcfae5c5 // vpsubd ymm3, ymm3, ymm4 + LONG $0x587de2c4; WORD $0x1462 // vpbroadcastd ymm4, dword [rdx + 20] + LONG $0xdcefe5c5 // vpxor ymm3, ymm3, ymm4 + LONG $0xe162f9c5 // vpunpckldq xmm4, xmm0, xmm1 + LONG $0xe870f9c5; BYTE $0x55 // vpshufd xmm5, xmm0, 85 + LONG $0x0251e3c4; WORD $0x02e9 // vpblendd xmm5, xmm5, xmm1, 2 + LONG $0xf270f9c5; BYTE $0x55 // vpshufd xmm6, xmm2, 85 + LONG $0x0249e3c4; WORD $0x02f3 // vpblendd xmm6, xmm6, xmm3, 2 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xf362e9c5 // vpunpckldq xmm6, xmm2, xmm3 + LONG $0x385de3c4; WORD $0x01e6 // vinserti128 ymm4, ymm4, xmm6, 1 + LONG $0xe56cddc5 // vpunpcklqdq ymm4, ymm4, ymm5 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x277ffec5 // vmovdqu yword [rdi], ymm4 + LONG $0xe170f9c5; BYTE $0xee // vpshufd xmm4, xmm1, 238 + LONG $0xe96af9c5 // vpunpckhdq xmm5, xmm0, xmm1 + LONG $0xf36ae9c5 // vpunpckhdq xmm6, xmm2, xmm3 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xf070f9c5; BYTE $0xff // vpshufd xmm6, xmm0, 255 + LONG $0x0249e3c4; WORD $0x02e4 // vpblendd xmm4, xmm6, xmm4, 2 + LONG $0xf370f9c5; BYTE $0xee // vpshufd xmm6, xmm3, 238 + LONG $0xfa70f9c5; BYTE $0xff // vpshufd xmm7, xmm2, 255 + LONG $0x0241e3c4; WORD $0x02f6 // vpblendd xmm6, xmm7, xmm6, 2 + LONG $0x385de3c4; WORD $0x01e6 // vinserti128 ymm4, ymm4, xmm6, 1 + LONG $0xe46cd5c5 // vpunpcklqdq ymm4, ymm5, ymm4 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x677ffec5; BYTE $0x20 // vmovdqu yword [rdi + 32], ymm4 + LONG $0x397de3c4; WORD $0x01c9 // vextracti128 xmm1, ymm1, 1 + LONG $0x397de3c4; WORD $0x01c0 // vextracti128 xmm0, ymm0, 1 + LONG $0xe162f9c5 // vpunpckldq xmm4, xmm0, xmm1 + LONG $0xe870f9c5; BYTE $0x55 // vpshufd xmm5, xmm0, 85 + LONG $0x0251e3c4; WORD $0x02e9 // vpblendd xmm5, xmm5, xmm1, 2 + LONG $0x397de3c4; WORD $0x01d2 // vextracti128 xmm2, ymm2, 1 + LONG $0xf270f9c5; BYTE $0x55 // vpshufd xmm6, xmm2, 85 + LONG $0x397de3c4; WORD $0x01db // vextracti128 xmm3, ymm3, 1 + LONG $0x0249e3c4; WORD $0x02f3 // vpblendd xmm6, xmm6, xmm3, 2 + LONG $0x3855e3c4; WORD $0x01ee // vinserti128 ymm5, ymm5, xmm6, 1 + LONG $0xf362e9c5 // vpunpckldq xmm6, xmm2, xmm3 + LONG $0x385de3c4; WORD $0x01e6 // vinserti128 ymm4, ymm4, xmm6, 1 + LONG $0xe56cddc5 // vpunpcklqdq ymm4, ymm4, ymm5 + LONG $0x00fde3c4; WORD $0xd8e4 // vpermq ymm4, ymm4, 216 + LONG $0x677ffec5; BYTE $0x40 // vmovdqu yword [rdi + 64], ymm4 + LONG $0xe170f9c5; BYTE $0xee // vpshufd xmm4, xmm1, 238 + LONG $0xc96af9c5 // vpunpckhdq xmm1, xmm0, xmm1 + LONG $0xeb6ae9c5 // vpunpckhdq xmm5, xmm2, xmm3 + LONG $0x3875e3c4; WORD $0x01cd // vinserti128 ymm1, ymm1, xmm5, 1 + LONG $0xc070f9c5; BYTE $0xff // vpshufd xmm0, xmm0, 255 + LONG $0x0279e3c4; WORD $0x02c4 // vpblendd xmm0, xmm0, xmm4, 2 + LONG $0xdb70f9c5; BYTE $0xee // vpshufd xmm3, xmm3, 238 + LONG $0xd270f9c5; BYTE $0xff // vpshufd xmm2, xmm2, 255 + LONG $0x0269e3c4; WORD $0x02d3 // vpblendd xmm2, xmm2, xmm3, 2 + LONG $0x387de3c4; WORD $0x01c2 // vinserti128 ymm0, ymm0, xmm2, 1 + LONG $0xc06cf5c5 // vpunpcklqdq ymm0, ymm1, ymm0 + LONG $0x00fde3c4; WORD $0xd8c0 // vpermq ymm0, ymm0, 216 + LONG $0x477ffec5; BYTE $0x60 // vmovdqu yword [rdi + 96], ymm0 + VZEROUPPER + RET diff --git a/lea/lea_amd64_sse2.s b/lea/lea_amd64_sse2.s new file mode 100644 index 0000000..b370995 --- /dev/null +++ b/lea/lea_amd64_sse2.s @@ -0,0 +1,2213 @@ +//go:build amd64 && gc && !purego +// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT + +TEXT ·__lea_encrypt_4block(SB), $0-32 + + MOVQ ct+0(FP), DI + MOVQ pt+8(FP), SI + MOVQ rk+16(FP), DX + MOVQ round+24(FP), CX + + LONG $0x066f0ff3 // movdqu xmm0, oword [rsi] + LONG $0x4e6f0ff3; BYTE $0x10 // movdqu xmm1, oword [rsi + 16] + LONG $0x5e6f0ff3; BYTE $0x20 // movdqu xmm3, oword [rsi + 32] + LONG $0x6e6f0ff3; BYTE $0x30 // movdqu xmm5, oword [rsi + 48] + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd1620f66 // punpckldq xmm2, xmm1 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xe5620f66 // punpckldq xmm4, xmm5 + LONG $0xc16a0f66 // punpckhdq xmm0, xmm1 + LONG $0xdd6a0f66 // punpckhdq xmm3, xmm5 + LONG $0x4a6e0f66; BYTE $0x10 // movd xmm1, dword [rdx + 16] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0x6a6e0f66; BYTE $0x14 // movd xmm5, dword [rdx + 20] + LONG $0xed700f66; BYTE $0x00 // pshufd xmm5, xmm5, 0 + LONG $0xf06f0f66 // movdqa xmm6, xmm0 + LONG $0xf36c0f66 // punpcklqdq xmm6, xmm3 + LONG $0xceef0f66 // pxor xmm1, xmm6 + LONG $0xc36d0f66 // punpckhqdq xmm0, xmm3 + LONG $0xc5ef0f66 // pxor xmm0, xmm5 + LONG $0xc1fe0f66 // paddd xmm0, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x03 // psrld xmm1, 3 + LONG $0xf0720f66; BYTE $0x1d // pslld xmm0, 29 + LONG $0xc1eb0f66 // por xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x08 // movd xmm1, dword [rdx + 8] + LONG $0xd9700f66; BYTE $0x00 // pshufd xmm3, xmm1, 0 + LONG $0x4a6e0f66; BYTE $0x0c // movd xmm1, dword [rdx + 12] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xea6f0f66 // movdqa xmm5, xmm2 + LONG $0xec6d0f66 // punpckhqdq xmm5, xmm4 + LONG $0xddef0f66 // pxor xmm3, xmm5 + LONG $0xceef0f66 // pxor xmm1, xmm6 + LONG $0xcbfe0f66 // paddd xmm1, xmm3 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf1720f66; BYTE $0x1b // pslld xmm1, 27 + LONG $0xcbeb0f66 // por xmm1, xmm3 + LONG $0x1a6e0f66 // movd xmm3, dword [rdx] + LONG $0xf3700f66; BYTE $0x00 // pshufd xmm6, xmm3, 0 + LONG $0x5a6e0f66; BYTE $0x04 // movd xmm3, dword [rdx + 4] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xd46c0f66 // punpcklqdq xmm2, xmm4 + LONG $0xf2ef0f66 // pxor xmm6, xmm2 + LONG $0xddef0f66 // pxor xmm3, xmm5 + LONG $0xdefe0f66 // paddd xmm3, xmm6 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x28 // movd xmm4, dword [rdx + 40] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + LONG $0x626e0f66; BYTE $0x2c // movd xmm4, dword [rdx + 44] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe2eb0f66 // por xmm4, xmm2 + LONG $0x526e0f66; BYTE $0x20 // movd xmm2, dword [rdx + 32] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0x526e0f66; BYTE $0x24 // movd xmm2, dword [rdx + 36] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd0eb0f66 // por xmm2, xmm0 + LONG $0x426e0f66; BYTE $0x18 // movd xmm0, dword [rdx + 24] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + LONG $0x426e0f66; BYTE $0x1c // movd xmm0, dword [rdx + 28] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x40 // movd xmm1, dword [rdx + 64] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + LONG $0x4a6e0f66; BYTE $0x44 // movd xmm1, dword [rdx + 68] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcbeb0f66 // por xmm1, xmm3 + LONG $0x5a6e0f66; BYTE $0x38 // movd xmm3, dword [rdx + 56] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + LONG $0x5a6e0f66; BYTE $0x3c // movd xmm3, dword [rdx + 60] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x30 // movd xmm4, dword [rdx + 48] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + LONG $0x626e0f66; BYTE $0x34 // movd xmm4, dword [rdx + 52] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe2eb0f66 // por xmm4, xmm2 + LONG $0x526e0f66; BYTE $0x58 // movd xmm2, dword [rdx + 88] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0x526e0f66; BYTE $0x5c // movd xmm2, dword [rdx + 92] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd0eb0f66 // por xmm2, xmm0 + LONG $0x426e0f66; BYTE $0x50 // movd xmm0, dword [rdx + 80] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + LONG $0x426e0f66; BYTE $0x54 // movd xmm0, dword [rdx + 84] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x48 // movd xmm1, dword [rdx + 72] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + LONG $0x4a6e0f66; BYTE $0x4c // movd xmm1, dword [rdx + 76] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x17 // psrld xmm3, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcbeb0f66 // por xmm1, xmm3 + LONG $0x5a6e0f66; BYTE $0x70 // movd xmm3, dword [rdx + 112] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + LONG $0x5a6e0f66; BYTE $0x74 // movd xmm3, dword [rdx + 116] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x03 // psrld xmm4, 3 + LONG $0xf3720f66; BYTE $0x1d // pslld xmm3, 29 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x68 // movd xmm4, dword [rdx + 104] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + LONG $0x626e0f66; BYTE $0x6c // movd xmm4, dword [rdx + 108] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x05 // psrld xmm2, 5 + LONG $0xf4720f66; BYTE $0x1b // pslld xmm4, 27 + LONG $0xe2eb0f66 // por xmm4, xmm2 + LONG $0x526e0f66; BYTE $0x60 // movd xmm2, dword [rdx + 96] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0x526e0f66; BYTE $0x64 // movd xmm2, dword [rdx + 100] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf2720f66; BYTE $0x09 // pslld xmm2, 9 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000088826e0f66 // movd xmm0, dword [rdx + 136] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x0000008c826e0f66 // movd xmm0, dword [rdx + 140] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x03 // psrld xmm1, 3 + LONG $0xf0720f66; BYTE $0x1d // pslld xmm0, 29 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000808a6e0f66 // movd xmm1, dword [rdx + 128] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000000848a6e0f66 // movd xmm1, dword [rdx + 132] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf1720f66; BYTE $0x1b // pslld xmm1, 27 + LONG $0xcbeb0f66 // por xmm1, xmm3 + LONG $0x5a6e0f66; BYTE $0x78 // movd xmm3, dword [rdx + 120] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + LONG $0x5a6e0f66; BYTE $0x7c // movd xmm3, dword [rdx + 124] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000a0a26e0f66 // movd xmm4, dword [rdx + 160] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000000a4a26e0f66 // movd xmm4, dword [rdx + 164] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000098926e0f66 // movd xmm2, dword [rdx + 152] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x0000009c926e0f66 // movd xmm2, dword [rdx + 156] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000090826e0f66 // movd xmm0, dword [rdx + 144] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x00000094826e0f66 // movd xmm0, dword [rdx + 148] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000b88a6e0f66 // movd xmm1, dword [rdx + 184] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000000bc8a6e0f66 // movd xmm1, dword [rdx + 188] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000000b09a6e0f66 // movd xmm3, dword [rdx + 176] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000000b49a6e0f66 // movd xmm3, dword [rdx + 180] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000a8a26e0f66 // movd xmm4, dword [rdx + 168] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000000aca26e0f66 // movd xmm4, dword [rdx + 172] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x000000d0926e0f66 // movd xmm2, dword [rdx + 208] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000000d4926e0f66 // movd xmm2, dword [rdx + 212] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x000000c8826e0f66 // movd xmm0, dword [rdx + 200] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x000000cc826e0f66 // movd xmm0, dword [rdx + 204] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000c08a6e0f66 // movd xmm1, dword [rdx + 192] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000000c48a6e0f66 // movd xmm1, dword [rdx + 196] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x17 // psrld xmm3, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000000e89a6e0f66 // movd xmm3, dword [rdx + 232] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000000ec9a6e0f66 // movd xmm3, dword [rdx + 236] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x03 // psrld xmm4, 3 + LONG $0xf3720f66; BYTE $0x1d // pslld xmm3, 29 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000e0a26e0f66 // movd xmm4, dword [rdx + 224] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000000e4a26e0f66 // movd xmm4, dword [rdx + 228] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x05 // psrld xmm2, 5 + LONG $0xf4720f66; BYTE $0x1b // pslld xmm4, 27 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x000000d8926e0f66 // movd xmm2, dword [rdx + 216] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000000dc926e0f66 // movd xmm2, dword [rdx + 220] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf2720f66; BYTE $0x09 // pslld xmm2, 9 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000100826e0f66 // movd xmm0, dword [rdx + 256] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x00000104826e0f66 // movd xmm0, dword [rdx + 260] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x03 // psrld xmm1, 3 + LONG $0xf0720f66; BYTE $0x1d // pslld xmm0, 29 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000f88a6e0f66 // movd xmm1, dword [rdx + 248] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000000fc8a6e0f66 // movd xmm1, dword [rdx + 252] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf1720f66; BYTE $0x1b // pslld xmm1, 27 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000000f09a6e0f66 // movd xmm3, dword [rdx + 240] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000000f49a6e0f66 // movd xmm3, dword [rdx + 244] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000118a26e0f66 // movd xmm4, dword [rdx + 280] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x0000011ca26e0f66 // movd xmm4, dword [rdx + 284] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000110926e0f66 // movd xmm2, dword [rdx + 272] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x00000114926e0f66 // movd xmm2, dword [rdx + 276] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000108826e0f66 // movd xmm0, dword [rdx + 264] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x0000010c826e0f66 // movd xmm0, dword [rdx + 268] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001308a6e0f66 // movd xmm1, dword [rdx + 304] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000001348a6e0f66 // movd xmm1, dword [rdx + 308] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001289a6e0f66 // movd xmm3, dword [rdx + 296] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x0000012c9a6e0f66 // movd xmm3, dword [rdx + 300] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000120a26e0f66 // movd xmm4, dword [rdx + 288] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x00000124a26e0f66 // movd xmm4, dword [rdx + 292] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000148926e0f66 // movd xmm2, dword [rdx + 328] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x0000014c926e0f66 // movd xmm2, dword [rdx + 332] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000140826e0f66 // movd xmm0, dword [rdx + 320] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x00000144826e0f66 // movd xmm0, dword [rdx + 324] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001388a6e0f66 // movd xmm1, dword [rdx + 312] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x0000013c8a6e0f66 // movd xmm1, dword [rdx + 316] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x17 // psrld xmm3, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001609a6e0f66 // movd xmm3, dword [rdx + 352] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000001649a6e0f66 // movd xmm3, dword [rdx + 356] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x03 // psrld xmm4, 3 + LONG $0xf3720f66; BYTE $0x1d // pslld xmm3, 29 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000158a26e0f66 // movd xmm4, dword [rdx + 344] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x0000015ca26e0f66 // movd xmm4, dword [rdx + 348] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x05 // psrld xmm2, 5 + LONG $0xf4720f66; BYTE $0x1b // pslld xmm4, 27 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000150926e0f66 // movd xmm2, dword [rdx + 336] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x00000154926e0f66 // movd xmm2, dword [rdx + 340] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf2720f66; BYTE $0x09 // pslld xmm2, 9 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000178826e0f66 // movd xmm0, dword [rdx + 376] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x0000017c826e0f66 // movd xmm0, dword [rdx + 380] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x03 // psrld xmm1, 3 + LONG $0xf0720f66; BYTE $0x1d // pslld xmm0, 29 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001708a6e0f66 // movd xmm1, dword [rdx + 368] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000001748a6e0f66 // movd xmm1, dword [rdx + 372] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf1720f66; BYTE $0x1b // pslld xmm1, 27 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001689a6e0f66 // movd xmm3, dword [rdx + 360] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x0000016c9a6e0f66 // movd xmm3, dword [rdx + 364] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000190a26e0f66 // movd xmm4, dword [rdx + 400] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x00000194a26e0f66 // movd xmm4, dword [rdx + 404] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000188926e0f66 // movd xmm2, dword [rdx + 392] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x0000018c926e0f66 // movd xmm2, dword [rdx + 396] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x00000180826e0f66 // movd xmm0, dword [rdx + 384] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x00000184826e0f66 // movd xmm0, dword [rdx + 388] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001a88a6e0f66 // movd xmm1, dword [rdx + 424] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000001ac8a6e0f66 // movd xmm1, dword [rdx + 428] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001a09a6e0f66 // movd xmm3, dword [rdx + 416] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000001a49a6e0f66 // movd xmm3, dword [rdx + 420] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000198a26e0f66 // movd xmm4, dword [rdx + 408] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x0000019ca26e0f66 // movd xmm4, dword [rdx + 412] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x000001c0926e0f66 // movd xmm2, dword [rdx + 448] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000001c4926e0f66 // movd xmm2, dword [rdx + 452] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x000001b8826e0f66 // movd xmm0, dword [rdx + 440] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x000001bc826e0f66 // movd xmm0, dword [rdx + 444] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001b08a6e0f66 // movd xmm1, dword [rdx + 432] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000001b48a6e0f66 // movd xmm1, dword [rdx + 436] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x17 // psrld xmm3, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001d89a6e0f66 // movd xmm3, dword [rdx + 472] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000001dc9a6e0f66 // movd xmm3, dword [rdx + 476] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x03 // psrld xmm4, 3 + LONG $0xf3720f66; BYTE $0x1d // pslld xmm3, 29 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000001d0a26e0f66 // movd xmm4, dword [rdx + 464] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000001d4a26e0f66 // movd xmm4, dword [rdx + 468] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x05 // psrld xmm2, 5 + LONG $0xf4720f66; BYTE $0x1b // pslld xmm4, 27 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x000001c8926e0f66 // movd xmm2, dword [rdx + 456] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000001cc926e0f66 // movd xmm2, dword [rdx + 460] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf2720f66; BYTE $0x09 // pslld xmm2, 9 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x000001f0826e0f66 // movd xmm0, dword [rdx + 496] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x000001f4826e0f66 // movd xmm0, dword [rdx + 500] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x03 // psrld xmm1, 3 + LONG $0xf0720f66; BYTE $0x1d // pslld xmm0, 29 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001e88a6e0f66 // movd xmm1, dword [rdx + 488] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000001ec8a6e0f66 // movd xmm1, dword [rdx + 492] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcbef0f66 // pxor xmm1, xmm3 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd96f0f66 // movdqa xmm3, xmm1 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf1720f66; BYTE $0x1b // pslld xmm1, 27 + LONG $0xcbeb0f66 // por xmm1, xmm3 + QUAD $0x000001e09a6e0f66 // movd xmm3, dword [rdx + 480] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000001e49a6e0f66 // movd xmm3, dword [rdx + 484] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000208a26e0f66 // movd xmm4, dword [rdx + 520] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x0000020ca26e0f66 // movd xmm4, dword [rdx + 524] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe2ef0f66 // pxor xmm4, xmm2 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xd46f0f66 // movdqa xmm2, xmm4 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe2eb0f66 // por xmm4, xmm2 + QUAD $0x00000200926e0f66 // movd xmm2, dword [rdx + 512] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x00000204926e0f66 // movd xmm2, dword [rdx + 516] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd0ef0f66 // pxor xmm2, xmm0 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd0eb0f66 // por xmm2, xmm0 + QUAD $0x000001f8826e0f66 // movd xmm0, dword [rdx + 504] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + QUAD $0x000001fcaa6e0f66 // movd xmm5, dword [rdx + 508] + LONG $0xed700f66; BYTE $0x00 // pshufd xmm5, xmm5, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0xe8fe0f66 // paddd xmm5, xmm0 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf5720f66; BYTE $0x09 // pslld xmm5, 9 + LONG $0xe8eb0f66 // por xmm5, xmm0 + QUAD $0x00000220826e0f66 // movd xmm0, dword [rdx + 544] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + QUAD $0x000002248a6e0f66 // movd xmm1, dword [rdx + 548] + LONG $0xf1700f66; BYTE $0x00 // pshufd xmm6, xmm1, 0 + LONG $0xf3ef0f66 // pxor xmm6, xmm3 + LONG $0xf0fe0f66 // paddd xmm6, xmm0 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf6720f66; BYTE $0x1d // pslld xmm6, 29 + LONG $0xf0eb0f66 // por xmm6, xmm0 + QUAD $0x00000218826e0f66 // movd xmm0, dword [rdx + 536] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + QUAD $0x0000021c8a6e0f66 // movd xmm1, dword [rdx + 540] + LONG $0xf9700f66; BYTE $0x00 // pshufd xmm7, xmm1, 0 + LONG $0xfcef0f66 // pxor xmm7, xmm4 + LONG $0xf8fe0f66 // paddd xmm7, xmm0 + LONG $0xc76f0f66 // movdqa xmm0, xmm7 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf7720f66; BYTE $0x1b // pslld xmm7, 27 + LONG $0xf8eb0f66 // por xmm7, xmm0 + QUAD $0x00000210826e0f66 // movd xmm0, dword [rdx + 528] + LONG $0xc8700f66; BYTE $0x00 // pshufd xmm1, xmm0, 0 + LONG $0xcdef0f66 // pxor xmm1, xmm5 + QUAD $0x00000214826e0f66 // movd xmm0, dword [rdx + 532] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + LONG $0xc1fe0f66 // paddd xmm0, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002388a6e0f66 // movd xmm1, dword [rdx + 568] + LONG $0xd1700f66; BYTE $0x00 // pshufd xmm2, xmm1, 0 + LONG $0xd6ef0f66 // pxor xmm2, xmm6 + QUAD $0x0000023c8a6e0f66 // movd xmm1, dword [rdx + 572] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcdef0f66 // pxor xmm1, xmm5 + LONG $0xcafe0f66 // paddd xmm1, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000230926e0f66 // movd xmm2, dword [rdx + 560] + LONG $0xda700f66; BYTE $0x00 // pshufd xmm3, xmm2, 0 + LONG $0xdfef0f66 // pxor xmm3, xmm7 + QUAD $0x00000234926e0f66 // movd xmm2, dword [rdx + 564] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd6ef0f66 // pxor xmm2, xmm6 + LONG $0xd3fe0f66 // paddd xmm2, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002289a6e0f66 // movd xmm3, dword [rdx + 552] + LONG $0xe3700f66; BYTE $0x00 // pshufd xmm4, xmm3, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + QUAD $0x0000022c9a6e0f66 // movd xmm3, dword [rdx + 556] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdfef0f66 // pxor xmm3, xmm7 + LONG $0xdcfe0f66 // paddd xmm3, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x19f98348 // cmp rcx, 25 + JB LBB0_3 + QUAD $0x00000250a26e0f66 // movd xmm4, dword [rdx + 592] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x00000254a26e0f66 // movd xmm4, dword [rdx + 596] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000248826e0f66 // movd xmm0, dword [rdx + 584] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x0000024c826e0f66 // movd xmm0, dword [rdx + 588] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002408a6e0f66 // movd xmm1, dword [rdx + 576] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x000002448a6e0f66 // movd xmm1, dword [rdx + 580] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000268926e0f66 // movd xmm2, dword [rdx + 616] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x0000026c926e0f66 // movd xmm2, dword [rdx + 620] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002609a6e0f66 // movd xmm3, dword [rdx + 608] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000002649a6e0f66 // movd xmm3, dword [rdx + 612] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000258a26e0f66 // movd xmm4, dword [rdx + 600] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x0000025ca26e0f66 // movd xmm4, dword [rdx + 604] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000280826e0f66 // movd xmm0, dword [rdx + 640] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + QUAD $0x00000284aa6e0f66 // movd xmm5, dword [rdx + 644] + LONG $0xed700f66; BYTE $0x00 // pshufd xmm5, xmm5, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0xe8fe0f66 // paddd xmm5, xmm0 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf5720f66; BYTE $0x1d // pslld xmm5, 29 + LONG $0xe8eb0f66 // por xmm5, xmm0 + QUAD $0x00000278826e0f66 // movd xmm0, dword [rdx + 632] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + QUAD $0x0000027c8a6e0f66 // movd xmm1, dword [rdx + 636] + LONG $0xf1700f66; BYTE $0x00 // pshufd xmm6, xmm1, 0 + LONG $0xf2ef0f66 // pxor xmm6, xmm2 + LONG $0xf0fe0f66 // paddd xmm6, xmm0 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf6720f66; BYTE $0x1b // pslld xmm6, 27 + LONG $0xf0eb0f66 // por xmm6, xmm0 + QUAD $0x00000270826e0f66 // movd xmm0, dword [rdx + 624] + LONG $0xc8700f66; BYTE $0x00 // pshufd xmm1, xmm0, 0 + LONG $0xccef0f66 // pxor xmm1, xmm4 + QUAD $0x00000274826e0f66 // movd xmm0, dword [rdx + 628] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + LONG $0xc1fe0f66 // paddd xmm0, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002988a6e0f66 // movd xmm1, dword [rdx + 664] + LONG $0xd1700f66; BYTE $0x00 // pshufd xmm2, xmm1, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + QUAD $0x0000029c8a6e0f66 // movd xmm1, dword [rdx + 668] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xccef0f66 // pxor xmm1, xmm4 + LONG $0xcafe0f66 // paddd xmm1, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000290926e0f66 // movd xmm2, dword [rdx + 656] + LONG $0xda700f66; BYTE $0x00 // pshufd xmm3, xmm2, 0 + LONG $0xdeef0f66 // pxor xmm3, xmm6 + QUAD $0x00000294926e0f66 // movd xmm2, dword [rdx + 660] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + LONG $0xd3fe0f66 // paddd xmm2, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002889a6e0f66 // movd xmm3, dword [rdx + 648] + LONG $0xe3700f66; BYTE $0x00 // pshufd xmm4, xmm3, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + QUAD $0x0000028c9a6e0f66 // movd xmm3, dword [rdx + 652] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdeef0f66 // pxor xmm3, xmm6 + LONG $0xdcfe0f66 // paddd xmm3, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x1df98348 // cmp rcx, 29 + JB LBB0_3 + QUAD $0x000002b0a26e0f66 // movd xmm4, dword [rdx + 688] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000002b4a26e0f66 // movd xmm4, dword [rdx + 692] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf4720f66; BYTE $0x1d // pslld xmm4, 29 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000002a8826e0f66 // movd xmm0, dword [rdx + 680] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + QUAD $0x000002ac826e0f66 // movd xmm0, dword [rdx + 684] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xc5fe0f66 // paddd xmm0, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x05 // psrld xmm1, 5 + LONG $0xf0720f66; BYTE $0x1b // pslld xmm0, 27 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002a08a6e0f66 // movd xmm1, dword [rdx + 672] + LONG $0xe9700f66; BYTE $0x00 // pshufd xmm5, xmm1, 0 + LONG $0xebef0f66 // pxor xmm5, xmm3 + QUAD $0x000002a48a6e0f66 // movd xmm1, dword [rdx + 676] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xcdfe0f66 // paddd xmm1, xmm5 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x17 // psrld xmm2, 23 + LONG $0xf1720f66; BYTE $0x09 // pslld xmm1, 9 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000002c8926e0f66 // movd xmm2, dword [rdx + 712] + LONG $0xea700f66; BYTE $0x00 // pshufd xmm5, xmm2, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + QUAD $0x000002cc926e0f66 // movd xmm2, dword [rdx + 716] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xd5fe0f66 // paddd xmm2, xmm5 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x03 // psrld xmm3, 3 + LONG $0xf2720f66; BYTE $0x1d // pslld xmm2, 29 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002c09a6e0f66 // movd xmm3, dword [rdx + 704] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xe8ef0f66 // pxor xmm5, xmm0 + QUAD $0x000002c49a6e0f66 // movd xmm3, dword [rdx + 708] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xddfe0f66 // paddd xmm3, xmm5 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x05 // psrld xmm4, 5 + LONG $0xf3720f66; BYTE $0x1b // pslld xmm3, 27 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000002b8a26e0f66 // movd xmm4, dword [rdx + 696] + LONG $0xec700f66; BYTE $0x00 // pshufd xmm5, xmm4, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + QUAD $0x000002bca26e0f66 // movd xmm4, dword [rdx + 700] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xe5fe0f66 // paddd xmm4, xmm5 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x17 // psrld xmm0, 23 + LONG $0xf4720f66; BYTE $0x09 // pslld xmm4, 9 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000002e0826e0f66 // movd xmm0, dword [rdx + 736] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + QUAD $0x000002e4aa6e0f66 // movd xmm5, dword [rdx + 740] + LONG $0xed700f66; BYTE $0x00 // pshufd xmm5, xmm5, 0 + LONG $0xe9ef0f66 // pxor xmm5, xmm1 + LONG $0xe8fe0f66 // paddd xmm5, xmm0 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x03 // psrld xmm0, 3 + LONG $0xf5720f66; BYTE $0x1d // pslld xmm5, 29 + LONG $0xe8eb0f66 // por xmm5, xmm0 + QUAD $0x000002d8826e0f66 // movd xmm0, dword [rdx + 728] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + QUAD $0x000002dc8a6e0f66 // movd xmm1, dword [rdx + 732] + LONG $0xf1700f66; BYTE $0x00 // pshufd xmm6, xmm1, 0 + LONG $0xf2ef0f66 // pxor xmm6, xmm2 + LONG $0xf0fe0f66 // paddd xmm6, xmm0 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xd0720f66; BYTE $0x05 // psrld xmm0, 5 + LONG $0xf6720f66; BYTE $0x1b // pslld xmm6, 27 + LONG $0xf0eb0f66 // por xmm6, xmm0 + QUAD $0x000002d0826e0f66 // movd xmm0, dword [rdx + 720] + LONG $0xc8700f66; BYTE $0x00 // pshufd xmm1, xmm0, 0 + LONG $0xccef0f66 // pxor xmm1, xmm4 + QUAD $0x000002d4826e0f66 // movd xmm0, dword [rdx + 724] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + LONG $0xc1fe0f66 // paddd xmm0, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x17 // psrld xmm1, 23 + LONG $0xf0720f66; BYTE $0x09 // pslld xmm0, 9 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002f88a6e0f66 // movd xmm1, dword [rdx + 760] + LONG $0xd1700f66; BYTE $0x00 // pshufd xmm2, xmm1, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + QUAD $0x000002fc8a6e0f66 // movd xmm1, dword [rdx + 764] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xccef0f66 // pxor xmm1, xmm4 + LONG $0xcafe0f66 // paddd xmm1, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x03 // psrld xmm2, 3 + LONG $0xf1720f66; BYTE $0x1d // pslld xmm1, 29 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000002f0926e0f66 // movd xmm2, dword [rdx + 752] + LONG $0xda700f66; BYTE $0x00 // pshufd xmm3, xmm2, 0 + LONG $0xdeef0f66 // pxor xmm3, xmm6 + QUAD $0x000002f4926e0f66 // movd xmm2, dword [rdx + 756] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + LONG $0xd3fe0f66 // paddd xmm2, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x05 // psrld xmm3, 5 + LONG $0xf2720f66; BYTE $0x1b // pslld xmm2, 27 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002e89a6e0f66 // movd xmm3, dword [rdx + 744] + LONG $0xe3700f66; BYTE $0x00 // pshufd xmm4, xmm3, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + QUAD $0x000002ec9a6e0f66 // movd xmm3, dword [rdx + 748] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdeef0f66 // pxor xmm3, xmm6 + LONG $0xdcfe0f66 // paddd xmm3, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x17 // psrld xmm4, 23 + LONG $0xf3720f66; BYTE $0x09 // pslld xmm3, 9 + LONG $0xdceb0f66 // por xmm3, xmm4 + +LBB0_3: + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xe2620f66 // punpckldq xmm4, xmm2 + LONG $0xe96f0f66 // movdqa xmm5, xmm1 + LONG $0xe8620f66 // punpckldq xmm5, xmm0 + LONG $0xda6a0f66 // punpckhdq xmm3, xmm2 + LONG $0xc86a0f66 // punpckhdq xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xc56c0f66 // punpcklqdq xmm0, xmm5 + LONG $0xe56d0f66 // punpckhqdq xmm4, xmm5 + LONG $0xd36f0f66 // movdqa xmm2, xmm3 + LONG $0xd16c0f66 // punpcklqdq xmm2, xmm1 + LONG $0xd96d0f66 // punpckhqdq xmm3, xmm1 + LONG $0x077f0ff3 // movdqu oword [rdi], xmm0 + LONG $0x677f0ff3; BYTE $0x10 // movdqu oword [rdi + 16], xmm4 + LONG $0x577f0ff3; BYTE $0x20 // movdqu oword [rdi + 32], xmm2 + LONG $0x5f7f0ff3; BYTE $0x30 // movdqu oword [rdi + 48], xmm3 + RET + +TEXT ·__lea_decrypt_4block(SB), $0-32 + + MOVQ pt+0(FP), DI + MOVQ ct+8(FP), SI + MOVQ rk+16(FP), DX + MOVQ round+24(FP), CX + + LONG $0x066f0ff3 // movdqu xmm0, oword [rsi] + LONG $0x4e6f0ff3; BYTE $0x10 // movdqu xmm1, oword [rsi + 16] + LONG $0x666f0ff3; BYTE $0x20 // movdqu xmm4, oword [rsi + 32] + LONG $0x5e6f0ff3; BYTE $0x30 // movdqu xmm3, oword [rsi + 48] + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd1620f66 // punpckldq xmm2, xmm1 + LONG $0xec6f0f66 // movdqa xmm5, xmm4 + LONG $0xeb620f66 // punpckldq xmm5, xmm3 + LONG $0xc16a0f66 // punpckhdq xmm0, xmm1 + LONG $0xe36a0f66 // punpckhdq xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xdd6c0f66 // punpcklqdq xmm3, xmm5 + LONG $0xd56d0f66 // punpckhqdq xmm2, xmm5 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xcc6c0f66 // punpcklqdq xmm1, xmm4 + LONG $0xc46d0f66 // punpckhqdq xmm0, xmm4 + LONG $0x1df98348 // cmp rcx, 29 + JB LBB1_2 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000002e8a26e0f66 // movd xmm4, dword [rdx + 744] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000002eca26e0f66 // movd xmm4, dword [rdx + 748] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002f09a6e0f66 // movd xmm3, dword [rdx + 752] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000002f49a6e0f66 // movd xmm3, dword [rdx + 756] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000002f8926e0f66 // movd xmm2, dword [rdx + 760] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000002fc926e0f66 // movd xmm2, dword [rdx + 764] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002d08a6e0f66 // movd xmm1, dword [rdx + 720] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000002d48a6e0f66 // movd xmm1, dword [rdx + 724] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000002d8826e0f66 // movd xmm0, dword [rdx + 728] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000002dc826e0f66 // movd xmm0, dword [rdx + 732] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000002e0a26e0f66 // movd xmm4, dword [rdx + 736] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000002e4a26e0f66 // movd xmm4, dword [rdx + 740] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002b89a6e0f66 // movd xmm3, dword [rdx + 696] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000002bc9a6e0f66 // movd xmm3, dword [rdx + 700] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000002c0926e0f66 // movd xmm2, dword [rdx + 704] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000002c4926e0f66 // movd xmm2, dword [rdx + 708] + LONG $0xf2700f66; BYTE $0x00 // pshufd xmm6, xmm2, 0 + LONG $0xf1ef0f66 // pxor xmm6, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002c88a6e0f66 // movd xmm1, dword [rdx + 712] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xceef0f66 // pxor xmm1, xmm6 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000002cc8a6e0f66 // movd xmm1, dword [rdx + 716] + LONG $0xd9700f66; BYTE $0x00 // pshufd xmm3, xmm1, 0 + LONG $0xd8ef0f66 // pxor xmm3, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000002a0826e0f66 // movd xmm0, dword [rdx + 672] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000002a4826e0f66 // movd xmm0, dword [rdx + 676] + LONG $0xd0700f66; BYTE $0x00 // pshufd xmm2, xmm0, 0 + LONG $0xd4ef0f66 // pxor xmm2, xmm4 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf5720f66; BYTE $0x05 // pslld xmm5, 5 + LONG $0xe8eb0f66 // por xmm5, xmm0 + QUAD $0x000002a8826e0f66 // movd xmm0, dword [rdx + 680] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + LONG $0xe8fa0f66 // psubd xmm5, xmm0 + QUAD $0x000002ac826e0f66 // movd xmm0, dword [rdx + 684] + LONG $0xc8700f66; BYTE $0x00 // pshufd xmm1, xmm0, 0 + LONG $0xcdef0f66 // pxor xmm1, xmm5 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf6720f66; BYTE $0x03 // pslld xmm6, 3 + LONG $0xf0eb0f66 // por xmm6, xmm0 + QUAD $0x000002b0826e0f66 // movd xmm0, dword [rdx + 688] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xf0fa0f66 // psubd xmm6, xmm0 + QUAD $0x000002b4826e0f66 // movd xmm0, dword [rdx + 692] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc6ef0f66 // pxor xmm0, xmm6 + JMP LBB1_3 + +LBB1_2: + LONG $0x19f98348 // cmp rcx, 25 + JB LBB1_4 + +LBB1_3: + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000288a26e0f66 // movd xmm4, dword [rdx + 648] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x0000028ca26e0f66 // movd xmm4, dword [rdx + 652] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002909a6e0f66 // movd xmm3, dword [rdx + 656] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000002949a6e0f66 // movd xmm3, dword [rdx + 660] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000298926e0f66 // movd xmm2, dword [rdx + 664] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x0000029c926e0f66 // movd xmm2, dword [rdx + 668] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002708a6e0f66 // movd xmm1, dword [rdx + 624] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000002748a6e0f66 // movd xmm1, dword [rdx + 628] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000278826e0f66 // movd xmm0, dword [rdx + 632] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x0000027c826e0f66 // movd xmm0, dword [rdx + 636] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000280a26e0f66 // movd xmm4, dword [rdx + 640] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x00000284a26e0f66 // movd xmm4, dword [rdx + 644] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002589a6e0f66 // movd xmm3, dword [rdx + 600] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x0000025c9a6e0f66 // movd xmm3, dword [rdx + 604] + LONG $0xeb700f66; BYTE $0x00 // pshufd xmm5, xmm3, 0 + LONG $0xeaef0f66 // pxor xmm5, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000260926e0f66 // movd xmm2, dword [rdx + 608] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd5ef0f66 // pxor xmm2, xmm5 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x00000264926e0f66 // movd xmm2, dword [rdx + 612] + LONG $0xf2700f66; BYTE $0x00 // pshufd xmm6, xmm2, 0 + LONG $0xf1ef0f66 // pxor xmm6, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002688a6e0f66 // movd xmm1, dword [rdx + 616] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xceef0f66 // pxor xmm1, xmm6 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x0000026c8a6e0f66 // movd xmm1, dword [rdx + 620] + LONG $0xd9700f66; BYTE $0x00 // pshufd xmm3, xmm1, 0 + LONG $0xd8ef0f66 // pxor xmm3, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000240826e0f66 // movd xmm0, dword [rdx + 576] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x00000244826e0f66 // movd xmm0, dword [rdx + 580] + LONG $0xd0700f66; BYTE $0x00 // pshufd xmm2, xmm0, 0 + LONG $0xd4ef0f66 // pxor xmm2, xmm4 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf5720f66; BYTE $0x05 // pslld xmm5, 5 + LONG $0xe8eb0f66 // por xmm5, xmm0 + QUAD $0x00000248826e0f66 // movd xmm0, dword [rdx + 584] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + LONG $0xe8fa0f66 // psubd xmm5, xmm0 + QUAD $0x0000024c826e0f66 // movd xmm0, dword [rdx + 588] + LONG $0xc8700f66; BYTE $0x00 // pshufd xmm1, xmm0, 0 + LONG $0xcdef0f66 // pxor xmm1, xmm5 + LONG $0xc66f0f66 // movdqa xmm0, xmm6 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf6720f66; BYTE $0x03 // pslld xmm6, 3 + LONG $0xf0eb0f66 // por xmm6, xmm0 + QUAD $0x00000250826e0f66 // movd xmm0, dword [rdx + 592] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xf0fa0f66 // psubd xmm6, xmm0 + QUAD $0x00000254826e0f66 // movd xmm0, dword [rdx + 596] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc6ef0f66 // pxor xmm0, xmm6 + +LBB1_4: + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000228a26e0f66 // movd xmm4, dword [rdx + 552] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x0000022ca26e0f66 // movd xmm4, dword [rdx + 556] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000002309a6e0f66 // movd xmm3, dword [rdx + 560] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000002349a6e0f66 // movd xmm3, dword [rdx + 564] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000238926e0f66 // movd xmm2, dword [rdx + 568] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x0000023c926e0f66 // movd xmm2, dword [rdx + 572] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002108a6e0f66 // movd xmm1, dword [rdx + 528] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000002148a6e0f66 // movd xmm1, dword [rdx + 532] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000218826e0f66 // movd xmm0, dword [rdx + 536] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x0000021c826e0f66 // movd xmm0, dword [rdx + 540] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000220a26e0f66 // movd xmm4, dword [rdx + 544] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x00000224a26e0f66 // movd xmm4, dword [rdx + 548] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001f89a6e0f66 // movd xmm3, dword [rdx + 504] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001fc9a6e0f66 // movd xmm3, dword [rdx + 508] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000200926e0f66 // movd xmm2, dword [rdx + 512] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x00000204926e0f66 // movd xmm2, dword [rdx + 516] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000002088a6e0f66 // movd xmm1, dword [rdx + 520] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x0000020c8a6e0f66 // movd xmm1, dword [rdx + 524] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000001e0826e0f66 // movd xmm0, dword [rdx + 480] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000001e4826e0f66 // movd xmm0, dword [rdx + 484] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1b // psrld xmm4, 27 + LONG $0xf3720f66; BYTE $0x05 // pslld xmm3, 5 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000001e8a26e0f66 // movd xmm4, dword [rdx + 488] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000001eca26e0f66 // movd xmm4, dword [rdx + 492] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1d // psrld xmm3, 29 + LONG $0xf2720f66; BYTE $0x03 // pslld xmm2, 3 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001f09a6e0f66 // movd xmm3, dword [rdx + 496] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001f49a6e0f66 // movd xmm3, dword [rdx + 500] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x09 // psrld xmm2, 9 + LONG $0xf1720f66; BYTE $0x17 // pslld xmm1, 23 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000001c8926e0f66 // movd xmm2, dword [rdx + 456] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000001cc926e0f66 // movd xmm2, dword [rdx + 460] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1b // psrld xmm1, 27 + LONG $0xf0720f66; BYTE $0x05 // pslld xmm0, 5 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001d08a6e0f66 // movd xmm1, dword [rdx + 464] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000001d48a6e0f66 // movd xmm1, dword [rdx + 468] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf4720f66; BYTE $0x03 // pslld xmm4, 3 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000001d8826e0f66 // movd xmm0, dword [rdx + 472] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000001dc826e0f66 // movd xmm0, dword [rdx + 476] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000001b0a26e0f66 // movd xmm4, dword [rdx + 432] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000001b4a26e0f66 // movd xmm4, dword [rdx + 436] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001b89a6e0f66 // movd xmm3, dword [rdx + 440] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001bc9a6e0f66 // movd xmm3, dword [rdx + 444] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000001c0926e0f66 // movd xmm2, dword [rdx + 448] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000001c4926e0f66 // movd xmm2, dword [rdx + 452] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001988a6e0f66 // movd xmm1, dword [rdx + 408] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x0000019c8a6e0f66 // movd xmm1, dword [rdx + 412] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000001a0826e0f66 // movd xmm0, dword [rdx + 416] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000001a4826e0f66 // movd xmm0, dword [rdx + 420] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000001a8a26e0f66 // movd xmm4, dword [rdx + 424] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000001aca26e0f66 // movd xmm4, dword [rdx + 428] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001809a6e0f66 // movd xmm3, dword [rdx + 384] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001849a6e0f66 // movd xmm3, dword [rdx + 388] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000188926e0f66 // movd xmm2, dword [rdx + 392] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x0000018c926e0f66 // movd xmm2, dword [rdx + 396] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001908a6e0f66 // movd xmm1, dword [rdx + 400] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000001948a6e0f66 // movd xmm1, dword [rdx + 404] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000168826e0f66 // movd xmm0, dword [rdx + 360] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x0000016c826e0f66 // movd xmm0, dword [rdx + 364] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1b // psrld xmm4, 27 + LONG $0xf3720f66; BYTE $0x05 // pslld xmm3, 5 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000170a26e0f66 // movd xmm4, dword [rdx + 368] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x00000174a26e0f66 // movd xmm4, dword [rdx + 372] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1d // psrld xmm3, 29 + LONG $0xf2720f66; BYTE $0x03 // pslld xmm2, 3 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001789a6e0f66 // movd xmm3, dword [rdx + 376] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x0000017c9a6e0f66 // movd xmm3, dword [rdx + 380] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x09 // psrld xmm2, 9 + LONG $0xf1720f66; BYTE $0x17 // pslld xmm1, 23 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000150926e0f66 // movd xmm2, dword [rdx + 336] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x00000154926e0f66 // movd xmm2, dword [rdx + 340] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1b // psrld xmm1, 27 + LONG $0xf0720f66; BYTE $0x05 // pslld xmm0, 5 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001588a6e0f66 // movd xmm1, dword [rdx + 344] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x0000015c8a6e0f66 // movd xmm1, dword [rdx + 348] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf4720f66; BYTE $0x03 // pslld xmm4, 3 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000160826e0f66 // movd xmm0, dword [rdx + 352] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x00000164826e0f66 // movd xmm0, dword [rdx + 356] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000138a26e0f66 // movd xmm4, dword [rdx + 312] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x0000013ca26e0f66 // movd xmm4, dword [rdx + 316] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001409a6e0f66 // movd xmm3, dword [rdx + 320] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001449a6e0f66 // movd xmm3, dword [rdx + 324] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000148926e0f66 // movd xmm2, dword [rdx + 328] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x0000014c926e0f66 // movd xmm2, dword [rdx + 332] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001208a6e0f66 // movd xmm1, dword [rdx + 288] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000001248a6e0f66 // movd xmm1, dword [rdx + 292] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x00000128826e0f66 // movd xmm0, dword [rdx + 296] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x0000012c826e0f66 // movd xmm0, dword [rdx + 300] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000130a26e0f66 // movd xmm4, dword [rdx + 304] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x00000134a26e0f66 // movd xmm4, dword [rdx + 308] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001089a6e0f66 // movd xmm3, dword [rdx + 264] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x0000010c9a6e0f66 // movd xmm3, dword [rdx + 268] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000110926e0f66 // movd xmm2, dword [rdx + 272] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x00000114926e0f66 // movd xmm2, dword [rdx + 276] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000001188a6e0f66 // movd xmm1, dword [rdx + 280] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x0000011c8a6e0f66 // movd xmm1, dword [rdx + 284] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000000f0826e0f66 // movd xmm0, dword [rdx + 240] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000000f4826e0f66 // movd xmm0, dword [rdx + 244] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1b // psrld xmm4, 27 + LONG $0xf3720f66; BYTE $0x05 // pslld xmm3, 5 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000f8a26e0f66 // movd xmm4, dword [rdx + 248] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000000fca26e0f66 // movd xmm4, dword [rdx + 252] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1d // psrld xmm3, 29 + LONG $0xf2720f66; BYTE $0x03 // pslld xmm2, 3 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000001009a6e0f66 // movd xmm3, dword [rdx + 256] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000001049a6e0f66 // movd xmm3, dword [rdx + 260] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x09 // psrld xmm2, 9 + LONG $0xf1720f66; BYTE $0x17 // pslld xmm1, 23 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000000d8926e0f66 // movd xmm2, dword [rdx + 216] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000000dc926e0f66 // movd xmm2, dword [rdx + 220] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1b // psrld xmm1, 27 + LONG $0xf0720f66; BYTE $0x05 // pslld xmm0, 5 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000e08a6e0f66 // movd xmm1, dword [rdx + 224] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000000e48a6e0f66 // movd xmm1, dword [rdx + 228] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf4720f66; BYTE $0x03 // pslld xmm4, 3 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000000e8826e0f66 // movd xmm0, dword [rdx + 232] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000000ec826e0f66 // movd xmm0, dword [rdx + 236] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000c0a26e0f66 // movd xmm4, dword [rdx + 192] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000000c4a26e0f66 // movd xmm4, dword [rdx + 196] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000000c89a6e0f66 // movd xmm3, dword [rdx + 200] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000000cc9a6e0f66 // movd xmm3, dword [rdx + 204] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x000000d0926e0f66 // movd xmm2, dword [rdx + 208] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x000000d4926e0f66 // movd xmm2, dword [rdx + 212] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000a88a6e0f66 // movd xmm1, dword [rdx + 168] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000000ac8a6e0f66 // movd xmm1, dword [rdx + 172] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + QUAD $0x000000b0826e0f66 // movd xmm0, dword [rdx + 176] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + QUAD $0x000000b4826e0f66 // movd xmm0, dword [rdx + 180] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1d // psrld xmm4, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x000000b8a26e0f66 // movd xmm4, dword [rdx + 184] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x000000bca26e0f66 // movd xmm4, dword [rdx + 188] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x09 // psrld xmm3, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000000909a6e0f66 // movd xmm3, dword [rdx + 144] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x000000949a6e0f66 // movd xmm3, dword [rdx + 148] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1b // psrld xmm2, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xcaeb0f66 // por xmm1, xmm2 + QUAD $0x00000098926e0f66 // movd xmm2, dword [rdx + 152] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + QUAD $0x0000009c926e0f66 // movd xmm2, dword [rdx + 156] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1d // psrld xmm1, 29 + LONG $0xf0720f66; BYTE $0x03 // pslld xmm0, 3 + LONG $0xc1eb0f66 // por xmm0, xmm1 + QUAD $0x000000a08a6e0f66 // movd xmm1, dword [rdx + 160] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + QUAD $0x000000a48a6e0f66 // movd xmm1, dword [rdx + 164] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe0eb0f66 // por xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x78 // movd xmm0, dword [rdx + 120] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x7c // movd xmm0, dword [rdx + 124] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1b // psrld xmm4, 27 + LONG $0xf3720f66; BYTE $0x05 // pslld xmm3, 5 + LONG $0xdceb0f66 // por xmm3, xmm4 + QUAD $0x00000080a26e0f66 // movd xmm4, dword [rdx + 128] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + QUAD $0x00000084a26e0f66 // movd xmm4, dword [rdx + 132] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1d // psrld xmm3, 29 + LONG $0xf2720f66; BYTE $0x03 // pslld xmm2, 3 + LONG $0xd3eb0f66 // por xmm2, xmm3 + QUAD $0x000000889a6e0f66 // movd xmm3, dword [rdx + 136] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + QUAD $0x0000008c9a6e0f66 // movd xmm3, dword [rdx + 140] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x09 // psrld xmm2, 9 + LONG $0xf1720f66; BYTE $0x17 // pslld xmm1, 23 + LONG $0xcaeb0f66 // por xmm1, xmm2 + LONG $0x526e0f66; BYTE $0x60 // movd xmm2, dword [rdx + 96] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + LONG $0x526e0f66; BYTE $0x64 // movd xmm2, dword [rdx + 100] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x1b // psrld xmm1, 27 + LONG $0xf0720f66; BYTE $0x05 // pslld xmm0, 5 + LONG $0xc1eb0f66 // por xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x68 // movd xmm1, dword [rdx + 104] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x6c // movd xmm1, dword [rdx + 108] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf4720f66; BYTE $0x03 // pslld xmm4, 3 + LONG $0xe0eb0f66 // por xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x70 // movd xmm0, dword [rdx + 112] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x74 // movd xmm0, dword [rdx + 116] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x09 // psrld xmm4, 9 + LONG $0xf3720f66; BYTE $0x17 // pslld xmm3, 23 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x48 // movd xmm4, dword [rdx + 72] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe0ef0f66 // pxor xmm4, xmm0 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x4c // movd xmm4, dword [rdx + 76] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1b // psrld xmm3, 27 + LONG $0xf2720f66; BYTE $0x05 // pslld xmm2, 5 + LONG $0xd3eb0f66 // por xmm2, xmm3 + LONG $0x5a6e0f66; BYTE $0x50 // movd xmm3, dword [rdx + 80] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + LONG $0x5a6e0f66; BYTE $0x54 // movd xmm3, dword [rdx + 84] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd16f0f66 // movdqa xmm2, xmm1 + LONG $0xd2720f66; BYTE $0x1d // psrld xmm2, 29 + LONG $0xf1720f66; BYTE $0x03 // pslld xmm1, 3 + LONG $0xcaeb0f66 // por xmm1, xmm2 + LONG $0x526e0f66; BYTE $0x58 // movd xmm2, dword [rdx + 88] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd3ef0f66 // pxor xmm2, xmm3 + LONG $0xcafa0f66 // psubd xmm1, xmm2 + LONG $0x526e0f66; BYTE $0x5c // movd xmm2, dword [rdx + 92] + LONG $0xd2700f66; BYTE $0x00 // pshufd xmm2, xmm2, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc86f0f66 // movdqa xmm1, xmm0 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf0720f66; BYTE $0x17 // pslld xmm0, 23 + LONG $0xc1eb0f66 // por xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x30 // movd xmm1, dword [rdx + 48] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xcaef0f66 // pxor xmm1, xmm2 + LONG $0xc1fa0f66 // psubd xmm0, xmm1 + LONG $0x4a6e0f66; BYTE $0x34 // movd xmm1, dword [rdx + 52] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xc46f0f66 // movdqa xmm0, xmm4 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf4720f66; BYTE $0x05 // pslld xmm4, 5 + LONG $0xe0eb0f66 // por xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x38 // movd xmm0, dword [rdx + 56] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc1ef0f66 // pxor xmm0, xmm1 + LONG $0xe0fa0f66 // psubd xmm4, xmm0 + LONG $0x426e0f66; BYTE $0x3c // movd xmm0, dword [rdx + 60] + LONG $0xe8700f66; BYTE $0x00 // pshufd xmm5, xmm0, 0 + LONG $0xecef0f66 // pxor xmm5, xmm4 + LONG $0xc36f0f66 // movdqa xmm0, xmm3 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf3720f66; BYTE $0x03 // pslld xmm3, 3 + LONG $0xd8eb0f66 // por xmm3, xmm0 + LONG $0x426e0f66; BYTE $0x40 // movd xmm0, dword [rdx + 64] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc5ef0f66 // pxor xmm0, xmm5 + LONG $0xd8fa0f66 // psubd xmm3, xmm0 + LONG $0x426e0f66; BYTE $0x44 // movd xmm0, dword [rdx + 68] + LONG $0xe0700f66; BYTE $0x00 // pshufd xmm4, xmm0, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xc26f0f66 // movdqa xmm0, xmm2 + LONG $0xd0720f66; BYTE $0x09 // psrld xmm0, 9 + LONG $0xf2720f66; BYTE $0x17 // pslld xmm2, 23 + LONG $0xd0eb0f66 // por xmm2, xmm0 + LONG $0x426e0f66; BYTE $0x18 // movd xmm0, dword [rdx + 24] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc4ef0f66 // pxor xmm0, xmm4 + LONG $0xd0fa0f66 // psubd xmm2, xmm0 + LONG $0x426e0f66; BYTE $0x1c // movd xmm0, dword [rdx + 28] + LONG $0xd8700f66; BYTE $0x00 // pshufd xmm3, xmm0, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xc16f0f66 // movdqa xmm0, xmm1 + LONG $0xd0720f66; BYTE $0x1b // psrld xmm0, 27 + LONG $0xf1720f66; BYTE $0x05 // pslld xmm1, 5 + LONG $0xc8eb0f66 // por xmm1, xmm0 + LONG $0x426e0f66; BYTE $0x20 // movd xmm0, dword [rdx + 32] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc3ef0f66 // pxor xmm0, xmm3 + LONG $0xc8fa0f66 // psubd xmm1, xmm0 + LONG $0x426e0f66; BYTE $0x24 // movd xmm0, dword [rdx + 36] + LONG $0xd0700f66; BYTE $0x00 // pshufd xmm2, xmm0, 0 + LONG $0xd1ef0f66 // pxor xmm2, xmm1 + LONG $0xc56f0f66 // movdqa xmm0, xmm5 + LONG $0xd0720f66; BYTE $0x1d // psrld xmm0, 29 + LONG $0xf5720f66; BYTE $0x03 // pslld xmm5, 3 + LONG $0xe8eb0f66 // por xmm5, xmm0 + LONG $0x426e0f66; BYTE $0x28 // movd xmm0, dword [rdx + 40] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc2ef0f66 // pxor xmm0, xmm2 + LONG $0xe8fa0f66 // psubd xmm5, xmm0 + LONG $0x426e0f66; BYTE $0x2c // movd xmm0, dword [rdx + 44] + LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0 + LONG $0xc5ef0f66 // pxor xmm0, xmm5 + LONG $0xcc6f0f66 // movdqa xmm1, xmm4 + LONG $0xd1720f66; BYTE $0x09 // psrld xmm1, 9 + LONG $0xf4720f66; BYTE $0x17 // pslld xmm4, 23 + LONG $0xe1eb0f66 // por xmm4, xmm1 + LONG $0x0a6e0f66 // movd xmm1, dword [rdx] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xc8ef0f66 // pxor xmm1, xmm0 + LONG $0xe1fa0f66 // psubd xmm4, xmm1 + LONG $0x4a6e0f66; BYTE $0x04 // movd xmm1, dword [rdx + 4] + LONG $0xc9700f66; BYTE $0x00 // pshufd xmm1, xmm1, 0 + LONG $0xccef0f66 // pxor xmm1, xmm4 + LONG $0xe36f0f66 // movdqa xmm4, xmm3 + LONG $0xd4720f66; BYTE $0x1b // psrld xmm4, 27 + LONG $0xf3720f66; BYTE $0x05 // pslld xmm3, 5 + LONG $0xdceb0f66 // por xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x08 // movd xmm4, dword [rdx + 8] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe1ef0f66 // pxor xmm4, xmm1 + LONG $0xdcfa0f66 // psubd xmm3, xmm4 + LONG $0x626e0f66; BYTE $0x0c // movd xmm4, dword [rdx + 12] + LONG $0xe4700f66; BYTE $0x00 // pshufd xmm4, xmm4, 0 + LONG $0xe3ef0f66 // pxor xmm4, xmm3 + LONG $0xda6f0f66 // movdqa xmm3, xmm2 + LONG $0xd3720f66; BYTE $0x1d // psrld xmm3, 29 + LONG $0xf2720f66; BYTE $0x03 // pslld xmm2, 3 + LONG $0xd3eb0f66 // por xmm2, xmm3 + LONG $0x5a6e0f66; BYTE $0x10 // movd xmm3, dword [rdx + 16] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdcef0f66 // pxor xmm3, xmm4 + LONG $0xd3fa0f66 // psubd xmm2, xmm3 + LONG $0x5a6e0f66; BYTE $0x14 // movd xmm3, dword [rdx + 20] + LONG $0xdb700f66; BYTE $0x00 // pshufd xmm3, xmm3, 0 + LONG $0xdaef0f66 // pxor xmm3, xmm2 + LONG $0xd06f0f66 // movdqa xmm2, xmm0 + LONG $0xd1620f66 // punpckldq xmm2, xmm1 + LONG $0xec6f0f66 // movdqa xmm5, xmm4 + LONG $0xeb620f66 // punpckldq xmm5, xmm3 + LONG $0xc16a0f66 // punpckhdq xmm0, xmm1 + LONG $0xe36a0f66 // punpckhdq xmm4, xmm3 + LONG $0xca6f0f66 // movdqa xmm1, xmm2 + LONG $0xcd6c0f66 // punpcklqdq xmm1, xmm5 + LONG $0xd56d0f66 // punpckhqdq xmm2, xmm5 + LONG $0xd86f0f66 // movdqa xmm3, xmm0 + LONG $0xdc6c0f66 // punpcklqdq xmm3, xmm4 + LONG $0xc46d0f66 // punpckhqdq xmm0, xmm4 + LONG $0x0f7f0ff3 // movdqu oword [rdi], xmm1 + LONG $0x577f0ff3; BYTE $0x10 // movdqu oword [rdi + 16], xmm2 + LONG $0x5f7f0ff3; BYTE $0x20 // movdqu oword [rdi + 32], xmm3 + LONG $0x477f0ff3; BYTE $0x30 // movdqu oword [rdi + 48], xmm0 + RET diff --git a/lea/lea_amd64_stubs.go b/lea/lea_amd64_stubs.go index a1203a5..dcc1eb7 100644 --- a/lea/lea_amd64_stubs.go +++ b/lea/lea_amd64_stubs.go @@ -1,13 +1,17 @@ -// Code generated by command: go run lea.go -out ../lea_amd64.s -stubs ../lea_amd64_stubs.go -pkg lea. DO NOT EDIT. - -//go:build amd64 && gc && !purego - -package lea - -func leaEnc4SSE2(ctx *leaContext, dst []byte, src []byte) - -func leaDec4SSE2(ctx *leaContext, dst []byte, src []byte) - -func leaEnc8AVX2(ctx *leaContext, dst []byte, src []byte) - -func leaDec8AVX2(ctx *leaContext, dst []byte, src []byte) +//go:build amd64 && gc && !purego + +package lea + +import "unsafe" + +//go:noescape +func __lea_encrypt_4block(ct, pt, rk unsafe.Pointer, round uint64) + +//go:noescape +func __lea_decrypt_4block(pt, ct, rk unsafe.Pointer, round uint64) + +//go:noescape +func __lea_encrypt_8block(ct, pt, rk unsafe.Pointer, round uint64) + +//go:noescape +func __lea_decrypt_8block(pt, ct, rk unsafe.Pointer, round uint64) diff --git a/lea/lea_arm64.go b/lea/lea_arm64.go index b87bc0a..abe5e39 100644 --- a/lea/lea_arm64.go +++ b/lea/lea_arm64.go @@ -7,35 +7,35 @@ import ( ) func init() { - leaEnc4 = leaEnc4ARM64 - leaDec4 = leaDec4ARM64 + leaEnc4 = leaEnc4NEON + leaDec4 = leaDec4NEON - leaEnc8 = leaEnc8ARM64 - leaDec8 = leaDec8ARM64 + leaEnc8 = leaEnc8NEON + leaDec8 = leaDec8NEON } -func leaEnc4ARM64(ctx *leaContext, dst, src []byte) { +func leaEnc4NEON(ctx *leaContext, dst, src []byte) { lea_encrypt_4block( unsafe.Pointer(&dst[0]), unsafe.Pointer(&src[0]), unsafe.Pointer(&ctx.rk[0]), - unsafe.Pointer(uintptr(ctx.round)), + uint64(ctx.round), ) } -func leaDec4ARM64(ctx *leaContext, dst, src []byte) { +func leaDec4NEON(ctx *leaContext, dst, src []byte) { lea_decrypt_4block( unsafe.Pointer(&dst[0]), unsafe.Pointer(&src[0]), unsafe.Pointer(&ctx.rk[0]), - unsafe.Pointer(uintptr(ctx.round)), + uint64(ctx.round), ) } -func leaEnc8ARM64(ctx *leaContext, dst, src []byte) { - leaEnc4ARM64(ctx, dst[0x00:], src[0x00:]) - leaEnc4ARM64(ctx, dst[0x40:], src[0x40:]) +func leaEnc8NEON(ctx *leaContext, dst, src []byte) { + leaEnc4NEON(ctx, dst[0x00:], src[0x00:]) + leaEnc4NEON(ctx, dst[0x40:], src[0x40:]) } -func leaDec8ARM64(ctx *leaContext, dst, src []byte) { - leaDec4ARM64(ctx, dst[0x00:], src[0x00:]) - leaDec4ARM64(ctx, dst[0x40:], src[0x40:]) +func leaDec8NEON(ctx *leaContext, dst, src []byte) { + leaDec4NEON(ctx, dst[0x00:], src[0x00:]) + leaDec4NEON(ctx, dst[0x40:], src[0x40:]) } diff --git a/lea/lea_arm64.s b/lea/lea_arm64_neon.s similarity index 99% rename from lea/lea_arm64.s rename to lea/lea_arm64_neon.s index 8f1e910..efe50d6 100644 --- a/lea/lea_arm64.s +++ b/lea/lea_arm64_neon.s @@ -1,4 +1,4 @@ -//go:build !noasm && arm64 +//go:build arm64 && gc && !purego // AUTO-GENERATED BY GOAT -- DO NOT EDIT TEXT ·lea_encrypt_4block(SB), $0-32 @@ -657,7 +657,7 @@ TEXT ·lea_encrypt_4block(SB), $0-32 WORD $0x6f290480 // ushr.4s v0, v4, #23 WORD $0x6f295480 // sli.4s v0, v4, #9 WORD $0xf100647f // cmp x3, #25 - WORD $0x54001b6b // b.lt LBB0_3 + WORD $0x54001b63 // b.lo LBB0_3 WORD $0x91094048 // add x8, x2, #592 WORD $0x4d40c904 // ld1r.4s { v4 }, [x8] WORD $0x6e221c84 // eor.16b v4, v4, v2 @@ -889,7 +889,7 @@ TEXT ·lea_decrypt_4block(SB), $0-32 WORD $0x910003fd // mov x29, sp WORD $0x4c400820 // ld4.4s { v0, v1, v2, v3 }, [x1] WORD $0xf100747f // cmp x3, #29 - WORD $0x54000dcb // b.lt LBB1_2 + WORD $0x54000dc3 // b.lo LBB1_2 WORD $0x6f370404 // ushr.4s v4, v0, #9 WORD $0x6f375404 // sli.4s v4, v0, #23 WORD $0x910ba048 // add x8, x2, #744 @@ -1000,7 +1000,7 @@ TEXT ·lea_decrypt_4block(SB), $0-32 WORD $0x6e251c83 // eor.16b v3, v4, v5 WORD $0x14000003 // b LBB1_3 WORD $0xf100647f // cmp x3, #25 - WORD $0x54000dab // b.lt LBB1_4 + WORD $0x54000da3 // b.lo LBB1_4 WORD $0x6f370404 // ushr.4s v4, v0, #9 WORD $0x6f375404 // sli.4s v4, v0, #23 WORD $0x910a2048 // add x8, x2, #648 diff --git a/lea/lea_arm64_stubs.go b/lea/lea_arm64_stubs.go index 43d7d1a..1c43305 100644 --- a/lea/lea_arm64_stubs.go +++ b/lea/lea_arm64_stubs.go @@ -1,4 +1,4 @@ -//go:build !noasm && arm64 +//go:build arm64 && gc && !purego // AUTO-GENERATED BY GOAT -- DO NOT EDIT package lea @@ -6,7 +6,7 @@ package lea import "unsafe" //go:noescape -func lea_encrypt_4block(ct, pt, rk, round unsafe.Pointer) +func lea_encrypt_4block(ct, pt, rk unsafe.Pointer, round uint64) //go:noescape -func lea_decrypt_4block(pt, ct, rk, round unsafe.Pointer) +func lea_decrypt_4block(pt, ct, rk unsafe.Pointer, round uint64) diff --git a/lea/lea_asm.go b/lea/lea_asm.go index 663219a..62456c6 100644 --- a/lea/lea_asm.go +++ b/lea/lea_asm.go @@ -7,6 +7,18 @@ import ( "fmt" ) +type funcBlock func(ctx *leaContext, dst, src []byte) + +var ( + leaEnc1 funcBlock = leaEnc1Go + leaEnc4 funcBlock = leaEnc4Go + leaEnc8 funcBlock = leaEnc8Go + + leaDec1 funcBlock = leaDec1Go + leaDec4 funcBlock = leaDec4Go + leaDec8 funcBlock = leaDec8Go +) + func init() { leaNew = newCipherAsm leaNewECB = newCipherAsmECB diff --git a/lea/test_amd64_test.go b/lea/test_amd64_test.go index 03ff565..ab532e2 100644 --- a/lea/test_amd64_test.go +++ b/lea/test_amd64_test.go @@ -6,8 +6,8 @@ import ( "testing" ) -func Test_Encrypt_4Blocks_SSE2(t *testing.T) { testAll(t, tb(4, leaEnc4Go, leaEnc4, false)) } -func Test_Decrypt_4Blocks_SSE2(t *testing.T) { testAll(t, tb(4, leaDec4Go, leaDec4, false)) } +func Test_Encrypt_4Blocks_SSE2(t *testing.T) { testAll(t, tb(4, leaEnc4Go, leaEnc4SSE2, false)) } +func Test_Decrypt_4Blocks_SSE2(t *testing.T) { testAll(t, tb(4, leaDec4Go, leaDec4SSE2, false)) } func Test_Encrypt_8Blocks_SSE2(t *testing.T) { testAll(t, tb(8, leaEnc8Go, leaEnc8SSE2, false)) } func Test_Decrypt_8Blocks_SSE2(t *testing.T) { testAll(t, tb(8, leaDec8Go, leaDec8SSE2, false)) } @@ -15,11 +15,11 @@ func Test_Decrypt_8Blocks_SSE2(t *testing.T) { testAll(t, tb(8, leaDec8Go, leaDe func Test_Encrypt_8Blocks_AVX2(t *testing.T) { testAll(t, tb(8, leaEnc8Go, leaEnc8AVX2, !hasAVX2)) } func Test_Decrypt_8Blocks_AVX2(t *testing.T) { testAll(t, tb(8, leaDec8Go, leaDec8AVX2, !hasAVX2)) } -func Benchmark_Encrypt_4Blocks_SSE2(b *testing.B) { benchAll(b, bb(4, leaEnc4Go, false)) } -func Benchmark_Decrypt_4Blocks_SSE2(b *testing.B) { benchAll(b, bb(4, leaDec4Go, false)) } +func Benchmark_Encrypt_4Blocks_SSE2(b *testing.B) { benchAll(b, bb(4, leaEnc4SSE2, false)) } +func Benchmark_Decrypt_4Blocks_SSE2(b *testing.B) { benchAll(b, bb(4, leaDec4SSE2, false)) } -func Benchmark_Encrypt_8Blocks_SSE2(b *testing.B) { benchAll(b, bb(8, leaEnc8Go, hasAVX2)) } -func Benchmark_Decrypt_8Blocks_SSE2(b *testing.B) { benchAll(b, bb(8, leaDec8Go, hasAVX2)) } +func Benchmark_Encrypt_8Blocks_SSE2(b *testing.B) { benchAll(b, bb(8, leaEnc8SSE2, !hasAVX2)) } +func Benchmark_Decrypt_8Blocks_SSE2(b *testing.B) { benchAll(b, bb(8, leaDec8SSE2, !hasAVX2)) } -func Benchmark_Encrypt_8Blocks_AVX2(b *testing.B) { benchAll(b, bb(8, leaEnc8Go, hasAVX2)) } -func Benchmark_Decrypt_8Blocks_AVX2(b *testing.B) { benchAll(b, bb(8, leaDec8Go, hasAVX2)) } +func Benchmark_Encrypt_8Blocks_AVX2(b *testing.B) { benchAll(b, bb(8, leaEnc8AVX2, !hasAVX2)) } +func Benchmark_Decrypt_8Blocks_AVX2(b *testing.B) { benchAll(b, bb(8, leaDec8AVX2, !hasAVX2)) } diff --git a/lea/test_arm64_test.go b/lea/test_arm64_test.go index 6e80c2b..af1aff3 100644 --- a/lea/test_arm64_test.go +++ b/lea/test_arm64_test.go @@ -6,8 +6,14 @@ import ( "testing" ) -func Test_Encrypt_4Blocks_ARM64(t *testing.T) { testAll(t, tb(4, leaEnc4Go, leaEnc4ARM64, false)) } -func Test_Decrypt_4Blocks_ARM64(t *testing.T) { testAll(t, tb(4, leaDec4Go, leaDec4ARM64, false)) } +func Test_Encrypt_4Blocks_NEON(t *testing.T) { testAll(t, tb(4, leaEnc4Go, leaEnc4NEON, false)) } +func Test_Decrypt_4Blocks_NEON(t *testing.T) { testAll(t, tb(4, leaDec4Go, leaDec4NEON, false)) } -func Benchmark_Encrypt_4Blocks_ARM64(b *testing.B) { benchAll(b, bb(4, leaEnc4ARM64, false)) } -func Benchmark_Decrypt_4Blocks_ARM64(b *testing.B) { benchAll(b, bb(4, leaDec4ARM64, false)) } +func Test_Encrypt_8Blocks_NEON(t *testing.T) { testAll(t, tb(8, leaEnc4Go, leaEnc8NEON, false)) } +func Test_Decrypt_8Blocks_NEON(t *testing.T) { testAll(t, tb(8, leaDec4Go, leaDec8NEON, false)) } + +func Benchmark_Encrypt_4Blocks_NEON(b *testing.B) { benchAll(b, bb(4, leaEnc4NEON, false)) } +func Benchmark_Decrypt_4Blocks_NEON(b *testing.B) { benchAll(b, bb(4, leaDec4NEON, false)) } + +func Benchmark_Encrypt_8Blocks_NEON(b *testing.B) { benchAll(b, bb(4, leaEnc8NEON, false)) } +func Benchmark_Decrypt_8Blocks_NEON(b *testing.B) { benchAll(b, bb(4, leaDec8NEON, false)) }